def read_word_embeddings(embeddings_file: str) -> WordEmbeddings: """ Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding that is the 0 vector. Reads in all embeddings with no filtering -- you should only use this for relativized word embedding files. :param embeddings_file: path to the file containing embeddings :return: WordEmbeddings object reflecting the words and their embeddings """ f = open(embeddings_file) word_indexer = Indexer() vectors = [] # Make position 0 a PAD token, which can be useful if you word_indexer.add_and_get_index("PAD") # Make position 1 the UNK token word_indexer.add_and_get_index("UNK") for line in f: if line.strip() != "": space_idx = line.find(' ') word = line[:space_idx] numbers = line[space_idx + 1:] float_numbers = [ float(number_str) for number_str in numbers.split() ] vector = np.array(float_numbers) word_indexer.add_and_get_index(word) # Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line # of the file to see what the embedding dim is if len(vectors) == 0: vectors.append(np.zeros(vector.shape[0])) vectors.append(np.zeros(vector.shape[0])) vectors.append(vector) f.close() # Turn vectors into a 2-D numpy array return WordEmbeddings(word_indexer, np.array(vectors))
def __init__(self, args, reduced_size=None, info={}): super(CNN, self).__init__() # disc_type=DISC_TYPE_MATRIX self.disc_type = disc_type = args.disc_type self.layer1 = nn.Sequential( nn.Conv2d(1, 4, kernel_size=2, padding=0), nn.ReLU()) # 1,4,3,3 self.layer2 = nn.Sequential( nn.Conv2d(4, 8, kernel_size=2), nn.ReLU()) # 1,8,2,2 ## but for 5 lines, it is 1,8,3,3 if args.data_type == "sonnet_endings": self.scorer = nn.Linear(2 * 2 * 8, 1) elif args.data_type == "limerick": self.scorer = nn.Linear(3 * 3 * 8, 1) self.predictor = nn.Sigmoid() self.args = args self.use_cuda = args.use_cuda ## self.g_indexer = Indexer(args) self.g_indexer.load('tmp/tmp_' + args.g2p_model_name + '/solver_g_indexer') self.g2pmodel = Model(H=info['H'], args=args, i_size=self.g_indexer.w_cnt, o_size=self.g_indexer.w_cnt, start_idx=self.g_indexer.w2idx[utils.START]) if not args.learn_g2p_encoder_from_scratch: print("=====" * 7, "LOADING g2p ENCODER PRETRAINED") model_dir = 'tmp/tmp_' + args.g2p_model_name + '/' state_dict_best = torch.load(model_dir + 'model_best') self.g2pmodel.load_state_dict(state_dict_best) if not args.trainable_g2p: assert not args.learn_g2p_encoder_from_scratch for param in self.g2pmodel.parameters(): param.requires_grad = False
def from_word_vectors(cls, word_vectors, unique_labels): """Instantiate the vectorizer""" review_vocab = word_vectors rating_vocab = Indexer() # Add ratings for l in unique_labels: rating_vocab.add_and_get_index(l) return cls(review_vocab, rating_vocab)
def __init__(self, max_word_length): vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' '] self.char_vocab_index = Indexer() self.char_vocab_index.add_and_get_index(PAD_TOKEN) # PAD is 0 self.char_vocab_index.add_and_get_index( UNK_TOKEN) # Unknown token is 1 for char in vocab: self.char_vocab_index.add_and_get_index(char) self.max_word_length = max_word_length
def __init__(self, args): super().__init__() self.args = args self.pad_token_id = args.pad_token_id # Initialize embedding layer (1) self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim) # Initialize char embedding layer self.char_embedding = nn.Embedding(args.char_vocab_size, args.char_embedding_dim) # Initialize Context2Query (2) self.aligned_att = AlignedAttention(args.embedding_dim, args.char_embedding_dim) rnn_cell = nn.LSTM if args.rnn_cell_type == 'lstm' else nn.GRU # Initialize passage encoder (3) self.passage_rnn = rnn_cell( args.embedding_dim * 2, args.hidden_dim, bidirectional=args.bidirectional, batch_first=True, ) # Initialize question encoder (4) self.question_rnn = rnn_cell( args.embedding_dim, args.hidden_dim, bidirectional=args.bidirectional, batch_first=True, ) self.dropout = nn.Dropout(self.args.dropout) # Adjust hidden dimension if bidirectional RNNs are used _hidden_dim = (args.hidden_dim * 2 if args.bidirectional else args.hidden_dim) # Initialize attention layer for question attentive sum (5) self.question_att = SpanAttention(_hidden_dim) # Initialize bilinear layer for start positions (6) self.start_output = BilinearOutput(_hidden_dim, _hidden_dim) # Initialize bilinear layer for end positions (7) self.end_output = BilinearOutput(_hidden_dim, _hidden_dim) # Initialize char indexer vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' '] self.char_vocab_index = Indexer() for char in vocab: self.char_vocab_index.add_and_get_index(char)
def load(self, specialTokenList=None): indexer = Indexer(specialTokenList) print "... loading training data." trainPairs,trainLens = self._load_pairs(indexer, self.dataDict['train_source'], self.dataDict['train_target']) print "... loading test data." testPairs,testLens = self._load_pairs(indexer, self.dataDict['test_source'], self.dataDict['test_source']) print "Done!\n" return indexer,trainPairs,trainLens,testPairs,testLens
def generate_indexer(usr_dataset, usr_bm_tg, feature_begin, feature_end): logging.info('generating indexer ...') indexer = Indexer(['user', 'tag', 'bookmark']) min_time = 1e30 max_time = -1 for line in usr_dataset[1:]: line_items = line.split('\t') contact_timestamp = float(line_items[2]) / 1000 min_time = min(min_time, contact_timestamp) max_time = max(max_time, contact_timestamp) if feature_begin < contact_timestamp <= feature_end: indexer.index('user', line_items[0]) indexer.index('user', line_items[1]) for line in usr_bm_tg[1:]: line_items = line.split('\t') tag_timestamp = float(line_items[3]) / 1000 if feature_begin < tag_timestamp <= feature_end: indexer.index('user', line_items[0]) indexer.index('bookmark', line_items[1]) indexer.index('tag', line_items[2]) with open('delicious/data/metadata.txt', 'w') as output: output.write('Nodes:\n') output.write('-----------------------------\n') output.write('#Users: %d\n' % indexer.indices['user']) output.write('#Tags: %d\n' % indexer.indices['tag']) output.write('#Bookmarks: %d\n' % indexer.indices['bookmark']) output.write('\nEdges:\n') output.write('-----------------------------\n') output.write('#Contact: %d\n' % len(usr_dataset)) output.write('#Save : %d\n' % len(usr_bm_tg)) output.write('#Attach: %d\n' % len(usr_bm_tg)) output.write('\nTime Span:\n') output.write('-----------------------------\n') output.write('From: %s\n' % datetime.fromtimestamp(min_time)) output.write('To: %s\n' % datetime.fromtimestamp(max_time)) return indexer
from solver import Solver from preprocess.tacotron.utils import spectrogram2wav #from preprocess.tacotron.audio import inv_spectrogram, save_wav from scipy.io.wavfile import write from preprocess.tacotron.mcep import mc2wav if __name__ == '__main__': feature = 'sp' hps = Hps() hps.load('./hps/v19.json') hps_tuple = hps.get_tuple() solver = Solver(hps_tuple, None) solver.load_model('/storage/model/voice_conversion/v19/model.pkl-59999') if feature == 'mc': # indexer to extract data indexer = Indexer() src_mc = indexer.index(speaker_id='225', utt_id='366', dset='test', feature='norm_mc') tar_mc = indexer.index(speaker_id='226', utt_id='366', dset='test', feature='norm_mc') expand_src_mc = np.expand_dims(src_mc, axis=0) expand_tar_mc = np.expand_dims(tar_mc, axis=0) src_mc_tensor = torch.from_numpy(expand_src_mc).type(torch.FloatTensor) tar_mc_tensor = torch.from_numpy(expand_tar_mc).type(torch.FloatTensor) c1 = Variable(torch.from_numpy(np.array([0]))).cuda() c2 = Variable(torch.from_numpy(np.array([1]))).cuda() results = [src_mc]
from keras.models import Sequential from keras.layers import Dense, Embedding, LSTM, Dropout from sklearn.model_selection import train_test_split from keras.utils.np_utils import to_categorical import re import json import pandas as pd from utils import get_train_data_from_csv, get_dev_data_from_csv, get_test_data_from_csv, Indexer, get_indexer from nltk.tokenize import TweetTokenizer from sklearn.metrics import classification_report include_test = True tknr = TweetTokenizer() indexer = get_indexer('indexer_15_dups.csv') word_indexer = Indexer() word_indexer.add_and_get_index("UNK") train_data = get_train_data_from_csv('data/train_15_ds.csv')[0:1000] dev_data = get_dev_data_from_csv('data/dev_15_ds.csv')[:200] test_data = get_test_data_from_csv('data/test_15_ds.csv')[0:200] X_train = [] Y_train = [] X_dev = [] Y_dev = [] Y_dev_true = [] X_test = [] Y_test = [] Y_test_true = []
def __init__(self): self.indexer = Indexer()
def generate_papers(datafile, feature_begin, feature_end, observation_begin, observation_end, conf_list): logging.info('generating papers ...') # try: # result = pickle.load(open('dblp/data/papers_%s.pkl' % path, 'rb')) # return result # except IOError: # pass indexer = Indexer(['author', 'paper', 'term', 'venue']) index, authors, title, year, venue = None, None, None, None, None references = [] write = 0 cite = 0 include = 0 published = 0 min_year = 3000 max_year = 0 papers_feature_window = [] papers_observation_window = [] with open(datafile) as file: dataset = file.read().splitlines() for line in dataset: if not line: if year and venue: year = int(year) if year > 0 and authors and venue in conf_list: min_year = min(min_year, year) max_year = max(max_year, year) authors = authors.split(',') terms = parse_term(title) write += len(authors) cite += len(references) include += len(terms) published += 1 p = Paper(year) if feature_begin < year <= feature_end: p.id = indexer.index('paper', index) p.terms = [ indexer.index('term', term) for term in terms ] p.references = [ indexer.index('paper', paper_id) for paper_id in references ] p.authors = [ indexer.index('author', author_name) for author_name in authors ] p.venue = indexer.index('venue', venue) bisect.insort(papers_feature_window, p) elif observation_begin < year <= observation_end: p.references = references p.authors = authors papers_observation_window.append(p) index, authors, title, year, venue = None, None, None, None, None references = [] else: begin = line[1] if begin == '*': title = line[2:] elif begin == '@': authors = line[2:] elif begin == 't': year = line[2:] elif begin == 'c': venue = line[2:] elif begin == 'i': index = line[6:] elif begin == '%': references.append(line[2:]) for p in papers_observation_window: authors = [] references = [] for author in p.authors: author_id = indexer.get_index('author', author) if author_id is not None: authors.append(author_id) for ref in p.references: paper_id = indexer.get_index('paper', ref) if paper_id is not None: references.append(paper_id) p.authors = authors p.references = references with open('dblp/data/metadata_%s.txt' % path, 'w') as output: output.write('Nodes:\n') output.write('-----------------------------\n') output.write('#Authors: %d\n' % indexer.indices['author']) output.write('#Papers: %d\n' % indexer.indices['paper']) output.write('#Venues: %d\n' % indexer.indices['venue']) output.write('#Terms: %d\n\n' % indexer.indices['term']) output.write('\nEdges:\n') output.write('-----------------------------\n') output.write('#Write: %d\n' % write) output.write('#Cite: %d\n' % cite) output.write('#Publish: %d\n' % published) output.write('#Contain: %d\n' % include) output.write('\nTime Span:\n') output.write('-----------------------------\n') output.write('From: %s\n' % min_year) output.write('To: %s\n' % max_year) result = papers_feature_window, papers_observation_window, indexer.indices # pickle.dump(result, open('dblp/data/papers_%s.pkl' % path, 'wb')) return result
contains a header line and 45463 data lines, each line includes a mId and its overview (some sentences). ''' movies.to_csv("processed_data/overviews.csv", columns=['mId', 'overview'], index=False) movies.to_csv("processed_data/mId2Title.csv", columns=['mId', 'tmdbId', 'title'], index=False) ''' create genres mId2Genre: 45463 lines, each line includes (mId, num of genres, gIds) Genre2Id: 20 lines, each line includes (gId, genre name) gId ranges from 45843 to 45862 ''' f = open("processed_data/mId2Genre.txt", "w") genreIdx = Indexer() for idx, row in movies.iterrows(): mId, raw_genres = row['mId'], row['genres'] raw_genres = raw_genres.replace("\'", "\"") genres_l = json.loads(raw_genres) f.write("%d %d" % (mId, len(genres_l))) for g in genres_l: f.write(" %d" % (genreIdx.add_and_get_index(g['name']) + id_base)) f.write("\n") f.close() f = open("processed_data/Genre2Id.txt", "w") num_genres = len(genreIdx) for i in range(num_genres): f.write("%d %s\n" % (i + id_base, genreIdx.get_object(i))) f.close()
def generate_indexer(user_rates_movies_ds, user_tags_movies_ds, movie_actor_ds, movie_director_ds, movie_genre_ds, movie_countries_ds, feature_begin, feature_end): logging.info('generating indexer ...') min_time = 1e30 max_time = -1 indexer = Indexer( ['user', 'tag', 'movie', 'actor', 'director', 'genre', 'country']) for line in user_rates_movies_ds[1:]: line_items = line.split('\t') rating_timestamp = float(line_items[3]) / 1000 min_time = min(min_time, rating_timestamp) max_time = max(max_time, rating_timestamp) rating = float(line_items[2]) if feature_begin < rating_timestamp <= feature_end and rating > rating_threshold: indexer.index('user', line_items[0]) indexer.index('movie', line_items[1]) for line in user_tags_movies_ds[1:]: line_items = line.split('\t') tag_timestamp = float(line_items[3]) / 1000 if feature_begin < tag_timestamp <= feature_end: indexer.index('user', line_items[0]) indexer.index('movie', line_items[1]) indexer.index('tag', line_items[2]) for line in movie_actor_ds[1:]: line_items = line.split('\t') ranking = int(line_items[3]) if ranking < actor_threshold and line_items[0] in indexer.mapping[ 'movie']: # indexer.index('movie', line_items[0]) indexer.index('actor', line_items[1]) for line in movie_director_ds[1:]: line_items = line.split('\t') if line_items[0] in indexer.mapping['movie']: # indexer.index('movie', line_items[0]) indexer.index('director', line_items[1]) for line in movie_genre_ds[1:]: line_items = line.split('\t') if line_items[0] in indexer.mapping['movie']: # indexer.index('movie', line_items[0]) indexer.index('genre', line_items[1]) for line in movie_countries_ds[1:]: line_items = line.split('\t') if line_items[0] in indexer.mapping['movie']: # indexer.index('movie', line_items[0]) indexer.index('country', line_items[1]) with open('movielens/data/metadata.txt', 'w') as output: output.write('Nodes:\n') output.write('-----------------------------\n') output.write('#Users: %d\n' % indexer.indices['user']) output.write('#Tags: %d\n' % indexer.indices['tag']) output.write('#Movies: %d\n' % indexer.indices['movie']) output.write('#Actors: %d\n' % indexer.indices['actor']) output.write('#Director: %d\n' % indexer.indices['director']) output.write('#Genre: %d\n' % indexer.indices['genre']) output.write('#Countriy: %d\n' % indexer.indices['country']) output.write('\nEdges:\n') output.write('-----------------------------\n') output.write('#Rate: %d\n' % len(user_rates_movies_ds)) output.write('#Attach: %d\n' % len(user_tags_movies_ds)) output.write('#Played_by: %d\n' % len(movie_actor_ds)) output.write('#Directed_by : %d\n' % len(movie_director_ds)) output.write('#Has: %d\n' % len(movie_genre_ds)) output.write('#Produced_in: %d\n' % len(movie_countries_ds)) output.write('\nTime Span:\n') output.write('-----------------------------\n') output.write('From: %s\n' % datetime.fromtimestamp(min_time)) output.write('To: %s\n' % datetime.fromtimestamp(max_time)) return indexer