def test_vocab(): user_vocab = Vocab(vocab_file='data/prediction/embeddings/user_vecs.vocab') print(f'user vocab length: {len(user_vocab)}') print([user_vocab[i] for i in range(5)]) user_vectors = np.load('data/prediction/embeddings/user_vecs.npy') user_vectors = np.concatenate((np.zeros( (2, user_vectors.shape[1]), dtype=np.float), user_vectors), axis=0) print(f'user vectors shape: {user_vectors.shape}') print('-' * 30) sub_vocab = Vocab(vocab_file='data/prediction/embeddings/sub_vecs.vocab') print(f'sub vocab length: {len(sub_vocab)}') print([sub_vocab[i] for i in range(5)]) sub_vectors = np.load('data/prediction/embeddings/sub_vecs.npy') sub_vectors = np.concatenate((np.zeros( (2, sub_vectors.shape[1]), dtype=np.float), sub_vectors), axis=0) print(f'sub vectors shape: {sub_vectors.shape}') print('-' * 30) words, word_vectors = load_glove_emb( 'data/prediction/embeddings/glove_word_embeds.txt') word_vectors = np.concatenate((np.zeros( (2, word_vectors.shape[1]), dtype=np.float), word_vectors), axis=0) word_vocab = Vocab(words=words) print(f'word vocab length: {len(word_vocab)}') print([word_vocab[i] for i in range(5)]) print(f'word vectors shape: {word_vectors.shape}')
def build_vocab(name, dataset_list, cache_path, word_vec_path=None, feat_dim=None): logging.info(' building a language model...') if not os.path.exists(cache_path): lang_model = Vocab(name) for dataset in dataset_list: logging.info(' indexing words from {}'.format(dataset.lmdb_dir)) index_words(lang_model, dataset.lmdb_dir) if word_vec_path is not None: lang_model.load_word_vectors(word_vec_path, feat_dim) with open(cache_path, 'wb') as f: pickle.dump(lang_model, f) else: logging.info(' loaded from {}'.format(cache_path)) with open(cache_path, 'rb') as f: lang_model = pickle.load(f) if word_vec_path is None: lang_model.word_embedding_weights = None elif lang_model.word_embedding_weights.shape[0] != lang_model.n_words: logging.warning(' failed to load word embedding weights. check this') assert False return lang_model
def __init__(self, vocab_path, data_path_list, max_length): super(Dataset, self).__init__() self.vocab_path = vocab_path self.data_path_list = data_path_list self.max_length = max_length self.data = None self.vocab = Vocab(self.vocab_path) self._prepareData()
def build_vocab(self, embed_file): word_counts = Counter() count_words(word_counts, [src + tgt for src, tgt in self.pairs]) vocab = Vocab() for word, count in word_counts.most_common(config.max_vocab_size): vocab.add_words([word]) if embed_file is not None: count = vocab.load_embeddings(embed_file) print('%d pre-trained embeddings loaded.' % count) return vocab
def build_model(config): vocab = Vocab(config['vocab']) device = config['device'] model = OCR(len(vocab), config) model = model.to(device) return model, vocab
def build_model(config): vocab = Vocab(config['vocab']) device = config['device'] model = VietOCR(len(vocab), config['backbone'], config['cnn'], config['transformer'], config['seq_modeling']) model = model.to(device) return model, vocab
def _make_speaker_model(self, lmdb_dir, cache_path): logging.info(' building a speaker model...') speaker_model = Vocab('vid', insert_default_tokens=False) lmdb_env = lmdb.open(lmdb_dir, readonly=True, lock=False) txn = lmdb_env.begin(write=False) cursor = txn.cursor() for key, value in cursor: video = pyarrow.deserialize(value) vid = video['vid'] speaker_model.index_word(vid) lmdb_env.close() logging.info(' indexed %d videos' % speaker_model.n_words) self.speaker_model = speaker_model # cache with open(cache_path, 'wb') as f: pickle.dump(self.speaker_model, f)
# sampling if settings.DOWN_SAMPLING: train_df_burst = train_df[train_df['label'] == 'burst'] train_df_non_burst = train_df[train_df['label'] == 'non-burst'] train_df = shuffle( pd.concat((train_df_non_burst.sample( n=int(len(train_df_burst) * settings.LABEL_RATIO)), train_df_burst), ignore_index=True)) # print(len(train_df[train_df['label'] == 'burst'])) # print(len(train_df[train_df['label'] == 'non-burst'])) # load vocab user_vocab = Vocab(vocab_file=settings.USER_VOCAB_FN) sub_vocab = Vocab(vocab_file=settings.SUB_VOCAB_FN) words, word_vectors = load_glove_emb(fn=settings.GLOVE_EMBEDDING_FN) word_vocab = Vocab(words=list(range(len(words))), additional_terms=False) label_vocab = Vocab(words=['non-burst', 'burst'], additional_terms=False) # make dataset train_ds = RedditDataset( df=train_df, user_vocab=user_vocab, sub_vocab=sub_vocab, word_vocab=word_vocab, label_vocab=label_vocab, content_col='content', )
class Dataset(Dataset): """ A dataset basically supports iteration over all the examples it contains. We currently supports only text data with this class. This class is inheriting Dataset class in torch.utils.data. """ def __init__(self, vocab_path, data_path_list, max_length): super(Dataset, self).__init__() self.vocab_path = vocab_path self.data_path_list = data_path_list self.max_length = max_length self.data = None self.vocab = Vocab(self.vocab_path) self._prepareData() def __getitem__(self, index): item_list = [] for item in self.data[index]: item_list.append(self.vocab.sentence_to_indices(item)) item_list.append(len(item)) return item_list def __len__(self): return len(self.data) def _prepareData(self): data = self._readData() print("Read {} sentence pairs".format(len(data))) data = self._filterDatas(data) print("Trim data to {} sentence pairs \n".format(len(data))) print("[*] Success to preprocess data! \n") self.data = data def _readData(self): print("[*] Reading lines...") # Read the file and split into lines lines_list = [[self._preprocessing(l).split(' ') for l in open(file_path, 'r', encoding='utf-8').readlines()] for file_path in self.data_path_list] data = list(zip(*lines_list)) # Print statistics for i, lines in enumerate(lines_list): print("Avg length of data {} : {:.2f}".format(i, sum([len(l) for l in lines]) / len(data))) print() return data def _preprocessing(self, s): return s.strip().lower() def _filterDatas(self, data): data = [d for d in data if self._chkMaxLength(d)] return data def _chkMaxLength(self, p): return len(p[0]) <= self.max_length and len(p[1]) <= self.max_length and len(p[0]) > 0 and len(p[1]) > 0 def getInstanceSize(self): return len(self.data) def getVocabSize(self): return self.vocab.__len__()
) if not os.path.exists(settings.TRAIN_DF_DUMP): with open(settings.TRAIN_DF_DUMP, mode='wb') as f: pickle.dump(train, f) if not os.path.exists(settings.TEST_DF_DUMP): with open(settings.TEST_DF_DUMP, mode='wb') as f: pickle.dump(test, f) if not os.path.exists(settings.DEV_DF_DUMP): with open(settings.DEV_DF_DUMP, mode='wb') as f: pickle.dump(dev, f) print('Loading vocab...') user_vocab = Vocab(vocab_file=settings.USER_VOCAB_FN) sub_vocab = Vocab(vocab_file=settings.SUB_VOCAB_FN) words, word_vecs = load_glove_emb(fn=settings.GLOVE_EMBEDDING_FN) word_vocab = Vocab(words=list(range(len(words)))) label_vocab = Vocab(words=['non-burst', 'burst']) ds = RedditDataset( df=cross_label_tokenized, user_vocab=user_vocab, sub_vocab=sub_vocab, word_vocab=word_vocab, label_vocab=label_vocab, content_col='content', ) # ds_it = iter(ds)
# Data Path Arguments parser.add_argument( '--acronyms_fn', default='../expansion_etl/data/derived/prototype_acronym_expansions.csv' ) parser.add_argument( '--semgroups_fn', default='../expansion_etl/data/original/umls_semantic_groups.txt') # Model Distribution Hyperparameters parser.add_argument('--document_topic_prior', type=float, default=1.0) parser.add_argument('--topic_expansion_prior', type=float, default=1.0) args = parser.parse_args() semgroup_vocab = Vocab('semgroups') with open(args.semgroups_fn, 'r') as semgroup_fd: semgroup_vocab.add_tokens( list(map(lambda x: x.strip().split('|')[1], semgroup_fd))) acronyms = pd.read_csv(args.acronyms_fn) sf_vocab = {} sfs = acronyms['sf'].unique() print('Creating expansion vocabularies for {} short forms'.format( len(sfs))) for sf in sfs: sf_vocab[sf] = Vocab(sf) sf_vocab[sf].add_tokens(acronyms[acronyms['sf'] == sf]['lf'].tolist()) print('\tVocabulary size of {} for {}'.format(sf_vocab[sf].size(), sf)) # Model Dimensions & Hyperparameters