def main(): input_file = "data/train.txt" vocab_file = "data/vocab" embedding_file = "data/glove.npz" glove_file = "data/glove.840B.300d.txt" dict_file = "data/dict.p" max_vocab_size = 5e4 Vocab.build_vocab(input_file, vocab_file, dict_file, glove_file, embedding_file, max_vocab_size)
if __name__ == '__main__': vocab_num = 100000 pubmed_w2v_path = 'pubmed_w2v.txt' emb_path = 'emb_cnn.pt' opt = Options(config_vocab=False) pubmedreader = PubMedReader(opt) print('loding text data') train_sents, train_labels, test_sents, test_labels, valid_sents, valid_labels = pubmedreader.get_data( ) print('read vocab') fixed_vocab_set = read_vocab(pubmed_w2v_path) print('fixed vocab set size {}'.format(len(fixed_vocab_set))) print('build vocab') vocab = Vocab.build_vocab(train_sents, fixed_vocab_set=fixed_vocab_set) # vocab.append_sents(valid_sents, fixed_vocab_set=fixed_vocab_set) vocab.append_sents(test_sents, fixed_vocab_set=fixed_vocab_set) # print('vocab size {} before shrink'.format(vocab.vocab_len)) vocab.shrink_vocab(2) print('vocab size {} after shrink'.format(vocab.vocab_len)) print('read vec') word_list = [vocab.idx2word[i] for i in range(len(vocab.idx2word))] vec = read_vec(pubmed_w2v_path, word_list) assert vec.shape[0] == vocab.vocab_len print('build emb layer') emb = Embedding(vocab.vocab_len,