def from_corpus(cls, corpus, vocab_size): vocab = Vocabulary() for token in corpus: vocab.add(token) vocab_subset = vocab.get_topk_subset(vocab_size) vocab_subset.shuffle() return cls(vocab_subset)
def make_hash_embeddings(igor, vocab): assert os.path.exists(igor.target_glove), "You need to specify a real file" fileiter = open(igor.target_glove).readlines() hash_vocab = Vocabulary() hash_vocab.use_mask = True hash_vocab.add(hash_vocab.mask_symbol) hash_vocab.add(hash_vocab.unk_symbol) word2hash = {} for word, v_id in vocab.items(): ids = hash_vocab.add_many(hash_word(word)) word2hash[v_id] = ids embeddings = np.zeros((len(hash_vocab), igor.embedding_size)) remaining_vocab = set(vocab.keys()) remaining_hashes = set(hash_vocab.values()) for line in tqdm(fileiter): line = line.replace("\n","").split(" ") word, nums = line[0], [float(x.strip()) for x in line[1:]] word_hash = hash_word(word) if word in remaining_vocab: hash_ids = word2hash[vocab[word]] remaining_vocab.remove(word) remaining_hashes.difference_update(hash_ids) embeddings[hash_ids] += np.array(nums) / len(hash_ids) print("{} words were not seen. {} hashes were not seen".format(len(remaining_vocab), len(remaining_hashes))) for hash_id in remaining_hashes: embeddings[hash_id] = np.asarray(glorot_uniform((igor.embedding_size,)).eval()) glove_name = igor.target_glove[igor.target_glove.find("glove"):].replace("/","") hash_vocab.save('hash_embedding_{}.vocab'.format(glove_name)) with open(path.join(igor.save_dir, "hash_embedding_{}.npy".format(glove_name)), "wb") as fp: np.save(fp, embeddings) with open(path.join(igor.save_dir, "word2hash.json".format(glove_name)), "w") as fp: json.dump(word2hash, fp)
for q, qid in zip(questions, qids): if qid not in seen_qid: seen_qid.add(qid) unique_questions.append(q) # print len(unique_questions), len(questions) #considering document to be collection of all answers and unique question docs = answers + unique_questions #stores document frequency of each word word2dfs = compute_dfs(docs) # print word2dfs.items()[:10] #creating vocabulary vocabulary = Vocabulary(start_feature_id=0) vocabulary.add('UNKNOWN_WORD_IDX') add_to_vocab(answers, vocabulary) add_to_vocab(questions, vocabulary) basename = os.path.basename(train) cPickle.dump(vocabulary, open(os.path.join(outdir, 'vocab.pickle'), 'w')) # print "vocabulary", len(vocabulary) dummy_word_idx = vocabulary.fid #longest answer and question length q_max_length = max(map(lambda x: len(x), questions)) a_max_length = max(map(lambda x: len(x), answers)) print q_max_length, a_max_length ''' for fname in [train, dev, test]:
word_vocab = Vocabulary( os.path.join(args.wiki_preprocess, 'word_vocab.txt')) entity_vocab = Vocabulary( os.path.join(args.wiki_preprocess, 'entity_vocab.txt')) print(f"# word in dataset: {len(word_vocab)}") print(f"# entity in dataset: {len(entity_vocab)}") path = os.path.join(args.wiki_preprocess, 'inlinks.txt') with open(path, 'r') as f: for line in tqdm(f, leave=False, dynamic_ncols=True, desc="Read inlniks"): links = json.loads(line) for word in links['inlinks']: word_vocab.add(word) print(f"# word in dataset + inlinks: {len(word_vocab)}") wiki2vec = Wikipedia2Vec.load(args.wiki2vec) inwiki_words_num = 0 word_vecs = [] word_vocab_path = os.path.join(args.wiki_preprocess, 'word_vocab.txt') with open(word_vocab_path, 'w') as f: for word in tqdm(sorted(list(word_vocab)), leave=False, dynamic_ncols=True, desc="Filter in-wiki words"): try: vec = wiki2vec.get_word_vector(word) word_vecs.append(vec) f.write(word + "\n")