Ejemplo n.º 1
0
 def from_corpus(cls, corpus, vocab_size):
     vocab = Vocabulary()
     for token in corpus:
         vocab.add(token)
     vocab_subset = vocab.get_topk_subset(vocab_size)
     vocab_subset.shuffle()
     return cls(vocab_subset)
Ejemplo n.º 2
0
def make_hash_embeddings(igor, vocab):
    assert os.path.exists(igor.target_glove), "You need to specify a real file"
    fileiter = open(igor.target_glove).readlines()

    hash_vocab = Vocabulary()
    hash_vocab.use_mask = True
    hash_vocab.add(hash_vocab.mask_symbol)
    hash_vocab.add(hash_vocab.unk_symbol)
    word2hash = {}
    for word, v_id in vocab.items():
        ids = hash_vocab.add_many(hash_word(word))
        word2hash[v_id] = ids

    embeddings = np.zeros((len(hash_vocab), igor.embedding_size))
    remaining_vocab = set(vocab.keys())
    remaining_hashes = set(hash_vocab.values())
    for line in tqdm(fileiter):
        line = line.replace("\n","").split(" ")
        word, nums = line[0], [float(x.strip()) for x in line[1:]]
        word_hash = hash_word(word)
        if word in remaining_vocab:
            hash_ids = word2hash[vocab[word]]
            remaining_vocab.remove(word)
            remaining_hashes.difference_update(hash_ids)
            embeddings[hash_ids] += np.array(nums) / len(hash_ids)
    print("{} words were not seen.  {} hashes were not seen".format(len(remaining_vocab),
                                                                    len(remaining_hashes)))
    for hash_id in remaining_hashes:
        embeddings[hash_id] = np.asarray(glorot_uniform((igor.embedding_size,)).eval())

    glove_name = igor.target_glove[igor.target_glove.find("glove"):].replace("/","")

    hash_vocab.save('hash_embedding_{}.vocab'.format(glove_name))
    with open(path.join(igor.save_dir, "hash_embedding_{}.npy".format(glove_name)), "wb") as fp:
        np.save(fp, embeddings)
    with open(path.join(igor.save_dir, "word2hash.json".format(glove_name)), "w") as fp:
        json.dump(word2hash, fp)
Ejemplo n.º 3
0
        for q, qid in zip(questions, qids):
            if qid not in seen_qid:
                seen_qid.add(qid)
                unique_questions.append(q)
        # print len(unique_questions), len(questions)

        #considering document to be collection of all answers and  unique question
        docs = answers + unique_questions

        #stores document frequency of each word 
        word2dfs = compute_dfs(docs)
        # print word2dfs.items()[:10]

        #creating vocabulary
        vocabulary = Vocabulary(start_feature_id=0)
        vocabulary.add('UNKNOWN_WORD_IDX')
        add_to_vocab(answers, vocabulary)
        add_to_vocab(questions, vocabulary)

        basename = os.path.basename(train)
        cPickle.dump(vocabulary, open(os.path.join(outdir, 'vocab.pickle'), 'w'))
        # print "vocabulary", len(vocabulary)

        dummy_word_idx = vocabulary.fid

        #longest answer and question length
        q_max_length = max(map(lambda x: len(x), questions))
        a_max_length = max(map(lambda x: len(x), answers))
        print q_max_length, a_max_length
        '''
        for fname in [train, dev, test]:
Ejemplo n.º 4
0
    word_vocab = Vocabulary(
        os.path.join(args.wiki_preprocess, 'word_vocab.txt'))
    entity_vocab = Vocabulary(
        os.path.join(args.wiki_preprocess, 'entity_vocab.txt'))
    print(f"# word in dataset: {len(word_vocab)}")
    print(f"# entity in dataset: {len(entity_vocab)}")

    path = os.path.join(args.wiki_preprocess, 'inlinks.txt')
    with open(path, 'r') as f:
        for line in tqdm(f,
                         leave=False,
                         dynamic_ncols=True,
                         desc="Read inlniks"):
            links = json.loads(line)
            for word in links['inlinks']:
                word_vocab.add(word)
    print(f"# word in dataset + inlinks: {len(word_vocab)}")

    wiki2vec = Wikipedia2Vec.load(args.wiki2vec)
    inwiki_words_num = 0
    word_vecs = []
    word_vocab_path = os.path.join(args.wiki_preprocess, 'word_vocab.txt')
    with open(word_vocab_path, 'w') as f:
        for word in tqdm(sorted(list(word_vocab)),
                         leave=False,
                         dynamic_ncols=True,
                         desc="Filter in-wiki words"):
            try:
                vec = wiki2vec.get_word_vector(word)
                word_vecs.append(vec)
                f.write(word + "\n")