Esempio n. 1
0
def load_voca_embs(voca_path, embs_path):
    voca = Vocabulary.load(voca_path)
    embs = np.load(embs_path)

    # check if sizes are matched
    if embs.shape[0] == voca.size() - 1:
        print(embs.shape)
        unk_emb = np.mean(embs, axis=0, keepdims=True)
        embs = np.append(embs, unk_emb, axis=0)
    elif embs.shape[0] != voca.size():
        print(embs.shape, voca.size())
        raise Exception("embeddings and vocabulary have differnt number of items ")

    return voca, embs
Esempio n. 2
0
def read_ent2id(ent_dic_path):
    print('load ent dic from', ent_dic_path)
    ent_dic = Vocabulary.load(ent_dic_path)
    ent2id = ent_dic.word2id
    return ent2id
Esempio n. 3
0
import sys
from nel.vocabulary import Vocabulary
import nel.utils as utils
import numpy as np

if __name__ == "__main__":
    core_voca_path = sys.argv[1]
    word_embs_dir = sys.argv[2]

    print('load core voca from', core_voca_path)
    core_voca = Vocabulary.load(core_voca_path)

    print('load full voca and embs')
    full_voca, full_embs = utils.load_voca_embs(
        word_embs_dir + '/all_dict.word',
        word_embs_dir + '/all_word_embeddings.npy')

    print('select word ids')
    selected = []
    for word in core_voca.id2word:
        word_id = full_voca.word2id.get(word, -1)
        if word_id >= 0:
            selected.append(word_id)

    print('save...')
    selected_embs = full_embs[selected, :]
    np.save(word_embs_dir + '/word_embeddings', selected_embs)

    with open(word_embs_dir + '/dict.word', 'w', encoding='utf8') as f:
        for i in selected:
            f.write(full_voca.id2word[i] + '\t1000\n')