def load(path, model_class, suffix=''):
    with io.open(path + '.config', 'r', encoding='utf8') as f:
        config = json.load(f)

    word_voca = Vocabulary()
    word_voca.__dict__ = config['word_voca']
    config['word_voca'] = word_voca
    entity_voca = Vocabulary()
    entity_voca.__dict__ = config['entity_voca']
    config['entity_voca'] = entity_voca

    if 'snd_word_voca' in config:
        snd_word_voca = Vocabulary()
        snd_word_voca.__dict__ = config['snd_word_voca']
        config['snd_word_voca'] = snd_word_voca

    model = model_class(config)
    model.load_state_dict(torch.load(path + '.state_dict' + suffix, map_location=torch.device('cpu')))
    return model
Beispiel #2
0
def load_voca_embs(voca_path, embs_path):
    voca = Vocabulary.load(voca_path)
    embs = np.load(embs_path)

    # check if sizes are matched
    if embs.shape[0] == voca.size() - 1:
        print(embs.shape)
        unk_emb = np.mean(embs, axis=0, keepdims=True)
        embs = np.append(embs, unk_emb, axis=0)
    elif embs.shape[0] != voca.size():
        print(embs.shape, voca.size())
        raise Exception("embeddings and vocabulary have differnt number of items ")

    return voca, embs
Beispiel #3
0
def read_ent2id(ent_dic_path):
    print('load ent dic from', ent_dic_path)
    ent_dic = Vocabulary.load(ent_dic_path)
    ent2id = ent_dic.word2id
    return ent2id
Beispiel #4
0
import sys
from nel.vocabulary import Vocabulary
import nel.utils as utils
import numpy as np

if __name__ == "__main__":
    core_voca_path = sys.argv[1]
    word_embs_dir = sys.argv[2]

    print('load core voca from', core_voca_path)
    core_voca = Vocabulary.load(core_voca_path)

    print('load full voca and embs')
    full_voca, full_embs = utils.load_voca_embs(
        word_embs_dir + '/all_dict.word',
        word_embs_dir + '/all_word_embeddings.npy')

    print('select word ids')
    selected = []
    for word in core_voca.id2word:
        word_id = full_voca.word2id.get(word, -1)
        if word_id >= 0:
            selected.append(word_id)

    print('save...')
    selected_embs = full_embs[selected, :]
    np.save(word_embs_dir + '/word_embeddings', selected_embs)

    with open(word_embs_dir + '/dict.word', 'w', encoding='utf8') as f:
        for i in selected:
            f.write(full_voca.id2word[i] + '\t1000\n')