Example #1
0
def load_embedding_model(args, vocab):
    embedding_model = Embedding(vocab.size(), args.input_dim)

    if args.cuda:
        embedding_model = embedding_model.cuda()
    emb_file = os.path.join(args.data, args.emb_dir.split("/")[-1]+"_"+args.emb_file + '_emb.pth')
    if os.path.isfile(emb_file) and torch.load(emb_file).size()[1] == args.input_dim:
        emb = torch.load(emb_file)
    else:
        glove_vocab, glove_emb = load_word_vectors(os.path.join(args.emb_dir,args.emb_file))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())

        emb = torch.zeros(vocab.size(), glove_emb.size(1))
        not_known = []
        for word in vocab.token_to_idx.keys():
            if glove_vocab.get_index(word):
                emb[vocab.get_index(word)] = glove_emb[glove_vocab.get_index(word)]
            else:
                not_known.append(word)
                emb[vocab.get_index(word)] = torch.Tensor(emb[vocab.get_index(word)].size()).normal_(-0.05, 0.05)
        if args.calculate_new_words:
            emb = apply_not_known_words(emb, args, not_known, vocab)

        torch.save(emb, emb_file)

    if args.cuda:
        emb = emb.cuda()
    # plug these into embedding matrix inside model
    embedding_model.state_dict()['weight'].copy_(emb)
    return embedding_model
Example #2
0
def prepare_embeddings(vectors, word2index):
    embedding_model = Embedding(len(word2index), config.tree_config["input_dim"])

    if config.tree_config["cuda"]:
        embedding_model = embedding_model.cuda()

    torch_vectors = torch.tensor(vectors)

    if config.tree_config["cuda"]:
        torch_vectors = torch_vectors.cuda()
    embedding_model.state_dict()['weight'].copy_(torch_vectors)
    return embedding_model
Example #3
0
def load_embedding_model(data: str,
                         emb_dir: str,
                         emb_file: str,
                         input_dim: int,
                         vocab: Vocab,
                         cuda: bool = False):

    new_emb_file = os.path.join(
        data,
        emb_dir.split("/")[-1] + "_" + emb_file + '_emb.pth')
    if os.path.isfile(new_emb_file) and torch.load(
            new_emb_file).size()[1] == input_dim:
        emb: Tensor = torch.load(new_emb_file)

    else:
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(emb_dir, emb_file))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())

        emb = torch.zeros(vocab.size(), glove_emb.size(1))
        not_known = []
        for word in vocab.token_to_idx.keys():
            if glove_vocab.get_index(word):
                emb[vocab.get_index(word)] = glove_emb[glove_vocab.get_index(
                    word)]
            else:
                not_known.append(word)
                emb[vocab.get_index(word)] = torch.Tensor(
                    emb[vocab.get_index(word)].size()).normal_(-0.05, 0.05)

        torch.save(emb, emb_file)

    embedding_model = Embedding(emb.shape[0], input_dim)
    if cuda:
        emb = emb.cuda()
        embedding_model = embedding_model.cuda()

    # plug these into embedding matrix inside model
    embedding_model.state_dict()['weight'].copy_(emb)
    return embedding_model
def main(vocab_dir, embed_path):
    with open(embed_path, 'rb') as fread:
        vocab_size, embed_size = map(int, fread.readline().strip().split())
        embed = np.zeros((vocab_size + 2, embed_size))  # <unk>, <pad> added
        embed_stoi = {}

        binary_len = np.dtype('float32').itemsize * embed_size
        for i in range(vocab_size):
            word = []
            while True:
                ch = fread.read(1)
                if ch == b' ':
                    word = b''.join(word).decode('utf-8')
                    break
                if ch != b'\n':
                    word.append(ch)

            embed_stoi[word] = i + 2
            embed[i + 2] = np.fromstring(fread.read(binary_len),
                                         dtype='float32')

    vocab = Vocab(Counter(list(embed_stoi)),
                  specials=['<unk>', '<pad>'],
                  specials_first=True)

    embed_torch = Embedding(vocab_size + 2, embed_size)
    for idx, key in enumerate(vocab.itos):
        if embed_stoi.get(key, 0) != 0:
            embed_torch.weight[idx] = torch.from_numpy(embed[embed_stoi[key]])
        else:
            embed_torch.weight[idx] = torch.from_numpy(np.zeros(embed_size))

    with open(os.path.join(vocab_dir, 'words'), 'wb') as fwrite:
        pickle.dump(vocab, fwrite)

    torch.save(embed_torch.state_dict(), os.path.join(vocab_dir, 'embed.pt'))
Example #5
0
        loss.backward()
        optim.step()
        model_optim.step()
        emb_optim.step()

        if i % 100 == 0:
            stats = open('stats.txt', 'a')
            eval_performance(stats, epoch, i)
            stats.close()
        if i % 1000 == 0:
            print('Saving')
            with open('lstm_lstm.pickle', 'wb') as f:
                pickle.dump(lstm, f)
            with open('lstm_lstm.model', 'wb') as f:
                pickle.dump(model, f)
            torch.save(emb.state_dict(), 'emb_post_lstm.txt')
            #with open('emb_post_lstm.pickle', 'wb') as f:
            #     pickle.dump(emb, f)
        i += 1

with open('lstm_lstm.pickle', 'wb') as f:
    pickle.dump(lstm, f)

stats.close()
#with open('lstm_sequential.pickle', 'wb') as f:
#    pickle.dump(model, f)

#    for a in acts:
#        spin, _, timestamp, words = a
#        loss = None
#        lbl = [pred1, pred2] if spin == 1 else [pred2, pred1]