def vector_loader(text_field_words):
    path = 'word_embedding/glove.sentiment.conj.pretrained.txt'
    words_dict, vec = torchwordemb.load_word2vec_text(path)
    embed_size = vec.size(1)

    # match
    count_list2 = []
    count = 0
    dict_cat = []
    for word in text_field_words:
        if word in words_dict:
            count += 1
            dict_cat.append(words_dict[word])
        else:
            dict_cat.append([0.0] * embed_size)
            count += 1
            count_list2.append(count - 1)
    count_data = len(text_field_words) - len(count_list2)

    # modify zero
    sum = []
    for j in range(embed_size):
        sum_col = 0.0
        for i in range(len(dict_cat)):
            sum_col += dict_cat[i][j]
            sum_col = float(sum_col / count_data)
            sum_col = round(sum_col, 6)
        sum.append(sum_col)

    for i in range(len(count_list2)):
        dict_cat[count_list2[i]] = sum

    return dict_cat
Esempio n. 2
0
    def test_word2vec_text(self):
        word, vec = torchwordemb.load_word2vec_text("resource/word2vec.test.txt")

        self.assertEqual(len(word), 10)

        self.assertEqual(vec.size(0), 10)
        self.assertEqual(vec.size(1), 300)
Esempio n. 3
0
def test_torch_load_word2vec(embedding_path):
    '''
    测试torch在load wordc2vec模型时是不是好用,事实证明很好用,其实跟word2vec内嵌的API效果相同。
    :param config: 配置项超参数
    :return: None
    '''
    vocab, embedding = torchwordemb.load_word2vec_text(embedding_path)
    print(embedding[vocab[u'中国']])
Esempio n. 4
0
def read_embed(embed_path, LM):

    if LM == "glove":
        vocab, vec = torchwordemb.load_glove_text(embed_path)
    else:
        vocab, vec = torchwordemb.load_word2vec_text(embed_path)

    return vocab, vec
Esempio n. 5
0
 def initialize_embed(self, word2vec_model, word2id):
     w2v_vocab, w2v_vectors = torchwordemb.load_word2vec_text(
         word2vec_model)
     for word, i in word2id.items():
         # ignore the unicode conversion/comparison warning
         with warnings.catch_warnings():
             warnings.simplefilter('ignore')
             if word in w2v_vocab.keys():
                 self.embedding.weight.data[i].copy_(
                     w2v_vectors[w2v_vocab[word]])
Esempio n. 6
0
    def __init__(self,
                 corpus_path,
                 db_path,
                 embedding_path='../data/embedding_model_t2s/vector_t2s',
                 N=1,
                 seq_max_len=10):
        self.N = N
        self.embedding_path = embedding_path
        self.embedding_vocab_t2n, self.embedding_vectors = torchwordemb.load_word2vec_text(
            self.embedding_path)
        self.embedding_size = self.embedding_vectors.size()[-1]
        self.UNK_EMBEDDING = self.embedding_vectors.mean(0).squeeze()
        self.BAK_EMBEDDING = torch.zeros(self.UNK_EMBEDDING.size())

        # 将特征改为二维特征,用embedding列表实现
        self.seq_max_len = seq_max_len

        save_path = db_path.rsplit('/', 1)[0] + '/fdict_%d.p' % N
        pure_grams_save_path = db_path.rsplit('/', 1)[0] + '/fdict_%d.p' % 2
        # print(save_path, pure_grams_save_path)
        if os.path.isfile(save_path):
            # load pre-dumped grams and N from file
            # print("dict exist")
            f = open(save_path, 'rb')
            self.grams = pkl.load(f)
            self.n = pkl.load(f)
            f.close()
        else:
            # print("build dicts")
            # if there doesn't exist any pre-dumped file, generate grams from corpus or database text file.
            self.grams = {}
            self.n = 0
            if corpus_path is not None:
                self._build_vocab_from_corpus(corpus_path)
            # print("vocab size after corpus building: {}".format(self.n))
            if db_path is not None: self._build_vocab_from_db(db_path)
            # print("vocab size after db building:{}".format(self.n))
            with open(save_path, 'wb') as f:
                pkl.dump(self.grams, f)
                pkl.dump(self.n, f)
Esempio n. 7
0
import random
from collections import Counter

import numpy as np
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchwordemb

DATA_DIR = 'data/v1/'

print('Loading word2vec')
vocab, w2v = torchwordemb.load_word2vec_text('/home/sakya.cxy/wkp/gitlab/item_rank_model/model/tensorflow/data/word_vec.txt')
vocab_size = w2v.size(0)
embedding_dim = w2v.size(1)
embeddings = nn.Embedding(vocab_size, embedding_dim).cuda()
embeddings.weight.data.copy_(w2v)
embeddings.weight.requires_grad = False
print('Loaded word2vec. Time cost:', time.clock())

total = 0
ban_title = 0

train_corpus = []
valid_corpus = []
for i, line in enumerate(open('./pairs.txt')):
    total += 1
    components = line.strip().split('\t')