def vector_loader(text_field_words): path = 'word_embedding/glove.sentiment.conj.pretrained.txt' words_dict, vec = torchwordemb.load_word2vec_text(path) embed_size = vec.size(1) # match count_list2 = [] count = 0 dict_cat = [] for word in text_field_words: if word in words_dict: count += 1 dict_cat.append(words_dict[word]) else: dict_cat.append([0.0] * embed_size) count += 1 count_list2.append(count - 1) count_data = len(text_field_words) - len(count_list2) # modify zero sum = [] for j in range(embed_size): sum_col = 0.0 for i in range(len(dict_cat)): sum_col += dict_cat[i][j] sum_col = float(sum_col / count_data) sum_col = round(sum_col, 6) sum.append(sum_col) for i in range(len(count_list2)): dict_cat[count_list2[i]] = sum return dict_cat
def test_word2vec_text(self): word, vec = torchwordemb.load_word2vec_text("resource/word2vec.test.txt") self.assertEqual(len(word), 10) self.assertEqual(vec.size(0), 10) self.assertEqual(vec.size(1), 300)
def test_torch_load_word2vec(embedding_path): ''' 测试torch在load wordc2vec模型时是不是好用,事实证明很好用,其实跟word2vec内嵌的API效果相同。 :param config: 配置项超参数 :return: None ''' vocab, embedding = torchwordemb.load_word2vec_text(embedding_path) print(embedding[vocab[u'中国']])
def read_embed(embed_path, LM): if LM == "glove": vocab, vec = torchwordemb.load_glove_text(embed_path) else: vocab, vec = torchwordemb.load_word2vec_text(embed_path) return vocab, vec
def initialize_embed(self, word2vec_model, word2id): w2v_vocab, w2v_vectors = torchwordemb.load_word2vec_text( word2vec_model) for word, i in word2id.items(): # ignore the unicode conversion/comparison warning with warnings.catch_warnings(): warnings.simplefilter('ignore') if word in w2v_vocab.keys(): self.embedding.weight.data[i].copy_( w2v_vectors[w2v_vocab[word]])
def __init__(self, corpus_path, db_path, embedding_path='../data/embedding_model_t2s/vector_t2s', N=1, seq_max_len=10): self.N = N self.embedding_path = embedding_path self.embedding_vocab_t2n, self.embedding_vectors = torchwordemb.load_word2vec_text( self.embedding_path) self.embedding_size = self.embedding_vectors.size()[-1] self.UNK_EMBEDDING = self.embedding_vectors.mean(0).squeeze() self.BAK_EMBEDDING = torch.zeros(self.UNK_EMBEDDING.size()) # 将特征改为二维特征,用embedding列表实现 self.seq_max_len = seq_max_len save_path = db_path.rsplit('/', 1)[0] + '/fdict_%d.p' % N pure_grams_save_path = db_path.rsplit('/', 1)[0] + '/fdict_%d.p' % 2 # print(save_path, pure_grams_save_path) if os.path.isfile(save_path): # load pre-dumped grams and N from file # print("dict exist") f = open(save_path, 'rb') self.grams = pkl.load(f) self.n = pkl.load(f) f.close() else: # print("build dicts") # if there doesn't exist any pre-dumped file, generate grams from corpus or database text file. self.grams = {} self.n = 0 if corpus_path is not None: self._build_vocab_from_corpus(corpus_path) # print("vocab size after corpus building: {}".format(self.n)) if db_path is not None: self._build_vocab_from_db(db_path) # print("vocab size after db building:{}".format(self.n)) with open(save_path, 'wb') as f: pkl.dump(self.grams, f) pkl.dump(self.n, f)
import random from collections import Counter import numpy as np import torch import torch.autograd as autograd from torch.autograd import Variable import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torchwordemb DATA_DIR = 'data/v1/' print('Loading word2vec') vocab, w2v = torchwordemb.load_word2vec_text('/home/sakya.cxy/wkp/gitlab/item_rank_model/model/tensorflow/data/word_vec.txt') vocab_size = w2v.size(0) embedding_dim = w2v.size(1) embeddings = nn.Embedding(vocab_size, embedding_dim).cuda() embeddings.weight.data.copy_(w2v) embeddings.weight.requires_grad = False print('Loaded word2vec. Time cost:', time.clock()) total = 0 ban_title = 0 train_corpus = [] valid_corpus = [] for i, line in enumerate(open('./pairs.txt')): total += 1 components = line.strip().split('\t')