def buildEmbMatrixFromGoogle(vocaboulary, emb_size): print('importing embeddings') vocab, vec = torchwordemb.load_word2vec_bin( "./GoogleNews-vectors-negative300.bin") print('imported embeddings') emb_mat = np.zeros((len(vocaboulary), emb_size)) for i, word in enumerate(vocaboulary.keys()): if i % 1000 == 0: print("Reading word ", i, "/", len(vocaboulary)) if word in vocab: emb_mat[vocaboulary[word]] = vec[vocab[word]].numpy() else: emb_mat[vocaboulary[word]] = np.random.normal(0, 1, emb_size) print('train matrices built') del vec del vocab gc.collect() print('garbage collected') return emb_mat
def test_word2vec_bin(self): word, vec = torchwordemb.load_word2vec_bin("resource/word2vec.test.bin") self.assertEqual(len(word), 113) self.assertEqual(vec.size(0), 113) self.assertEqual(vec.size(1), 100)
def create_emb_matrix(self, vocabulary): print('importing embeddings') vocab, vec = torchwordemb.load_word2vec_bin( "./GoogleNews-vectors-negative300.bin") print('imported embeddings') emb_mat = np.zeros((self.ntoken, self.emsize)) for word in vocabulary.keys(): if word in vocab: emb_mat[vocabulary[word]] = vec[vocab[word]].numpy() else: emb_mat[vocabulary[word]] = np.random.normal(0, 1, self.emsize) # hypotetically, the one for <unk> # emb_mat[-1] = np.random.normal(0, 1, self.emb_size) print('train matrices built') del vec del vocab gc.collect() print('garbage collected') return emb_mat
def __init__(self, img_path, transform=None, target_transform=None, loader=default_loader,square=False,data_path=None,partition=None,sem_reg=None,ingrW2V=None): ingr_id, _ = torchwordemb.load_word2vec_bin(ingrW2V) self.ingr_id = ingr_id if data_path==None: raise Exception('No data path specified.') if partition is None: raise Exception('Unknown partition type %s.' % partition) else: self.partition=partition with open(os.path.join(data_path,partition+'_images.p'),'rb') as f: self.ids = pickle.load(f) self.square = square self.imgPath = img_path self.mismtch = 0.8 self.maxInst = 20 with open(os.path.join(data_path,'ingredients_dict.p'),'rb') as f: self.ingr_dic = pickle.load(f) with open(os.path.join(data_path,'recipe_class.p'),'rb') as f: self.recipe_class = pickle.load(f) if sem_reg is not None: self.semantic_reg = sem_reg else: self.semantic_reg = False self.transform = transform self.target_transform = target_transform self.loader = loader
def __init__(self): super(ingRNN, self).__init__() self.irnn = nn.LSTM(input_size=300, hidden_size=300, bidirectional=True, batch_first=True) _, vec = torchwordemb.load_word2vec_bin(ingrW2V) self.embs = nn.Embedding( vec.size(0), 300, padding_idx=0) # not sure about the padding idx self.embs.weight.data.copy_(vec)
def __init__(self): super(segmentRNN, self).__init__() self.segnn = nn.LSTM(input_size=opts.segmentW2VDim, hidden_size=opts.srnnDim, bidirectional=True, batch_first=True) _, vec = torchwordemb.load_word2vec_bin(opts.segmentW2V) self.embs = nn.Embedding( vec.size(0), opts.segmentW2VDim, padding_idx=0) #not sure about the padding idx self.embs.weight.data.copy_(vec)
def __init__(self): super(ingRNN, self).__init__() self.irnn = nn.LSTM(input_size=ingrW2VDim, hidden_size=irnnDim, bidirectional=True, batch_first=True, num_layers=2, dropout=0.2) _, vec = torchwordemb.load_word2vec_bin(ingrW2V) #self.embs = nn.Embedding(vec.size(0), ingrW2VDim, padding_idx=0) # not sure about the padding idx self.embs = nn.Embedding.from_pretrained(vec, freeze=True)
def loadPretrained(self, pretrained_path): count = 0 vocab, vec = torchwordemb.load_word2vec_bin(pretrained_path) for i, word in enumerate(self.index2word): if word in vocab: count += 1 self.embedding.weight.data[i, :].copy_ = vec[vocab[word]] else: self.embedding.weight.data[i, :].copy_( np.random.uniform(-0.25, 0.25, 300)) print("Loaded " + str(count) + " pretrained vectors")
def __init__(self): super(ingRNN, self).__init__() self.irnn = nn.LSTM(input_size=opts.ingrW2VDim, hidden_size=opts.irnnDim, bidirectional=True, batch_first=True) _, vec = torchwordemb.load_word2vec_bin(opts.ingrW2V) print(opts.ingrW2V) self.embs = nn.Embedding( opts.ingrW2VDim, vec.size(0), padding_idx=0 ) # not sure about the padding idx Jan change ingrW2VDim to ingrW2V self.embs.weight.data.copy_(vec)
def __init__(self): super(ingRNN, self).__init__() self.irnn = nn.LSTM(input_size=opts.ingrW2VDim, hidden_size=opts.irnnDim, bidirectional=True, batch_first=True) _, vec = torchwordemb.load_word2vec_bin( os.path.join(opts.data_path, opts.tag, 'vocab.bin')) self.embs = nn.Embedding( vec.size(0), opts.ingrW2VDim, padding_idx=0) # not sure about the padding idx self.embs.weight.data.copy_(vec)
def load_dataset(train_path, dev_path, max_text_length, embedding_dim, tokenizer=tokenizer, dev_ratio=0.1, pretrained_word_embedding_name="glove.6B.300d", pretrained_word_embedding_path=None, saved_text_vocab_path="text_vocab.pkl", saved_label_vocab_path="label_vocab.pkl"): text_field = data.Field(lower=True, tokenize=tokenizer, fix_length=max_text_length) label_field = data.Field(sequential=False) print('loading data') train_data = data.TabularDataset(path=train_path, format='csv', skip_header=True, fields=[("text", text_field), ('label', label_field)]) dev_data = data.TabularDataset(path=dev_path, format='csv', skip_header=True, fields=[("text", text_field), ('label', label_field)]) print('building vocab') text_field.build_vocab(train_data, dev_data) label_field.build_vocab(train_data, dev_data) vectors = None if pretrained_word_embedding_name == "word2vec": vocab, vec = torchwordemb.load_word2vec_bin( pretrained_word_embedding_path) text_field.vocab.set_vectors(vocab, vec, embedding_dim) vectors = text_field.vocab.vectors elif "glove" in pretrained_word_embedding_name: text_field.vocab.load_vectors(pretrained_word_embedding_name) vectors = text_field.vocab.vectors pickle.dump(text_field, open(saved_text_vocab_path, 'wb')) pickle.dump(label_field, open(saved_label_vocab_path, 'wb')) vocab_size = len(text_field.vocab) print("vocab size ", vocab_size) #from zero label_size = len(label_field.vocab) - 1 return train_data, dev_data, vocab_size, label_size, label_field.vocab.itos, vectors
def __init__(self): super(ingredient_RNN, self).__init__() self.irnn = nn.LSTM(input_size=ING_WORD2VEC_DIM, hidden_size=ING_RNN_DIM, bidirectional=True, batch_first=True) #Get the size of the Vocab. _, vec = torchwordemb.load_word2vec_bin( ING_WORD2VEC_PATH) # give the vector of size 300 #Creating the Embedding Matrix and then copy the vectors from Google WORD2VEC model to the embedding variable self.embs = nn.Embedding( vec.size(0), ING_WORD2VEC_DIM, padding_idx=0) # not sure about the padding idx self.embs.weight.data.copy_(vec)
def get_word_embeddings_bin(self, embedding_file, save_name='debiased_embeddings.mod', embedding_dim=300): """ Get word embeddings, where it assumes input as a bin file """ print "Loading word embeddings from {}".format(embedding_file) assert os.path.exists(embedding_file) embeddings = torch.Tensor(len(self.dictionary), embedding_dim) vocab, vecs = torchwordemb.load_word2vec_bin(embedding_file) ct = 0 for word in self.dictionary.word2idx: if word in vocab: v = vecs[vocab[word]] embeddings[self.dictionary.word2idx[word]].copy_(v) ct += 1 print 'Copied {}/{} words'.format(ct, len(self.dictionary.word2idx)) torch.save(embeddings, open(save_name, 'wb')) return embeddings
def load_word_vec(model_path): vocab, vec = torchwordemb.load_word2vec_bin(model_path) return vocab, vec
def load_corpus(self): print("正在加载语料库...") # 判断文件是否存在 if not os.path.exists(q_file): print("请将question库文件名命名为q.txt,quesion库目录应为%s" % q_file) sys.exit() if not os.path.exists(a_file): print("请将answer库文件名命名为q.txt,anwser库目录应为%s" % a_file) sys.exit() '''word2vec''' self.word2vec() vocab, _ = torchwordemb.load_word2vec_bin(WORD2VEC_PATH) # 加载语料并处理 self.en_lang = self.Lang() # encoder_lang self.de_lang = self.Lang() # decoder_lang q_lines = [] a_lines = [] with open(q_file, 'r') as f: for line in f.readlines(): q_lines.append(line.strip('\n')) with open(a_file, 'r') as f: for line in f.readlines(): a_lines.append(line.strip('\n')) self.pairs = [] for i in range(len(q_lines)): self.pairs.append({0: q_lines[i], 1: a_lines[i]}) self.word_dict = {} for k,v in vocab.items(): self.word_dict[k] = int(v)+2 #将词后移动,使头两位为SOS,EOS word_index = 0 for en_line in q_lines: word_list = wordseg(en_line) for word in word_list: if word in self.word_dict and word not in self.en_lang.word2index: word_index = self.word_dict[word] self.en_lang.word2index[word] = word_index self.en_lang.index2word[word_index] = word self.en_lang.n_words+=1 for de_line in a_lines: word_list = wordseg(de_line) for word in word_list: if word in self.word_dict and word not in self.de_lang.word2index: word_index = self.word_dict[word] self.en_lang.word2index[word] = word_index self.en_lang.index2word[word_index] = word self.en_lang.n_words+=1 print("语料库统计:") print("Q: %d 词" % self.en_lang.n_words) print("A: %d 词" % self.de_lang.n_words)
def __init__(self,path): self.vocab, self.embedding_matrix = torchwordemb.load_word2vec_bin(path)
import torch import torch.nn as nn import torch.nn.parallel import torch.optim import torch.utils.data import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models import torchwordemb from args import get_parser parser = get_parser() opts = parser.parse_args() irnn = nn.LSTM(input_size=opts.ingrW2VDim, hidden_size=opts.irnnDim, bidirectional=True, batch_first=True) _, vec = torchwordemb.load_word2vec_bin(opts.ingrW2V) print(vec)
import torchwordemb import pickle ingr_id, _ = torchwordemb.load_word2vec_bin("data/vocab.bin") with open('data/ingredients_dict.p', 'rb') as f: ingr_dic = pickle.load(f) with open('data/test_images.p', 'rb') as f: ids = pickle.load(f) for index in range(5): print("CASE: ", index) recipeId = ids[index][:-4] print("recipeid = ", recipeId) for item in ingr_dic[recipeId]['ingr']: print(item) if item not in ingr_id: print("but it's not in dict!")