Ejemplo n.º 1
0
def buildEmbMatrixFromGoogle(vocaboulary, emb_size):
    print('importing embeddings')
    vocab, vec = torchwordemb.load_word2vec_bin(
        "./GoogleNews-vectors-negative300.bin")
    print('imported embeddings')

    emb_mat = np.zeros((len(vocaboulary), emb_size))

    for i, word in enumerate(vocaboulary.keys()):
        if i % 1000 == 0:
            print("Reading word ", i, "/", len(vocaboulary))
        if word in vocab:
            emb_mat[vocaboulary[word]] = vec[vocab[word]].numpy()
        else:
            emb_mat[vocaboulary[word]] = np.random.normal(0, 1, emb_size)

    print('train matrices built')

    del vec
    del vocab
    gc.collect()

    print('garbage collected')

    return emb_mat
Ejemplo n.º 2
0
    def test_word2vec_bin(self):
        word, vec = torchwordemb.load_word2vec_bin("resource/word2vec.test.bin") 

        self.assertEqual(len(word), 113)

        self.assertEqual(vec.size(0), 113)
        self.assertEqual(vec.size(1), 100)
Ejemplo n.º 3
0
    def create_emb_matrix(self, vocabulary):
        print('importing embeddings')
        vocab, vec = torchwordemb.load_word2vec_bin(
            "./GoogleNews-vectors-negative300.bin")
        print('imported embeddings')

        emb_mat = np.zeros((self.ntoken, self.emsize))

        for word in vocabulary.keys():
            if word in vocab:
                emb_mat[vocabulary[word]] = vec[vocab[word]].numpy()
            else:
                emb_mat[vocabulary[word]] = np.random.normal(0, 1, self.emsize)

        # hypotetically, the one for <unk>
        # emb_mat[-1] = np.random.normal(0, 1, self.emb_size)

        print('train matrices built')

        del vec
        del vocab
        gc.collect()

        print('garbage collected')

        return emb_mat
Ejemplo n.º 4
0
    def __init__(self, img_path, transform=None, target_transform=None,
                 loader=default_loader,square=False,data_path=None,partition=None,sem_reg=None,ingrW2V=None):
        ingr_id, _ = torchwordemb.load_word2vec_bin(ingrW2V)
        self.ingr_id = ingr_id

        if data_path==None:
            raise Exception('No data path specified.')

        if partition is None:
            raise Exception('Unknown partition type %s.' % partition)
        else:
            self.partition=partition

        with open(os.path.join(data_path,partition+'_images.p'),'rb') as f:
            self.ids = pickle.load(f)

        self.square  = square

        self.imgPath = img_path
        self.mismtch = 0.8
        self.maxInst = 20
        with open(os.path.join(data_path,'ingredients_dict.p'),'rb') as f:
            self.ingr_dic = pickle.load(f)
        with open(os.path.join(data_path,'recipe_class.p'),'rb') as f:
            self.recipe_class = pickle.load(f)

        if sem_reg is not None:
            self.semantic_reg = sem_reg
        else:
            self.semantic_reg = False

        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader
Ejemplo n.º 5
0
 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=300,
                         hidden_size=300,
                         bidirectional=True,
                         batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(ingrW2V)
     self.embs = nn.Embedding(
         vec.size(0), 300, padding_idx=0)  # not sure about the padding idx
     self.embs.weight.data.copy_(vec)
Ejemplo n.º 6
0
 def __init__(self):
     super(segmentRNN, self).__init__()
     self.segnn = nn.LSTM(input_size=opts.segmentW2VDim,
                          hidden_size=opts.srnnDim,
                          bidirectional=True,
                          batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(opts.segmentW2V)
     self.embs = nn.Embedding(
         vec.size(0), opts.segmentW2VDim,
         padding_idx=0)  #not sure about the padding idx
     self.embs.weight.data.copy_(vec)
Ejemplo n.º 7
0
 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=ingrW2VDim,
                         hidden_size=irnnDim,
                         bidirectional=True,
                         batch_first=True,
                         num_layers=2,
                         dropout=0.2)
     _, vec = torchwordemb.load_word2vec_bin(ingrW2V)
     #self.embs = nn.Embedding(vec.size(0), ingrW2VDim, padding_idx=0) # not sure about the padding idx
     self.embs = nn.Embedding.from_pretrained(vec, freeze=True)
Ejemplo n.º 8
0
 def loadPretrained(self, pretrained_path):
     count = 0
     vocab, vec = torchwordemb.load_word2vec_bin(pretrained_path)
     for i, word in enumerate(self.index2word):
         if word in vocab:
             count += 1
             self.embedding.weight.data[i, :].copy_ = vec[vocab[word]]
         else:
             self.embedding.weight.data[i, :].copy_(
                 np.random.uniform(-0.25, 0.25, 300))
     print("Loaded " + str(count) + " pretrained vectors")
Ejemplo n.º 9
0
 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=opts.ingrW2VDim,
                         hidden_size=opts.irnnDim,
                         bidirectional=True,
                         batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(opts.ingrW2V)
     print(opts.ingrW2V)
     self.embs = nn.Embedding(
         opts.ingrW2VDim, vec.size(0), padding_idx=0
     )  # not sure about the padding idx Jan change ingrW2VDim to ingrW2V
     self.embs.weight.data.copy_(vec)
Ejemplo n.º 10
0
 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=opts.ingrW2VDim,
                         hidden_size=opts.irnnDim,
                         bidirectional=True,
                         batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(
         os.path.join(opts.data_path, opts.tag, 'vocab.bin'))
     self.embs = nn.Embedding(
         vec.size(0), opts.ingrW2VDim,
         padding_idx=0)  # not sure about the padding idx
     self.embs.weight.data.copy_(vec)
Ejemplo n.º 11
0
def load_dataset(train_path,
                 dev_path,
                 max_text_length,
                 embedding_dim,
                 tokenizer=tokenizer,
                 dev_ratio=0.1,
                 pretrained_word_embedding_name="glove.6B.300d",
                 pretrained_word_embedding_path=None,
                 saved_text_vocab_path="text_vocab.pkl",
                 saved_label_vocab_path="label_vocab.pkl"):
    text_field = data.Field(lower=True,
                            tokenize=tokenizer,
                            fix_length=max_text_length)
    label_field = data.Field(sequential=False)

    print('loading data')
    train_data = data.TabularDataset(path=train_path,
                                     format='csv',
                                     skip_header=True,
                                     fields=[("text", text_field),
                                             ('label', label_field)])
    dev_data = data.TabularDataset(path=dev_path,
                                   format='csv',
                                   skip_header=True,
                                   fields=[("text", text_field),
                                           ('label', label_field)])

    print('building vocab')
    text_field.build_vocab(train_data, dev_data)
    label_field.build_vocab(train_data, dev_data)

    vectors = None

    if pretrained_word_embedding_name == "word2vec":
        vocab, vec = torchwordemb.load_word2vec_bin(
            pretrained_word_embedding_path)
        text_field.vocab.set_vectors(vocab, vec, embedding_dim)
        vectors = text_field.vocab.vectors
    elif "glove" in pretrained_word_embedding_name:
        text_field.vocab.load_vectors(pretrained_word_embedding_name)
        vectors = text_field.vocab.vectors

    pickle.dump(text_field, open(saved_text_vocab_path, 'wb'))
    pickle.dump(label_field, open(saved_label_vocab_path, 'wb'))

    vocab_size = len(text_field.vocab)
    print("vocab size ", vocab_size)
    #from zero
    label_size = len(label_field.vocab) - 1

    return train_data, dev_data, vocab_size, label_size, label_field.vocab.itos, vectors
Ejemplo n.º 12
0
    def __init__(self):
        super(ingredient_RNN, self).__init__()

        self.irnn = nn.LSTM(input_size=ING_WORD2VEC_DIM,
                            hidden_size=ING_RNN_DIM,
                            bidirectional=True,
                            batch_first=True)

        #Get the size of the Vocab.
        _, vec = torchwordemb.load_word2vec_bin(
            ING_WORD2VEC_PATH)  # give the vector of size 300

        #Creating the Embedding Matrix and then copy the vectors from Google WORD2VEC model to the embedding variable
        self.embs = nn.Embedding(
            vec.size(0), ING_WORD2VEC_DIM,
            padding_idx=0)  # not sure about the padding idx

        self.embs.weight.data.copy_(vec)
Ejemplo n.º 13
0
 def get_word_embeddings_bin(self,
                             embedding_file,
                             save_name='debiased_embeddings.mod',
                             embedding_dim=300):
     """ Get word embeddings, where it assumes input as a bin file
     """
     print "Loading word embeddings from {}".format(embedding_file)
     assert os.path.exists(embedding_file)
     embeddings = torch.Tensor(len(self.dictionary), embedding_dim)
     vocab, vecs = torchwordemb.load_word2vec_bin(embedding_file)
     ct = 0
     for word in self.dictionary.word2idx:
         if word in vocab:
             v = vecs[vocab[word]]
             embeddings[self.dictionary.word2idx[word]].copy_(v)
             ct += 1
     print 'Copied {}/{} words'.format(ct, len(self.dictionary.word2idx))
     torch.save(embeddings, open(save_name, 'wb'))
     return embeddings
Ejemplo n.º 14
0
def load_word_vec(model_path):
    vocab, vec = torchwordemb.load_word2vec_bin(model_path)
    return vocab, vec
Ejemplo n.º 15
0
	def load_corpus(self):
		print("正在加载语料库...")
		
		# 判断文件是否存在		
		if not os.path.exists(q_file):
			print("请将question库文件名命名为q.txt,quesion库目录应为%s" %  q_file)
			sys.exit()
		if not os.path.exists(a_file):
			print("请将answer库文件名命名为q.txt,anwser库目录应为%s" % a_file)
			sys.exit()
		
		'''word2vec'''

		self.word2vec()

		vocab, _ = torchwordemb.load_word2vec_bin(WORD2VEC_PATH)

		# 加载语料并处理
		self.en_lang = self.Lang() # encoder_lang
		self.de_lang = self.Lang() # decoder_lang
		
		q_lines = []
		a_lines = []
		with open(q_file, 'r') as f:
			for line in f.readlines():
				q_lines.append(line.strip('\n'))
		with open(a_file, 'r') as f:
			for line in f.readlines():
				a_lines.append(line.strip('\n'))
		self.pairs = []
		for i in range(len(q_lines)):
			self.pairs.append({0: q_lines[i], 1: a_lines[i]})

		self.word_dict = {}

		for k,v in vocab.items():
			self.word_dict[k] = int(v)+2 #将词后移动,使头两位为SOS,EOS

		word_index = 0
		for en_line in q_lines:
			word_list = wordseg(en_line)
			for word in word_list:
				if word in self.word_dict and word not in self.en_lang.word2index:
					word_index = self.word_dict[word]
					self.en_lang.word2index[word] = word_index
					self.en_lang.index2word[word_index] = word
					self.en_lang.n_words+=1


		for de_line in a_lines:
			word_list = wordseg(de_line)
			for word in word_list:
				if word in self.word_dict and word not in self.de_lang.word2index:
					word_index = self.word_dict[word]
					self.en_lang.word2index[word] = word_index
					self.en_lang.index2word[word_index] = word
					self.en_lang.n_words+=1


		print("语料库统计:")
		print("Q: %d 词" % self.en_lang.n_words)
		print("A: %d 词" % self.de_lang.n_words)
Ejemplo n.º 16
0
 def __init__(self,path):
     self.vocab, self.embedding_matrix = torchwordemb.load_word2vec_bin(path)
Ejemplo n.º 17
0
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torchwordemb
from args import get_parser

parser = get_parser()
opts = parser.parse_args()

irnn = nn.LSTM(input_size=opts.ingrW2VDim,
               hidden_size=opts.irnnDim,
               bidirectional=True,
               batch_first=True)
_, vec = torchwordemb.load_word2vec_bin(opts.ingrW2V)
print(vec)
Ejemplo n.º 18
0
import torchwordemb
import pickle

ingr_id, _ = torchwordemb.load_word2vec_bin("data/vocab.bin")

with open('data/ingredients_dict.p', 'rb') as f:
    ingr_dic = pickle.load(f)
with open('data/test_images.p', 'rb') as f:
    ids = pickle.load(f)

for index in range(5):
    print("CASE: ", index)
    recipeId = ids[index][:-4]
    print("recipeid = ", recipeId)

    for item in ingr_dic[recipeId]['ingr']:
        print(item)
        if item not in ingr_id:
            print("but it's not in dict!")