Ejemplos de load_word2vec_bin en Python, ejemplos de torchwordemb.load_word2vec_bin en Python

Ejemplo n.º 1

0

Mostrar archivo

def buildEmbMatrixFromGoogle(vocaboulary, emb_size):
    print('importing embeddings')
    vocab, vec = torchwordemb.load_word2vec_bin(
        "./GoogleNews-vectors-negative300.bin")
    print('imported embeddings')

    emb_mat = np.zeros((len(vocaboulary), emb_size))

    for i, word in enumerate(vocaboulary.keys()):
        if i % 1000 == 0:
            print("Reading word ", i, "/", len(vocaboulary))
        if word in vocab:
            emb_mat[vocaboulary[word]] = vec[vocab[word]].numpy()
        else:
            emb_mat[vocaboulary[word]] = np.random.normal(0, 1, emb_size)

    print('train matrices built')

    del vec
    del vocab
    gc.collect()

    print('garbage collected')

    return emb_mat

Ejemplo n.º 2

0

Mostrar archivo

Archivo: tests.py Proyecto: zmqgeek/pytorch-wordemb

    def test_word2vec_bin(self):
        word, vec = torchwordemb.load_word2vec_bin("resource/word2vec.test.bin") 

        self.assertEqual(len(word), 113)

        self.assertEqual(vec.size(0), 113)
        self.assertEqual(vec.size(1), 100)

Ejemplo n.º 3

0

Mostrar archivo

    def create_emb_matrix(self, vocabulary):
        print('importing embeddings')
        vocab, vec = torchwordemb.load_word2vec_bin(
            "./GoogleNews-vectors-negative300.bin")
        print('imported embeddings')

        emb_mat = np.zeros((self.ntoken, self.emsize))

        for word in vocabulary.keys():
            if word in vocab:
                emb_mat[vocabulary[word]] = vec[vocab[word]].numpy()
            else:
                emb_mat[vocabulary[word]] = np.random.normal(0, 1, self.emsize)

        # hypotetically, the one for <unk>
        # emb_mat[-1] = np.random.normal(0, 1, self.emb_size)

        print('train matrices built')

        del vec
        del vocab
        gc.collect()

        print('garbage collected')

        return emb_mat

Ejemplo n.º 4

0

Mostrar archivo

Archivo: data_loader.py Proyecto: paisuygoda/im2ingr

    def __init__(self, img_path, transform=None, target_transform=None,
                 loader=default_loader,square=False,data_path=None,partition=None,sem_reg=None,ingrW2V=None):
        ingr_id, _ = torchwordemb.load_word2vec_bin(ingrW2V)
        self.ingr_id = ingr_id

        if data_path==None:
            raise Exception('No data path specified.')

        if partition is None:
            raise Exception('Unknown partition type %s.' % partition)
        else:
            self.partition=partition

        with open(os.path.join(data_path,partition+'_images.p'),'rb') as f:
            self.ids = pickle.load(f)

        self.square  = square

        self.imgPath = img_path
        self.mismtch = 0.8
        self.maxInst = 20
        with open(os.path.join(data_path,'ingredients_dict.p'),'rb') as f:
            self.ingr_dic = pickle.load(f)
        with open(os.path.join(data_path,'recipe_class.p'),'rb') as f:
            self.recipe_class = pickle.load(f)

        if sem_reg is not None:
            self.semantic_reg = sem_reg
        else:
            self.semantic_reg = False

        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

Ejemplo n.º 5

0

Mostrar archivo

Archivo: trijoint.py Proyecto: yes-github/bootcamp

 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=300,
                         hidden_size=300,
                         bidirectional=True,
                         batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(ingrW2V)
     self.embs = nn.Embedding(
         vec.size(0), 300, padding_idx=0)  # not sure about the padding idx
     self.embs.weight.data.copy_(vec)

Ejemplo n.º 6

0

Mostrar archivo

 def __init__(self):
     super(segmentRNN, self).__init__()
     self.segnn = nn.LSTM(input_size=opts.segmentW2VDim,
                          hidden_size=opts.srnnDim,
                          bidirectional=True,
                          batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(opts.segmentW2V)
     self.embs = nn.Embedding(
         vec.size(0), opts.segmentW2VDim,
         padding_idx=0)  #not sure about the padding idx
     self.embs.weight.data.copy_(vec)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: rnn_solver.py Proyecto: chikuanlin/whats-cooking

 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=ingrW2VDim,
                         hidden_size=irnnDim,
                         bidirectional=True,
                         batch_first=True,
                         num_layers=2,
                         dropout=0.2)
     _, vec = torchwordemb.load_word2vec_bin(ingrW2V)
     #self.embs = nn.Embedding(vec.size(0), ingrW2VDim, padding_idx=0) # not sure about the padding idx
     self.embs = nn.Embedding.from_pretrained(vec, freeze=True)

Ejemplo n.º 8

0

Mostrar archivo

 def loadPretrained(self, pretrained_path):
     count = 0
     vocab, vec = torchwordemb.load_word2vec_bin(pretrained_path)
     for i, word in enumerate(self.index2word):
         if word in vocab:
             count += 1
             self.embedding.weight.data[i, :].copy_ = vec[vocab[word]]
         else:
             self.embedding.weight.data[i, :].copy_(
                 np.random.uniform(-0.25, 0.25, 300))
     print("Loaded " + str(count) + " pretrained vectors")

Ejemplo n.º 9

0

Mostrar archivo

Archivo: trijoint.py Proyecto: januaryshen/IMT573_im2recipe

 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=opts.ingrW2VDim,
                         hidden_size=opts.irnnDim,
                         bidirectional=True,
                         batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(opts.ingrW2V)
     print(opts.ingrW2V)
     self.embs = nn.Embedding(
         opts.ingrW2VDim, vec.size(0), padding_idx=0
     )  # not sure about the padding idx Jan change ingrW2VDim to ingrW2V
     self.embs.weight.data.copy_(vec)

Ejemplo n.º 10

0

Mostrar archivo

 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=opts.ingrW2VDim,
                         hidden_size=opts.irnnDim,
                         bidirectional=True,
                         batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(
         os.path.join(opts.data_path, opts.tag, 'vocab.bin'))
     self.embs = nn.Embedding(
         vec.size(0), opts.ingrW2VDim,
         padding_idx=0)  # not sure about the padding idx
     self.embs.weight.data.copy_(vec)

Ejemplo n.º 11

0

Mostrar archivo

def load_dataset(train_path,
                 dev_path,
                 max_text_length,
                 embedding_dim,
                 tokenizer=tokenizer,
                 dev_ratio=0.1,
                 pretrained_word_embedding_name="glove.6B.300d",
                 pretrained_word_embedding_path=None,
                 saved_text_vocab_path="text_vocab.pkl",
                 saved_label_vocab_path="label_vocab.pkl"):
    text_field = data.Field(lower=True,
                            tokenize=tokenizer,
                            fix_length=max_text_length)
    label_field = data.Field(sequential=False)

    print('loading data')
    train_data = data.TabularDataset(path=train_path,
                                     format='csv',
                                     skip_header=True,
                                     fields=[("text", text_field),
                                             ('label', label_field)])
    dev_data = data.TabularDataset(path=dev_path,
                                   format='csv',
                                   skip_header=True,
                                   fields=[("text", text_field),
                                           ('label', label_field)])

    print('building vocab')
    text_field.build_vocab(train_data, dev_data)
    label_field.build_vocab(train_data, dev_data)

    vectors = None

    if pretrained_word_embedding_name == "word2vec":
        vocab, vec = torchwordemb.load_word2vec_bin(
            pretrained_word_embedding_path)
        text_field.vocab.set_vectors(vocab, vec, embedding_dim)
        vectors = text_field.vocab.vectors
    elif "glove" in pretrained_word_embedding_name:
        text_field.vocab.load_vectors(pretrained_word_embedding_name)
        vectors = text_field.vocab.vectors

    pickle.dump(text_field, open(saved_text_vocab_path, 'wb'))
    pickle.dump(label_field, open(saved_label_vocab_path, 'wb'))

    vocab_size = len(text_field.vocab)
    print("vocab size ", vocab_size)
    #from zero
    label_size = len(label_field.vocab) - 1

    return train_data, dev_data, vocab_size, label_size, label_field.vocab.itos, vectors

Ejemplo n.º 12

0

Mostrar archivo

Archivo: model.py Proyecto: 443582555/ingredientCaption

    def __init__(self):
        super(ingredient_RNN, self).__init__()

        self.irnn = nn.LSTM(input_size=ING_WORD2VEC_DIM,
                            hidden_size=ING_RNN_DIM,
                            bidirectional=True,
                            batch_first=True)

        #Get the size of the Vocab.
        _, vec = torchwordemb.load_word2vec_bin(
            ING_WORD2VEC_PATH)  # give the vector of size 300

        #Creating the Embedding Matrix and then copy the vectors from Google WORD2VEC model to the embedding variable
        self.embs = nn.Embedding(
            vec.size(0), ING_WORD2VEC_DIM,
            padding_idx=0)  # not sure about the padding idx

        self.embs.weight.data.copy_(vec)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: data.py Proyecto: pkulzb/EthicsInDialogue

 def get_word_embeddings_bin(self,
                             embedding_file,
                             save_name='debiased_embeddings.mod',
                             embedding_dim=300):
     """ Get word embeddings, where it assumes input as a bin file
     """
     print "Loading word embeddings from {}".format(embedding_file)
     assert os.path.exists(embedding_file)
     embeddings = torch.Tensor(len(self.dictionary), embedding_dim)
     vocab, vecs = torchwordemb.load_word2vec_bin(embedding_file)
     ct = 0
     for word in self.dictionary.word2idx:
         if word in vocab:
             v = vecs[vocab[word]]
             embeddings[self.dictionary.word2idx[word]].copy_(v)
             ct += 1
     print 'Copied {}/{} words'.format(ct, len(self.dictionary.word2idx))
     torch.save(embeddings, open(save_name, 'wb'))
     return embeddings

Ejemplo n.º 14

0

Mostrar archivo

def load_word_vec(model_path):
    vocab, vec = torchwordemb.load_word2vec_bin(model_path)
    return vocab, vec

Ejemplo n.º 15

0

Mostrar archivo

Archivo: data_loader.py Proyecto: flyrainkey/xiaoming

	def load_corpus(self):
		print("正在加载语料库...")
		
		# 判断文件是否存在		
		if not os.path.exists(q_file):
			print("请将question库文件名命名为q.txt，quesion库目录应为%s" %  q_file)
			sys.exit()
		if not os.path.exists(a_file):
			print("请将answer库文件名命名为q.txt，anwser库目录应为%s" % a_file)
			sys.exit()
		
		'''word2vec'''

		self.word2vec()

		vocab, _ = torchwordemb.load_word2vec_bin(WORD2VEC_PATH)

		# 加载语料并处理
		self.en_lang = self.Lang() # encoder_lang
		self.de_lang = self.Lang() # decoder_lang
		
		q_lines = []
		a_lines = []
		with open(q_file, 'r') as f:
			for line in f.readlines():
				q_lines.append(line.strip('\n'))
		with open(a_file, 'r') as f:
			for line in f.readlines():
				a_lines.append(line.strip('\n'))
		self.pairs = []
		for i in range(len(q_lines)):
			self.pairs.append({0: q_lines[i], 1: a_lines[i]})

		self.word_dict = {}

		for k,v in vocab.items():
			self.word_dict[k] = int(v)+2 #将词后移动，使头两位为SOS，EOS

		word_index = 0
		for en_line in q_lines:
			word_list = wordseg(en_line)
			for word in word_list:
				if word in self.word_dict and word not in self.en_lang.word2index:
					word_index = self.word_dict[word]
					self.en_lang.word2index[word] = word_index
					self.en_lang.index2word[word_index] = word
					self.en_lang.n_words+=1


		for de_line in a_lines:
			word_list = wordseg(de_line)
			for word in word_list:
				if word in self.word_dict and word not in self.de_lang.word2index:
					word_index = self.word_dict[word]
					self.en_lang.word2index[word] = word_index
					self.en_lang.index2word[word_index] = word
					self.en_lang.n_words+=1


		print("语料库统计：")
		print("Q: %d 词" % self.en_lang.n_words)
		print("A: %d 词" % self.de_lang.n_words)

Ejemplo n.º 16

0

Mostrar archivo

 def __init__(self,path):
     self.vocab, self.embedding_matrix = torchwordemb.load_word2vec_bin(path)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: Jantest.py Proyecto: januaryshen/IMT573_im2recipe

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torchwordemb
from args import get_parser

parser = get_parser()
opts = parser.parse_args()

irnn = nn.LSTM(input_size=opts.ingrW2VDim,
               hidden_size=opts.irnnDim,
               bidirectional=True,
               batch_first=True)
_, vec = torchwordemb.load_word2vec_bin(opts.ingrW2V)
print(vec)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: code_test.py Proyecto: paisuygoda/im2ingr

import torchwordemb
import pickle

ingr_id, _ = torchwordemb.load_word2vec_bin("data/vocab.bin")

with open('data/ingredients_dict.p', 'rb') as f:
    ingr_dic = pickle.load(f)
with open('data/test_images.p', 'rb') as f:
    ids = pickle.load(f)

for index in range(5):
    print("CASE: ", index)
    recipeId = ids[index][:-4]
    print("recipeid = ", recipeId)

    for item in ingr_dic[recipeId]['ingr']:
        print(item)
        if item not in ingr_id:
            print("but it's not in dict!")