Python load_word2vec_binの例、torchwordemb.load_word2vec_bin Pythonの例

コード例 #1

0

ファイルを表示

def buildEmbMatrixFromGoogle(vocaboulary, emb_size):
    print('importing embeddings')
    vocab, vec = torchwordemb.load_word2vec_bin(
        "./GoogleNews-vectors-negative300.bin")
    print('imported embeddings')

    emb_mat = np.zeros((len(vocaboulary), emb_size))

    for i, word in enumerate(vocaboulary.keys()):
        if i % 1000 == 0:
            print("Reading word ", i, "/", len(vocaboulary))
        if word in vocab:
            emb_mat[vocaboulary[word]] = vec[vocab[word]].numpy()
        else:
            emb_mat[vocaboulary[word]] = np.random.normal(0, 1, emb_size)

    print('train matrices built')

    del vec
    del vocab
    gc.collect()

    print('garbage collected')

    return emb_mat

コード例 #2

0

ファイルを表示

ファイル: tests.py プロジェクト: zmqgeek/pytorch-wordemb

    def test_word2vec_bin(self):
        word, vec = torchwordemb.load_word2vec_bin("resource/word2vec.test.bin") 

        self.assertEqual(len(word), 113)

        self.assertEqual(vec.size(0), 113)
        self.assertEqual(vec.size(1), 100)

コード例 #3

0

ファイルを表示

    def create_emb_matrix(self, vocabulary):
        print('importing embeddings')
        vocab, vec = torchwordemb.load_word2vec_bin(
            "./GoogleNews-vectors-negative300.bin")
        print('imported embeddings')

        emb_mat = np.zeros((self.ntoken, self.emsize))

        for word in vocabulary.keys():
            if word in vocab:
                emb_mat[vocabulary[word]] = vec[vocab[word]].numpy()
            else:
                emb_mat[vocabulary[word]] = np.random.normal(0, 1, self.emsize)

        # hypotetically, the one for <unk>
        # emb_mat[-1] = np.random.normal(0, 1, self.emb_size)

        print('train matrices built')

        del vec
        del vocab
        gc.collect()

        print('garbage collected')

        return emb_mat

コード例 #4

0

ファイルを表示

ファイル: data_loader.py プロジェクト: paisuygoda/im2ingr

    def __init__(self, img_path, transform=None, target_transform=None,
                 loader=default_loader,square=False,data_path=None,partition=None,sem_reg=None,ingrW2V=None):
        ingr_id, _ = torchwordemb.load_word2vec_bin(ingrW2V)
        self.ingr_id = ingr_id

        if data_path==None:
            raise Exception('No data path specified.')

        if partition is None:
            raise Exception('Unknown partition type %s.' % partition)
        else:
            self.partition=partition

        with open(os.path.join(data_path,partition+'_images.p'),'rb') as f:
            self.ids = pickle.load(f)

        self.square  = square

        self.imgPath = img_path
        self.mismtch = 0.8
        self.maxInst = 20
        with open(os.path.join(data_path,'ingredients_dict.p'),'rb') as f:
            self.ingr_dic = pickle.load(f)
        with open(os.path.join(data_path,'recipe_class.p'),'rb') as f:
            self.recipe_class = pickle.load(f)

        if sem_reg is not None:
            self.semantic_reg = sem_reg
        else:
            self.semantic_reg = False

        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

コード例 #5

0

ファイルを表示

ファイル: trijoint.py プロジェクト: yes-github/bootcamp

 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=300,
                         hidden_size=300,
                         bidirectional=True,
                         batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(ingrW2V)
     self.embs = nn.Embedding(
         vec.size(0), 300, padding_idx=0)  # not sure about the padding idx
     self.embs.weight.data.copy_(vec)

コード例 #6

0

ファイルを表示

 def __init__(self):
     super(segmentRNN, self).__init__()
     self.segnn = nn.LSTM(input_size=opts.segmentW2VDim,
                          hidden_size=opts.srnnDim,
                          bidirectional=True,
                          batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(opts.segmentW2V)
     self.embs = nn.Embedding(
         vec.size(0), opts.segmentW2VDim,
         padding_idx=0)  #not sure about the padding idx
     self.embs.weight.data.copy_(vec)

コード例 #7

0

ファイルを表示

ファイル: rnn_solver.py プロジェクト: chikuanlin/whats-cooking

 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=ingrW2VDim,
                         hidden_size=irnnDim,
                         bidirectional=True,
                         batch_first=True,
                         num_layers=2,
                         dropout=0.2)
     _, vec = torchwordemb.load_word2vec_bin(ingrW2V)
     #self.embs = nn.Embedding(vec.size(0), ingrW2VDim, padding_idx=0) # not sure about the padding idx
     self.embs = nn.Embedding.from_pretrained(vec, freeze=True)

コード例 #8

0

ファイルを表示

 def loadPretrained(self, pretrained_path):
     count = 0
     vocab, vec = torchwordemb.load_word2vec_bin(pretrained_path)
     for i, word in enumerate(self.index2word):
         if word in vocab:
             count += 1
             self.embedding.weight.data[i, :].copy_ = vec[vocab[word]]
         else:
             self.embedding.weight.data[i, :].copy_(
                 np.random.uniform(-0.25, 0.25, 300))
     print("Loaded " + str(count) + " pretrained vectors")

コード例 #9

0

ファイルを表示

ファイル: trijoint.py プロジェクト: januaryshen/IMT573_im2recipe

 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=opts.ingrW2VDim,
                         hidden_size=opts.irnnDim,
                         bidirectional=True,
                         batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(opts.ingrW2V)
     print(opts.ingrW2V)
     self.embs = nn.Embedding(
         opts.ingrW2VDim, vec.size(0), padding_idx=0
     )  # not sure about the padding idx Jan change ingrW2VDim to ingrW2V
     self.embs.weight.data.copy_(vec)

コード例 #10

0

ファイルを表示

 def __init__(self):
     super(ingRNN, self).__init__()
     self.irnn = nn.LSTM(input_size=opts.ingrW2VDim,
                         hidden_size=opts.irnnDim,
                         bidirectional=True,
                         batch_first=True)
     _, vec = torchwordemb.load_word2vec_bin(
         os.path.join(opts.data_path, opts.tag, 'vocab.bin'))
     self.embs = nn.Embedding(
         vec.size(0), opts.ingrW2VDim,
         padding_idx=0)  # not sure about the padding idx
     self.embs.weight.data.copy_(vec)

コード例 #11

0

ファイルを表示

def load_dataset(train_path,
                 dev_path,
                 max_text_length,
                 embedding_dim,
                 tokenizer=tokenizer,
                 dev_ratio=0.1,
                 pretrained_word_embedding_name="glove.6B.300d",
                 pretrained_word_embedding_path=None,
                 saved_text_vocab_path="text_vocab.pkl",
                 saved_label_vocab_path="label_vocab.pkl"):
    text_field = data.Field(lower=True,
                            tokenize=tokenizer,
                            fix_length=max_text_length)
    label_field = data.Field(sequential=False)

    print('loading data')
    train_data = data.TabularDataset(path=train_path,
                                     format='csv',
                                     skip_header=True,
                                     fields=[("text", text_field),
                                             ('label', label_field)])
    dev_data = data.TabularDataset(path=dev_path,
                                   format='csv',
                                   skip_header=True,
                                   fields=[("text", text_field),
                                           ('label', label_field)])

    print('building vocab')
    text_field.build_vocab(train_data, dev_data)
    label_field.build_vocab(train_data, dev_data)

    vectors = None

    if pretrained_word_embedding_name == "word2vec":
        vocab, vec = torchwordemb.load_word2vec_bin(
            pretrained_word_embedding_path)
        text_field.vocab.set_vectors(vocab, vec, embedding_dim)
        vectors = text_field.vocab.vectors
    elif "glove" in pretrained_word_embedding_name:
        text_field.vocab.load_vectors(pretrained_word_embedding_name)
        vectors = text_field.vocab.vectors

    pickle.dump(text_field, open(saved_text_vocab_path, 'wb'))
    pickle.dump(label_field, open(saved_label_vocab_path, 'wb'))

    vocab_size = len(text_field.vocab)
    print("vocab size ", vocab_size)
    #from zero
    label_size = len(label_field.vocab) - 1

    return train_data, dev_data, vocab_size, label_size, label_field.vocab.itos, vectors

コード例 #12

0

ファイルを表示

ファイル: model.py プロジェクト: 443582555/ingredientCaption

    def __init__(self):
        super(ingredient_RNN, self).__init__()

        self.irnn = nn.LSTM(input_size=ING_WORD2VEC_DIM,
                            hidden_size=ING_RNN_DIM,
                            bidirectional=True,
                            batch_first=True)

        #Get the size of the Vocab.
        _, vec = torchwordemb.load_word2vec_bin(
            ING_WORD2VEC_PATH)  # give the vector of size 300

        #Creating the Embedding Matrix and then copy the vectors from Google WORD2VEC model to the embedding variable
        self.embs = nn.Embedding(
            vec.size(0), ING_WORD2VEC_DIM,
            padding_idx=0)  # not sure about the padding idx

        self.embs.weight.data.copy_(vec)

コード例 #13

0

ファイルを表示

ファイル: data.py プロジェクト: pkulzb/EthicsInDialogue

 def get_word_embeddings_bin(self,
                             embedding_file,
                             save_name='debiased_embeddings.mod',
                             embedding_dim=300):
     """ Get word embeddings, where it assumes input as a bin file
     """
     print "Loading word embeddings from {}".format(embedding_file)
     assert os.path.exists(embedding_file)
     embeddings = torch.Tensor(len(self.dictionary), embedding_dim)
     vocab, vecs = torchwordemb.load_word2vec_bin(embedding_file)
     ct = 0
     for word in self.dictionary.word2idx:
         if word in vocab:
             v = vecs[vocab[word]]
             embeddings[self.dictionary.word2idx[word]].copy_(v)
             ct += 1
     print 'Copied {}/{} words'.format(ct, len(self.dictionary.word2idx))
     torch.save(embeddings, open(save_name, 'wb'))
     return embeddings

コード例 #14

0

ファイルを表示

def load_word_vec(model_path):
    vocab, vec = torchwordemb.load_word2vec_bin(model_path)
    return vocab, vec

コード例 #15

0

ファイルを表示

ファイル: data_loader.py プロジェクト: flyrainkey/xiaoming

	def load_corpus(self):
		print("正在加载语料库...")
		
		# 判断文件是否存在		
		if not os.path.exists(q_file):
			print("请将question库文件名命名为q.txt，quesion库目录应为%s" %  q_file)
			sys.exit()
		if not os.path.exists(a_file):
			print("请将answer库文件名命名为q.txt，anwser库目录应为%s" % a_file)
			sys.exit()
		
		'''word2vec'''

		self.word2vec()

		vocab, _ = torchwordemb.load_word2vec_bin(WORD2VEC_PATH)

		# 加载语料并处理
		self.en_lang = self.Lang() # encoder_lang
		self.de_lang = self.Lang() # decoder_lang
		
		q_lines = []
		a_lines = []
		with open(q_file, 'r') as f:
			for line in f.readlines():
				q_lines.append(line.strip('\n'))
		with open(a_file, 'r') as f:
			for line in f.readlines():
				a_lines.append(line.strip('\n'))
		self.pairs = []
		for i in range(len(q_lines)):
			self.pairs.append({0: q_lines[i], 1: a_lines[i]})

		self.word_dict = {}

		for k,v in vocab.items():
			self.word_dict[k] = int(v)+2 #将词后移动，使头两位为SOS，EOS

		word_index = 0
		for en_line in q_lines:
			word_list = wordseg(en_line)
			for word in word_list:
				if word in self.word_dict and word not in self.en_lang.word2index:
					word_index = self.word_dict[word]
					self.en_lang.word2index[word] = word_index
					self.en_lang.index2word[word_index] = word
					self.en_lang.n_words+=1


		for de_line in a_lines:
			word_list = wordseg(de_line)
			for word in word_list:
				if word in self.word_dict and word not in self.de_lang.word2index:
					word_index = self.word_dict[word]
					self.en_lang.word2index[word] = word_index
					self.en_lang.index2word[word_index] = word
					self.en_lang.n_words+=1


		print("语料库统计：")
		print("Q: %d 词" % self.en_lang.n_words)
		print("A: %d 词" % self.de_lang.n_words)

コード例 #16

0

ファイルを表示

 def __init__(self,path):
     self.vocab, self.embedding_matrix = torchwordemb.load_word2vec_bin(path)

コード例 #17

0

ファイルを表示

ファイル: Jantest.py プロジェクト: januaryshen/IMT573_im2recipe

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torchwordemb
from args import get_parser

parser = get_parser()
opts = parser.parse_args()

irnn = nn.LSTM(input_size=opts.ingrW2VDim,
               hidden_size=opts.irnnDim,
               bidirectional=True,
               batch_first=True)
_, vec = torchwordemb.load_word2vec_bin(opts.ingrW2V)
print(vec)

コード例 #18

0

ファイルを表示

ファイル: code_test.py プロジェクト: paisuygoda/im2ingr

import torchwordemb
import pickle

ingr_id, _ = torchwordemb.load_word2vec_bin("data/vocab.bin")

with open('data/ingredients_dict.p', 'rb') as f:
    ingr_dic = pickle.load(f)
with open('data/test_images.p', 'rb') as f:
    ids = pickle.load(f)

for index in range(5):
    print("CASE: ", index)
    recipeId = ids[index][:-4]
    print("recipeid = ", recipeId)

    for item in ingr_dic[recipeId]['ingr']:
        print(item)
        if item not in ingr_id:
            print("but it's not in dict!")