def load_word_vectors(word_vectors_name, embedding_size, word_vectors_cache='../data/word_vectors_cache'): implemented_vector_embeddings = ('GloVe_6B', 'GloVe_42B', 'GloVe_840B', 'GloVe_twitter.27B', 'FastText_en') assert word_vectors_name in implemented_vector_embeddings word_vectors = None if word_vectors_name == 'GloVe_6B': assert embedding_size in (50, 100, 200, 300) word_vectors = GloVe(name='6B', dim=embedding_size, cache=word_vectors_cache) if word_vectors_name == 'GloVe_42B': embedding_size = 300 word_vectors = GloVe(name='42B', cache=word_vectors_cache) if word_vectors_name == 'GloVe_840B': embedding_size = 300 word_vectors = GloVe(name='840B', cache=word_vectors_cache) if word_vectors_name == 'GloVe_twitter.27B': assert embedding_size in (25, 50, 100, 200) word_vectors = GloVe(name='twitter.27B', dim=embedding_size, cache=word_vectors_cache) if word_vectors_name == 'FastText_en': embedding_size = 300 word_vectors = FastText(language='en', cache=word_vectors_cache) return word_vectors, embedding_size
def load_embedding(self): word2id = self.opt.word2id logging.info('Load embedding from pytorch-nlp.') if self.opt.embedding_cache: embedding_dict = GloVe(cache=self.opt.embedding_cache) # load embedding cache from a specific place else: embedding_dict = GloVe() # load embedding cache from local dir or download now logging.info('Load embedding finished.') self.embedding_layer.weight.data.uniform_(-0.25, 0.25) for word, idx in word2id.items(): if word in embedding_dict.stoi: self.embedding_layer.weight.data[idx] = embedding_dict[word] logging.info('Word embedding size: {0}'.format(self.embedding_layer.weight.data.size()))
def get_input_embeddings(tokens_seq, tokens_vocab): vocab_size = tokens_vocab.__len__() print("Get pretrained embeddings") embeddings_vectors = {} for name_, size_ in [("6B", 100), ("840B", 300)]: glove_vectors = GloVe( name=name_, dim=size_, cache="/home/[email protected]/resources/embeddings/glove") vocab_vectors = np.zeros((vocab_size, size_)) # if token is OOV (not in embeddings = words_vector) get ppmi for token, id_ in tokens_vocab.vocab.items(): if not (vocab_vectors[id_, :] == 0).all(): continue else: vector = glove_vectors[token] if not (vector == 0).all(): vocab_vectors[id_, :] = vector else: vocab_vectors[id_, :] = [ random.uniform(-1.5, 1.7) for _ in range(size_) ] vocab_vectors = torch.tensor(vocab_vectors) embeddings_vectors[(name_, size_)] = vocab_vectors print("Finished") return embeddings_vectors
def form_dataset(create_new, path_full, path_processed, need_hist, use_glove: bool, data_range=0): # Preprocess data if needed, else open processed file if create_new: all_quotes, vocabulary,\ word_count, total_word_count,\ end_token, quote_count = dp().preprocess(path_data_full=path_full, path_data_processed=path_processed, data_range=data_range) else: all_quotes, vocabulary, \ word_count, total_word_count, \ end_token, quote_count = dp().open_preprocessed(path_data_processed=path_processed) # If need histograms, draw them: if need_hist: util.draw_histograms(all_quotes, vocabulary, word_count) # If using GloVe, then use this: if use_glove: # Create datasets and prepare embeddings: glove = GloVe('6B') # Get embeddings embeddings = [] # Append 1st embedding as pad embedding for intuition later onwards #embeddings.append(torch.zeros_like(glove['word'])) for word in word_count.keys(): embeddings.append(glove[word]) else: embeddings = None all_quotes = util.words_to_label(all_quotes, vocabulary) x_data = all_quotes[:int(len(all_quotes) * 0.8)] y_data = all_quotes[int(len(all_quotes) * 0.8):] dataset_train = QuoteDataset(x_data, end_token) dataset_test = QuoteDataset(y_data, end_token) return dataset_train, dataset_test, vocabulary, word_count, total_word_count, end_token, quote_count, embeddings
def __init__( self, vocab=None, name='840B', dim=300, trainable=False, ): super(GloveEmbedding, self).__init__() self.vocab_size = len(vocab) self.vocab = vocab self.name = name self.dim = dim vectors = GloVe(name=self.name, dim=self.dim) self.weights = torch.zeros(self.vocab_size, vectors.dim) for idx in range(self.vocab_size): self.weights[idx, :] = vectors[self.vocab[idx]] self.embedding = nn.Embedding(self.vocab_size, self.dim) self.embedding.weight.data = torch.Tensor(self.weights) if not trainable: self.embedding.weight.requires_grad = False
def load_pretrained_embeddings(self, dim): vectors = GloVe(name="6B", dim=dim) embeddings = torch.stack([ vectors[self.id_to_word[ind]] for ind in tqdm(range(len(self.id_to_word))) ]) embeddings = embeddings.type(torch.float64) return embeddings
def word_vec(list_idx): list_word_vec=[] vec=GloVe(name='840B',dim=300) for i,w in enumerate(list_idx): list_word_vec.append(vec[w]) sentence_word_vec= torch.stack(list_word_vec) #print (sentence_word_vec.shape) return sentence_word_vec
def convert_to_vector_representation(data): glv = GloVe() vectorized_data = [] for document, y in data: vector = [] for word in document: word_embed = glv[word] vector.append(word_embed) vectorized_data.append((vector, y)) return vectorized_data
def load_embedding(opt, word2id): if opt.word_embedding: logging.info('Load embedding from file.') raise NotImplementedError else: logging.info('Load embedding from pytorch-nlp.') if opt.embedding_cache: embedding_dict = GloVe(cache=opt.embedding_cache) # load embedding cache from a specific place else: embedding_dict = GloVe() # load embedding cache from local dir for download now logging.info('Load embedding finished.') pad_id = word2id['<pad>'] n_v = len(word2id) n_d = opt.word_dim embedding_layer = nn.Embedding(n_v, n_d, padding_idx=pad_id) embedding_layer.weight.data.uniform_(-0.25, 0.25) for word, idx in word2id.items(): if word in embedding_dict.stoi: embedding_layer.weight.data[idx] = embedding_dict[word] logging.info('Word embedding size: {0}'.format(embedding_layer.weight.data.size())) return embedding_layer
def get_w2v_input(self): from torchnlp.word_to_vector import GloVe vectors = GloVe() indexeds = [] for idx in self.batch_index: text = self.raw_input[idx] pad_num = self.max_len - len(text) indexeds.append( torch.cat((vectors[text], torch.zeros([pad_num, 300])), 0)) return torch.stack(indexeds)
def make_embedding(vocab, d): '''craete and save a (|V|xd) embedding matrix''' vectors = GloVe() dim = vectors['hi'].shape[0] embeddings = [] for word, _ in vocab.items(): if word == "PAD": vec = torch.zeros(dim).float() else: vec = vectors[word] embeddings.append(vec) embd = torch.stack(embeddings) torch.save(embd, '%s/embd.pt' % d)
def test_glove_6b_50(mock_urlretrieve): directory = 'tests/_test_data/glove/' # Make sure URL has a 200 status mock_urlretrieve.side_effect = urlretrieve_side_effect # Attempt to parse a subset of GloVe vectors = GloVe(name="6B", dim="50", cache=directory) assert len(vectors['the']) == 50 # Test with the unknown characters assert len(vectors['漢字']) == 50 # Clean up os.remove(directory + 'glove.6B.50d.txt.pt')
def load_w2v_models(self): original_path = os.getcwd() # os.chdir(self.cf.folder_of_data + 'all_data/') # this is where the data models are stored os.chdir(self.cf.folder_of_data) # this is where the data models are stored print('loading google w2v') IMDB_dataset.google_model = gensim.models.KeyedVectors.load_word2vec_format(self.cf.folder_of_data + '/models/GoogleNews-vectors-negative300.bin', binary=True) print('loading fasttext w2v') IMDB_dataset.fasttext_model = FastText(language="simple") print('loading glove w2v') IMDB_dataset.glove_model = GloVe(name='6B', dim=300) print('building/loading custom w2v') IMDB_dataset.custom_model = build_w2v_model(self.cf, self.data) os.chdir(original_path) # restore the original path
def get_lstm_parsed_sentences_and_embeddings(data): processed_data = ProcessDataset(data, max_vocab_size=len(data)) words_counter = processed_data.build_counter() vocab, index_to_vocab = processed_data.build_vocab( words_counter=words_counter, max_vocab_size=len(data)) sentences = np.array([ pad_features(processed_data.__getitem__(i)[0]) for i in range(len(data)) ]) pretrained_embedding = GloVe(name='6B', dim=300, is_include=lambda w: w in vocab.keys()) embedding_weights = torch.Tensor(len(vocab.keys()), pretrained_embedding.dim) for num, word in index_to_vocab.items(): embedding_weights[num] = pretrained_embedding[index_to_vocab[num]] return sentences, embedding_weights
def __init__(self, n_vocab, n_embed, n_hidden, n_layer, vocab=None): super(KnowledgeEncoder, self).__init__() self.n_vocab = n_vocab self.n_embed = n_embed self.n_hidden = n_hidden self.n_layer = n_layer if vocab is None: self.embedding = nn.Embedding(n_vocab, n_embed) else: embedding = torch.Tensor(n_vocab, n_embed) vectors = GloVe() for word in vocab.stoi: if word in vectors.stoi: embedding[vocab.stoi[word]] = vectors[word] self.embedding = nn.Embedding.from_pretrained(embedding) print("Kencoder embedding is initialized with Glove") self.gru = nn.GRU(input_size=n_embed, hidden_size=n_hidden, num_layers=n_layer, bidirectional=True)
def get_comment_embed( string, glove_embedding=None, corpus_vocab_prob_file='pandas_objects/corpus_vocab_prob.pkl'): """ Parameters ---------- string : str Comment associated (or not) with image. corpus_vocab_prob_file : str, optional File location of corpus vocab probability pickle file. The default is 'pandas_objects/corpus_vocab_prob.pkl'. Returns ------- comment_embedding : torch.Tensor [1, 100] """ string_list = preprocess_comments(string, input_type='string').split(" ") string_list = list(filter(lambda x: x != "", string_list)) if glove_embedding == None: glove_embedding = GloVe(name="6B", dim=100, is_include=lambda w: w in set(string_list)) corpus_vocab_prob = pd.read_pickle(corpus_vocab_prob_file) comment_embedding = torch.zeros([100]) # Summary vector for word in string_list: word_embedding = glove_embedding[word] try: word_prob = corpus_vocab_prob[word] comment_embedding = comment_embedding + (1e-3 / (1e-3 + word_prob) * word_embedding) except: print('Word not in Flickr Corpus. WORD: ', word) return comment_embedding
def main(vocab_file): GloVe_vectors = GloVe(name='6B', dim=cfg.EMBEDDING_SIZE) embeddings = torch.Tensor(cfg.VOCAB_SIZE + 3, 100) word2idx, idx2word = {}, {} word2idx[cfg.SENTENCE_START] = cfg.VOCAB_SIZE word2idx[cfg.SENTENCE_END] = cfg.VOCAB_SIZE + 1 word2idx[cfg.UNKNOWN] = cfg.VOCAB_SIZE + 2 idx2word[cfg.VOCAB_SIZE] = cfg.SENTENCE_START idx2word[cfg.VOCAB_SIZE + 1] = cfg.SENTENCE_END idx2word[cfg.VOCAB_SIZE + 2] = cfg.UNKNOWN with open(vocab_file, 'r') as reader: for i, line in enumerate(reader): token, count = line.split(' ') embeddings[i] = GloVe_vectors[token] word2idx[token] = i idx2word[i] = token # Start and end tokens embeddings[-3] = torch.cat((torch.zeros(cfg.EMBEDDING_SIZE // 2), torch.ones(cfg.EMBEDDING_SIZE // 2)), 0) embeddings[-2] = torch.cat((torch.ones(cfg.EMBEDDING_SIZE // 2), torch.zeros(cfg.EMBEDDING_SIZE // 2)), 0) embeddings[-1] = torch.zeros(cfg.EMBEDDING_SIZE) torch.save(embeddings, 'GloVe_embeddings.pt') word2idx = json.dumps(word2idx) idx2word = json.dumps(idx2word) with open("word2idx.json", 'w') as json_writer: json_writer.write(word2idx) with open("idx2word.json", 'w') as json_writer: json_writer.write(idx2word)
def __init__(self, vocab=None, name="840B", dim=100, trainable=False): super(GloveEmbedding, self).__init__() self.vocab_size = len(vocab) self.vocab = vocab self.name = name self.dim = dim # pdb.set_trace() vectors = GloVe(name=self.name, dim=self.dim) self.weights = torch.zeros(self.vocab_size, vectors.dim) for i, idx in enumerate(list(self.vocab.idx2word.keys())): self.weights[i, :] = vectors[self.vocab[idx]] self.embedding = nn.Embedding(self.vocab_size, self.dim) self.embedding.weight.data = torch.Tensor(self.weights) if not trainable: self.embedding.weight.requires_grad = False
def get_word_vec_model(cf, reviews): print('--- Loading', cf.word_corpus_4_text_understanding, " pre-trained Word2Vec model") if cf.word_corpus_4_text_understanding == 'Google_news' : word_vec_model = gensim.models.KeyedVectors.load_word2vec_format(cf.folder_of_data + '/models/GoogleNews-vectors-negative300.bin', binary=True) elif cf.word_corpus_4_text_understanding=='Brown': word_vec_model = gensim.models.Word2Vec(brown.sents()) # using Brown corpus elif cf.word_corpus_4_text_understanding == 'Fasttext': word_vec_model = FastText(language="simple") elif cf.word_corpus_4_text_understanding == 'Glove': word_vec_model = GloVe(name='6B', dim=300) elif cf.word_corpus_4_text_understanding == 'CharNGram': word_vec_model = CharNGram() elif cf.word_corpus_4_text_understanding == 'Custom': word_vec_model = build_w2v_model(cf, reviews) else: print('Error: Please select a word vector model') print('--- Loading', cf.word_corpus_4_text_understanding, ' done') return word_vec_model # , custom_model
def get_torch_glove(torch_glove_type): """A helper function to user torchnlp built-in glove getter. :param torch_glove_type: a string, name of GloVe embedding :return: bool, whether the get operation workd. """ logging.info( f'Downloading GloVe vectors from TorchNLP for {torch_glove_type}') # set path for download (cache in torchnlp) period = "." underscore = "_" if period in torch_glove_type: torch_glove_path = torch_glove_type.replace(period, underscore) else: torch_glove_path = torch_glove_type torch_glove_folder = os.sep.join( [EMBEDDING_FOLDER, f'torch_glove_{torch_glove_path}']) # run torchnlp method for GloVe download directories = [] if os.path.exists(torch_glove_folder): directories = os.listdir(torch_glove_folder) if len(directories) == 0: GloVe(name=torch_glove_type, cache=torch_glove_folder) directories = os.listdir(torch_glove_folder) if len(directories) > 0: directories = [ x for x in directories if not x.endswith('.pt') and not x.endswith('.zip') ] write_pickle(directories, torch_glove_folder) if directories: return True else: return False
def __init__(self, n_vocab, n_embed, n_hidden, n_layer, vocab=None): super(Decoder, self).__init__() self.n_vocab = n_vocab self.n_embed = n_embed self.n_hidden = n_hidden self.n_layer = n_layer if vocab is None: self.embedding = nn.Embedding(n_vocab, n_embed) else: embedding = torch.Tensor(n_vocab, n_embed) vectors = GloVe() for word in vocab.stoi: if word in vectors.stoi: embedding[vocab.stoi[word]] = vectors[word] self.embedding = nn.Embedding.from_pretrained(embedding) print("decoder embedding is initialized with Glove") self.attention = Attention(n_hidden) self.y_weight = nn.Linear(n_hidden, n_hidden) self.k_weight = nn.Linear(n_hidden, n_hidden) self.z_weight = nn.Linear(2 * n_hidden, n_hidden) self.y_gru = nn.GRU(n_embed + n_hidden, n_hidden, n_layer) self.k_gru = nn.GRU(3 * n_hidden, n_hidden, n_layer) self.out = nn.Linear(2 * n_hidden, n_vocab)
def __init__(self, vocab, sentences, model="elmo", attention="vanilla", k=0): super().__init__() self.name = f"model={model}_attention={attention}_k={k}" self.k = k self.bert = model == "bert" if self.bert: self.bert_model = BertModel.from_pretrained('bert-base-uncased') self.bert_model.train() hidden_size = 768 double = hidden_size self.dropout_on_input_to_linear_layer = nn.Dropout(0.1) else: hidden_size = 128 self.cached = dict() self.init_elmo(vocab, sentences) self.glove = GloVe() self.lstm1 = nn.LSTM(input_size=1324, hidden_size=hidden_size, batch_first=True, bidirectional=True) double = hidden_size * 2 self.dropout_on_input_to_LSTM = nn.Dropout(0.5) self.dropout_on_input_to_linear_layer = nn.Dropout(0.3) # Initialise parameters for classification & attention layers self.output_projection = nn.Linear(double * (2 if k > -1 else 1), 2) if attention == "general": self.attention = GeneralAttention(double) else: self.attention = HierarchicalAttention(double, self.bert)
amazon_test_csv = pd.read_csv(test_csv, header=None, names=['sentiment', 'title', 'review']) test_data = get_data(amazon_test_csv) test_texts = retrieve_texts(test_data) _, test_indices, _ = docs2idxs(test_texts, max_len=training_seq_len, encoder=enc) test_indices_and_sentiment = [(idxs, d[0]) for (idxs, d) in zip(test_indices, test_data)] training_titles = set(amazon_training_csv['title']) training_reviews = set(amazon_training_csv['review']) for idx, item in amazon_test_csv.iterrows(): if item['review'] in training_reviews: if item['title'] in training_titles: raise AssertionError("Row w/ title {} redundant.".format(item['title'])) vecs = GloVe(cache=config.get('PREPARATION', 'word_vector_cache')) embedding_weights = weights(enc, vecs) embedding_model = Sequential() embedding_model.add(Embedding(enc.vocab_size, vecs.dim, weights=[embedding_weights], input_length=training_seq_len, trainable=False)) embedding_model.compile('rmsprop', 'mse') input_shape = (training_seq_len, vecs.dim, 1) x_train_unshaped = [embedding_model.predict(np.array(sample[0]).reshape(1, -1)) for sample in training_indices_and_sentiment] # shape n * (1 * seq_len * vector_dim) x_test_unshaped = [embedding_model.predict(np.array(sample[0]).reshape(1, -1)) for sample in
import torch from torchnlp.encoders.text import WhitespaceEncoder from torchnlp.word_to_vector import GloVe encoder = WhitespaceEncoder(["now this ain't funny", "so don't you dare laugh"]) vocab = set(encoder.vocab) pretrained_embedding = GloVe(name='6B', dim=300, is_include=lambda w: w in vocab) embedding_weights = torch.Tensor(encoder.vocab_size, pretrained_embedding.dim) for i, token in enumerate(encoder.vocab): embedding_weights[i] = pretrained_embedding[token] print("")
#!/usr/bin/env python # encoding: utf-8 """ @author: HuRuiFeng @file: lesson85-time-series-expression.py @time: 2020/7/24 14:13 @project: deeplearning-with-pytorch-notes @desc: 第85课:时间序列表示 """ import torch from torch import nn import torchnlp from torchnlp import word_to_vector from torchnlp.word_to_vector import GloVe print("-----word2vec vs GloVe-----") print("====word2vec=====") word_to_ix = {"hello": 0, "world": 1} lookup_tensor = torch.tensor([word_to_ix['hello']], dtype=torch.long) embeds = nn.Embedding(2, 5) hello_embed = embeds(lookup_tensor) print(hello_embed) print() print("====GloVe=====") vectors = GloVe() print(vectors['hello'])
def __init__(self): self.stop_words = stopwords.words("english") self.glove_vectors = GloVe(name='6B') self.lemma = WordNetLemmatizer() self.token_index = {} return
import ssl ssl._create_default_https_context = ssl._create_unverified_context from torchnlp.word_to_vector import GloVe import numpy as np glove_embedding = GloVe( cache= "../../Twitter_Ideology_Prediction/data/post_processing/.word_vectors_cache" ) # print(vars(glove_embedding).keys()) # print(dir(glove_embedding)) # print(type(glove_embedding.stoi)) vocabulary = list(glove_embedding.stoi.keys()) dictionary = dict(zip(list(range(len(vocabulary))), vocabulary)) word2idx = glove_embedding.stoi # reversed_dict ''' print(len(vocabulary)) print(word2idx["trump"]) print(word2idx["Trump"]) print(word2idx["obama"]) print(word2idx["Obama"]) print(word2idx["democratic"]) print(word2idx["Democratic"]) print(word2idx["republican"]) print(word2idx["Republican"]) ''' # n = 1000
comment_embedding = comment_embedding + (1e-3 / (1e-3 + word_prob) * word_embedding) except: print('Word not in Flickr Corpus. WORD: ', word) return comment_embedding if __name__ == "__main__": for string in strings: similarities = [] string_list = preprocess_comments(string, input_type='string').split(" ") string_list = list(filter(lambda x: x != "", string_list)) glove_embedding = GloVe(name="6B", dim=100, is_include=lambda w: w in set(string_list)) for img_name in img_names: similarities.append( [img_name, main(string, img_name, glove_embedding)]) similarities.sort(key=lambda x: x[1]) similarities.reverse() print('From most similar to least similar...') for i in range(1, len(similarities) + 1): print('IMG: ', similarities[i - 1], 'IMG RANK:', i)
def _build_dataloader(self): self.val_loader = self.corpus = None if self.dataset_kind == "mnist": transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]) self.dataset = MNISTBufferedDataset(self.data_dir, download=True, train=True, transform=transform) self.val_dataset = MNISTBufferedDataset(self.data_dir, download=True, transform=transform) self.train_sampler = MNISTSequenceSampler( self.dataset, sequences=self.sequences, batch_size=self.batch_size, random_mnist_images=not self.static_digit, randomize_sequence_cursors=self.randomize_sequence_cursors, noise_buffer=self.noise_buffer, use_mnist_pct=self.use_mnist_pct, max_batches=self.batches_in_epoch, ) if self.static_digit: # For static digit paradigm, val & train samplers much # match to ensure same digit prototype used for each sequence item. self.val_sampler = self.train_sampler else: self.val_sampler = MNISTSequenceSampler( self.val_dataset, sequences=self.sequences, batch_size=self.batch_size, random_mnist_images=not self.static_digit, randomize_sequence_cursors=self.randomize_sequence_cursors, noise_buffer=self.noise_buffer, use_mnist_pct=self.use_mnist_pct, max_batches=self.eval_batches_in_epoch, ) self.train_loader = DataLoader( self.dataset, batch_sampler=self.train_sampler, collate_fn=pred_sequence_collate, ) self.val_loader = DataLoader( self.val_dataset, batch_sampler=self.val_sampler, collate_fn=pred_sequence_collate, ) elif self.dataset_kind == "ptb": # Download "Penn Treebank" dataset from torchnlp.datasets import penn_treebank_dataset print("Maybe download PTB...") penn_treebank_dataset(self.data_dir + "/PTB", train=True, test=True) corpus = lang_util.Corpus(self.data_dir + "/PTB") train_sampler = PTBSequenceSampler( corpus.train, batch_size=self.batch_size, max_batches=self.batches_in_epoch, ) if self.embedding_kind == "rsm_bitwise": embedding = lang_util.BitwiseWordEmbedding().embedding_dict elif self.embedding_kind in ["bpe", "glove"]: from torchnlp.word_to_vector import BPEmb, GloVe cache_dir = self.data_dir + "/torchnlp/.word_vectors_cache" if self.embedding_kind == "bpe": vectors = BPEmb(dim=self.embed_dim, cache=cache_dir) else: vectors = GloVe(name="6B", dim=self.embed_dim, cache=cache_dir) embedding = {} for word_id, word in enumerate(corpus.dictionary.idx2word): embedding[word_id] = vectors[word] elif "ptb_fasttext" in self.embedding_kind: import fasttext # Generated via notebooks/ptb_embeddings.ipynb embedding = {} ft_model = fasttext.load_model(self.data_dir + "/embeddings/%s.bin" % self.embedding_kind) for word_id, word in enumerate(corpus.dictionary.idx2word): embedding[word_id] = torch.tensor(ft_model[word]) if self.embedding_kind: print("Loaded embedding dict (%s) with %d entries" % (self.embedding_kind, len(embedding))) collate_fn = partial(ptb_pred_sequence_collate, vector_dict=embedding) self.train_loader = DataLoader(corpus.train, batch_sampler=train_sampler, collate_fn=collate_fn) val_sampler = PTBSequenceSampler( corpus.test, batch_size=self.eval_batch_size, max_batches=self.eval_batches_in_epoch, uniform_offsets=True, ) self.val_loader = DataLoader(corpus.test, batch_sampler=val_sampler, collate_fn=collate_fn) self.corpus = corpus print("Built dataloaders...")
def glove_embedding(size): glove = GloVe('6B', size, cache=CACHE) stoi = {tok: i for i, tok in enumerate(glove.itos)} rows, cols = glove.vectors.shape embedding = nn.Embedding(rows, cols, _weight=glove.vectors) return embedding, stoi, glove.itos