def load_pretrained_embeddings(self, vocabulary, path): File_object = open(r"results.txt", "a") """ Loads GloVe vectors and initializes the embedding matrix. Args: vocabulary: `Vocabulary` object. path: Embedding path, e.g. "glove/glove.6B.300d.txt". """ embedding_map = load_cached_embeddings(path) # Create embedding matrix. By default, embeddings are randomly # initialized from Uniform(-0.1, 0.1). embeddings = torch.zeros( (len(vocabulary), self.args.embedding_dim)).uniform_(-0.1, 0.1) # Initialize pre-trained embeddings. num_pretrained = 0 for (i, word) in enumerate(vocabulary.words): if word in embedding_map: embeddings[i] = torch.tensor(embedding_map[word]) num_pretrained += 1 # Place embedding matrix on GPU. self.embedding.weight.data = cuda(self.args, embeddings) return num_pretrained
def load_pretrained_embeddings(self, vocabulary, path): """ Loads GloVe vectors and initializes the embedding matrix. Args: vocabulary: `Vocabulary` object. path: Embedding path, e.g. "glove/glove.6B.300d.txt". """ if path == 'glove/biowordvec_train.vec.bin': print('Using Bio2Word Embedding') embedding_map = KeyedVectors.load_word2vec_format(path, binary=True, limit=100000) else: print('Using GloVe') embedding_map = load_cached_embeddings(path) # Create embedding matrix. By default, embeddings are randomly # initialized from Uniform(-0.1, 0.1). embeddings = torch.zeros( (len(vocabulary), self.args.embedding_dim)).uniform_(-0.1, 0.1) # Initialize pre-trained embeddings. num_pretrained = 0 for (i, word) in enumerate(vocabulary.words): if word in embedding_map: embeddings[i] = torch.tensor(embedding_map[word]) num_pretrained += 1 # Place embedding matrix on GPU. self.embedding.weight.data = cuda(self.args, embeddings) return num_pretrained
def load_input_data(self, dataset_root_folder, word_vectors_cache_file, \ train_set_folder, dev_set_folder, test_set_folder, load_ext_feats=True): for set_folder in [test_set_folder, dev_set_folder, train_set_folder]: if set_folder: questions, sentences, labels, maxlen_q, maxlen_s, vocab = \ utils.read_in_dataset(dataset_root_folder, set_folder) self.data_splits[set_folder] = [ questions, sentences, labels, maxlen_q, maxlen_s ] default_ext_feats = [np.zeros(4)] * len( self.data_splits[set_folder][0]) self.data_splits[set_folder].append(default_ext_feats) utils.load_cached_embeddings( word_vectors_cache_file, vocab, self.embeddings, [] if "train" in set_folder else self.unk_term)
def load_input_data(self, dataset_root_folder, word_vectors_cache_file, train_set_folder, dev_set_folder, test_set_folder): for set_folder in [test_set_folder, dev_set_folder, train_set_folder]: if set_folder: self.datasets[set_folder] = utils.read_in_dataset( dataset_root_folder, set_folder) # NOTE: self.datasets[set_folder] = questions, sentences, labels, vocab, maxlen_q, maxlen_s, ext_feats self.embeddings[set_folder] = utils.load_cached_embeddings( word_vectors_cache_file, self.datasets[set_folder][3], [] if "train" in set_folder else self.unk_term)
def load_pretrained_embeddings(self, vocabulary, path, sentences): """ Loads GloVe vectors and initializes the embedding matrix. Args: vocabulary: `Vocabulary` object. path: Embedding path, e.g. "glove/glove.6B.300d.txt". """ self.vocabulary = vocabulary '''options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" # Note the "1", since we want only 1 output representation for each token. elmo = Elmo(options_file, weight_file, 1, dropout=0) print ("made it here the prequel") sentences = sentences[0:5] print (sentences) character_ids = batch_to_ids(sentences) print ("made it here") print ("charids: " + str(character_ids)) embeddings = elmo(character_ids)''' #return len(vocabulary) embedding_map = load_cached_embeddings(path) # Create embedding matrix. By default, embeddings are randomly # initialized from Uniform(-0.1, 0.1). embeddings = torch.zeros( (len(vocabulary), self.args.embedding_dim)).uniform_(-0.1, 0.1) # Initialize pre-trained embeddings. num_pretrained = 0 for (i, word) in enumerate(vocabulary.words): if word in embedding_map: embeddings[i] = torch.tensor(embedding_map[word]) num_pretrained += 1 # Place embedding matrix on GPU. self.embedding.weight.data = cuda(self.args, embeddings) return num_pretrained