def load_glove_embeddings(sentences): """ Converts each word of the sentences to the respective Glove embeddings. :param sentences return: """ # Load the glove vectors saved locally. glove_vectors = Vectors('glove.6B.300d.txt', './pretrained_weights/') # Convert the input sentences to embeddings. final_sentences = [] batch_size = len(sentences) max_len = max([len(sentence) for sentence in sentences]) for sentence in sentences: sentence_with_embeddings = glove_vectors.get_vecs_by_tokens(sentence) # Add padding for words. if len(sentence_with_embeddings) < max_len: temp = torch.zeros([max_len - len(sentence), 300]).float() sentence_with_embeddings = torch.cat( [sentence_with_embeddings, temp], dim=0) final_sentences.append(torch.as_tensor(sentence_with_embeddings)) return torch.stack(final_sentences).view(batch_size, max_len, 300)
def tfidf_fasttext_pretrained_vectorize(conf: dict, preprocessed_text: pd.Series, name: str): vectors = Vectors(name='data/06_models/crawl-300d-2M.vec', cache='cache') sent_emb = preprocessed_text.apply( lambda text: vectors.get_vecs_by_tokens(text.split()).mean(axis=0) ) X_fasttext = np.stack(sent_emb.values, axis=0) return X_fasttext
def load_pretrained_embeddings(path, vocab=None): """ Returns an object with the the pretrained vectors, loaded from the file at the specified path. The file format is the same as https://www.kaggle.com/danielwillgeorge/glove6b100dtxt You can also access the vectors at: https://www.dropbox.com/s/qxak38ybjom696y/glove.6B.100d.txt?dl=0 (for efficiency (time and memory) - load only the vectors you need) The format of the vectors object is not specified as it will be used internaly in your code, so you can use the datastructure of your choice. Args: path (str): full path to the embeddings file vocab (list): a list of words to have embeddings for. Defaults to None. """ vectors = Vectors(name=path, cache=os.getcwd()) if vocab is not None: vectors = vectors.get_vecs_by_tokens(vocab, True) return vectors
dataloader_dev: DataLoader = DataLoader(dataset_dev, batch_size=BATCH_SIZE_VALID_TEST) if USE_GLOVE: ''' Load the GloVe embeddings ''' from torchtext.vocab import Vectors vectors = Vectors(GLOVE_PATH, cache="./") pretrained_embeddings = torch.randn(len(vocab_words), vectors.dim) initialised = 0 for i, w in enumerate(vocab_words.itos): if w in vectors.stoi: initialised += 1 vec = vectors.get_vecs_by_tokens(w) pretrained_embeddings[i] = vec pretrained_embeddings[vocab_words[pad_token]] = torch.zeros(vectors.dim) hyperparameters.embedding_dim = vectors.dim hyperparameters.glove_embeddings = pretrained_embeddings hyperparameters.vocab_size_words = len(vocab_words) print("VECTOR DIM", vectors.dim) print("initialised embeddings {}".format(initialised)) print("random initialised embeddings {} ".format( len(vocab_words) - initialised)) print(hyperparameters) print(net_configuration) model: SRL_final_MODEL = SRL_final_MODEL(