Example #1
0
def load_glove_embeddings(sentences):
    """
    Converts each word of the sentences to the respective Glove embeddings.

    :param sentences
    return:
    """

    # Load the glove vectors saved locally.
    glove_vectors = Vectors('glove.6B.300d.txt', './pretrained_weights/')

    # Convert the input sentences to embeddings.
    final_sentences = []
    batch_size = len(sentences)
    max_len = max([len(sentence) for sentence in sentences])
    for sentence in sentences:
        sentence_with_embeddings = glove_vectors.get_vecs_by_tokens(sentence)

        # Add padding for words.
        if len(sentence_with_embeddings) < max_len:
            temp = torch.zeros([max_len - len(sentence), 300]).float()
            sentence_with_embeddings = torch.cat(
                [sentence_with_embeddings, temp], dim=0)

        final_sentences.append(torch.as_tensor(sentence_with_embeddings))
    return torch.stack(final_sentences).view(batch_size, max_len, 300)
Example #2
0
def tfidf_fasttext_pretrained_vectorize(conf: dict, preprocessed_text: pd.Series, name: str):

    vectors = Vectors(name='data/06_models/crawl-300d-2M.vec', cache='cache')
    sent_emb = preprocessed_text.apply(
        lambda text:
            vectors.get_vecs_by_tokens(text.split()).mean(axis=0)
    )
    X_fasttext = np.stack(sent_emb.values, axis=0)
    return X_fasttext
Example #3
0
def load_pretrained_embeddings(path, vocab=None):
    """ Returns an object with the the pretrained vectors, loaded from the
        file at the specified path. The file format is the same as
        https://www.kaggle.com/danielwillgeorge/glove6b100dtxt
        You can also access the vectors at:
         https://www.dropbox.com/s/qxak38ybjom696y/glove.6B.100d.txt?dl=0
         (for efficiency (time and memory) - load only the vectors you need)
        The format of the vectors object is not specified as it will be used
        internaly in your code, so you can use the datastructure of your choice.

    Args:
        path (str): full path to the embeddings file
        vocab (list): a list of words to have embeddings for. Defaults to None.

    """
    vectors = Vectors(name=path, cache=os.getcwd())
    if vocab is not None:
        vectors = vectors.get_vecs_by_tokens(vocab, True)
    return vectors
dataloader_dev: DataLoader = DataLoader(dataset_dev,
                                        batch_size=BATCH_SIZE_VALID_TEST)

if USE_GLOVE:
    '''
    Load the GloVe embeddings
    '''
    from torchtext.vocab import Vectors

    vectors = Vectors(GLOVE_PATH, cache="./")
    pretrained_embeddings = torch.randn(len(vocab_words), vectors.dim)
    initialised = 0
    for i, w in enumerate(vocab_words.itos):
        if w in vectors.stoi:
            initialised += 1
            vec = vectors.get_vecs_by_tokens(w)
            pretrained_embeddings[i] = vec

    pretrained_embeddings[vocab_words[pad_token]] = torch.zeros(vectors.dim)
    hyperparameters.embedding_dim = vectors.dim
    hyperparameters.glove_embeddings = pretrained_embeddings
    hyperparameters.vocab_size_words = len(vocab_words)
    print("VECTOR DIM", vectors.dim)
    print("initialised embeddings {}".format(initialised))
    print("random initialised embeddings {} ".format(
        len(vocab_words) - initialised))

print(hyperparameters)
print(net_configuration)

model: SRL_final_MODEL = SRL_final_MODEL(