Example #1
0
def get_glove_embeds(glove_path, dim, corpus):
    # Get glove embeddings of all terms in the corpus
    word2idx, idx2word = utils.create_word_idx_matrices([corpus])
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(glove_path, dim, word2idx,
                                              idx2word)
    print("glove matrices created")

    glove_embeds = []

    for word in corpus:
        glove_embeds.append(glove[word2idx[word]])

    return np.array(glove_embeds)
Example #2
0
def get_glove_embeds(sent_list,
                     glove_path,
                     dim,
                     idxs,
                     rand_words,
                     idx_of_interest=None):

    word2idx, idx2word = utils.create_word_idx_matrices(sent_list)
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(glove_path, dim, word2idx,
                                              idx2word)
    print("glove matrices created")

    embeds_dict = defaultdict(list)

    for sent in sent_list:
        curr_words = []
        for idx in idxs:
            curr_words.append(sent[idx])
        rand_word_list = list(set(rand_words) - set(curr_words))
        rand_word_list.sort()
        rand_word = random.choice(rand_word_list)
        if idx_of_interest is not None:
            word_of_interest = sent[idx_of_interest]
            embed_of_interest = glove[word2idx[word_of_interest]]

            for idx, word in enumerate(curr_words):
                embeds_dict[idxs[idx]].append(
                    np.concatenate((embed_of_interest, glove[word2idx[word]])))

            embeds_dict[-1].append(
                np.concatenate(
                    (embed_of_interest, glove[word2idx[rand_word]])))

        else:
            for idx, word in enumerate(curr_words):
                embeds_dict[idxs[idx]].append(glove[word2idx[word]])

            embeds_dict[-1].append(glove[word2idx[rand_word]])

    return embeds_dict
Example #3
0
if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    glove_list, bert_list = RSA.preprocess_data(
        '../Binding_Theory/Pronominal/pronominal_corpus.txt', noun_list)
    print("data processed")

    bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1)
    print("BERT embeds generated")

    word2idx, idx2word = utils.create_word_idx_matrices(glove_list)
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(
        "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word)
    print("glove matrices created")

    nonant_data, ant_data = get_diagnostic_input(glove_list,
                                                 glove,
                                                 word2idx,
                                                 bert_embeds,
                                                 anaphor=False)
    print("glove embeds generated")

    np.random.seed(seed=9)
    np.random.shuffle(nonant_data)
    np.random.shuffle(ant_data)

    ant_X = np.concatenate(np.array(ant_data)[:, 0]).reshape(-1, 1068)
    ant_Y = np.array(ant_data)[:, 1].reshape(-1).astype("int")