Beispiel #1
0
                glove_embeds_nonant_id.append([cat_embeds, 1])
            else:
                cat_embeds = np.concatenate(
                    (bert_pronoun, glove[word2idx[word]]))
                glove_embeds_ant_id.append([cat_embeds, 0])
                glove_embeds_nonant_id.append([cat_embeds, 0])

    return np.array(glove_embeds_nonant_id), np.array(glove_embeds_ant_id)


if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    glove_list, bert_list = RSA.preprocess_data(
        '../Binding_Theory/Pronominal/pronominal_corpus.txt', noun_list)
    print("data processed")

    bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1)
    print("BERT embeds generated")

    word2idx, idx2word = utils.create_word_idx_matrices(glove_list)
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(
        "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word)
    print("glove matrices created")

    nonant_data, ant_data = get_diagnostic_input(glove_list,
                                                 glove,
                                                 word2idx,
                                                 bert_embeds,
Beispiel #2
0
import random

lexical_idxs = [1, 2, 3, 5, 6]

verb_list = [
    'loves', 'hates', 'likes', 'smells', 'touches', 'pushes', 'moves', 'sees',
    'lifts', 'hits'
]

if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    # Preprocess Corpus
    glove_list, bert_list = RSA.preprocess_data('./head_adj_trans_corpus.txt')
    print("data processed")

    # Generate glove hypothesis models
    embed_dict = RSA.get_glove_embeds(
        glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300,
        lexical_idxs, verb_list)
    adj1 = np.array(embed_dict[lexical_idxs[0]])
    subj = np.array(embed_dict[lexical_idxs[1]])
    verb = np.array(embed_dict[lexical_idxs[2]])
    adj2 = np.array(embed_dict[lexical_idxs[3]])
    obj = np.array(embed_dict[lexical_idxs[4]])
    rand_verb = np.array(embed_dict[-1])
    print("glove embeds generated")

    # Generate BERT reference model
                glove_embeds_nonarg_id.append([cat_embeds, 1])
            else:
                cat_embeds = np.concatenate(
                    (bert_pronoun, glove[word2idx[word]]))
                glove_embeds_subj_id.append([cat_embeds, 0])
                glove_embeds_nonarg_id.append([cat_embeds, 0])

    return np.array(glove_embeds_nonarg_id), np.array(glove_embeds_subj_id)


if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    glove_list, bert_list = RSA.preprocess_data(
        '../Subject_Tracking/Relative_Clauses/copula_RC_corpus.txt', noun_list)
    print("data processed")

    print("glove embeds generated")
    bert_embeds = RSA.get_bert_embeds(bert_list, verb_idx + 1)
    print("BERT embeds generated")

    word2idx, idx2word = utils.create_word_idx_matrices(glove_list)
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(
        "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word)
    print("glove matrices created")

    nonarg_data, subj_data = get_diagnostic_input(glove_list, glove, word2idx,
                                                  bert_embeds)
    print("glove embeds generated")
    'person', 'painter', 'cop', 'student', 'teacher', 'lawyer', 'peasant',
    'chef', 'pilot', 'athlete', 'farmer', 'boys', 'girls', 'men', 'women',
    'guys', 'doctors', 'artists', 'robots', 'people', 'painters', 'cops',
    'students', 'teachers', 'lawyers', 'peasants', 'chefs', 'pilots',
    'athletes', 'farmers', 'house', 'building', 'chair', 'table', 'door',
    'window', 'plane', 'car', 'truck', 'houses', 'buildings', 'chairs',
    'tables', 'doors', 'windows', 'planes', 'cars', 'trucks'
]

if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    # Preprocess corpus
    glove_list, bert_list = RSA.preprocess_data('./copula_PP_corpus.txt',
                                                noun_list)
    print("data processed")

    # Get dictionary of Glove embedding hypothesis models
    embed_dict = RSA.get_glove_embeds(
        glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300,
        noun_idxs, noun_list, verb_idx)
    glove_subj = np.array(embed_dict[noun_idxs[0]])
    glove_nonarg = np.array(embed_dict[noun_idxs[1]])
    glove_rand = np.array(embed_dict[-1])
    print("glove embeds generated")

    # Get BERT embedding reference models
    bert_embeds = RSA.get_bert_embeds(bert_list, verb_idx + 1)
    print("BERT embeds generated")
Beispiel #5
0
    print(np.min(encoding))
    print(np.max(encoding))

    probplot(encoding, plot=plt)
    plt.ylim(-12, 12)
    plt.xlim(-5, 5)
    plt.title(f"{corpus} QQ Plot: {word}", fontsize=17)
    plt.xlabel('Theoretical Quantiles', fontsize=17)
    plt.ylabel('Ordered Values', fontsize=17)
    plt.savefig(f"{corpus}_qq_plot_{word}")


if __name__ == "__main__":

    print('Analyze Anaphor Corpus')
    _, bert_list = RSA.preprocess_data('../Binding_Theory/Anaphor/anaphor_corpus.txt')
    print("data processed")
    qq_bert(bert_list, -2, 'Anaphor')
    prop, total, means = test_bert_embeds(bert_list)
    print(f'Percentage non-normal: {prop}')
    print(f'Total embeds in unique contexts: {total}')

    print('Analyze Pronominal Corpus')
    _, bert_list = RSA.preprocess_data('../Binding_Theory/Pronominal/pronominal_corpus.txt')
    print("data processed")
    qq_bert(bert_list, -2, 'Pronominal')
    prop, total, means = test_bert_embeds(bert_list)
    print(f'Percentage non-normal: {prop}')
    print(f'Total embeds in unique contexts: {total}')

    print('Analyze Prepositional Phrase Corpus')
pro_idx = 7

noun_list = [
    'doctor', 'artist', 'robot', 'person', 'dancer', 'painter', 'cop',
    'politician', 'student', 'teacher', 'farmer', 'banker', 'lawyer',
    'peasant', 'chef', 'pilot', 'athlete', 'fairy', 'monster', 'alien',
    'ghost', 'vampire', 'mummy'
]

if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    # Preprocess Corpus
    glove_list, bert_list = RSA.preprocess_data('./pronominal_corpus.txt',
                                                noun_list)
    print("data processed")

    # Generate dictionary of Glove embedding hypothesis models
    embed_dict = RSA.get_glove_embeds(
        glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300,
        noun_idxs, noun_list, pro_idx)
    glove_ant = np.array(embed_dict[noun_idxs[0]])
    glove_nonant = np.array(embed_dict[noun_idxs[1]])
    glove_rand = np.array(embed_dict[-1])
    print("glove embeds generated")

    # Generate BERT embedding reference model
    bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1)
    print("BERT embeds generated")
pro_idx = 7

noun_list = [
    'doctor', 'artist', 'robot', 'person', 'dancer', 'painter', 'cop',
    'politician', 'student', 'teacher', 'farmer', 'banker', 'lawyer',
    'peasant', 'chef', 'pilot', 'athlete', 'fairy', 'monster', 'alien',
    'ghost', 'vampire', 'mummy'
]

if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    # Preprocess corpus
    glove_list, bert_list = RSA.preprocess_data('./anaphor_corpus.txt',
                                                noun_list)
    print("data processed")

    # Get dictionary of Glove embedding hypothesis models
    embed_dict = RSA.get_glove_embeds(
        glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300,
        noun_idxs, noun_list, pro_idx)
    glove_ant = np.array(embed_dict[noun_idxs[0]])
    glove_nonant = np.array(embed_dict[noun_idxs[1]])
    glove_rand = np.array(embed_dict[-1])
    print("glove embeds generated")

    # Get BERT embedding reference models
    bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1)
    print("BERT embeds generated")
Beispiel #8
0
            else:
                cat_embeds = np.concatenate(
                    (bert_pronoun, glove[word2idx[word]]))
                glove_embeds_subj_id.append([cat_embeds, 0])
                glove_embeds_nonarg_id.append([cat_embeds, 0])

    return np.array(glove_embeds_nonarg_id), np.array(glove_embeds_subj_id)


if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    glove_list, bert_list = RSA.preprocess_data(
        '../Subject_Tracking/Prepositional_Phrases/copula_PP_corpus.txt',
        noun_list)
    print("data processed")

    print("glove embeds generated")
    bert_embeds = RSA.get_bert_embeds(bert_list, verb_idx + 1)
    print("BERT embeds generated")

    word2idx, idx2word = utils.create_word_idx_matrices(glove_list)
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(
        "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word)
    print("glove matrices created")

    nonarg_data, subj_data = get_diagnostic_input(glove_list, glove, word2idx,
                                                  bert_embeds)
Beispiel #9
0
                glove_embeds_nonant_id.append([cat_embeds, 1])
            else:
                cat_embeds = np.concatenate(
                    (bert_pronoun, glove[word2idx[word]]))
                glove_embeds_ant_id.append([cat_embeds, 0])
                glove_embeds_nonant_id.append([cat_embeds, 0])

    return np.array(glove_embeds_nonant_id), np.array(glove_embeds_ant_id)


if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    glove_list, bert_list = RSA.preprocess_data(
        '../Binding_Theory/Anaphor/anaphor_corpus.txt', noun_list)
    print("data processed")

    bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1)
    print("BERT embeds generated")

    word2idx, idx2word = utils.create_word_idx_matrices(glove_list)
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(
        "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word)
    print("glove matrices created")

    nonant_data, ant_data = get_diagnostic_input(glove_list,
                                                 glove,
                                                 word2idx,
                                                 bert_embeds,
import random

lexical_idxs = [1, 2]

verb_list = [
    'talks', 'swims', 'walks', 'screams', 'fights', 'hides', 'eats', 'runs',
    'thinks', 'works'
]

if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    # Preprocess Corpus
    glove_list, bert_list = RSA.preprocess_data('./head_simple_corpus.txt')
    print("data processed")

    # Generate Glove embedding hypothesis models
    embed_dict = RSA.get_glove_embeds(
        glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300,
        lexical_idxs, verb_list)
    subj = np.array(embed_dict[lexical_idxs[0]])
    verb = np.array(embed_dict[lexical_idxs[1]])
    rand_verb = np.array(embed_dict[-1])
    print("glove embeds generated")

    # Generate BERT embedding reference models
    bert_embeds = RSA.get_bert_embeds(bert_list, 0)
    print("BERT embeds generated")