예제 #1
0
                glove_embeds_nonant_id.append([cat_embeds, 1])
            else:
                cat_embeds = np.concatenate(
                    (bert_pronoun, glove[word2idx[word]]))
                glove_embeds_ant_id.append([cat_embeds, 0])
                glove_embeds_nonant_id.append([cat_embeds, 0])

    return np.array(glove_embeds_nonant_id), np.array(glove_embeds_ant_id)


if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    glove_list, bert_list = RSA.preprocess_data(
        '../Binding_Theory/Pronominal/pronominal_corpus.txt', noun_list)
    print("data processed")

    bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1)
    print("BERT embeds generated")

    word2idx, idx2word = utils.create_word_idx_matrices(glove_list)
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(
        "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word)
    print("glove matrices created")

    nonant_data, ant_data = get_diagnostic_input(glove_list,
                                                 glove,
                                                 word2idx,
                                                 bert_embeds,
예제 #2
0
import random

lexical_idxs = [1, 2, 3, 5, 6]

verb_list = [
    'loves', 'hates', 'likes', 'smells', 'touches', 'pushes', 'moves', 'sees',
    'lifts', 'hits'
]

if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    # Preprocess Corpus
    glove_list, bert_list = RSA.preprocess_data('./head_adj_trans_corpus.txt')
    print("data processed")

    # Generate glove hypothesis models
    embed_dict = RSA.get_glove_embeds(
        glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300,
        lexical_idxs, verb_list)
    adj1 = np.array(embed_dict[lexical_idxs[0]])
    subj = np.array(embed_dict[lexical_idxs[1]])
    verb = np.array(embed_dict[lexical_idxs[2]])
    adj2 = np.array(embed_dict[lexical_idxs[3]])
    obj = np.array(embed_dict[lexical_idxs[4]])
    rand_verb = np.array(embed_dict[-1])
    print("glove embeds generated")

    # Generate BERT reference model
                glove_embeds_nonarg_id.append([cat_embeds, 1])
            else:
                cat_embeds = np.concatenate(
                    (bert_pronoun, glove[word2idx[word]]))
                glove_embeds_subj_id.append([cat_embeds, 0])
                glove_embeds_nonarg_id.append([cat_embeds, 0])

    return np.array(glove_embeds_nonarg_id), np.array(glove_embeds_subj_id)


if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    glove_list, bert_list = RSA.preprocess_data(
        '../Subject_Tracking/Relative_Clauses/copula_RC_corpus.txt', noun_list)
    print("data processed")

    print("glove embeds generated")
    bert_embeds = RSA.get_bert_embeds(bert_list, verb_idx + 1)
    print("BERT embeds generated")

    word2idx, idx2word = utils.create_word_idx_matrices(glove_list)
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(
        "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word)
    print("glove matrices created")

    nonarg_data, subj_data = get_diagnostic_input(glove_list, glove, word2idx,
                                                  bert_embeds)
    print("glove embeds generated")
예제 #4
0
    'person', 'painter', 'cop', 'student', 'teacher', 'lawyer', 'peasant',
    'chef', 'pilot', 'athlete', 'farmer', 'boys', 'girls', 'men', 'women',
    'guys', 'doctors', 'artists', 'robots', 'people', 'painters', 'cops',
    'students', 'teachers', 'lawyers', 'peasants', 'chefs', 'pilots',
    'athletes', 'farmers', 'house', 'building', 'chair', 'table', 'door',
    'window', 'plane', 'car', 'truck', 'houses', 'buildings', 'chairs',
    'tables', 'doors', 'windows', 'planes', 'cars', 'trucks'
]

if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    # Preprocess corpus
    glove_list, bert_list = RSA.preprocess_data('./copula_PP_corpus.txt',
                                                noun_list)
    print("data processed")

    # Get dictionary of Glove embedding hypothesis models
    embed_dict = RSA.get_glove_embeds(
        glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300,
        noun_idxs, noun_list, verb_idx)
    glove_subj = np.array(embed_dict[noun_idxs[0]])
    glove_nonarg = np.array(embed_dict[noun_idxs[1]])
    glove_rand = np.array(embed_dict[-1])
    print("glove embeds generated")

    # Get BERT embedding reference models
    bert_embeds = RSA.get_bert_embeds(bert_list, verb_idx + 1)
    print("BERT embeds generated")
예제 #5
0
    print(np.min(encoding))
    print(np.max(encoding))

    probplot(encoding, plot=plt)
    plt.ylim(-12, 12)
    plt.xlim(-5, 5)
    plt.title(f"{corpus} QQ Plot: {word}", fontsize=17)
    plt.xlabel('Theoretical Quantiles', fontsize=17)
    plt.ylabel('Ordered Values', fontsize=17)
    plt.savefig(f"{corpus}_qq_plot_{word}")


if __name__ == "__main__":

    print('Analyze Anaphor Corpus')
    _, bert_list = RSA.preprocess_data('../Binding_Theory/Anaphor/anaphor_corpus.txt')
    print("data processed")
    qq_bert(bert_list, -2, 'Anaphor')
    prop, total, means = test_bert_embeds(bert_list)
    print(f'Percentage non-normal: {prop}')
    print(f'Total embeds in unique contexts: {total}')

    print('Analyze Pronominal Corpus')
    _, bert_list = RSA.preprocess_data('../Binding_Theory/Pronominal/pronominal_corpus.txt')
    print("data processed")
    qq_bert(bert_list, -2, 'Pronominal')
    prop, total, means = test_bert_embeds(bert_list)
    print(f'Percentage non-normal: {prop}')
    print(f'Total embeds in unique contexts: {total}')

    print('Analyze Prepositional Phrase Corpus')
pro_idx = 7

noun_list = [
    'doctor', 'artist', 'robot', 'person', 'dancer', 'painter', 'cop',
    'politician', 'student', 'teacher', 'farmer', 'banker', 'lawyer',
    'peasant', 'chef', 'pilot', 'athlete', 'fairy', 'monster', 'alien',
    'ghost', 'vampire', 'mummy'
]

if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    # Preprocess Corpus
    glove_list, bert_list = RSA.preprocess_data('./pronominal_corpus.txt',
                                                noun_list)
    print("data processed")

    # Generate dictionary of Glove embedding hypothesis models
    embed_dict = RSA.get_glove_embeds(
        glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300,
        noun_idxs, noun_list, pro_idx)
    glove_ant = np.array(embed_dict[noun_idxs[0]])
    glove_nonant = np.array(embed_dict[noun_idxs[1]])
    glove_rand = np.array(embed_dict[-1])
    print("glove embeds generated")

    # Generate BERT embedding reference model
    bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1)
    print("BERT embeds generated")
예제 #7
0
pro_idx = 7

noun_list = [
    'doctor', 'artist', 'robot', 'person', 'dancer', 'painter', 'cop',
    'politician', 'student', 'teacher', 'farmer', 'banker', 'lawyer',
    'peasant', 'chef', 'pilot', 'athlete', 'fairy', 'monster', 'alien',
    'ghost', 'vampire', 'mummy'
]

if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    # Preprocess corpus
    glove_list, bert_list = RSA.preprocess_data('./anaphor_corpus.txt',
                                                noun_list)
    print("data processed")

    # Get dictionary of Glove embedding hypothesis models
    embed_dict = RSA.get_glove_embeds(
        glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300,
        noun_idxs, noun_list, pro_idx)
    glove_ant = np.array(embed_dict[noun_idxs[0]])
    glove_nonant = np.array(embed_dict[noun_idxs[1]])
    glove_rand = np.array(embed_dict[-1])
    print("glove embeds generated")

    # Get BERT embedding reference models
    bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1)
    print("BERT embeds generated")
예제 #8
0
            else:
                cat_embeds = np.concatenate(
                    (bert_pronoun, glove[word2idx[word]]))
                glove_embeds_subj_id.append([cat_embeds, 0])
                glove_embeds_nonarg_id.append([cat_embeds, 0])

    return np.array(glove_embeds_nonarg_id), np.array(glove_embeds_subj_id)


if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    glove_list, bert_list = RSA.preprocess_data(
        '../Subject_Tracking/Prepositional_Phrases/copula_PP_corpus.txt',
        noun_list)
    print("data processed")

    print("glove embeds generated")
    bert_embeds = RSA.get_bert_embeds(bert_list, verb_idx + 1)
    print("BERT embeds generated")

    word2idx, idx2word = utils.create_word_idx_matrices(glove_list)
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(
        "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word)
    print("glove matrices created")

    nonarg_data, subj_data = get_diagnostic_input(glove_list, glove, word2idx,
                                                  bert_embeds)
예제 #9
0
                glove_embeds_nonant_id.append([cat_embeds, 1])
            else:
                cat_embeds = np.concatenate(
                    (bert_pronoun, glove[word2idx[word]]))
                glove_embeds_ant_id.append([cat_embeds, 0])
                glove_embeds_nonant_id.append([cat_embeds, 0])

    return np.array(glove_embeds_nonant_id), np.array(glove_embeds_ant_id)


if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    glove_list, bert_list = RSA.preprocess_data(
        '../Binding_Theory/Anaphor/anaphor_corpus.txt', noun_list)
    print("data processed")

    bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1)
    print("BERT embeds generated")

    word2idx, idx2word = utils.create_word_idx_matrices(glove_list)
    print("word idx matrices created")
    glove = utils.create_embedding_dictionary(
        "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word)
    print("glove matrices created")

    nonant_data, ant_data = get_diagnostic_input(glove_list,
                                                 glove,
                                                 word2idx,
                                                 bert_embeds,
예제 #10
0
import random

lexical_idxs = [1, 2]

verb_list = [
    'talks', 'swims', 'walks', 'screams', 'fights', 'hides', 'eats', 'runs',
    'thinks', 'works'
]

if __name__ == "__main__":

    np.random.seed(seed=9)
    random.seed(9)

    # Preprocess Corpus
    glove_list, bert_list = RSA.preprocess_data('./head_simple_corpus.txt')
    print("data processed")

    # Generate Glove embedding hypothesis models
    embed_dict = RSA.get_glove_embeds(
        glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300,
        lexical_idxs, verb_list)
    subj = np.array(embed_dict[lexical_idxs[0]])
    verb = np.array(embed_dict[lexical_idxs[1]])
    rand_verb = np.array(embed_dict[-1])
    print("glove embeds generated")

    # Generate BERT embedding reference models
    bert_embeds = RSA.get_bert_embeds(bert_list, 0)
    print("BERT embeds generated")