glove_embeds_nonant_id.append([cat_embeds, 1]) else: cat_embeds = np.concatenate( (bert_pronoun, glove[word2idx[word]])) glove_embeds_ant_id.append([cat_embeds, 0]) glove_embeds_nonant_id.append([cat_embeds, 0]) return np.array(glove_embeds_nonant_id), np.array(glove_embeds_ant_id) if __name__ == "__main__": np.random.seed(seed=9) random.seed(9) glove_list, bert_list = RSA.preprocess_data( '../Binding_Theory/Pronominal/pronominal_corpus.txt', noun_list) print("data processed") bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1) print("BERT embeds generated") word2idx, idx2word = utils.create_word_idx_matrices(glove_list) print("word idx matrices created") glove = utils.create_embedding_dictionary( "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word) print("glove matrices created") nonant_data, ant_data = get_diagnostic_input(glove_list, glove, word2idx, bert_embeds,
import random lexical_idxs = [1, 2, 3, 5, 6] verb_list = [ 'loves', 'hates', 'likes', 'smells', 'touches', 'pushes', 'moves', 'sees', 'lifts', 'hits' ] if __name__ == "__main__": np.random.seed(seed=9) random.seed(9) # Preprocess Corpus glove_list, bert_list = RSA.preprocess_data('./head_adj_trans_corpus.txt') print("data processed") # Generate glove hypothesis models embed_dict = RSA.get_glove_embeds( glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300, lexical_idxs, verb_list) adj1 = np.array(embed_dict[lexical_idxs[0]]) subj = np.array(embed_dict[lexical_idxs[1]]) verb = np.array(embed_dict[lexical_idxs[2]]) adj2 = np.array(embed_dict[lexical_idxs[3]]) obj = np.array(embed_dict[lexical_idxs[4]]) rand_verb = np.array(embed_dict[-1]) print("glove embeds generated") # Generate BERT reference model
glove_embeds_nonarg_id.append([cat_embeds, 1]) else: cat_embeds = np.concatenate( (bert_pronoun, glove[word2idx[word]])) glove_embeds_subj_id.append([cat_embeds, 0]) glove_embeds_nonarg_id.append([cat_embeds, 0]) return np.array(glove_embeds_nonarg_id), np.array(glove_embeds_subj_id) if __name__ == "__main__": np.random.seed(seed=9) random.seed(9) glove_list, bert_list = RSA.preprocess_data( '../Subject_Tracking/Relative_Clauses/copula_RC_corpus.txt', noun_list) print("data processed") print("glove embeds generated") bert_embeds = RSA.get_bert_embeds(bert_list, verb_idx + 1) print("BERT embeds generated") word2idx, idx2word = utils.create_word_idx_matrices(glove_list) print("word idx matrices created") glove = utils.create_embedding_dictionary( "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word) print("glove matrices created") nonarg_data, subj_data = get_diagnostic_input(glove_list, glove, word2idx, bert_embeds) print("glove embeds generated")
'person', 'painter', 'cop', 'student', 'teacher', 'lawyer', 'peasant', 'chef', 'pilot', 'athlete', 'farmer', 'boys', 'girls', 'men', 'women', 'guys', 'doctors', 'artists', 'robots', 'people', 'painters', 'cops', 'students', 'teachers', 'lawyers', 'peasants', 'chefs', 'pilots', 'athletes', 'farmers', 'house', 'building', 'chair', 'table', 'door', 'window', 'plane', 'car', 'truck', 'houses', 'buildings', 'chairs', 'tables', 'doors', 'windows', 'planes', 'cars', 'trucks' ] if __name__ == "__main__": np.random.seed(seed=9) random.seed(9) # Preprocess corpus glove_list, bert_list = RSA.preprocess_data('./copula_PP_corpus.txt', noun_list) print("data processed") # Get dictionary of Glove embedding hypothesis models embed_dict = RSA.get_glove_embeds( glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300, noun_idxs, noun_list, verb_idx) glove_subj = np.array(embed_dict[noun_idxs[0]]) glove_nonarg = np.array(embed_dict[noun_idxs[1]]) glove_rand = np.array(embed_dict[-1]) print("glove embeds generated") # Get BERT embedding reference models bert_embeds = RSA.get_bert_embeds(bert_list, verb_idx + 1) print("BERT embeds generated")
print(np.min(encoding)) print(np.max(encoding)) probplot(encoding, plot=plt) plt.ylim(-12, 12) plt.xlim(-5, 5) plt.title(f"{corpus} QQ Plot: {word}", fontsize=17) plt.xlabel('Theoretical Quantiles', fontsize=17) plt.ylabel('Ordered Values', fontsize=17) plt.savefig(f"{corpus}_qq_plot_{word}") if __name__ == "__main__": print('Analyze Anaphor Corpus') _, bert_list = RSA.preprocess_data('../Binding_Theory/Anaphor/anaphor_corpus.txt') print("data processed") qq_bert(bert_list, -2, 'Anaphor') prop, total, means = test_bert_embeds(bert_list) print(f'Percentage non-normal: {prop}') print(f'Total embeds in unique contexts: {total}') print('Analyze Pronominal Corpus') _, bert_list = RSA.preprocess_data('../Binding_Theory/Pronominal/pronominal_corpus.txt') print("data processed") qq_bert(bert_list, -2, 'Pronominal') prop, total, means = test_bert_embeds(bert_list) print(f'Percentage non-normal: {prop}') print(f'Total embeds in unique contexts: {total}') print('Analyze Prepositional Phrase Corpus')
pro_idx = 7 noun_list = [ 'doctor', 'artist', 'robot', 'person', 'dancer', 'painter', 'cop', 'politician', 'student', 'teacher', 'farmer', 'banker', 'lawyer', 'peasant', 'chef', 'pilot', 'athlete', 'fairy', 'monster', 'alien', 'ghost', 'vampire', 'mummy' ] if __name__ == "__main__": np.random.seed(seed=9) random.seed(9) # Preprocess Corpus glove_list, bert_list = RSA.preprocess_data('./pronominal_corpus.txt', noun_list) print("data processed") # Generate dictionary of Glove embedding hypothesis models embed_dict = RSA.get_glove_embeds( glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300, noun_idxs, noun_list, pro_idx) glove_ant = np.array(embed_dict[noun_idxs[0]]) glove_nonant = np.array(embed_dict[noun_idxs[1]]) glove_rand = np.array(embed_dict[-1]) print("glove embeds generated") # Generate BERT embedding reference model bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1) print("BERT embeds generated")
pro_idx = 7 noun_list = [ 'doctor', 'artist', 'robot', 'person', 'dancer', 'painter', 'cop', 'politician', 'student', 'teacher', 'farmer', 'banker', 'lawyer', 'peasant', 'chef', 'pilot', 'athlete', 'fairy', 'monster', 'alien', 'ghost', 'vampire', 'mummy' ] if __name__ == "__main__": np.random.seed(seed=9) random.seed(9) # Preprocess corpus glove_list, bert_list = RSA.preprocess_data('./anaphor_corpus.txt', noun_list) print("data processed") # Get dictionary of Glove embedding hypothesis models embed_dict = RSA.get_glove_embeds( glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300, noun_idxs, noun_list, pro_idx) glove_ant = np.array(embed_dict[noun_idxs[0]]) glove_nonant = np.array(embed_dict[noun_idxs[1]]) glove_rand = np.array(embed_dict[-1]) print("glove embeds generated") # Get BERT embedding reference models bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1) print("BERT embeds generated")
else: cat_embeds = np.concatenate( (bert_pronoun, glove[word2idx[word]])) glove_embeds_subj_id.append([cat_embeds, 0]) glove_embeds_nonarg_id.append([cat_embeds, 0]) return np.array(glove_embeds_nonarg_id), np.array(glove_embeds_subj_id) if __name__ == "__main__": np.random.seed(seed=9) random.seed(9) glove_list, bert_list = RSA.preprocess_data( '../Subject_Tracking/Prepositional_Phrases/copula_PP_corpus.txt', noun_list) print("data processed") print("glove embeds generated") bert_embeds = RSA.get_bert_embeds(bert_list, verb_idx + 1) print("BERT embeds generated") word2idx, idx2word = utils.create_word_idx_matrices(glove_list) print("word idx matrices created") glove = utils.create_embedding_dictionary( "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word) print("glove matrices created") nonarg_data, subj_data = get_diagnostic_input(glove_list, glove, word2idx, bert_embeds)
glove_embeds_nonant_id.append([cat_embeds, 1]) else: cat_embeds = np.concatenate( (bert_pronoun, glove[word2idx[word]])) glove_embeds_ant_id.append([cat_embeds, 0]) glove_embeds_nonant_id.append([cat_embeds, 0]) return np.array(glove_embeds_nonant_id), np.array(glove_embeds_ant_id) if __name__ == "__main__": np.random.seed(seed=9) random.seed(9) glove_list, bert_list = RSA.preprocess_data( '../Binding_Theory/Anaphor/anaphor_corpus.txt', noun_list) print("data processed") bert_embeds = RSA.get_bert_embeds(bert_list, pro_idx + 1) print("BERT embeds generated") word2idx, idx2word = utils.create_word_idx_matrices(glove_list) print("word idx matrices created") glove = utils.create_embedding_dictionary( "../glove_utils/glove/glove.6B.300d.txt", 300, word2idx, idx2word) print("glove matrices created") nonant_data, ant_data = get_diagnostic_input(glove_list, glove, word2idx, bert_embeds,
import random lexical_idxs = [1, 2] verb_list = [ 'talks', 'swims', 'walks', 'screams', 'fights', 'hides', 'eats', 'runs', 'thinks', 'works' ] if __name__ == "__main__": np.random.seed(seed=9) random.seed(9) # Preprocess Corpus glove_list, bert_list = RSA.preprocess_data('./head_simple_corpus.txt') print("data processed") # Generate Glove embedding hypothesis models embed_dict = RSA.get_glove_embeds( glove_list, "../../glove_utils/glove/glove.6B.300d.txt", 300, lexical_idxs, verb_list) subj = np.array(embed_dict[lexical_idxs[0]]) verb = np.array(embed_dict[lexical_idxs[1]]) rand_verb = np.array(embed_dict[-1]) print("glove embeds generated") # Generate BERT embedding reference models bert_embeds = RSA.get_bert_embeds(bert_list, 0) print("BERT embeds generated")