def main(): random.seed(1012) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 tokenizer = RobertaTokenizer.from_pretrained('roberta-large') # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' # state_dict = torch.load(checkpoint_path)["model"] # roberta = RobertaForMaskedLM.from_pretrained('roberta-base', state_dict=state_dict) # # Initializing a RoBERTa configuration # config = RobertaConfig.from_pretrained('roberta-base') # # Initializing a model from the configuration # roberta = RobertaForMaskedLM(config) # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' # state_dict = torch.load(checkpoint_path)["model"] # roberta.load_state_dict(state_dict) roberta = HappyROBERTA('roberta-large') config = RobertaConfig.from_pretrained('roberta-large') mlm = RobertaForMaskedLM(config) #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/save_step_230400/checkpoint.pt' #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/roberta_base_best_sample_from_sets/checkpoint.pt' checkpoint_path = '../data/finetune_data/roberta-large/save_step_57000/checkpoint.pt' state_dict = torch.load(checkpoint_path)["model"] mlm.load_state_dict(state_dict) mlm.eval() roberta.mlm = mlm fictitious_entities = proc.generate_pairs_of_random_strings( number_of_pairs=100, min_length=3, max_length=12, character_set=chars) with open("../data/truism_data/physical_data_sentences_2.json", "r") as f: physical_sents = json.load(f) with open("../data/truism_data/physical_data_2.json", "r") as f: physical_config = json.load(f) with open("../data/finetune_data/sample_from_sets/test_keys.json", "r") as f: test_keys = json.load(f) phy_filtered = {} for key in test_keys['phy']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in phy_filtered.keys(): phy_filtered[index] = {} phy_filtered[index][ling_pert] = {} phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] elif ling_pert not in phy_filtered[index].keys(): phy_filtered[index][ling_pert] = {} phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] else: phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] # physical_sents = {k: physical_sents[k] for k in ('11', '16')} # physical_config = {k: physical_config[k] for k in ('11', '16')} logger.info("finished reading in physical data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=phy_filtered, config=physical_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/physical_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical dataset results") with open("../data/truism_data/material_data_sentences_2.json", "r") as f: material_sents = json.load(f) with open("../data/truism_data/material_data_2.json", "r") as f: material_config = json.load(f) mat_filtered = {} for key in test_keys['mat']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in mat_filtered.keys(): mat_filtered[index] = {} mat_filtered[index][ling_pert] = {} mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] elif ling_pert not in mat_filtered[index].keys(): mat_filtered[index][ling_pert] = {} mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] else: mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] logger.info("finished reading in material data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=mat_filtered, config=material_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/material_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical material results") with open("../data/truism_data/social_data_sentences_2.json", "r") as f: social_sents = json.load(f) with open("../data/truism_data/social_data_2.json", "r") as f: social_config = json.load(f) soc_filtered = {} for key in test_keys['soc']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in soc_filtered.keys(): soc_filtered[index] = {} soc_filtered[index][ling_pert] = {} soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] elif ling_pert not in soc_filtered[index].keys(): soc_filtered[index][ling_pert] = {} soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] else: soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] logger.info("finished reading in social data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=soc_filtered, config=social_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/social_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical social results")
def main(): random.seed(1012) logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 bert_base_cased = HappyBERT("bert-base-cased") fictitious_entities = proc.generate_pairs_of_random_strings(number_of_pairs=100, min_length=3, max_length=12, character_set=chars) with open("../data/truism_data/physical_data_sentences_2.json", "r") as f: physical_sents = json.load(f) with open("../data/truism_data/physical_data_2.json", "r") as f: physical_config = json.load(f) logger.info("finished reading in physical data") output_df = run_pipeline(model=bert_base_cased, fictitious_entities=fictitious_entities, sentences=physical_sents, config=physical_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv("../data/masked_word_result_data/bert/bert_physical_perf_{}.csv".format(number_of_entity_trials), index=False) logger.info("finished saving physical results") with open("../data/truism_data/material_data_sentences_2.json", "r") as f: material_sents = json.load(f) with open("../data/truism_data/material_data_2.json", "r") as f: material_config = json.load(f) logger.info("finished reading in material data") output_df = run_pipeline(model=bert_base_cased, fictitious_entities=fictitious_entities, sentences=material_sents, config=material_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv("../data/masked_word_result_data/bert/bert_material_perf_{}.csv".format(number_of_entity_trials), index=False) logger.info("finished saving material results") with open("../data/truism_data/social_data_sentences_2.json", "r") as f: social_sents = json.load(f) with open("../data/truism_data/social_data_2.json", "r") as f: social_config = json.load(f) logger.info("finished reading in social data") output_df = run_pipeline(model=bert_base_cased, fictitious_entities=fictitious_entities, sentences=social_sents, config=social_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv("../data/masked_word_result_data/bert/bert_social_perf_{}.csv".format(number_of_entity_trials), index=False) logger.info("finished saving social results")
def main(): random.seed(1012) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 bart = torch.hub.load(github='pytorch/fairseq', model='bart.large.mnli') fictitious_entities = proc.generate_pairs_of_random_strings( number_of_pairs=100, min_length=3, max_length=12, character_set=chars) with open("../data/generation_test_data/physical_easy_data_sentences.json", "r") as f: physical_sents = json.load(f) logger.info("finished reading in physical data") output_df = run_pipeline(model=bart, fictitious_entities=fictitious_entities, sentences=physical_sents, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/entailment_result_data/bart_easy/physical_entail_perf_{}.csv". format(number_of_entity_trials), index=False) logger.info("finished saving physical results") with open("../data/generation_test_data/material_easy_data_sentences.json", "r") as f: material_sents = json.load(f) logger.info("finished reading in material data") output_df = run_pipeline(model=bart, fictitious_entities=fictitious_entities, sentences=material_sents, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/entailment_result_data/bart_easy/material_entail_perf_{}.csv". format(number_of_entity_trials), index=False) logger.info("finished saving material results") with open("../data/generation_test_data/social_easy_data_sentences.json", "r") as f: social_sents = json.load(f) logger.info("finished reading in social data") output_df = run_pipeline(model=bart, fictitious_entities=fictitious_entities, sentences=social_sents, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/entailment_result_data/bart_easy/social_entail_perf_{}.csv". format(number_of_entity_trials), index=False) logger.info("finished saving social results")
def main(): random.seed(1012) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 roberta = torch.hub.load(github='pytorch/fairseq', model='roberta.large') roberta.eval() fictitious_entities = proc.generate_pairs_of_random_strings( number_of_pairs=100, min_length=3, max_length=12, character_set=chars) with open("../data/truism_data/physical_data_sentences_2.json", "r") as f: physical_sents = json.load(f) with open("../data/truism_data/physical_data_2.json", "r") as f: physical_config = json.load(f) logger.info("finished reading in physical data") output_df = run_pipeline(model=roberta, fictitious_entities=fictitious_entities, sentences=physical_sents, config=physical_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta-large/physical_perf_2_{}.csv". format(number_of_entity_trials), index=False) logger.info("finished saving physical dataset results") with open("../data/truism_data/material_data_sentences_2.json", "r") as f: material_sents = json.load(f) with open("../data/truism_data/material_data_2.json", "r") as f: material_config = json.load(f) logger.info("finished reading in material data") output_df = run_pipeline(model=roberta, fictitious_entities=fictitious_entities, sentences=material_sents, config=material_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta-large/material_perf_2_{}.csv". format(number_of_entity_trials), index=False) logger.info("finished saving physical material results") with open("../data/truism_data/social_data_sentences_2.json", "r") as f: social_sents = json.load(f) with open("../data/truism_data/social_data_2.json", "r") as f: social_config = json.load(f) logger.info("finished reading in social data") output_df = run_pipeline(model=roberta, fictitious_entities=fictitious_entities, sentences=social_sents, config=social_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta-large/social_perf_2_{}.csv". format(number_of_entity_trials), index=False) logger.info("finished saving social results") with open("../data/truism_data/temporal_data_sentences_2.json", "r") as f: temporal_sents = json.load(f) with open("../data/truism_data/temporal_data_2.json", "r") as f: temporal_config = json.load(f) logger.info("finished reading in temporal data") output_df = run_pipeline(model=roberta, fictitious_entities=fictitious_entities, sentences=temporal_sents, config=temporal_config, number_of_entity_trials=number_of_entity_trials, logger=logger, temporal=True) output_df.to_csv( "../data/masked_word_result_data/roberta-large/temporal_perf_2_{}.csv". format(number_of_entity_trials), index=False) logger.info("finished saving temporal results")
def main(): random.seed(1012) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 roberta = RobertaModel.from_pretrained( '../../fairseq/fine-tuned_roberta/', checkpoint_file='base_fine-tuned_best.pt', data_name_or_path='../../fairseq/data-bin/probes') fictitious_entities = proc.generate_pairs_of_random_strings( number_of_pairs=100, min_length=3, max_length=12, character_set=chars) with open("../data/truism_data/physical_data_sentences_2.json", "r") as f: physical_sents = json.load(f) with open("../data/truism_data/physical_data_2.json", "r") as f: physical_config = json.load(f) physical_sents = {k: physical_sents[k] for k in ('15', '18')} physical_config = {k: physical_config[k] for k in ('15', '18')} logger.info("finished reading in physical data") output_df = run_pipeline(model=roberta, fictitious_entities=fictitious_entities, sentences=physical_sents, config=physical_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta-base/physical_perf_ft_2_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical dataset results") with open("../data/truism_data/material_data_sentences_2.json", "r") as f: material_sents = json.load(f) with open("../data/truism_data/material_data_2.json", "r") as f: material_config = json.load(f) material_sents = {k: material_sents[k] for k in ('15', '18')} material_config = {k: material_config[k] for k in ('15', '18')} logger.info("finished reading in material data") output_df = run_pipeline(model=roberta, fictitious_entities=fictitious_entities, sentences=material_sents, config=material_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta-base/material_perf_ft_2_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical material results") with open("../data/truism_data/social_data_sentences_2.json", "r") as f: social_sents = json.load(f) with open("../data/truism_data/social_data_2.json", "r") as f: social_config = json.load(f) social_sents = {k: social_sents[k] for k in ('15', '18')} social_config = {k: social_config[k] for k in ('15', '18')} logger.info("finished reading in social data") output_df = run_pipeline(model=roberta, fictitious_entities=fictitious_entities, sentences=social_sents, config=social_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta-base/social_perf_ft_2_{}.csv". format(number_of_entity_trials), index=False) logger.info("finished saving physical social results")
def main(): random.seed(1012) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 roberta = RobertaModel.from_pretrained( '../../fairseq/fine-tuned_roberta/', checkpoint_file='no_neutral_checkpoint_best.pt', data_name_or_path='../../fairseq/MNLI-bin') fictitious_entities = proc.generate_pairs_of_random_strings( number_of_pairs=100, min_length=3, max_length=12, character_set=chars) with open("../data/generation_test_data/physical_data_sentences.json", "r") as f: physical_sents = json.load(f) logger.info("finished reading in physical data") output_df = run_pipeline(model=roberta, fictitious_entities=fictitious_entities, sentences=physical_sents, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/entailment_result_data/roberta/physical_noNeutral_entail_perf_2_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical results") with open("../data/generation_test_data/material_data_sentences.json", "r") as f: material_sents = json.load(f) logger.info("finished reading in material data") output_df = run_pipeline(model=roberta, fictitious_entities=fictitious_entities, sentences=material_sents, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/entailment_result_data/roberta/material_noNeutral_entail_perf_2_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving material results") with open("../data/generation_test_data/social_data_sentences.json", "r") as f: social_sents = json.load(f) logger.info("finished reading in social data") output_df = run_pipeline(model=roberta, fictitious_entities=fictitious_entities, sentences=social_sents, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/entailment_result_data/roberta/social_noNeutral_entail_perf_2_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving social results")