Example #1
0
def main():
    random.seed(1012)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    # state_dict = torch.load(checkpoint_path)["model"]
    # roberta = RobertaForMaskedLM.from_pretrained('roberta-base', state_dict=state_dict)

    # # Initializing a RoBERTa configuration
    # config = RobertaConfig.from_pretrained('roberta-base')
    # # Initializing a model from the configuration
    # roberta = RobertaForMaskedLM(config)
    # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    # state_dict = torch.load(checkpoint_path)["model"]
    # roberta.load_state_dict(state_dict)

    roberta = HappyROBERTA('roberta-large')

    config = RobertaConfig.from_pretrained('roberta-large')
    mlm = RobertaForMaskedLM(config)
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/save_step_230400/checkpoint.pt'
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/roberta_base_best_sample_from_sets/checkpoint.pt'
    checkpoint_path = '../data/finetune_data/roberta-large/save_step_57000/checkpoint.pt'
    state_dict = torch.load(checkpoint_path)["model"]
    mlm.load_state_dict(state_dict)
    mlm.eval()

    roberta.mlm = mlm

    fictitious_entities = proc.generate_pairs_of_random_strings(
        number_of_pairs=100, min_length=3, max_length=12, character_set=chars)
    with open("../data/truism_data/physical_data_sentences_2.json", "r") as f:
        physical_sents = json.load(f)

    with open("../data/truism_data/physical_data_2.json", "r") as f:
        physical_config = json.load(f)

    with open("../data/finetune_data/sample_from_sets/test_keys.json",
              "r") as f:
        test_keys = json.load(f)

    phy_filtered = {}
    for key in test_keys['phy']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in phy_filtered.keys():
            phy_filtered[index] = {}
            phy_filtered[index][ling_pert] = {}
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in phy_filtered[index].keys():
            phy_filtered[index][ling_pert] = {}
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
        else:
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
    # physical_sents = {k: physical_sents[k] for k in ('11', '16')}
    # physical_config  = {k: physical_config[k] for k in ('11', '16')}

    logger.info("finished reading in physical data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=phy_filtered,
                             config=physical_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/physical_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical dataset results")

    with open("../data/truism_data/material_data_sentences_2.json", "r") as f:
        material_sents = json.load(f)

    with open("../data/truism_data/material_data_2.json", "r") as f:
        material_config = json.load(f)

    mat_filtered = {}
    for key in test_keys['mat']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in mat_filtered.keys():
            mat_filtered[index] = {}
            mat_filtered[index][ling_pert] = {}
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in mat_filtered[index].keys():
            mat_filtered[index][ling_pert] = {}
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]
        else:
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]

    logger.info("finished reading in material data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=mat_filtered,
                             config=material_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/material_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical material results")

    with open("../data/truism_data/social_data_sentences_2.json", "r") as f:
        social_sents = json.load(f)

    with open("../data/truism_data/social_data_2.json", "r") as f:
        social_config = json.load(f)

    soc_filtered = {}
    for key in test_keys['soc']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in soc_filtered.keys():
            soc_filtered[index] = {}
            soc_filtered[index][ling_pert] = {}
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in soc_filtered[index].keys():
            soc_filtered[index][ling_pert] = {}
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]
        else:
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=soc_filtered,
                             config=social_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/social_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical social results")
def main():
    random.seed(1012)
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    bert_base_cased = HappyBERT("bert-base-cased")

    fictitious_entities = proc.generate_pairs_of_random_strings(number_of_pairs=100, 
                                                                min_length=3,
                                                                max_length=12,
                                                                character_set=chars)
    
    with open("../data/truism_data/physical_data_sentences_2.json", "r") as f:
        physical_sents = json.load(f)
        
    with open("../data/truism_data/physical_data_2.json", "r") as f:
        physical_config = json.load(f)

    logger.info("finished reading in physical data")

    output_df = run_pipeline(model=bert_base_cased, 
                             fictitious_entities=fictitious_entities, 
                             sentences=physical_sents, 
                             config=physical_config, 
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv("../data/masked_word_result_data/bert/bert_physical_perf_{}.csv".format(number_of_entity_trials),
                     index=False)

    logger.info("finished saving physical results")

        
    with open("../data/truism_data/material_data_sentences_2.json", "r") as f:
        material_sents = json.load(f)
        
    with open("../data/truism_data/material_data_2.json", "r") as f:
        material_config = json.load(f)

    logger.info("finished reading in material data")

    output_df = run_pipeline(model=bert_base_cased, 
                             fictitious_entities=fictitious_entities, 
                             sentences=material_sents, 
                             config=material_config, 
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv("../data/masked_word_result_data/bert/bert_material_perf_{}.csv".format(number_of_entity_trials),
                     index=False)

    logger.info("finished saving material results")
        
    with open("../data/truism_data/social_data_sentences_2.json", "r") as f:
        social_sents = json.load(f)
        
    with open("../data/truism_data/social_data_2.json", "r") as f:
        social_config = json.load(f)

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=bert_base_cased, 
                             fictitious_entities=fictitious_entities, 
                             sentences=social_sents, 
                             config=social_config, 
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv("../data/masked_word_result_data/bert/bert_social_perf_{}.csv".format(number_of_entity_trials),
                     index=False)

    logger.info("finished saving social results")
def main():
    random.seed(1012)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    bart = torch.hub.load(github='pytorch/fairseq', model='bart.large.mnli')

    fictitious_entities = proc.generate_pairs_of_random_strings(
        number_of_pairs=100, min_length=3, max_length=12, character_set=chars)

    with open("../data/generation_test_data/physical_easy_data_sentences.json",
              "r") as f:
        physical_sents = json.load(f)

    logger.info("finished reading in physical data")

    output_df = run_pipeline(model=bart,
                             fictitious_entities=fictitious_entities,
                             sentences=physical_sents,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/entailment_result_data/bart_easy/physical_entail_perf_{}.csv".
        format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical results")

    with open("../data/generation_test_data/material_easy_data_sentences.json",
              "r") as f:
        material_sents = json.load(f)

    logger.info("finished reading in material data")

    output_df = run_pipeline(model=bart,
                             fictitious_entities=fictitious_entities,
                             sentences=material_sents,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/entailment_result_data/bart_easy/material_entail_perf_{}.csv".
        format(number_of_entity_trials),
        index=False)

    logger.info("finished saving material results")

    with open("../data/generation_test_data/social_easy_data_sentences.json",
              "r") as f:
        social_sents = json.load(f)

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=bart,
                             fictitious_entities=fictitious_entities,
                             sentences=social_sents,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/entailment_result_data/bart_easy/social_entail_perf_{}.csv".
        format(number_of_entity_trials),
        index=False)

    logger.info("finished saving social results")
Example #4
0
def main():
    random.seed(1012)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    roberta = torch.hub.load(github='pytorch/fairseq', model='roberta.large')
    roberta.eval()

    fictitious_entities = proc.generate_pairs_of_random_strings(
        number_of_pairs=100, min_length=3, max_length=12, character_set=chars)

    with open("../data/truism_data/physical_data_sentences_2.json", "r") as f:
        physical_sents = json.load(f)

    with open("../data/truism_data/physical_data_2.json", "r") as f:
        physical_config = json.load(f)

    logger.info("finished reading in physical data")

    output_df = run_pipeline(model=roberta,
                             fictitious_entities=fictitious_entities,
                             sentences=physical_sents,
                             config=physical_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta-large/physical_perf_2_{}.csv".
        format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical dataset results")

    with open("../data/truism_data/material_data_sentences_2.json", "r") as f:
        material_sents = json.load(f)

    with open("../data/truism_data/material_data_2.json", "r") as f:
        material_config = json.load(f)

    logger.info("finished reading in material data")

    output_df = run_pipeline(model=roberta,
                             fictitious_entities=fictitious_entities,
                             sentences=material_sents,
                             config=material_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta-large/material_perf_2_{}.csv".
        format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical material results")

    with open("../data/truism_data/social_data_sentences_2.json", "r") as f:
        social_sents = json.load(f)

    with open("../data/truism_data/social_data_2.json", "r") as f:
        social_config = json.load(f)

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=roberta,
                             fictitious_entities=fictitious_entities,
                             sentences=social_sents,
                             config=social_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta-large/social_perf_2_{}.csv".
        format(number_of_entity_trials),
        index=False)

    logger.info("finished saving social results")

    with open("../data/truism_data/temporal_data_sentences_2.json", "r") as f:
        temporal_sents = json.load(f)

    with open("../data/truism_data/temporal_data_2.json", "r") as f:
        temporal_config = json.load(f)

    logger.info("finished reading in temporal data")

    output_df = run_pipeline(model=roberta,
                             fictitious_entities=fictitious_entities,
                             sentences=temporal_sents,
                             config=temporal_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger,
                             temporal=True)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta-large/temporal_perf_2_{}.csv".
        format(number_of_entity_trials),
        index=False)

    logger.info("finished saving temporal results")
Example #5
0
def main():
    random.seed(1012)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    roberta = RobertaModel.from_pretrained(
        '../../fairseq/fine-tuned_roberta/',
        checkpoint_file='base_fine-tuned_best.pt',
        data_name_or_path='../../fairseq/data-bin/probes')

    fictitious_entities = proc.generate_pairs_of_random_strings(
        number_of_pairs=100, min_length=3, max_length=12, character_set=chars)
    with open("../data/truism_data/physical_data_sentences_2.json", "r") as f:
        physical_sents = json.load(f)

    with open("../data/truism_data/physical_data_2.json", "r") as f:
        physical_config = json.load(f)

    physical_sents = {k: physical_sents[k] for k in ('15', '18')}
    physical_config = {k: physical_config[k] for k in ('15', '18')}

    logger.info("finished reading in physical data")

    output_df = run_pipeline(model=roberta,
                             fictitious_entities=fictitious_entities,
                             sentences=physical_sents,
                             config=physical_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta-base/physical_perf_ft_2_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical dataset results")

    with open("../data/truism_data/material_data_sentences_2.json", "r") as f:
        material_sents = json.load(f)

    with open("../data/truism_data/material_data_2.json", "r") as f:
        material_config = json.load(f)

    material_sents = {k: material_sents[k] for k in ('15', '18')}
    material_config = {k: material_config[k] for k in ('15', '18')}

    logger.info("finished reading in material data")

    output_df = run_pipeline(model=roberta,
                             fictitious_entities=fictitious_entities,
                             sentences=material_sents,
                             config=material_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta-base/material_perf_ft_2_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical material results")

    with open("../data/truism_data/social_data_sentences_2.json", "r") as f:
        social_sents = json.load(f)

    with open("../data/truism_data/social_data_2.json", "r") as f:
        social_config = json.load(f)

    social_sents = {k: social_sents[k] for k in ('15', '18')}
    social_config = {k: social_config[k] for k in ('15', '18')}

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=roberta,
                             fictitious_entities=fictitious_entities,
                             sentences=social_sents,
                             config=social_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta-base/social_perf_ft_2_{}.csv".
        format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical social results")
def main():
    random.seed(1012)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    roberta = RobertaModel.from_pretrained(
        '../../fairseq/fine-tuned_roberta/',
        checkpoint_file='no_neutral_checkpoint_best.pt',
        data_name_or_path='../../fairseq/MNLI-bin')

    fictitious_entities = proc.generate_pairs_of_random_strings(
        number_of_pairs=100, min_length=3, max_length=12, character_set=chars)

    with open("../data/generation_test_data/physical_data_sentences.json",
              "r") as f:
        physical_sents = json.load(f)

    logger.info("finished reading in physical data")

    output_df = run_pipeline(model=roberta,
                             fictitious_entities=fictitious_entities,
                             sentences=physical_sents,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/entailment_result_data/roberta/physical_noNeutral_entail_perf_2_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical results")

    with open("../data/generation_test_data/material_data_sentences.json",
              "r") as f:
        material_sents = json.load(f)

    logger.info("finished reading in material data")

    output_df = run_pipeline(model=roberta,
                             fictitious_entities=fictitious_entities,
                             sentences=material_sents,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/entailment_result_data/roberta/material_noNeutral_entail_perf_2_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving material results")

    with open("../data/generation_test_data/social_data_sentences.json",
              "r") as f:
        social_sents = json.load(f)

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=roberta,
                             fictitious_entities=fictitious_entities,
                             sentences=social_sents,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/entailment_result_data/roberta/social_noNeutral_entail_perf_2_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving social results")