def run_pipeline(model, fictitious_entities, sentences, config, number_of_entity_trials, logger, temporal=False): if not temporal: dataset = proc.prepare_masked_instances( sentences=sentences, config=config, fictitious_entities=fictitious_entities, num_entity_trials=number_of_entity_trials) else: dataset = proc.prepare_masked_instances_temporal( sentences=sentences, config=config, fictitious_entities=fictitious_entities, num_entity_trials=number_of_entity_trials) logger.info("finished creating dataset") perf = utils.fair_seq_masked_word_prediction( masked_examples=dataset, model=model, gpu_available=torch.cuda.is_available(), top_n=100, logger=logger) logger.info("finished evaluating dataset") output_df = utils.convert_bi_statistic_results_into_df(perf) return output_df
def get_scores(fictitious_entities, sentences, config, number_of_entity_trials, logger): dataset = proc.prepare_masked_instances( sentences=sentences, config=config, fictitious_entities=fictitious_entities, num_entity_trials=number_of_entity_trials) with open("good_responses.json") as f: good_responses = json.load(f) original_data = [] with open("blah.txt") as f: for line in f: tup = eval(line) original_data.append(tup) logger.info("finished creating dataset") output = {} missed = {} prev_output_length = 0 stop = False random_count = random.randint(75, 250) logger.info("Initial Random Count {}".format(random_count)) for i, key in enumerate(dataset.keys()): trials = dataset[key] output[key] = [] missed[key] = [] for j, entry in enumerate(trials): if check_response(good_responses[key][j]): tup = original_data[(i * 5) + j] sent = re.sub(r"\bA\b", tup[1][0], tup[0]) sent = re.sub(r"\bB\b", tup[1][1], sent) new_masked_sent = sent.replace("<mask>", "[MASK]") try: scores = get_scores_from_allen_nlp(new_masked_sent) output[key].append(scores) except AssertionError: logger.info("something is up 1") missed[key].append(j) except: logger.info("something is up 2") missed[key].append(j) random_count += -1 if random_count == 0: logger.info("Number Completed: {}".format(len(output))) logger.info("sleeping") time.sleep(random.randint(3, 12)) logger.info("done_sleeping {}-{}".format(i, j)) random_count = random.randint(75, 250) logger.info("New Random Count {}".format(random_count)) else: output[key].append(good_responses[key][j]) return output, missed
def run_pipeline(model, tokenizer, fictitious_entities, sentences, config, number_of_entity_trials, logger): dataset = proc.prepare_masked_instances( sentences=sentences, config=config, fictitious_entities=fictitious_entities, num_entity_trials=number_of_entity_trials) logger.info("finished creating dataset") perf = utils.happy_transformer_masked_word_prediction( masked_examples=dataset, model=model, top_n=100, logger=logger) logger.info("finished evaluating dataset") output_df = utils.convert_bi_statistic_results_into_df(perf) return output_df