Ejemplo n.º 1
0
def run_pipeline(model,
                 fictitious_entities,
                 sentences,
                 config,
                 number_of_entity_trials,
                 logger,
                 temporal=False):
    if not temporal:
        dataset = proc.prepare_masked_instances(
            sentences=sentences,
            config=config,
            fictitious_entities=fictitious_entities,
            num_entity_trials=number_of_entity_trials)
    else:
        dataset = proc.prepare_masked_instances_temporal(
            sentences=sentences,
            config=config,
            fictitious_entities=fictitious_entities,
            num_entity_trials=number_of_entity_trials)

    logger.info("finished creating dataset")

    perf = utils.fair_seq_masked_word_prediction(
        masked_examples=dataset,
        model=model,
        gpu_available=torch.cuda.is_available(),
        top_n=100,
        logger=logger)

    logger.info("finished evaluating dataset")

    output_df = utils.convert_bi_statistic_results_into_df(perf)

    return output_df
def get_scores(fictitious_entities, sentences, config, number_of_entity_trials,
               logger):
    dataset = proc.prepare_masked_instances(
        sentences=sentences,
        config=config,
        fictitious_entities=fictitious_entities,
        num_entity_trials=number_of_entity_trials)
    with open("good_responses.json") as f:
        good_responses = json.load(f)

    original_data = []
    with open("blah.txt") as f:
        for line in f:
            tup = eval(line)
            original_data.append(tup)

    logger.info("finished creating dataset")

    output = {}
    missed = {}
    prev_output_length = 0
    stop = False
    random_count = random.randint(75, 250)
    logger.info("Initial Random Count {}".format(random_count))
    for i, key in enumerate(dataset.keys()):
        trials = dataset[key]
        output[key] = []
        missed[key] = []
        for j, entry in enumerate(trials):
            if check_response(good_responses[key][j]):
                tup = original_data[(i * 5) + j]
                sent = re.sub(r"\bA\b", tup[1][0], tup[0])
                sent = re.sub(r"\bB\b", tup[1][1], sent)
                new_masked_sent = sent.replace("<mask>", "[MASK]")

                try:
                    scores = get_scores_from_allen_nlp(new_masked_sent)
                    output[key].append(scores)
                except AssertionError:
                    logger.info("something is up 1")
                    missed[key].append(j)
                except:
                    logger.info("something is up 2")
                    missed[key].append(j)

                random_count += -1

                if random_count == 0:
                    logger.info("Number Completed: {}".format(len(output)))
                    logger.info("sleeping")
                    time.sleep(random.randint(3, 12))
                    logger.info("done_sleeping {}-{}".format(i, j))
                    random_count = random.randint(75, 250)
                    logger.info("New Random Count {}".format(random_count))
            else:
                output[key].append(good_responses[key][j])

    return output, missed
Ejemplo n.º 3
0
def run_pipeline(model, tokenizer, fictitious_entities, sentences, config,
                 number_of_entity_trials, logger):
    dataset = proc.prepare_masked_instances(
        sentences=sentences,
        config=config,
        fictitious_entities=fictitious_entities,
        num_entity_trials=number_of_entity_trials)

    logger.info("finished creating dataset")

    perf = utils.happy_transformer_masked_word_prediction(
        masked_examples=dataset, model=model, top_n=100, logger=logger)

    logger.info("finished evaluating dataset")

    output_df = utils.convert_bi_statistic_results_into_df(perf)

    return output_df