Example #1
0
def evaluate(model, args, dev_features, device, global_steps):
    # Eval!
    print("***** Running evaluation *****")
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            input_ids, input_mask, segment_ids, example_indices = batch
            inputs = {
                'input_ids': input_ids,
                'attention_mask': input_mask,
                'token_type_ids': segment_ids
            }
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = dev_features[example_index.item()]
            unique_id = str(eval_feature.unique_id)
            result = RawResult(unique_id=unique_id,
                               start_logits=to_list(outputs[0][i]),
                               end_logits=to_list(outputs[1][i]),
                               answer_type_logits=to_list(outputs[2][i]))
            all_results.append(result)

    pickle.dump(all_results,
                open(os.path.join(args.output_dir, 'RawResults.pkl'), 'wb'))
    # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults.pkl'), 'rb'))

    candidates_dict = read_candidates_from_one_split(args.predict_file)
    nq_pred_dict = compute_pred_dict(candidates_dict, dev_features,
                                     [r._asdict() for r in all_results],
                                     args.n_best_size, args.max_answer_length)

    output_prediction_file = os.path.join(
        args.output_dir, 'predictions' + str(global_steps) + '.json')
    with open(output_prediction_file, 'w') as f:
        json.dump({'predictions': list(nq_pred_dict.values())}, f)

    results = get_metrics_as_dict(args.predict_file, output_prediction_file)
    print('Steps:{}'.format(global_steps))
    print(json.dumps(results, indent=2))

    model.train()

    return results
Example #2
0
                short_end_topk_logits=outputs['short_end_topk_logits']
                [i].cpu().numpy(),
                short_end_topk_index=outputs['short_end_topk_index']
                [i].cpu().numpy(),
                answer_type_logits=to_list(outputs['answer_type_logits'][i]),
                long_cls_logits=outputs['long_cls_logits'][i].cpu().numpy(),
                short_cls_logits=outputs['short_cls_logits'][i].cpu().numpy())
            all_results.append(result)
    #
    pickle.dump(
        all_results,
        open(os.path.join(args.output_dir, 'RawResults_test.pkl'), 'wb'))
    # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults_test.pkl'), 'rb'))

    print("Going to candidates file")
    candidates_dict = read_candidates_from_one_split(args.predict_file)

    print("Compute_pred_dict")
    nq_pred_dict = compute_pred_dict(candidates_dict,
                                     features,
                                     [r._asdict() for r in all_results],
                                     args.n_best_size,
                                     args.max_answer_length,
                                     topk_pred=True,
                                     long_n_top=5,
                                     short_n_top=5)

    output_prediction_file = os.path.join(args.output_dir,
                                          'test_predictions.json')
    print("Saving predictions to", output_prediction_file)
    with open(output_prediction_file, 'w') as f:
    example_id = i["example_id"]
    long_answer = i["long_answer"]
    long_answer_score = i["long_answer_score"]
    long_preds.append(
        dict(example_id=example_id,
             long_answer=long_answer,
             long_answer_score=long_answer_score))
with open(long_prediction_file, "w") as f:
    json.dump(long_preds, f)

# map long answer prediction span to its long candidate index
with open(long_prediction_file, "r") as f:
    long_preds = json.load(f)
cand_dict = {}

candidates_dict = read_candidates_from_one_split(gold_file)
for long_pred in long_preds:
    # example_id = str(long_pred["example_id"])
    example_id = long_pred["example_id"]
    start = long_pred["long_answer"]["start_token"]
    end = long_pred["long_answer"]["end_token"]
    cand_dict[example_id] = -1
    for idx, c in enumerate(candidates_dict[example_id]):
        if start == c["start_token"] and end == c["end_token"]:
            cand_dict[example_id] = idx
            break

with open(os.path.join(model_dir, "long_cand_dict_" + prediction_file),
          "w") as f:
    json.dump(cand_dict, f)
def evaluate(model, args, dev_features, device, ei):
    # Eval!
    if os.path.exists(
            os.path.join(args.output_dir,
                         'RawResults_ensemble{}.pkl'.format(ei))):
        all_results = pickle.load(
            open(
                os.path.join(args.output_dir,
                             'RawResults_ensemble{}.pkl'.format(ei)), 'rb'))
    else:
        all_results = []
        for batch in tqdm(eval_dataloader,
                          desc="Evaluating Ensemble-{}".format(ei)):
            model.eval()
            batch = tuple(t.to(device) for t in batch)
            with torch.no_grad():
                input_ids, input_mask, segment_ids, example_indices = batch
                inputs = {
                    'input_ids': input_ids,
                    'attention_mask': input_mask,
                    'token_type_ids': segment_ids
                }
                outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = dev_features[example_index.item()]
                unique_id = str(eval_feature.unique_id)

                result = RawResult(
                    unique_id=unique_id,
                    # [topk]
                    long_start_topk_logits=outputs['long_start_topk_logits']
                    [i].cpu().numpy(),
                    long_start_topk_index=outputs['long_start_topk_index']
                    [i].cpu().numpy(),
                    long_end_topk_logits=outputs['long_end_topk_logits']
                    [i].cpu().numpy(),
                    long_end_topk_index=outputs['long_end_topk_index']
                    [i].cpu().numpy(),
                    # [topk, topk]
                    short_start_topk_logits=outputs['short_start_topk_logits']
                    [i].cpu().numpy(),
                    short_start_topk_index=outputs['short_start_topk_index']
                    [i].cpu().numpy(),
                    short_end_topk_logits=outputs['short_end_topk_logits']
                    [i].cpu().numpy(),
                    short_end_topk_index=outputs['short_end_topk_index']
                    [i].cpu().numpy(),
                    answer_type_logits=to_list(
                        outputs['answer_type_logits'][i]),
                    long_cls_logits=outputs['long_cls_logits']
                    [i].cpu().numpy(),
                    short_cls_logits=outputs['short_cls_logits']
                    [i].cpu().numpy())
                all_results.append(result)

        pickle.dump(
            all_results,
            open(
                os.path.join(args.output_dir,
                             'RawResults_ensemble{}.pkl'.format(ei)), 'wb'))

    candidates_dict = read_candidates_from_one_split(args.predict_file)
    nq_pred_dict = compute_pred_dict(candidates_dict,
                                     dev_features,
                                     [r._asdict() for r in all_results],
                                     args.n_best_size,
                                     args.max_answer_length,
                                     topk_pred=True,
                                     long_n_top=5,
                                     short_n_top=5,
                                     ensemble=True)

    output_prediction_file = os.path.join(args.output_dir,
                                          'predictions{}.json'.format(ei))
    with open(output_prediction_file, 'w') as f:
        json.dump({'predictions': list(nq_pred_dict.values())}, f)