Exemple #1
0
def evaluate(model, args, dev_features, device, global_steps):
    # Eval!
    print("***** Running evaluation *****")
    # all_results = []
    # for batch in tqdm(eval_dataloader, desc="Evaluating"):
    #     model.eval()
    #     batch = tuple(t.to(device) for t in batch)
    #     with torch.no_grad():
    #         input_ids, input_mask, segment_ids, example_indices = batch
    #         inputs = {'input_ids': input_ids,
    #                   'attention_mask': input_mask,
    #                   'token_type_ids': segment_ids}
    #         outputs = model(**inputs)
    #
    #     for i, example_index in enumerate(example_indices):
    #         eval_feature = dev_features[example_index.item()]
    #         unique_id = str(eval_feature.unique_id)
    #         result = RawResult(unique_id=unique_id,
    #                            example_id=eval_feature.example_index,
    #                            start_logits=to_list(outputs[0][i]),
    #                            end_logits=to_list(outputs[1][i]),
    #                            answer_type_logits=to_list(outputs[2][i]))
    #         all_results.append(result)

    # pickle.dump(all_results, open(os.path.join(args.output_dir, 'RawResults.pkl'), 'wb'))
    # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults.pkl'), 'rb'))
    #
    # # print("Going to candidates file")
    # candidates_dict = read_candidates_from_one_split(args.predict_file)
    #
    # # print("Compute_pred_dict")
    # nq_pred_dict = compute_pred_dict(candidates_dict, dev_features,
    #                                  [r._asdict() for r in all_results],
    #                                  args.n_best_size, args.max_answer_length)
    #
    output_prediction_file = os.path.join(
        args.output_dir, 'predictions' + str(global_steps) + '.json')
    # # print("Saving predictions to", output_prediction_file)
    # with open(output_prediction_file, 'w') as f:
    #     json.dump({'predictions': list(nq_pred_dict.values())}, f)

    # print("Computing f1 score")
    results = get_metrics_as_dict(args.predict_file, output_prediction_file)
    print('Steps:{}'.format(global_steps))
    print(json.dumps(results, indent=2))

    model.train()

    return results
def evaluate(model, args, dev_features, device, global_steps):
    # Eval!
    print("***** Running evaluation *****")
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            input_ids, input_mask, segment_ids, example_indices = batch
            inputs = {
                'input_ids': input_ids,
                'attention_mask': input_mask,
                'token_type_ids': segment_ids
            }
            start_logits, end_logits, answer_type_logits = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = dev_features[example_index.item()]
            unique_id = str(eval_feature.unique_id)

            result = RawResult(
                unique_id=unique_id,
                short_start_logits=start_logits[i].cpu().numpy(),
                short_end_logits=end_logits[i].cpu().numpy(),
                answer_type_logits=answer_type_logits[i].cpu().numpy())
            all_results.append(result)

    pickle.dump(all_results,
                open(os.path.join(args.output_dir, 'RawResults.pkl'), 'wb'))
    # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults.pkl'), 'rb'))

    nq_pred_dict = compute_short_pred(dev_features, all_results,
                                      args.n_best_size, args.max_answer_length)
    nq_pred_dict = convert_short_pred(nq_pred_dict)

    output_prediction_file = os.path.join(
        args.output_dir, 'predictions' + str(global_steps) + '.json')
    with open(output_prediction_file, 'w') as f:
        json.dump({'predictions': list(nq_pred_dict.values())}, f)

    results = get_metrics_as_dict(args.predict_file, output_prediction_file)
    print('Steps:{}'.format(global_steps))
    print(json.dumps(results, indent=2))

    model.train()

    return results
def evaluate(model, args, dev_features, device):
    # Eval!
    print("***** Running evaluation *****")
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            input_ids, input_mask, segment_ids, example_indices = batch
            inputs = {'input_ids': input_ids,
                      'attention_mask': input_mask,
                      'token_type_ids': segment_ids}
            start_logits, end_logits = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = dev_features[example_index.item()]
            unique_id = str(eval_feature.unique_id)

            result = RawResult(unique_id=unique_id,
                               long_start_logits=start_logits[i].cpu().numpy(),
                               long_end_logits=end_logits[i].cpu().numpy())
            all_results.append(result)

    if args.is_test:
        pickle.dump(all_results, open(os.path.join(args.output_dir, 'test_long_RawResults.pkl'), 'wb'))
    else:
        pickle.dump(all_results, open(os.path.join(args.output_dir, 'dev_long_RawResults.pkl'), 'wb'))
    # all_results = pickle.load(open(os.path.join(args.output_dir, 'dev_long_RawResults.pkl'), 'rb'))

    for th in args.thresholds:
        print('UNK type threshold:', th)
        ground_truth_dict = load_all_annotations_from_dev(args.predict_file, is_test=args.is_test)
        nq_pred_dict = compute_long_pred(ground_truth_dict, dev_features, all_results, args.n_best_size, th)

        if args.is_test:
            output_prediction_file = os.path.join(args.output_dir,
                                                  'test_long_predictions.json')
        else:
            output_prediction_file = os.path.join(args.output_dir,
                                                  'dev_long_predictions.json')
        with open(output_prediction_file, 'w') as f:
            json.dump({'predictions': list(nq_pred_dict.values())}, f)

        if not args.is_test:
            results = get_metrics_as_dict(args.predict_file, output_prediction_file)
            print(json.dumps(results, indent=2))
def get_ensemble_result(args):
    ensemble_names = [
        init_restore_dir.split('/')[-2].split('-')[-1]
        for init_restore_dir in args.init_restore_dir
    ]
    if args.is_test:
        all_preds = [
            os.path.join(args.output_dir,
                         'test_short_predictions_' + e + '.json')
            for e in ensemble_names
        ]
    else:
        all_preds = [
            os.path.join(args.output_dir,
                         'dev_short_predictions_' + e + '.json')
            for e in ensemble_names
        ]

    output_prediction_file = None

    for th in args.thresholds:
        for yesno_th in args.yesno_thresholds:
            print('UNK type threshold:', th, 'YESNO threshold:', yesno_th)
            ensemble_pred_dict = short_ensemble_combine(
                all_preds, th, yesno_th, args.long_pred_file)
            long_short_combined_pred = combine_long_short(
                ensemble_pred_dict, args.long_pred_file)

            if args.is_test:
                output_prediction_file = os.path.join(
                    args.output_dir, 'all_test_short_predictions.json')
            else:
                output_prediction_file = os.path.join(
                    args.output_dir, 'all_dev_short_predictions.json')
            with open(output_prediction_file, 'w') as f:
                json.dump({'predictions': long_short_combined_pred}, f)

            if not args.is_test:
                results = get_metrics_as_dict(args.predict_file,
                                              output_prediction_file)
                print(json.dumps(results, indent=2))

    if args.is_test:
        make_submission(output_prediction_file, args.output_dir)
def get_ensemble_result(args):
    ensemble_names = [
        init_restore_dir.split('/')[-2].split('-')[-1]
        for init_restore_dir in args.init_restore_dir
    ]
    if args.is_test:
        all_preds = [
            os.path.join(args.output_dir,
                         'test_long_predictions_' + e + '.json')
            for e in ensemble_names
        ]
    else:
        all_preds = [
            os.path.join(args.output_dir,
                         'dev_long_predictions_' + e + '.json')
            for e in ensemble_names
        ]

    output_prediction_file = None
    for th1 in args.thresholds1:
        for th2 in args.thresholds2:
            print('UNK threshold:', th1, 'SHORT threshold', th2)
            ensemble_ls_pred_dict = ls_ensemble_combine(all_preds, th1, th2)

            if args.is_test:
                output_prediction_file = os.path.join(
                    args.output_dir, 'all_test_ls_predictions.json')
            else:
                output_prediction_file = os.path.join(
                    args.output_dir, 'all_dev_ls_predictions.json')
            with open(output_prediction_file, 'w') as f:
                json.dump(
                    {'predictions': list(ensemble_ls_pred_dict.values())}, f)

            if not args.is_test:
                results = get_metrics_as_dict(args.predict_file,
                                              output_prediction_file)
                print(json.dumps(results, indent=2))
Exemple #6
0
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            result = RawResult(unique_id=unique_id,
                               start_logits=to_list(outputs[0][i]),
                               end_logits=to_list(outputs[1][i]),
                               answer_type_logits=to_list(outputs[2][i]))
            all_results.append(result)

    pickle.dump(all_results,
                open(os.path.join(args.output_dir, 'RawResults.pkl'), 'wb'))

    # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults.pkl'), 'rb'))

    print("Going to candidates file")
    candidates_dict = read_candidates_from_one_split(args.predict_file)

    print("Compute_pred_dict")
    nq_pred_dict = compute_pred_dict(candidates_dict, features,
                                     [r._asdict() for r in all_results],
                                     args.n_best_size, args.max_answer_length)

    output_prediction_file = os.path.join(args.output_dir, 'predictions.json')
    print("Saving predictions to", output_prediction_file)
    with open(output_prediction_file, 'w') as f:
        json.dump({'predictions': list(nq_pred_dict.values())}, f)

    print("Computing f1 score")
    results = get_metrics_as_dict(args.predict_file, output_prediction_file)
    print(json.dumps(results, indent=2))
Exemple #7
0
def evaluate(model, args, dev_features, device, global_steps):
    # Eval!
    print("***** Running evaluation *****")
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            input_ids, input_mask, segment_ids, example_indices = batch
            inputs = {
                'input_ids': input_ids,
                'attention_mask': input_mask,
                'token_type_ids': segment_ids
            }
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = dev_features[example_index.item()]
            unique_id = str(eval_feature.unique_id)

            result = RawResult(
                unique_id=unique_id,
                # [topk]
                long_start_topk_logits=outputs['long_start_topk_logits']
                [i].cpu().numpy(),
                long_start_topk_index=outputs['long_start_topk_index']
                [i].cpu().numpy(),
                long_end_topk_logits=outputs['long_end_topk_logits']
                [i].cpu().numpy(),
                long_end_topk_index=outputs['long_end_topk_index']
                [i].cpu().numpy(),
                # [topk, topk]
                short_start_topk_logits=outputs['short_start_topk_logits']
                [i].cpu().numpy(),
                short_start_topk_index=outputs['short_start_topk_index']
                [i].cpu().numpy(),
                short_end_topk_logits=outputs['short_end_topk_logits']
                [i].cpu().numpy(),
                short_end_topk_index=outputs['short_end_topk_index']
                [i].cpu().numpy(),
                answer_type_logits=to_list(outputs['answer_type_logits'][i]),
                long_cls_logits=outputs['long_cls_logits'][i].cpu().numpy(),
                short_cls_logits=outputs['short_cls_logits'][i].cpu().numpy())
            all_results.append(result)

    pickle.dump(
        all_results,
        open(os.path.join(args.output_dir, 'RawResults_dev.pkl'), 'wb'))
    # all_results = pickle.load(open(os.path.join(args.output_dir, 'RawResults_dev.pkl'), 'rb'))

    candidates_dict = read_candidates_from_one_split(args.predict_file)
    nq_pred_dict = compute_pred_dict(candidates_dict,
                                     dev_features,
                                     [r._asdict() for r in all_results],
                                     args.n_best_size,
                                     args.max_answer_length,
                                     topk_pred=True,
                                     long_n_top=5,
                                     short_n_top=5)

    output_prediction_file = os.path.join(
        args.output_dir, 'predictions' + str(global_steps) + '.json')
    with open(output_prediction_file, 'w') as f:
        json.dump({'predictions': list(nq_pred_dict.values())}, f)

    results = get_metrics_as_dict(args.predict_file, output_prediction_file)
    print('Steps:{}'.format(global_steps))
    print(json.dumps(results, indent=2))

    model.train()

    return results