Esempio n. 1
0
def evaluate():
    """Evaluate the model on validation dataset.
    """
    log.info('Loading dev data...')
    if version_2:
        dev_data = SQuAD('dev', version='2.0')
    else:
        dev_data = SQuAD('dev', version='1.1')
    if args.debug:
        sampled_data = [dev_data[0], dev_data[1], dev_data[2]]
        dev_data = mx.gluon.data.SimpleDataset(sampled_data)
    log.info('Number of records in dev data:{}'.format(len(dev_data)))

    dev_dataset = dev_data.transform(SQuADTransform(
        copy.copy(tokenizer),
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_pad=False,
        is_training=False)._transform,
                                     lazy=False)

    dev_data_transform, _ = preprocess_dataset(
        dev_data,
        SQuADTransform(copy.copy(tokenizer),
                       max_seq_length=max_seq_length,
                       doc_stride=doc_stride,
                       max_query_length=max_query_length,
                       is_pad=False,
                       is_training=False))
    log.info('The number of examples after preprocessing:{}'.format(
        len(dev_data_transform)))

    dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform,
                                              batchify_fn=batchify_fn,
                                              num_workers=4,
                                              batch_size=test_batch_size,
                                              shuffle=False,
                                              last_batch='keep')

    log.info('start prediction')

    all_results = collections.defaultdict(list)

    epoch_tic = time.time()
    total_num = 0
    for data in dev_dataloader:
        example_ids, inputs, token_types, valid_length, _, _ = data
        total_num += len(inputs)
        out = net(
            inputs.astype('float32').as_in_context(ctx),
            token_types.astype('float32').as_in_context(ctx),
            valid_length.astype('float32').as_in_context(ctx))

        output = mx.nd.split(out, axis=2, num_outputs=2)
        example_ids = example_ids.asnumpy().tolist()
        pred_start = output[0].reshape((0, -3)).asnumpy()
        pred_end = output[1].reshape((0, -3)).asnumpy()

        for example_id, start, end in zip(example_ids, pred_start, pred_end):
            all_results[example_id].append(PredResult(start=start, end=end))

    epoch_toc = time.time()
    log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
        epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))

    log.info('Get prediction results...')

    all_predictions = collections.OrderedDict()

    for features in dev_dataset:
        results = all_results[features[0].example_id]
        example_qas_id = features[0].qas_id

        prediction, _ = predict(
            features=features,
            results=results,
            tokenizer=nlp.data.BERTBasicTokenizer(lower=lower),
            max_answer_length=max_answer_length,
            null_score_diff_threshold=null_score_diff_threshold,
            n_best_size=n_best_size,
            version_2=version_2)

        all_predictions[example_qas_id] = prediction

    with io.open(os.path.join(output_dir, 'predictions.json'),
                 'w',
                 encoding='utf-8') as fout:
        data = json.dumps(all_predictions, ensure_ascii=False)
        fout.write(data)

    if version_2:
        log.info(
            'Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0'
        )
    else:
        F1_EM = get_F1_EM(dev_data, all_predictions)
        log.info(F1_EM)
Esempio n. 2
0
def evaluate():
    """Evaluate the model on validation dataset.
    """
    log.info('Start Evaluation')

    all_results = collections.defaultdict(list)

    if VERIFIER_ID == 2:
        all_pre_na_prob = collections.defaultdict(list)

    epoch_tic = time.time()
    total_num = 0
    for data in dev_dataloader:
        example_ids, inputs, token_types, valid_length, _, _ = data
        total_num += len(inputs)

        cls_mask = mx.nd.zeros(token_types.shape)
        sep_mask_1 = mx.nd.zeros(token_types.shape)
        sep_mask_2 = mx.nd.zeros(token_types.shape)
        cls_mask[:, 0] = 1.
        range_row_index = mx.nd.array(np.arange(len(example_ids)))
        valid_query_length = (1 - token_types).sum(axis=1)
        sep_mask_1[range_row_index, valid_query_length - 1] = 1.
        sep_mask_2[range_row_index, valid_length - 1] = 1.
        additional_masks = (cls_mask.astype('float32').as_in_context(ctx),
                            sep_mask_1.astype('float32').as_in_context(ctx),
                            sep_mask_2.astype('float32').as_in_context(ctx))

        out, bert_out = net(
            inputs.astype('float32').as_in_context(ctx),
            token_types.astype('float32').as_in_context(ctx),
            valid_length.astype('float32').as_in_context(ctx),
            additional_masks)

        if VERIFIER_ID == 2:
            has_answer_tmp = verifier.evaluate(dev_features, example_ids, out,
                                               token_types,
                                               bert_out).asnumpy().tolist()

        output = mx.nd.split(out, axis=2, num_outputs=2)
        example_ids = example_ids.asnumpy().tolist()
        pred_start = output[0].reshape((0, -3)).asnumpy()
        pred_end = output[1].reshape((0, -3)).asnumpy()

        for example_id, start, end in zip(example_ids, pred_start, pred_end):
            all_results[example_id].append(PredResult(start=start, end=end))
        if VERIFIER_ID == 2:
            for example_id, has_ans_prob in zip(example_ids, has_answer_tmp):
                all_pre_na_prob[example_id].append(has_ans_prob)

    epoch_toc = time.time()
    log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
        epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))

    log.info('Get prediction results...')

    all_predictions = collections.OrderedDict()

    for features in dev_dataset:
        results = all_results[features[0].example_id]
        example_qas_id = features[0].qas_id
        # prediction2 is likely to be empty when in version_2
        prediction, score_diff, best_pred = predict(
            features=features,
            results=results,
            tokenizer=nlp.data.BERTBasicTokenizer(lower=lower),
            max_answer_length=max_answer_length,
            n_best_size=n_best_size,
            version_2=version_2,
            offsets=offsets)
        # print(score_diff, null_score_diff_threshold, features[0].is_impossible) # debug
        # verifier
        if version_2 and prediction != '':
            # threshold serves as the basic verifier

            if score_diff > null_score_diff_threshold:
                answerable = 0.
            else:
                answerable = 1.

            if VERIFIER_ID == 0:
                best_pred_score = 1. if best_pred else 0.
                has_ans_prob = verifier.evaluate(score_diff, best_pred_score)
                # print(features[0].is_impossible)
            elif VERIFIER_ID == 1:
                has_ans_prob = verifier.evaluate(features, prediction)
            elif VERIFIER_ID == 2:
                has_ans_prob_list = all_pre_na_prob[features[0].example_id]
                has_ans_prob = sum(has_ans_prob_list) / max(
                    len(has_ans_prob_list), 1)
            else:
                has_ans_prob = 1.

            if args.verifier_mode == "takeover":
                answerable = has_ans_prob
            elif args.verifier_mode == "joint":
                answerable = answerable * has_ans_prob
            elif args.verifier_mode == "all":
                answerable = (answerable + has_ans_prob) * 0.5

            if answerable < answerable_threshold:
                prediction = ""

        all_predictions[example_qas_id] = prediction
        # the form of hashkey - answer string

    with io.open(os.path.join(output_dir, 'predictions.json'),
                 'w',
                 encoding='utf-8') as fout:
        data = json.dumps(all_predictions, ensure_ascii=False)
        fout.write(data)

    if version_2:
        log.info(
            'Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0'
        )
    else:
        F1_EM = get_F1_EM(dev_data, all_predictions)
        log.info(F1_EM)
Esempio n. 3
0
def evaluate():
    """Evaluate the model on validation dataset.
    """
    log.info('Loading dev data...')
    if version_2:
        dev_data = SQuAD('dev', version='2.0')
    else:
        dev_data = SQuAD('dev', version='1.1')
    log.info('Number of records in Train data:{}'.format(len(dev_data)))

    dev_dataset = dev_data.transform(
        SQuADTransform(berttoken,
                       max_seq_length=max_seq_length,
                       doc_stride=doc_stride,
                       max_query_length=max_query_length,
                       is_pad=False,
                       is_training=False)._transform)

    dev_data_transform, _ = preprocess_dataset(
        dev_data,
        SQuADTransform(berttoken,
                       max_seq_length=max_seq_length,
                       doc_stride=doc_stride,
                       max_query_length=max_query_length,
                       is_pad=False,
                       is_training=False))
    log.info('The number of examples after preprocessing:{}'.format(
        len(dev_data_transform)))

    dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform,
                                              batchify_fn=batchify_fn,
                                              num_workers=4,
                                              batch_size=test_batch_size,
                                              shuffle=False,
                                              last_batch='keep')

    log.info('Start predict')

    _Result = collections.namedtuple(
        '_Result', ['example_id', 'start_logits', 'end_logits'])
    all_results = {}

    epoch_tic = time.time()
    total_num = 0
    for data in dev_dataloader:
        example_ids, inputs, token_types, valid_length, _, _ = data
        total_num += len(inputs)
        out = net(
            inputs.astype('float32').as_in_context(ctx),
            token_types.astype('float32').as_in_context(ctx),
            valid_length.astype('float32').as_in_context(ctx))

        output = nd.split(out, axis=2, num_outputs=2)
        start_logits = output[0].reshape((0, -3)).asnumpy()
        end_logits = output[1].reshape((0, -3)).asnumpy()

        for example_id, start, end in zip(example_ids, start_logits,
                                          end_logits):
            example_id = example_id.asscalar()
            if example_id not in all_results:
                all_results[example_id] = []
            all_results[example_id].append(
                _Result(example_id, start.tolist(), end.tolist()))
        if args.test_mode:
            log.info('Exit early in test mode')
            break
    epoch_toc = time.time()
    log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
        epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))
    log.info('Get prediction results...')

    all_predictions, all_nbest_json, scores_diff_json = predictions(
        dev_dataset=dev_dataset,
        all_results=all_results,
        tokenizer=nlp.data.BERTBasicTokenizer(lower=lower),
        max_answer_length=max_answer_length,
        null_score_diff_threshold=null_score_diff_threshold,
        n_best_size=n_best_size,
        version_2=version_2,
        test_mode=args.test_mode)

    with open(os.path.join(output_dir, 'predictions.json'),
              'w',
              encoding='utf-8') as all_predictions_write:
        all_predictions_write.write(json.dumps(all_predictions))

    with open(os.path.join(output_dir, 'nbest_predictions.json'),
              'w',
              encoding='utf-8') as all_predictions_write:
        all_predictions_write.write(json.dumps(all_nbest_json))

    if version_2:
        with open(os.path.join(output_dir, 'null_odds.json'),
                  'w',
                  encoding='utf-8') as all_predictions_write:
            all_predictions_write.write(json.dumps(scores_diff_json))
    else:
        log.info(get_F1_EM(dev_data, all_predictions))
Esempio n. 4
0
def evaluate():
    """Evaluate the model on validation dataset.
    """
    log.info('Loading dev data...')
    if version_2:
        dev_data = SQuAD('dev', version='2.0')
    else:
        dev_data = SQuAD('dev', version='1.1')
    if args.debug:
        sampled_data = dev_data[:10]  # [dev_data[0], dev_data[1], dev_data[2]]
        dev_data = mx.gluon.data.SimpleDataset(sampled_data)
    log.info('Number of records in dev data:{}'.format(len(dev_data)))

    dev_dataset = dev_data.transform(SQuADTransform(
        copy.copy(tokenizer),
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_pad=True,
        is_training=True)._transform,
                                     lazy=False)

    dev_data_transform, _ = preprocess_dataset(
        dev_data,
        SQuADTransform(copy.copy(tokenizer),
                       max_seq_length=max_seq_length,
                       doc_stride=doc_stride,
                       max_query_length=max_query_length,
                       is_pad=True,
                       is_training=True))

    # refer to evaluation process
    # for feat in train_dataset:
    #     print(feat[0].example_id)
    #     print(feat[0].tokens)
    #     print(feat[0].token_to_orig_map)
    #     input()
    # exit(0)

    dev_features = {
        features[0].example_id: features
        for features in dev_dataset
    }

    #for line in train_data_transform:
    #    print(line)
    #    input()

    dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform,
                                              batchify_fn=batchify_fn,
                                              batch_size=test_batch_size,
                                              num_workers=4,
                                              shuffle=True)
    '''

    dev_dataset = dev_data.transform(
        SQuADTransform(
            copy.copy(tokenizer),
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_pad=False,
            is_training=False)._transform, lazy=False)

    # for feat in dev_dataset:
    #     print(feat[0].example_id)
    #     print(feat[0].tokens)
    #     print(feat[0].token_to_orig_map)
    #     input()
    # exit(0)

    dev_features = {features[0].example_id: features for features in dev_dataset}

    dev_data_transform, _ = preprocess_dataset(
        dev_data, SQuADTransform(
            copy.copy(tokenizer),
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_pad=False,
            is_training=False))
    log.info('The number of examples after preprocessing:{}'.format(
        len(dev_data_transform)))

    dev_dataloader = mx.gluon.data.DataLoader(
        dev_data_transform,
        batchify_fn=batchify_fn,
        num_workers=4, batch_size=test_batch_size,
        shuffle=False, last_batch='keep')
    '''
    log.info('start prediction')

    all_results = collections.defaultdict(list)

    if args.verify and VERIFIER_ID in [2, 3]:
        all_pre_na_prob = collections.defaultdict(list)
    else:
        all_pre_na_prob = None

    epoch_tic = time.time()
    total_num = 0
    for data in dev_dataloader:
        example_ids, inputs, token_types, valid_length, _, _ = data
        total_num += len(inputs)
        out = net(
            inputs.astype('float32').as_in_context(ctx),
            token_types.astype('float32').as_in_context(ctx),
            valid_length.astype('float32').as_in_context(ctx))

        if all_pre_na_prob is not None:
            has_answer_tmp = verifier.evaluate(dev_features, example_ids,
                                               out).asnumpy().tolist()

        output = mx.nd.split(out, axis=2, num_outputs=2)
        example_ids = example_ids.asnumpy().tolist()
        pred_start = output[0].reshape((0, -3)).asnumpy()
        pred_end = output[1].reshape((0, -3)).asnumpy()

        for example_id, start, end in zip(example_ids, pred_start, pred_end):
            all_results[example_id].append(PredResult(start=start, end=end))
        if all_pre_na_prob is not None:
            for example_id, has_ans_prob in zip(example_ids, has_answer_tmp):
                all_pre_na_prob[example_id].append(has_ans_prob)

    epoch_toc = time.time()
    log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
        epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))

    log.info('Get prediction results...')

    all_predictions = collections.OrderedDict()

    for features in dev_dataset:
        results = all_results[features[0].example_id]
        example_qas_id = features[0].qas_id

        if all_pre_na_prob is not None:
            has_ans_prob_list = all_pre_na_prob[features[0].example_id]
            has_ans_prob = sum(has_ans_prob_list) / max(
                len(has_ans_prob_list), 1)
            if has_ans_prob < 0.5:
                prediction = ""
                all_predictions[example_qas_id] = prediction
                continue

        prediction, _ = predict(
            features=features,
            results=results,
            tokenizer=nlp.data.BERTBasicTokenizer(lower=lower),
            max_answer_length=max_answer_length,
            null_score_diff_threshold=null_score_diff_threshold,
            n_best_size=n_best_size,
            version_2=version_2)

        if args.verify and VERIFIER_ID == 1:
            if len(prediction) > 0:
                has_answer = verifier.evaluate(features, prediction)
                if not has_answer:
                    prediction = ""

        all_predictions[example_qas_id] = prediction
        # the form of hashkey - answer string

    with io.open(os.path.join(output_dir, 'predictions.json'),
                 'w',
                 encoding='utf-8') as fout:
        data = json.dumps(all_predictions, ensure_ascii=False)
        fout.write(data)

    if version_2:
        log.info(
            'Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0'
        )
    else:
        F1_EM = get_F1_EM(dev_data, all_predictions)
        log.info(F1_EM)