Esempio n. 1
0
 def get_training_data(self,
                       train_features,
                       example_ids,
                       out,
                       token_types=None):
     output = mx.nd.split(out, axis=2, num_outputs=2)
     example_ids = example_ids.asnumpy().tolist()
     pred_start = output[0].reshape((0, -3)).asnumpy()
     pred_end = output[1].reshape((0, -3)).asnumpy()
     raw_data = []
     for example_id, start, end in zip(example_ids, pred_start, pred_end):
         results = [PredResult(start=start, end=end)]
         features = train_features[example_id]
         label = 0 if features[0].is_impossible else 1
         prediction, score_diff, top_predict = predict(
             features=features,
             results=results,
             tokenizer=self.tokenizer,
             max_answer_length=self.max_answer_length,
             n_best_size=self.n_best_size,
             version_2=self.version_2)
         non_empty_top = 1. if top_predict else 0.
         # print(prediction, "," , top_predict, ",", features[0].orig_answer_text)
         raw_data.append([score_diff, non_empty_top, label])
     return raw_data
Esempio n. 2
0
def transform_fn(model, input_data, input_content_type=None, output_content_type=None):
    """
    Transform a request using the Gluon model. Called once per request.
    :param model: The Gluon model and the vocab
    :param dataset: The request payload
    
        Example:
        ## (example_id, [question, content], ques_cont_token_types, valid_length, _, _)


        (2, 
        '56be4db0acb8001400a502ee', 
        'Where did Super Bowl 50 take place?', 
        
        'Super Bowl 50 was an American football game to determine the champion of the National 
        Football League (NFL) for the 2015 season. The American Football Conference (AFC) 
        champion Denver Broncos defeated the National Football Conference (NFC) champion 
        Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played 
        on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, 
        California. As this was the 50th Super Bowl, the league emphasized the "golden 
        anniversary" with various gold-themed initiatives, as well as temporarily suspending 
        the tradition of naming each Super Bowl game with Roman numerals (under which the 
        game would have been known as "Super Bowl L"), so that the logo could prominently 
        feature the Arabic numerals 50.', 
        
        ['Santa Clara, California', "Levi's Stadium", "Levi's Stadium 
        in the San Francisco Bay Area at Santa Clara, California."], 
        
        [403, 355, 355])

    :param input_content_type: The request content type, assume json
    :param output_content_type: The (desired) response content type, assume json
    :return: response payload and content type.
    """
    net, vocab, squadTransform = model
#     data = input_data
    data = json.loads(input_data)
#     test_examples_tuples = [(i, "", question, content, [], [])]
#     question, context = data #.split(" [CONTEXT] ")
#     tup = (0, "", question, context, [], [])
    test_examples_tuples = _test_example_transform(data)
    test_dataset = mx.gluon.data.SimpleDataset(test_examples_tuples)  # [tup]
    all_results = get_all_results(net, vocab, squadTransform, test_dataset, ctx=mx.cpu())
    all_predictions = collections.defaultdict(list) # collections.OrderedDict()
    data_transform = test_dataset.transform(squadTransform._transform)
    for features in data_transform:
        f_id = features[0].example_id
        results = all_results[f_id]
        prediction, nbest = bert_qa_evaluate.predict(
            features=features,
            results=results,
            tokenizer=nlp.data.BERTBasicTokenizer(vocab))        
        nbest_prediction = [] 
        for i in range(3):
            nbest_prediction.append('%.2f%% \t %s'%(nbest[i][1] * 100, nbest[i][0]))
        all_predictions[f_id] = nbest_prediction
    response_body = json.dumps(all_predictions)
    return response_body, output_content_type
Esempio n. 3
0
 def parse_sentences(self,
                     train_features,
                     example_ids,
                     out,
                     token_types=None):
     output = mx.nd.split(out, axis=2, num_outputs=2)
     example_ids = example_ids.asnumpy().tolist()
     pred_start = output[0].reshape((0, -3)).asnumpy()
     pred_end = output[1].reshape((0, -3)).asnumpy()
     raw_data = []
     for example_id, start, end in zip(example_ids, pred_start, pred_end):
         results = [PredResult(start=start, end=end)]
         features = train_features[example_id]
         label = 0 if features[0].is_impossible else 1
         context_text = ' '.join(features[0].doc_tokens)
         question_text = features[0].question_text
         answer_text = features[0].orig_answer_text
         prediction, _, _ = predict(  # TODO: use this more wisely, for example, GAN
             features=features,
             results=results,
             tokenizer=self.tokenizer,
             max_answer_length=self.max_answer_length,
             n_best_size=self.n_best_size,
             version_2=self.version_2,
             offsets=self.offsets)
         # if len(prediction) == 0:
         #     continue # not validating for n/a output
         if self.extract_sentence:
             sentences = list(
                 filter(lambda x: len(x.strip()) > 0,
                        re.split(pattern, context_text)))
             if label == 1:
                 answer_sentence = self.find_sentence(
                     sentences, answer_text)
                 raw_data.append([
                     answer_sentence + '. ' + question_text, answer_text,
                     label
                 ])
             elif len(prediction) > 0:
                 sentence_text = self.find_sentence(sentences, prediction)
                 raw_data.append([
                     sentence_text + '. ' + question_text, prediction, label
                 ])
         else:
             first_part = context_text + '. ' + question_text
             if label == 1:
                 raw_data.append([first_part, answer_text, label])
             elif len(prediction) > 0:
                 raw_data.append([first_part, prediction, label])
     # dataset = VerifierDataset(raw_data)
     # return dataset
     return raw_data
Esempio n. 4
0
def evaluate():
    """Evaluate the model on validation dataset.
    """
    log.info('Loading dev data...')
    if version_2:
        dev_data = SQuAD('dev', version='2.0')
    else:
        dev_data = SQuAD('dev', version='1.1')
    if args.debug:
        sampled_data = [dev_data[0], dev_data[1], dev_data[2]]
        dev_data = mx.gluon.data.SimpleDataset(sampled_data)
    log.info('Number of records in dev data:{}'.format(len(dev_data)))

    dev_dataset = dev_data.transform(SQuADTransform(
        copy.copy(tokenizer),
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_pad=False,
        is_training=False)._transform,
                                     lazy=False)

    dev_data_transform, _ = preprocess_dataset(
        dev_data,
        SQuADTransform(copy.copy(tokenizer),
                       max_seq_length=max_seq_length,
                       doc_stride=doc_stride,
                       max_query_length=max_query_length,
                       is_pad=False,
                       is_training=False))
    log.info('The number of examples after preprocessing:{}'.format(
        len(dev_data_transform)))

    dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform,
                                              batchify_fn=batchify_fn,
                                              num_workers=4,
                                              batch_size=test_batch_size,
                                              shuffle=False,
                                              last_batch='keep')

    log.info('start prediction')

    all_results = collections.defaultdict(list)

    epoch_tic = time.time()
    total_num = 0
    for data in dev_dataloader:
        example_ids, inputs, token_types, valid_length, _, _ = data
        total_num += len(inputs)
        out = net(
            inputs.astype('float32').as_in_context(ctx),
            token_types.astype('float32').as_in_context(ctx),
            valid_length.astype('float32').as_in_context(ctx))

        output = mx.nd.split(out, axis=2, num_outputs=2)
        example_ids = example_ids.asnumpy().tolist()
        pred_start = output[0].reshape((0, -3)).asnumpy()
        pred_end = output[1].reshape((0, -3)).asnumpy()

        for example_id, start, end in zip(example_ids, pred_start, pred_end):
            all_results[example_id].append(PredResult(start=start, end=end))

    epoch_toc = time.time()
    log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
        epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))

    log.info('Get prediction results...')

    all_predictions = collections.OrderedDict()

    for features in dev_dataset:
        results = all_results[features[0].example_id]
        example_qas_id = features[0].qas_id

        prediction, _ = predict(
            features=features,
            results=results,
            tokenizer=nlp.data.BERTBasicTokenizer(lower=lower),
            max_answer_length=max_answer_length,
            null_score_diff_threshold=null_score_diff_threshold,
            n_best_size=n_best_size,
            version_2=version_2)

        all_predictions[example_qas_id] = prediction

    with io.open(os.path.join(output_dir, 'predictions.json'),
                 'w',
                 encoding='utf-8') as fout:
        data = json.dumps(all_predictions, ensure_ascii=False)
        fout.write(data)

    if version_2:
        log.info(
            'Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0'
        )
    else:
        F1_EM = get_F1_EM(dev_data, all_predictions)
        log.info(F1_EM)
Esempio n. 5
0
def evaluate():
    """Evaluate the model on validation dataset.
    """
    log.info('Start Evaluation')

    all_results = collections.defaultdict(list)

    if VERIFIER_ID == 2:
        all_pre_na_prob = collections.defaultdict(list)

    epoch_tic = time.time()
    total_num = 0
    for data in dev_dataloader:
        example_ids, inputs, token_types, valid_length, _, _ = data
        total_num += len(inputs)

        cls_mask = mx.nd.zeros(token_types.shape)
        sep_mask_1 = mx.nd.zeros(token_types.shape)
        sep_mask_2 = mx.nd.zeros(token_types.shape)
        cls_mask[:, 0] = 1.
        range_row_index = mx.nd.array(np.arange(len(example_ids)))
        valid_query_length = (1 - token_types).sum(axis=1)
        sep_mask_1[range_row_index, valid_query_length - 1] = 1.
        sep_mask_2[range_row_index, valid_length - 1] = 1.
        additional_masks = (cls_mask.astype('float32').as_in_context(ctx),
                            sep_mask_1.astype('float32').as_in_context(ctx),
                            sep_mask_2.astype('float32').as_in_context(ctx))

        out, bert_out = net(
            inputs.astype('float32').as_in_context(ctx),
            token_types.astype('float32').as_in_context(ctx),
            valid_length.astype('float32').as_in_context(ctx),
            additional_masks)

        if VERIFIER_ID == 2:
            has_answer_tmp = verifier.evaluate(dev_features, example_ids, out,
                                               token_types,
                                               bert_out).asnumpy().tolist()

        output = mx.nd.split(out, axis=2, num_outputs=2)
        example_ids = example_ids.asnumpy().tolist()
        pred_start = output[0].reshape((0, -3)).asnumpy()
        pred_end = output[1].reshape((0, -3)).asnumpy()

        for example_id, start, end in zip(example_ids, pred_start, pred_end):
            all_results[example_id].append(PredResult(start=start, end=end))
        if VERIFIER_ID == 2:
            for example_id, has_ans_prob in zip(example_ids, has_answer_tmp):
                all_pre_na_prob[example_id].append(has_ans_prob)

    epoch_toc = time.time()
    log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
        epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))

    log.info('Get prediction results...')

    all_predictions = collections.OrderedDict()

    for features in dev_dataset:
        results = all_results[features[0].example_id]
        example_qas_id = features[0].qas_id
        # prediction2 is likely to be empty when in version_2
        prediction, score_diff, best_pred = predict(
            features=features,
            results=results,
            tokenizer=nlp.data.BERTBasicTokenizer(lower=lower),
            max_answer_length=max_answer_length,
            n_best_size=n_best_size,
            version_2=version_2,
            offsets=offsets)
        # print(score_diff, null_score_diff_threshold, features[0].is_impossible) # debug
        # verifier
        if version_2 and prediction != '':
            # threshold serves as the basic verifier

            if score_diff > null_score_diff_threshold:
                answerable = 0.
            else:
                answerable = 1.

            if VERIFIER_ID == 0:
                best_pred_score = 1. if best_pred else 0.
                has_ans_prob = verifier.evaluate(score_diff, best_pred_score)
                # print(features[0].is_impossible)
            elif VERIFIER_ID == 1:
                has_ans_prob = verifier.evaluate(features, prediction)
            elif VERIFIER_ID == 2:
                has_ans_prob_list = all_pre_na_prob[features[0].example_id]
                has_ans_prob = sum(has_ans_prob_list) / max(
                    len(has_ans_prob_list), 1)
            else:
                has_ans_prob = 1.

            if args.verifier_mode == "takeover":
                answerable = has_ans_prob
            elif args.verifier_mode == "joint":
                answerable = answerable * has_ans_prob
            elif args.verifier_mode == "all":
                answerable = (answerable + has_ans_prob) * 0.5

            if answerable < answerable_threshold:
                prediction = ""

        all_predictions[example_qas_id] = prediction
        # the form of hashkey - answer string

    with io.open(os.path.join(output_dir, 'predictions.json'),
                 'w',
                 encoding='utf-8') as fout:
        data = json.dumps(all_predictions, ensure_ascii=False)
        fout.write(data)

    if version_2:
        log.info(
            'Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0'
        )
    else:
        F1_EM = get_F1_EM(dev_data, all_predictions)
        log.info(F1_EM)
Esempio n. 6
0
def evaluate():
    """Evaluate the model on validation dataset.
    """
    log.info('Loading dev data...')
    if version_2:
        dev_data = SQuAD('dev', version='2.0')
    else:
        dev_data = SQuAD('dev', version='1.1')
    if args.debug:
        sampled_data = dev_data[:10]  # [dev_data[0], dev_data[1], dev_data[2]]
        dev_data = mx.gluon.data.SimpleDataset(sampled_data)
    log.info('Number of records in dev data:{}'.format(len(dev_data)))

    dev_dataset = dev_data.transform(SQuADTransform(
        copy.copy(tokenizer),
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_pad=True,
        is_training=True)._transform,
                                     lazy=False)

    dev_data_transform, _ = preprocess_dataset(
        dev_data,
        SQuADTransform(copy.copy(tokenizer),
                       max_seq_length=max_seq_length,
                       doc_stride=doc_stride,
                       max_query_length=max_query_length,
                       is_pad=True,
                       is_training=True))

    # refer to evaluation process
    # for feat in train_dataset:
    #     print(feat[0].example_id)
    #     print(feat[0].tokens)
    #     print(feat[0].token_to_orig_map)
    #     input()
    # exit(0)

    dev_features = {
        features[0].example_id: features
        for features in dev_dataset
    }

    #for line in train_data_transform:
    #    print(line)
    #    input()

    dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform,
                                              batchify_fn=batchify_fn,
                                              batch_size=test_batch_size,
                                              num_workers=4,
                                              shuffle=True)
    '''

    dev_dataset = dev_data.transform(
        SQuADTransform(
            copy.copy(tokenizer),
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_pad=False,
            is_training=False)._transform, lazy=False)

    # for feat in dev_dataset:
    #     print(feat[0].example_id)
    #     print(feat[0].tokens)
    #     print(feat[0].token_to_orig_map)
    #     input()
    # exit(0)

    dev_features = {features[0].example_id: features for features in dev_dataset}

    dev_data_transform, _ = preprocess_dataset(
        dev_data, SQuADTransform(
            copy.copy(tokenizer),
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_pad=False,
            is_training=False))
    log.info('The number of examples after preprocessing:{}'.format(
        len(dev_data_transform)))

    dev_dataloader = mx.gluon.data.DataLoader(
        dev_data_transform,
        batchify_fn=batchify_fn,
        num_workers=4, batch_size=test_batch_size,
        shuffle=False, last_batch='keep')
    '''
    log.info('start prediction')

    all_results = collections.defaultdict(list)

    if args.verify and VERIFIER_ID in [2, 3]:
        all_pre_na_prob = collections.defaultdict(list)
    else:
        all_pre_na_prob = None

    epoch_tic = time.time()
    total_num = 0
    for data in dev_dataloader:
        example_ids, inputs, token_types, valid_length, _, _ = data
        total_num += len(inputs)
        out = net(
            inputs.astype('float32').as_in_context(ctx),
            token_types.astype('float32').as_in_context(ctx),
            valid_length.astype('float32').as_in_context(ctx))

        if all_pre_na_prob is not None:
            has_answer_tmp = verifier.evaluate(dev_features, example_ids,
                                               out).asnumpy().tolist()

        output = mx.nd.split(out, axis=2, num_outputs=2)
        example_ids = example_ids.asnumpy().tolist()
        pred_start = output[0].reshape((0, -3)).asnumpy()
        pred_end = output[1].reshape((0, -3)).asnumpy()

        for example_id, start, end in zip(example_ids, pred_start, pred_end):
            all_results[example_id].append(PredResult(start=start, end=end))
        if all_pre_na_prob is not None:
            for example_id, has_ans_prob in zip(example_ids, has_answer_tmp):
                all_pre_na_prob[example_id].append(has_ans_prob)

    epoch_toc = time.time()
    log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
        epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))

    log.info('Get prediction results...')

    all_predictions = collections.OrderedDict()

    for features in dev_dataset:
        results = all_results[features[0].example_id]
        example_qas_id = features[0].qas_id

        if all_pre_na_prob is not None:
            has_ans_prob_list = all_pre_na_prob[features[0].example_id]
            has_ans_prob = sum(has_ans_prob_list) / max(
                len(has_ans_prob_list), 1)
            if has_ans_prob < 0.5:
                prediction = ""
                all_predictions[example_qas_id] = prediction
                continue

        prediction, _ = predict(
            features=features,
            results=results,
            tokenizer=nlp.data.BERTBasicTokenizer(lower=lower),
            max_answer_length=max_answer_length,
            null_score_diff_threshold=null_score_diff_threshold,
            n_best_size=n_best_size,
            version_2=version_2)

        if args.verify and VERIFIER_ID == 1:
            if len(prediction) > 0:
                has_answer = verifier.evaluate(features, prediction)
                if not has_answer:
                    prediction = ""

        all_predictions[example_qas_id] = prediction
        # the form of hashkey - answer string

    with io.open(os.path.join(output_dir, 'predictions.json'),
                 'w',
                 encoding='utf-8') as fout:
        data = json.dumps(all_predictions, ensure_ascii=False)
        fout.write(data)

    if version_2:
        log.info(
            'Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0'
        )
    else:
        F1_EM = get_F1_EM(dev_data, all_predictions)
        log.info(F1_EM)