Example #1
0
def show_prediction_for_dureader(paths,
                                 outpath,
                                 reader_exp_name,
                                 para_selection_method,
                                 decoder_dict=None):
    print('show_prediction_for_dureader')
    loader = DureaderLoader(
        paths,
        para_selection_method,
        sample_fields=['question', 'answers', 'question_id', 'question_type'])
    sample_list = loader.sample_list
    reader = ReaderFactory.from_exp_name(reader_exp_name,
                                         decoder_dict=decoder_dict)
    _preds = reader.evaluate_on_records(sample_list, batch_size=128)
    _preds = group_dict_list(_preds, 'question_id')
    pred_answers = MaxAllJudger().judge(_preds)
    pred_answer_list = RecordGrouper.from_group_dict('question_id',
                                                     pred_answers).records
    print('bidaf evaluation')
    ranked_list_formatter = QARankedListFormater(pred_answer_list)
    formated_result = ranked_list_formatter.format_result()
    with open(outpath, 'w', encoding='utf-8') as f:
        f.write('experiment settings\n')
        f.write('reader_exp_name : %s\n' % (reader_exp_name))
        f.write('para_selection_method : %s\n' % (str(para_selection_method)))
        f.write('decoder : %s\n' % (str(decoder_dict)))
        f.write('##' * 20)
        f.write('Content:\n\n')
        f.write(formated_result)
Example #2
0
def test_preprocessing_charspan():
    from dataloader.dureader import DureaderLoader
    from .util import preprocessing_charspan
    loader = DureaderLoader("./data/demo/devset/search.dev.json" ,'most_related_para',sample_fields=['question','answers','question_id','question_type','answer_docs','answer_spans'],\
        doc_fields=['segmented_paragraphs'])
    #print(len(loader.sample_list))
    #print(loader.sample_list[1])
    for sample in loader.sample_list:
        if len(sample['answer_spans']) == 0:
            continue
        word_tokens = sample['segmented_paragraphs']
        preprocessing_charspan(sample)
        passage = sample['passage']
        start, end = sample['char_spans'][0]
        assert passage[start:end + 1] in "".join(word_tokens)
    print(loader.sample_list[3])
Example #3
0
def test_dureader_bert_rc(test_path,
                          reader_exp_name,
                          para_selection_method,
                          decoder_dict=None):
    print('test_dureader_bert_rc loading samples...')
    loader = DureaderLoader(
        test_path,
        para_selection_method,
        sample_fields=['question', 'answers', 'question_id', 'question_type'])
    sample_list = loader.sample_list
    reader = ReaderFactory.from_exp_name(reader_exp_name,
                                         decoder_dict=decoder_dict)
    _preds = reader.evaluate_on_records(sample_list, batch_size=128)
    _preds = group_dict_list(_preds, 'question_id')
    pred_answers = MaxAllJudger().judge(_preds)
    print('bidaf evaluation')
    evaluate_mrc_bidaf(pred_answers)
Example #4
0
def evaluate3(evaluate_files,
              bert_config_path,
              weight_path,
              metric_dir,
              eval_method='bidaf_script'):
    from common.util import group_dict_list
    print('load model')
    with torch.no_grad():
        reader = ReaderFactory.from_exp_name('reader/bert_default',
                                             READER_CLASS='bert_reader')
        #dataset = make_dataset(evaluate_files)
        #iterator = make_batch_iterator(dataset,bs=128)
        loader = DureaderLoader(evaluate_files,
                                'most_related_para',
                                sample_fields=[
                                    'question', 'answers', 'question_id',
                                    'question_type'
                                ])

        dataset = BertRCDataset(loader.sample_list,
                                args.max_query_length,
                                args.max_seq_length,
                                device=args.device)
        iterator = dataset.make_batchiter(batch_size=128)
        print('Iterate Batch')
        preds = reader.evaluate_on_batch(iterator)

        tmp = {}
        tmp = group_dict_list(preds, 'question_id')

        pred_result, ref_result = {}, {}
        # find max score predcition(dict) of qid

        for qid in tmp:
            l = tmp[qid]
            max_answer = max(l, key=lambda d: d['span_score'])
            max_answer.update({'entity_answers': [[]], 'yesno_answers': []})
            ref = {k: v for k, v in max_answer.items()}
            ref_result[qid] = ref
            #順序不能倒過來...

            max_answer['answers'] = [max_answer['span']]
            pred_result[qid] = max_answer

        mrc_eval.evaluate(pred_result, ref_result)
Example #5
0
def test_mrc_baseline():
    print('test_dureader_bert_rc loading samples...')
    from dataloader.dureader import  DureaderLoader
    from qa.reader import ReaderFactory,BertRCDataset
    from qa.judger import MaxAllJudger
    from common.util import group_dict_list,evaluate_mrc_bidaf
    loader = DureaderLoader( ['./data/demo/devset/search.dev.2.json'],'most_related_para',sample_fields=['question','answers','question_id','question_type'])
    sample_list = loader.sample_list
    reader = ReaderFactory.from_exp_name('reader/bert_default',decoder_dict={'class':'default','kwargs':{'k':1}})
    reader_config = reader.config
    dataset  = BertRCDataset(sample_list,reader_config.MAX_QUERY_LEN,reader_config.MAX_SEQ_LEN,device=reader.device)
    print('make batch')
    iterator = dataset.make_batchiter(batch_size=128)
    _preds = reader.evaluate_on_batch(iterator)
    _preds = group_dict_list(_preds,'question_id')
    pred_answers  = MaxAllJudger().judge(_preds)
    res_dict = evaluate_mrc_bidaf(pred_answers)
    assert res_dict == {'Bleu-1': 0.19711538461443695, 'Bleu-2': 0.15154174071281326, 'Bleu-3': 0.11637351097094059, 'Bleu-4': 0.0983666932134996, 'Rouge-L': 0.260079879764384}
Example #6
0
def evaluate_dureader_ranker(paths, ranker, batch_size=64, print_detail=True):
    if type(ranker) == str:
        ranker = RankerFactory.from_exp_name(ranker)
    loader = DureaderLoader(
        paths,
        'most_related_para',
        sample_fields=['question', 'question_id', 'answer_docs'])
    samples_to_evaluate = []
    for sample in loader.sample_list:
        if len(sample['answer_docs']) == 0:
            continue
        label = 0
        if sample['doc_id'] == sample['answer_docs'][0]:
            label = 1
        sample['label'] = label
        samples_to_evaluate.append(sample)
    rank_results = ranker.evaluate_on_records(samples_to_evaluate,
                                              batch_size=batch_size)
    print(len(rank_results))
    sorted_results = RecordGrouper(rank_results).group_sort(
        'question', 'rank_score', 50)
    if print_detail:
        for k, v in sorted_results.items():
            print('question:')
            print(k)
            for x in v[0:10]:
                print('\t\t' + x['passage'][0:100])
                print('\t\t %.3f' % (x['rank_score']))
                print('# #' * 10)
                print('\n')
    print('precision is ')
    print(precision(sorted_results, k=1))
    print('recall is ')
    print(recall(sorted_results, k=1))
    print('accuracy is')
    print(accuracy(sorted_results))
Example #7
0
if __name__ == '__main__':
    # evaluate ranker
    from qa.eval import evaluate_dureader_ranker
    experiment = Experiment('reader/pg')
    #TRAIN_PATH = ["./data/trainset/search.train.json","./data/trainset/zhidao.train.json"]
    #TRAIN_PATH = ["./data/trainset/search.train.json"]
    #DEV_PATH = "./data/devset/search.dev.json"
    TRAIN_PATH = "./data/demo/devset/search.dev.2.json"
    DEV_PATH = "./data/demo/devset/search.dev.2.json"
    READER_EXP_NAME = 'reader/bert_default'
    RANKER_EXP_NAME = 'pointwise/answer_doc'
    EPOCH = 10

    TRAIN_READER = False

    train_loader = DureaderLoader(TRAIN_PATH ,'most_related_para',sample_fields=['question','answers','question_id','question_type','answer_docs','answer_spans'],\
        doc_fields=['segmented_paragraphs'])
    print('preprocessing span for  train data')
    train_loader.sample_list = list(
        filter(lambda x: len(x['answers']) > 0 and len(x['answer_docs']) > 0,
               train_loader.sample_list))
    for sample in train_loader.sample_list:
        if sample["doc_id"] == sample['answer_docs'][0]:
            preprocessing_charspan(sample)
        else:
            sample['char_spans'] = [0, 0]
            del sample['answer_spans']
            del sample['segmented_paragraphs']
    print('load ranker')
    ranker = RankerFactory.from_exp_name(experiment.config.ranker_name,
                                         eval_flag=False)
    print('load reader')
Example #8
0
def evaluate2(evaluate_files,
              bert_config_path,
              weight_path,
              metric_dir,
              eval_method='bidaf_script'):
    print('load model')
    with torch.no_grad():
        model = ReaderFactory.from_exp_name('reader/bert_default',
                                            READER_CLASS='bert_reader').model
        model = model.eval()
        #dataset = make_dataset(evaluate_files)
        #iterator = make_batch_iterator(dataset,bs=128)
        loader = DureaderLoader(evaluate_files,
                                'most_related_para',
                                sample_fields=[
                                    'question', 'answers', 'question_id',
                                    'question_type'
                                ])

        dataset = BertRCDataset(loader.sample_list,
                                args.max_query_length,
                                args.max_seq_length,
                                device=args.device)
        iterator = dataset.make_batchiter(batch_size=128)
        print('Iterate Batch')
        preds = []
        for i, batch in enumerate(iterator):
            if i % 20 == 0:
                print('evaluate on %d batch' % (i))
            start_probs, end_probs = model(batch.input_ids,
                                           token_type_ids=batch.segment_ids,
                                           attention_mask=batch.input_mask)
            for i in range(len(start_probs)):
                sb, eb = start_probs[i].unsqueeze(0), end_probs[i].unsqueeze(0)
                span, score = find_best_span_from_probs(sb, eb)
                score = score.item()  #輸出的score不是機率 所以不會介於0~1之間
                answer = extact_answer_from_span(batch.question[i],
                                                 batch.passage[i], span)
                preds.append({
                    'question_id': batch.question_id[i],
                    'question': batch.question[i],
                    'question_type': batch.question_type[i],
                    'answers': [answer],
                    'entity_answers': [[]],
                    'yesno_answers': [],
                    'score': score,
                    'gold': batch.answers[i]
                })

        tmp = {}
        for pred in preds:
            qid = pred['question_id']
            if qid not in tmp:
                tmp[qid] = []
            tmp[qid].append(pred)

        pred_result, ref_result = {}, {}
        # find max score predcition(dict) of qid

        for qid in tmp:
            l = tmp[qid]
            max_answer = max(l, key=lambda d: d['score'])
            pred_result[qid] = max_answer

            ref = {k: v for k, v in max_answer.items()}
            ref['answers'] = max_answer['gold']
            ref_result[qid] = ref

        mrc_eval.evaluate(pred_result, ref_result)
Example #9
0
print('build model')
model, tokenizer, device = util.model_factory(config.BERT_SERIALIZATION_DIR,
                                              device=DEVICE)
model.load_state_dict(torch.load(expe.model_path, map_location=device))
model.eval()
_num_fn = numeralize_fucntion_factory(config.NUM_FN_NAME)
num_fn = functools.partial(_num_fn,
                           max_seq_len=config.MAX_SEQ_LEN,
                           max_passage_len=config.MAX_PASSAGE_LEN,
                           tokenizer=tokenizer,
                           device=device)

if args.para_selection == 'most_related_para' and args.label_policy == 'answer_docs':
    loader = DureaderLoader(EVAL_FILE,
                            args.para_selection,
                            sample_fields=[
                                'question', 'answer_docs', 'question_id',
                                'question_type'
                            ])
    sample_list = loader.sample_list
    sample_list = list(
        filter(lambda x: len(x['answer_docs']) != 0, sample_list))
    #print(sample_list[0])
    X = list(map(lambda x: (x['question'], x['passage']), sample_list))
    y = list(
        map(lambda x: 1
            if x['answer_docs'][0] == x['doc_id'] else 0, sample_list))
    print('total %d' % (len(X)))
    with torch.no_grad():
        metrics = [('accuracy', accuracy), ('precision', precision)]
        results = evaluate_on_examples(model,
                                       num_fn,