def validate_dataset(model, split, tokenizer, topk=1):
    assert split in ('dev', 'test')
    dataloader = get_dataloader('xlnet', split, tokenizer, bwd=False, \
                        batch_size=16, num_workers=16)
    em, f1, count = 0, 0, 0

    model.start_n_top = topk
    model.end_n_top = topk
    model.eval()
    for batch in dataloader:
        input_ids, attention_mask, token_type_ids, input_tokens_no_unk, answers = batch
        input_ids = input_ids.cuda(device=device)
        attention_mask = attention_mask.cuda(device=device)
        token_type_ids = token_type_ids.cuda(device=device)
        with torch.no_grad():
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)

        start_index = outputs[1]
        end_index = outputs[3].view(-1, model.end_n_top,
                                    model.start_n_top).permute([0, 2, 1])[:, :,
                                                                          0]
        for i, answer in enumerate(answers):
            preds = []
            for k in range(model.start_n_top):
                pred_tokens = input_tokens_no_unk[i][
                    start_index[i][k]:end_index[i][k] + 1]
                preds.append(tokenizer.convert_tokens_to_string(pred_tokens))

            norm_preds_tokens = [
                norm_tokenizer.basic_tokenizer.tokenize(pred) for pred in preds
            ]
            norm_preds = [
                norm_tokenizer.convert_tokens_to_string(norm_pred_tokens)
                for norm_pred_tokens in norm_preds_tokens
            ]
            norm_answer_tokens = [
                norm_tokenizer.basic_tokenizer.tokenize(ans) for ans in answer
            ]
            norm_answer = [
                norm_tokenizer.convert_tokens_to_string(ans_tokens)
                for ans_tokens in norm_answer_tokens
            ]

            em += max(
                metric_max_over_ground_truths(exact_match_score, norm_pred,
                                              norm_answer)
                for norm_pred in norm_preds)
            f1 += max(
                metric_max_over_ground_truths(f1_score, norm_pred, norm_answer)
                for norm_pred in norm_preds)
            count += 1
    del dataloader
    return em, f1, count
Exemple #2
0
def evaluate_q_types(dataset, predictions):
    q_one_grams = {}
    q_two_grams = {}

    for article in dataset:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:

                qa_split = qa['question'].split()
                first_word = qa_split[0]
                first_2_words = first_word + " " + qa_split[1]

                if first_word not in q_one_grams:
                    q_one_grams[first_word] = {'f1': 0.0, 'em': 0.0, 'count': 0}
                if first_2_words not in q_two_grams:
                    q_two_grams[first_2_words] = {'f1': 0.0, 'em': 0.0, 'count': 0}

                q_one_grams[first_word]['count'] += 1
                q_two_grams[first_2_words]['count'] += 1

                if qa['id'] not in predictions:
                    continue
                
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                prediction = predictions[qa['id']]
                
                em = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
                f1 = metric_max_over_ground_truths(f1_score, prediction, ground_truths)

                q_one_grams[first_word]['f1'] += f1
                q_one_grams[first_word]['em'] += em
                q_two_grams[first_2_words]['f1'] += f1
                q_two_grams[first_2_words]['em'] += em

    results_1 = {}
    for key in q_one_grams:
        val = q_one_grams[key]
        results_1[key] = {'f1':100.0 * val['f1'] / val['count'], 'em':100.0 * val['em'] / val['count'], 'count': val['count']}

    sorted_results_1 = sorted(results_1.items(), key=lambda (x,y): y['count'], reverse=True)

    results_2 = {}
    for key in q_two_grams:
        val = q_two_grams[key]
        results_2[key] = {'f1':100.0 * val['f1'] / val['count'], 'em':100.0 * val['em'] / val['count'], 'count': val['count']}

    sorted_results_2 = sorted(results_2.items(), key=lambda (x,y): y['count'], reverse=True)

    return {'one grams': sorted_results_1[:20], 'two grams': sorted_results_2[:20]}
def evaluation_devresult(pre_result,target_result):
    '''
    function:
        QA evaluation
    '''
    f1 = exact_match = total = 0
    for i in range(len(pre_result)):
        total += 1
        prediction = pre_result[i]
        ground_truths = target_result[i]
        exact_match += evaluate.metric_max_over_ground_truths(evaluate.exact_match_score, prediction, ground_truths)
        f1 += evaluate.metric_max_over_ground_truths(evaluate.f1_score, prediction, ground_truths)
    exact_match =  exact_match / total
    f1 =  f1 / total
    return exact_match,f1
Exemple #4
0
def validate_dataset(model, split, tokenizer, topk=1, prefix=None):
    assert split in ('dev', 'test')
    fwd_dataloader = get_dataloader('bert', split, tokenizer, bwd=False, \
                        batch_size=16, num_workers=16, prefix=prefix)
    bwd_dataloader = get_dataloader('bert', split, tokenizer, bwd=True, \
                        batch_size=16, num_workers=16, prefix=prefix)
    em, f1, count = 0, 0, 0

    model.eval()
    for fwd_batch, bwd_batch in zip(fwd_dataloader, bwd_dataloader):
        # FWD
        input_ids, attention_mask, token_type_ids, margin_mask, fwd_input_tokens_no_unks, answers = fwd_batch
        input_ids = input_ids.cuda(device=device)
        attention_mask = attention_mask.cuda(device=device)
        token_type_ids = token_type_ids.cuda(device=device)
        margin_mask = margin_mask.cuda(device=device)
        with torch.no_grad():
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)

        start_logits, end_logits = outputs[0], outputs[1]
        start_logits += margin_mask
        end_logits += margin_mask
        start_logits = start_logits.cpu().clone()
        fwd_end_logits = end_logits.cpu().clone()

        start_probs = start_logits  #softmax(start_logits, dim=1)
        fwd_start_probs, fwd_start_index = start_probs.topk(topk * 5, dim=1)

        # BWD
        input_ids, attention_mask, token_type_ids, margin_mask, bwd_input_tokens_no_unks, answers = bwd_batch
        input_ids = input_ids.cuda(device=device)
        attention_mask = attention_mask.cuda(device=device)
        token_type_ids = token_type_ids.cuda(device=device)
        margin_mask = margin_mask.cuda(device=device)
        with torch.no_grad():
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)

        start_logits, end_logits = outputs[0], outputs[1]
        start_logits += margin_mask
        end_logits += margin_mask
        start_logits = start_logits.cpu().clone()
        bwd_end_logits = end_logits.cpu().clone()

        start_probs = start_logits  #softmax(start_logits, dim=1)
        bwd_start_probs, bwd_start_index = start_probs.topk(topk * 5, dim=1)

        # FWD-BWD
        for i, answer in enumerate(answers):
            preds, probs = [], []
            for n in range(topk * 5):
                # FWD
                start_prob = fwd_start_probs[i][n].item()
                start_ind = fwd_start_index[i][n].item()
                beam_end_logits = fwd_end_logits[i].clone().unsqueeze(0)

                end_probs = beam_end_logits  #softmax(beam_end_logits, dim=1)
                end_probs[0, :start_ind] += -1e10
                end_probs[0, start_ind + 20:] += -1e10
                end_probs, end_index = end_probs.topk(topk * 5, dim=1)

                # topk*topk combination
                for m in range(topk * 5):
                    end_prob = end_probs[0][m].item()
                    end_ind = end_index[0][m].item()

                    prob = start_prob + end_prob  # log prob  i.e. logits
                    span_tokens = fwd_input_tokens_no_unks[i][
                        start_ind:end_ind + 1]
                    pred = convert_tokens_to_string(span_tokens)

                    if pred == tokenizer.sep_token or pred == '':
                        pass
                    elif pred and pred not in preds:
                        probs.append(prob)
                        preds.append(pred)
                    elif pred and pred in preds:
                        pred_idx = preds.index(pred)
                        if prob > probs[pred_idx]:
                            probs[pred_idx] = prob
                        #probs[preds.index(pred)] += prob
                    else:
                        pass

                # BWD
                start_prob = bwd_start_probs[i][n].item()
                start_ind = bwd_start_index[i][n].item()
                beam_end_logits = bwd_end_logits[i].clone().unsqueeze(0)

                end_probs = beam_end_logits  #softmax(beam_end_logits, dim=1)
                end_probs[0, :start_ind] += -1e10
                end_probs[0, start_ind + 20:] += -1e10
                end_probs, end_index = end_probs.topk(topk * 5, dim=1)
                end_ind = end_index[0][0]

                # topk*topk combination
                for m in range(topk * 5):
                    end_prob = end_probs[0][m].item()
                    end_ind = end_index[0][m].item()

                    prob = start_prob + end_prob  # log prob  i.e. logits
                    span_tokens = bwd_input_tokens_no_unks[i][
                        start_ind:end_ind + 1]
                    pred = convert_tokens_to_string(span_tokens)

                    if pred == tokenizer.sep_token or pred == '':
                        pass
                    elif pred and pred not in preds:
                        probs.append(prob)
                        preds.append(pred)
                    elif pred and pred in preds:
                        pred_idx = pred.index(pred)
                        if prob > probs[pred_idx]:
                            probs[pred_idx] = prob
                        #probs[preds.index(pred)] += prob
                    else:
                        pass

            count += 1
            if len(preds) > 0:
                sorted_probs_preds = list(reversed(sorted(zip(probs, preds))))
                probs, preds = map(list, zip(*sorted_probs_preds))
                probs, preds = probs[:topk], preds[:topk]

                norm_preds_tokens = [
                    norm_tokenizer.basic_tokenizer.tokenize(pred)
                    for pred in preds
                ]
                norm_preds = [
                    norm_tokenizer.convert_tokens_to_string(norm_pred_tokens)
                    for norm_pred_tokens in norm_preds_tokens
                ]
                norm_answer_tokens = [
                    norm_tokenizer.basic_tokenizer.tokenize(ans)
                    for ans in answer
                ]
                norm_answer = [
                    norm_tokenizer.convert_tokens_to_string(ans_tokens)
                    for ans_tokens in norm_answer_tokens
                ]

                em += max(
                    metric_max_over_ground_truths(exact_match_score, norm_pred,
                                                  norm_answer)
                    for norm_pred in norm_preds)
                f1 += max(
                    metric_max_over_ground_truths(f1_score, norm_pred,
                                                  norm_answer)
                    for norm_pred in norm_preds)

    del fwd_dataloader, bwd_dataloader
    return em, f1, count
Exemple #5
0
def validate_dataset(model, split, tokenizer, dataset, topk=1):
    assert split in ('dev', 'test')
    fwd_dataloader = get_dataloader('bert', split, tokenizer, bwd=False, \
                        batch_size=16, num_workers=16, prefix=dataset)
    bwd_dataloader = get_dataloader('bert', split, tokenizer, bwd=True, \
                        batch_size=16, num_workers=16, prefix=dataset)
    em, f1, count = 0, 0, 0

    model.eval()
    for fwd_batch, bwd_batch in zip(fwd_dataloader, bwd_dataloader):
        # Forward
        input_ids, attention_mask, token_type_ids, fwd_input_tokens_no_unk, answers = fwd_batch
        input_ids = input_ids.cuda(device=device)
        attention_mask = attention_mask.cuda(device=device)
        token_type_ids = token_type_ids.cuda(device=device)
        with torch.no_grad():
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
        start_logits, end_logits = outputs[0], outputs[1]
        start_probs = softmax(start_logits, dim=1)
        end_probs = softmax(end_logits, dim=1)
        fwd_start_probs, fwd_start_index = start_probs.topk(topk, dim=1)
        fwd_end_probs, fwd_end_index = end_probs.topk(topk, dim=1)

        # Backward
        input_ids, attention_mask, token_type_ids, bwd_input_tokens_no_unk, answers = bwd_batch
        input_ids = input_ids.cuda(device=device)
        attention_mask = attention_mask.cuda(device=device)
        token_type_ids = token_type_ids.cuda(device=device)
        with torch.no_grad():
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
        start_logits, end_logits = outputs[0], outputs[1]
        start_probs = softmax(start_logits, dim=1)
        end_probs = softmax(end_logits, dim=1)
        bwd_start_probs, bwd_start_index = start_probs.topk(topk, dim=1)
        bwd_end_probs, bwd_end_index = end_probs.topk(topk, dim=1)

        for i, answer in enumerate(answers):
            preds = []
            if topk <= 1:
                span_tokens = fwd_input_tokens_no_unk[i][
                    fwd_start_index[i][0]:fwd_end_index[i][0] + 1]
                preds.append(tokenizer.convert_tokens_to_string(span_tokens))
                span_tokens = bwd_input_tokens_no_unk[i][
                    bwd_start_index[i][0]:bwd_end_index[i][0] + 1]
                preds.append(tokenizer.convert_tokens_to_string(span_tokens))
            else:
                joint_probs, joint_index = (
                    fwd_start_probs[i].unsqueeze(1) *
                    fwd_end_probs[i].unsqueeze(0)).view(topk * topk).topk(topk)
                for n in range(topk):
                    smap = joint_index[n] // topk
                    emap = joint_index[n] - smap * topk
                    span_tokens = fwd_input_tokens_no_unk[i][
                        fwd_start_index[i][smap]:fwd_end_index[i][emap] + 1]
                    preds.append(
                        tokenizer.convert_tokens_to_string(span_tokens))

                joint_probs, joint_index = (
                    bwd_start_probs[i].unsqueeze(1) *
                    bwd_end_probs[i].unsqueeze(0)).view(topk * topk).topk(topk)
                for n in range(topk):
                    smap = joint_index[n] // topk
                    emap = joint_index[n] - smap * topk
                    span_tokens = bwd_input_tokens_no_unk[i][
                        bwd_start_index[i][smap]:bwd_end_index[i][emap] + 1]
                    preds.append(
                        tokenizer.convert_tokens_to_string(span_tokens))

            norm_preds_tokens = [
                norm_tokenizer.basic_tokenizer.tokenize(pred) for pred in preds
            ]
            norm_preds = [
                norm_tokenizer.convert_tokens_to_string(norm_pred_tokens)
                for norm_pred_tokens in norm_preds_tokens
            ]
            norm_answer_tokens = [
                norm_tokenizer.basic_tokenizer.tokenize(ans) for ans in answer
            ]
            norm_answer = [
                norm_tokenizer.convert_tokens_to_string(ans_tokens)
                for ans_tokens in norm_answer_tokens
            ]

            em += max(
                metric_max_over_ground_truths(exact_match_score, norm_pred,
                                              norm_answer)
                for norm_pred in norm_preds)
            f1 += max(
                metric_max_over_ground_truths(f1_score, norm_pred, norm_answer)
                for norm_pred in norm_preds)
            count += 1
    del fwd_dataloader, bwd_dataloader
    return em, f1, count
Exemple #6
0
def validate_dataset(model, split, tokenizer, topk=5):
    assert split in ('dev', 'test')
    dataloader = get_dataloader('xlnet', split, tokenizer, bwd=False, \
                        batch_size=8, num_workers=8)
    em, f1, count = 0, 0, 0

    model.start_n_top = topk
    model.end_n_top = topk
    model.eval()
    for batch in dataloader:
        batch = (*(tensor.cuda(device) for tensor in batch[:-2]), *batch[-2:])
        input_ids, attention_mask, token_type_ids, cls_index, input_tokens_no_unk, answers = batch
        with torch.no_grad():
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            cls_index=cls_index)

        start_index = outputs[1]
        end_index = outputs[3][:, :, 0]
        op_types = outputs[4]
        for i, answer in enumerate(answers):
            preds = []
            for k in range(model.start_n_top):
                op_type = op_types[i][k].argmax().item()
                if op_type == 0:
                    pred_tokens = input_tokens_no_unk[i][
                        start_index[i][k]:end_index[i][k] + 1]
                    pred = tokenizer.convert_tokens_to_string(pred_tokens)
                elif op_type == 1:
                    pred = arithmetic_op(tokenizer,
                                         num_match_re,
                                         input_tokens_no_unk[i],
                                         start_index[i][k],
                                         end_index[i][k],
                                         plus=True)
                elif op_type == 2:
                    pred = arithmetic_op(tokenizer,
                                         num_match_re,
                                         input_tokens_no_unk[i],
                                         start_index[i][k],
                                         end_index[i][k],
                                         plus=False)
                elif op_type == 3:
                    pred = date_duration_op(tokenizer,
                                            date_re,
                                            dur_re,
                                            tn,
                                            input_tokens_no_unk[i],
                                            start_index[i][k],
                                            end_index[i][k],
                                            plus=True)
                elif op_type == 4:
                    pred = date_duration_op(tokenizer,
                                            date_re,
                                            dur_re,
                                            tn,
                                            input_tokens_no_unk[i],
                                            start_index[i][k],
                                            end_index[i][k],
                                            plus=False)
                preds.append(pred)

            norm_preds_tokens = [
                norm_tokenizer.basic_tokenizer.tokenize(pred) for pred in preds
            ]
            norm_preds = [
                norm_tokenizer.convert_tokens_to_string(norm_pred_tokens)
                for norm_pred_tokens in norm_preds_tokens
            ]
            norm_answer_tokens = [
                norm_tokenizer.basic_tokenizer.tokenize(ans) for ans in answer
            ]
            norm_answer = [
                norm_tokenizer.convert_tokens_to_string(ans_tokens)
                for ans_tokens in norm_answer_tokens
            ]

            em += max(
                metric_max_over_ground_truths(exact_match_score, norm_pred,
                                              norm_answer)
                for norm_pred in norm_preds)
            f1 += max(
                metric_max_over_ground_truths(f1_score, norm_pred, norm_answer)
                for norm_pred in norm_preds)
            count += 1
    del dataloader
    return em, f1, count