Esempio n. 1
0
def test_entities_at_the_end():
    words = "Shyam lives in New York".split()
    gold = "B-PER O O B-LOC I-LOC".split()
    pred = "B-PER O O B-LOC O".split()

    print("Input gold. This should be perfect.")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, gold)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 1.0

    print("This should be 50% F1")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 0.5
    assert by_type["PER"].fscore == 1.0
    assert by_type["LOC"].fscore == 0.0

    print("This should be 50% F1")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, pred, gold)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 0.5
    assert by_type["PER"].fscore == 1.0
    assert by_type["LOC"].fscore == 0.0
Esempio n. 2
0
    def eval(self, dataset_name, log_output=None):
        dataset = self.datasets.get(dataset_name, None)
        if dataset is None:
            return

        results = []
        logger.info('Evaluating {} ({})'.format(self.name, dataset_name))
        set_loss = 0
        for tokens, labels, chars, seq_lens, char_lens in dataset.get_dataset(
                volatile=True, gpu=self.gpu):
            preds, loss = self.model.predict(tokens, labels, seq_lens, chars,
                                             char_lens)
            set_loss += float(loss.data[0])
            for pred, gold, seq_len, ts in zip(preds, labels, seq_lens,
                                               tokens):
                l = int(seq_len.data[0])
                pred = pred.data.tolist()[:l]
                gold = gold.data.tolist()[:l]
                ts = ts.data.tolist()[:l]
                for p, g, t in zip(pred, gold, ts):
                    t = self.idx_token.get(t, 'UNK')
                    results.append('{} {} {}'.format(t, self.idx_label[g],
                                                     self.idx_label[p]))
                results.append('')
        counts = evaluate(results)
        overall, by_type = metrics(counts)
        report(counts)
        logger.info('Loss: {:.5f}'.format(set_loss))
        return SCORES(fscore=overall.fscore,
                      precision=overall.prec,
                      recall=overall.rec,
                      loss=set_loss)
Esempio n. 3
0
def evaluate(results, idx_token, idx_label, writer=None):
    """Evaluate prediction results.

    :param results: A List of which each item is a tuple
        (predictions, gold labels, sequence lengths, tokens) of a batch.
    :param idx_token: Index to token dictionary.
    :param idx_label: Index to label dictionary.
    :param writer: An object (file object) with a write() function. Extra output.
    :return: F-score, precision, and recall.
    """
    # b: batch, s: sequence
    outputs = []
    for preds_b, golds_b, len_b, tokens_b in results:
        for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b,
                                                     tokens_b):
            l = int(len_s.item())
            preds_s = preds_s.data.tolist()[:l]
            golds_s = golds_s.data.tolist()[:l]
            tokens_s = tokens_s.data.tolist()[:l]
            for p, g, t in zip(preds_s, golds_s, tokens_s):
                token = idx_token.get(t, C.UNK_INDEX)
                outputs.append('{} {} {}'.format(token, idx_label.get(g, 0),
                                                 idx_label.get(p, 0)))
            outputs.append('')
    counts = conlleval.evaluate(outputs)
    overall, by_type = conlleval.metrics(counts)
    conlleval.report(counts)
    if writer:
        conlleval.report(counts, out=writer)
        writer.flush()
    return overall.fscore, overall.prec, overall.rec
Esempio n. 4
0
    def get_results(self, name):
        p = (float(self.main_correct_count) / float(self.main_predicted_count)) if (self.main_predicted_count > 0) else 0.0
        r = (float(self.main_correct_count) / float(self.main_total_count)) if (self.main_total_count > 0) else 0.0
        f = (2.0 * p * r / (p + r)) if (p+r > 0.0) else 0.0
        f05 = ((1.0 + 0.5*0.5) * p * r / ((0.5*0.5 * p) + r)) if (p+r > 0.0) else 0.0

        results = collections.OrderedDict()
        results[name + "_cost_avg"] = self.cost_sum / float(self.token_count)
        results[name + "_cost_sum"] = self.cost_sum
        results[name + "_main_predicted_count"] = self.main_predicted_count
        results[name + "_main_total_count"] = self.main_total_count
        results[name + "_main_correct_count"] = self.main_correct_count
        results[name + "_p"] = p
        results[name + "_r"] = r
        results[name + "_f"] = f
        results[name + "_f05"] = f05
        results[name + "_accuracy"] = self.correct_sum / float(self.token_count)
        results[name + "_token_count"] = self.token_count
        results[name + "_time"] = float(time.time()) - float(self.start_time)
        results[name + "_correct_sum"] = self.correct_sum

        if self.label2id is not None and self.conll_eval == True:
            conll_counts = conlleval.evaluate(self.conll_format)
            conll_metrics_overall, conll_metrics_by_type = conlleval.metrics(conll_counts)
            results[name + "_conll_accuracy"] = float(conll_counts.correct_tags) / float(conll_counts.token_counter)
            results[name + "_conll_p"] = conll_metrics_overall.prec
            results[name + "_conll_r"] = conll_metrics_overall.rec
            results[name + "_conll_f"] = conll_metrics_overall.fscore
#            for i, m in sorted(conll_metrics_by_type.items()):
#                results[name + "_conll_p_" + str(i)] = m.prec
#                results[name + "_conll_r_" + str(i)] = m.rec
#                results[name + "_conll_f_" + str(i)] = m.fscore #str(m.fscore) + " " + str(conll_counts.t_found_guessed[i])

        return results, self.conll_format
Esempio n. 5
0
def conll_summary(tokens, gold, pred, config):
    """Return string summarizing performance using CoNLL criteria."""
    index_to_label = {v: k for k, v in config.label_to_index.items()}

    acc = accuracy(gold, pred)
    gold = map(lambda i: index_to_label[i], as_dense(gold))
    pred = map(lambda i: index_to_label[i], as_dense(pred))

    # Format as space-separated (token, gold, pred) strings for CoNLL eval.
    if len(tokens) != len(gold) or len(gold) != len(pred):
        raise ValueError('counts do not match')
    formatted = [' '.join(t) for t in zip(tokens, gold, pred)]

    o, by_type = conlleval.metrics(conlleval.evaluate(formatted))
    nlen = max(len(name) for name in by_type.keys())
    summaries = [
        '%.2f%% acc %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' %
        (100. * acc, 100. * o.fscore, 100. * o.prec, 100. * o.rec, o.tp, o.fp,
         o.fn)
    ]
    config.results_log[config.model_name_log][
        config.dataset_name_log] = o.fscore
    for name, r in sorted(by_type.items()):
        summaries.append('%*s %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' %
                         (nlen, name, 100. * r.fscore, 100. * r.prec,
                          100. * r.rec, r.tp, r.fp, r.fn))

    return '\n'.join(summaries)
Esempio n. 6
0
def evaluate(args, data, model, id2label, all_ori_tokens):
    model.eval()
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data,
                            sampler=sampler,
                            batch_size=args.train_batch_size)

    logger.info("***** Running eval *****")
    # logger.info(f" Num examples = {len(data)}")
    # logger.info(f" Batch size = {args.eval_batch_size}")
    pred_labels = []
    ori_labels = []

    for b_i, (input_ids, input_mask, segment_ids, label_ids, bbox, bbox_pos_id,
              bbox_num) in enumerate(tqdm(dataloader, desc="Evaluating")):

        input_ids = input_ids.to(args.device)
        input_mask = input_mask.to(args.device)
        segment_ids = segment_ids.to(args.device)
        label_ids = label_ids.to(args.device)
        bbox = bbox.to(args.device)
        bbox_pos_id = bbox_pos_id.to(args.device)
        bbox_num = bbox_num.to(args.device)

        with torch.no_grad():
            logits = model.predict(input_ids, segment_ids, input_mask, bbox,
                                   bbox_pos_id, bbox_num)
        # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
        # logits = logits.detach().cpu().numpy()

        for l in logits:  # logits-> List[List[int]]
            pred_labels.append([id2label[idx] for idx in l])

        for l in label_ids:  # tensor
            ori_labels.append([id2label[idx.item()] for idx in l])

    eval_list = []

    for ori_tokens, oril, prel in zip(all_ori_tokens, ori_labels, pred_labels):
        for ot, ol, pl in zip(ori_tokens, oril, prel):
            if ot in ["[CLS]", "[SEP]"]:
                continue
            if len(f"{ot} {ol} {pl}\n".split(" ")) != 3:
                continue
            eval_list.append(f"{ot} {ol} {pl}\n")
        eval_list.append("\n")

    # eval the model
    counts = conlleval.evaluate(eval_list)
    conlleval.report(counts)

    # namedtuple('Metrics', 'tp fp fn prec rec fscore')
    overall, by_type = conlleval.metrics(counts)

    return overall, by_type
Esempio n. 7
0
    def on_epoch_end(self, epoch, logs=None):
        ypred = self.model.predict(self.test_features)
        c, cmat = conll_eval_counts(ypred, self.test_ground_truth, self.labels)
        ceval.report(c, prefix=self.prefix)
        print_cm(cmat, ordered_label_keys(self.labels))
        o, b = ceval.metrics(c)

        # tensorboard requires those logs to be float64 with attribute item(), thus we create them with numpy
        logs[self.prefix + "_conll_f1"] = np.float64(o.fscore)
        logs[self.prefix + "_conll_prec"] = np.float64(o.prec)
        logs[self.prefix + "_conll_rec"] = np.float64(o.rec)
Esempio n. 8
0
def calculate_labeling_scores(results, report=True):
    outputs = []
    for p_b, g_b, t_b, l_b in results:
        for p_s, g_s, t_s, l_s in zip(p_b, g_b, t_b, l_b):
            p_s = p_s[:l_s]
            for p, g, t in zip(p_s, g_s, t_s):
                outputs.append('{} {} {}'.format(t, g, p))
            outputs.append('')
    counts = conlleval.evaluate(outputs)
    overall, by_type = conlleval.metrics(counts)
    if report:
        conlleval.report(counts)
    return (overall.fscore * 100.0, overall.prec * 100.0, overall.rec * 100.0)
Esempio n. 9
0
def test_format():
    words = "Shyam lives in New York .".split()
    gold = "B-PER O O B-LOC I-LOC O".split()
    pred = "B-PER O O B-LOC O O".split()
    print("Testing inputting the wrong format. This should get an exception")
    try:
        evaluate([1, 2, 3])
    except Exception as e:
        print(e)

    pred = "B-PER O O B-LOC I-MISC O".split()
    print("This should be 50% F1")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 0.4
Esempio n. 10
0
def evaluate(results, idx_token, idx_label, writer=None):
    """Evaluate prediction results.

    :param results: A List of which each item is a tuple
        (predictions, gold labels, sequence lengths, tokens) of a batch.
    :param idx_token: Index to token dictionary.
    :param idx_label: Index to label dictionary.
    :param writer: An object (file object) with a write() function. Extra output.
    :return: F-score, precision, and recall.
    """
    # b: batch, s: sequence
    outputs = []
    # preds: predictions
    # golds: answers?
    # len: length of something
    # tokens: original words?
    for preds_b, golds_b, len_b, tokens_b in results:
        for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b, tokens_b):
            l = int(len_s.item())
            preds_s = preds_s.data.tolist()[:l]
            golds_s = golds_s.data.tolist()[:l]
            tokens_s = tokens_s.data.tolist()[:l]
            for p, g, t in zip(preds_s, golds_s, tokens_s):
                token = idx_token.get(t, C.UNK)
                # if token == '':  # debug
                #     token = '<$UNK$>'
                # print(idx_token)  # debug
                # print("p: ", p, ", g: ", g, ", t: ", t, ", corresponding token:", token, "|")  # DEBUG
                outputs.append('{} {} {}'.format(
                    token, idx_label.get(g, 0), idx_label.get(p, 0)))
            outputs.append('')
    # print("OUTPUTS: ", outputs)  # DEBUG # seems like outputs is right but counts is wrong
    # Why is english-covered-test not like the other, uncovered datasets? is this causing an issue?
    counts = conlleval.evaluate(outputs)
    # print("counts: ", counts)  # DEBUG
    overall, by_type = conlleval.metrics(counts)
    conlleval.report(counts)
    if writer:
        conlleval.report(counts, out=writer)
        writer.flush()
    return overall.fscore, overall.prec, overall.rec
def conll_summary(sentences):
    eval_sentences = [[(t.target_str, t.prediction_str) for t in s]
                      for s in sentences]
    gold = [t.target_str for s in sentences for t in s]
    pred = [t.prediction_str for s in sentences for t in s]
    acc = accuracy(gold, pred)
    counts = conlleval.evaluate_sentences(eval_sentences)
    overall, by_type = conlleval.metrics(counts)
    #print("By type keys: ", len(by_type.keys()))
    #nlen = max(len(name) for name in by_type.keys()) if len(by_type.keys()) > 0 else 0
    nlen = max(len(name) for name in by_type.keys())

    summaries = [(
        'acc: {acc:.2%} f: {m.fscore:.2%} ' +
        '(p:{m.prec:.1%} r:{m.rec:.1%} tp:{m.tp} fp:{m.fp} fn:{m.fn})').format(
            acc=acc, m=overall)]
    for name, r in sorted(by_type.items()):
        summaries.append(
            ('{name:{nlen}} f: {m.fscore:.2%} ' +
             '(p:{m.prec:.1%} r:{m.rec:.1%} tp:{m.tp} fp:{m.fp} fn:{m.fn})'
             ).format(name=name, nlen=nlen, m=r))
    return '\n'.join(summaries)
Esempio n. 12
0
def conll_summary(tokens, gold, pred, config):
  """Return string summarizing performance using CoNLL criteria."""
  index_to_label = { v: k for k, v in config.label_to_index.items() }

  acc = accuracy(gold, pred)
  gold = map(lambda i: index_to_label[i], as_dense(gold))
  pred = map(lambda i: index_to_label[i], as_dense(pred))

  # Format as space-separated (token, gold, pred) strings for CoNLL eval.
  if len(tokens) != len(gold) or len(gold) != len(pred):
    raise ValueError('counts do not match')
  formatted = [' '.join(t) for t in zip(tokens, gold, pred)]

  o, by_type = conlleval.metrics(conlleval.evaluate(formatted))
  nlen = max(len(name) for name in by_type.keys())
  summaries = ['%.2f%% acc %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % (
    100.*acc, 100.*o.fscore, 100.*o.prec, 100.*o.rec,  o.tp, o.fp, o.fn
  )]
  for name, r in sorted(by_type.items()):
    summaries.append('%*s %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % (
      nlen, name, 100.*r.fscore, 100.*r.prec, 100.*r.rec, r.tp, r.fp, r.fn
    ))

  return '\n'.join(summaries)
Esempio n. 13
0
def conlleval_overall_results(documents):
    """Return overall conlleval results for Documents."""
    counts = conlleval_evaluate(documents)
    overall, by_type = metrics(counts)
    return overall
Esempio n. 14
0
def main(argv):

    argparser = argument_parser()
    args = argparser.parse_args(argv[1:])
    seq_len = args.max_seq_length    # abbreviation

    pretrained_model, tokenizer = load_pretrained(args)

    train_words, train_tags = read_conll(args.train_data)
    test_words, test_tags = read_conll(args.test_data)


    print(args.no_context)

    if args.no_context:
        train_data = process_no_context(train_words, train_tags, tokenizer, seq_len)
        test_data = process_no_context(test_words, test_tags, tokenizer, seq_len)
    elif args.documentwise:
        tr_docs, tr_doc_tags, tr_line_ids = split_to_documents(train_words, train_tags)
        te_docs, te_doc_tags, te_line_ids = split_to_documents(test_words, test_tags)
        train_data = process_docs(tr_docs, tr_doc_tags, tr_line_ids, tokenizer, seq_len)
        test_data = process_docs(te_docs, te_doc_tags, te_line_ids, tokenizer, seq_len)
    else:
        train_data = process_sentences(train_words, train_tags, tokenizer, seq_len, args.predict_position)
        test_data = process_sentences(test_words, test_tags, tokenizer, seq_len, args.predict_position)
    
    label_list = get_labels(train_data.labels)
    tag_map = { l: i for i, l in enumerate(label_list) }
    inv_tag_map = { v: k for k, v in tag_map.items() }

    train_x = encode(train_data.combined_tokens, tokenizer, seq_len)
    test_x = encode(test_data.combined_tokens, tokenizer, seq_len)
    train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len)
    test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len)


    if args.use_ner_model and (args.ner_model_dir is not None):
        ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir)
    else:
        optimizer = create_optimizer(len(train_x[0]), args)
        model = create_ner_model(pretrained_model, len(tag_map))
        if args.num_gpus > 1:
            ner_model = multi_gpu_model(model, args.num_gpus)
        else:
            ner_model = model

        ner_model.compile(
            optimizer,
            loss='sparse_categorical_crossentropy',
            sample_weight_mode='temporal',
            metrics=['sparse_categorical_accuracy']
            )
                
        ner_model.fit(
            train_x,
            train_y,
            sample_weight=train_weights,
            epochs=args.num_train_epochs,
            batch_size=args.batch_size
            )
        if args.ner_model_dir is not None:
            label_list = [v for k, v in sorted(list(inv_tag_map.items()))]
            save_ner_model(ner_model, tokenizer, label_list, args)

    
    probs = ner_model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)
    
    results = []
    m_names = []
    if args.no_context:
        pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers)
        output_file = "output/{}-NC.tsv".format(args.output_file)
        m_names.append('NC')  
        ensemble = []
        for i,pred in enumerate(pr_test_first):
            ensemble.append([inv_tag_map[t] for t in pred])
        lines_ensemble, sentences_ensemble = write_result(
            output_file, test_data.words, test_data.lengths,
            test_data.tokens, test_data.labels, ensemble
            )
        c = conlleval.evaluate(lines_ensemble)
        conlleval.report(c)
        results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])



    else:
        # First tag then vote
        pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers)
        # Accumulate probabilities, then vote
        prob_ensemble, prob_test_first = get_predictions2(probs, test_data.tokens, test_data.sentence_numbers)
        ens = [pr_ensemble, prob_ensemble, pr_test_first, prob_test_first]
        if args.documentwise:
            # D-CMV: Documentwise CMV
            # D-CMVP: Documetwise CMV, probs summed, argmax after that
            # D-F: Documentwise First
            # D-FP: Same as D-FP 
            method_names = ['D-CMV','D-CMVP','D-F','D-FP']  
        else:           
            method_names = ['CMV','CMVP','F','FP']
        for i, ensem in enumerate(ens):
            ensemble = []
            for j,pred in enumerate(ensem):
                ensemble.append([inv_tag_map[t] for t in pred])
            output_file = "output/{}-{}.tsv".format(args.output_file, method_names[i])
            lines_ensemble, sentences_ensemble = write_result(
                    output_file, test_data.words, test_data.lengths,
                    test_data.tokens, test_data.labels, ensemble)
            print("Model trained: ", args.ner_model_dir)
            print("Seq-len: ", args.max_seq_length)
            print("Learning rate: ", args.learning_rate)
            print("Batch Size: ", args.batch_size)
            print("Epochs: ", args.num_train_epochs)
            print("Training data: ", args.train_data)
            print("Testing data: ", args.test_data)
            print("")
            print("Results with {}".format(method_names[i]))
            c = conlleval.evaluate(lines_ensemble)
            print("")
            conlleval.report(c)
            results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])
            m_names.extend(method_names)

        
    if args.sentence_in_context:     
        starting_pos = np.arange(0,seq_len+1,32)
        starting_pos[0] = 1
        m_names.extend(starting_pos)
        for start_p in starting_pos:
            tt_lines, tt_tags, line_nos, line_starts = combine_sentences2(test_data.tokens, test_data.labels, seq_len-1, start_p-1)
            tt_x = encode(tt_lines, tokenizer, seq_len)
            tt_y, train_weights = label_encode(tt_tags, tag_map, seq_len)
            probs = ner_model.predict(tt_x, batch_size=args.batch_size)
            preds = np.argmax(probs, axis=-1)


            pred_tags = []
            for i, pred in enumerate(preds):
                idx = line_nos[i].index(i)
                pred_tags.append([inv_tag_map[t] for t in pred[line_starts[i][idx]+1:line_starts[i][idx]+len(test_data.tokens[i])+1]])
                
            output_file = "output/{}-{}.tsv".format(args.output_file, start_p)
            lines_first, sentences_first = write_result(
                output_file, test_data.words, test_data.lengths,
                test_data.tokens, test_data.labels, pred_tags
            )
            print("")
            print("Results with prediction starting position ", start_p)
            c = conlleval.evaluate(lines_first)
            conlleval.report(c)
            results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])

    result_file = "./results/results-{}.csv".format(args.output_file) 
    with open(result_file, 'w+') as f:
        for i, line in enumerate(results):
            params = "{},{},{},{},{},{},{},{},{}".format(args.output_file,
                                            args.max_seq_length, 
                                            args.bert_config_file, 
                                            args.num_train_epochs, 
                                            args.learning_rate,
                                            args.batch_size,
                                            args.predict_position,
                                            args.train_data,
                                            args.test_data)
            f.write(params)
            f.write(",{}".format(m_names[i]))
            for item in line:
                f.write(",{}".format(item))
            f.write('\n') 

    for i in results:
        print(i)
    return 0
Esempio n. 15
0
 def evaluate_conlleval_string(self, conlleval_string):
     counts = conlleval.evaluate(conlleval_string.split('\n'), {'delimiter': self.separator})
     full_report = conlleval.report(counts)
     overall, per_label = conlleval.metrics(counts)
     return overall, per_label, full_report
Esempio n. 16
0
                        histogram_freq=1)
        ]

        model.fit(xtr,
                  ytr,
                  batch_size=batch_size,
                  epochs=nb_epoch,
                  verbose=1,
                  validation_data=(xte, yte),
                  callbacks=callbacks)

        print('loading the currently best model for final evaluation...')
        model = load_model(checkPointPath)

        print('--------------------------------------------------')
        print('Fold ', currentFold, ' performance')
        counts, cmat = conll_eval_counts(model.predict(xte), yte, labels)
        overall, byType = ceval.metrics(counts)
        ceval.report(counts)
        print_cm(cmat, ordered_label_keys(labels))
        foldScores.append(overall.fscore)
        print('\n')
        print('avg f1 fold scores so far: ', np.mean(foldScores))
        currentFold += 1

        # we clear the tensorflow session after each fold to not leak resources
        K.clear_session()

    print('f1 fold scores: ', foldScores)
    print('final avg f1 fold scores: ', np.mean(foldScores))
Esempio n. 17
0
    def get_results(self, name, token_labels_available=True):
        """
        Gets the statistical results both at the sentence and at the token level.
        :param name: train, dev or test (+ epoch number).
        :param token_labels_available: whether there are token annotations.
        :return: an ordered dictionary containing the collection of results.
        """
        results = OrderedDict()

        results["name"] = name
        results["cost_sum"] = self.cost_sum
        results["cost_avg"] = (self.cost_sum / float(self.count_sent)
                               if self.count_sent else 0.0)

        results["count_sent"] = self.count_sent
        results["total_correct_sent"] = self.correct_binary_sent
        results["accuracy_sent"] = (self.correct_binary_sent / float(self.count_sent)
                                    if self.count_sent else 0.0)

        # Calculate the micro and macro averages for the sentence predictions
        f_macro_sent, p_macro_sent, r_macro_sent, f05_macro_sent = 0.0, 0.0, 0.0, 0.0
        f_non_default_macro_sent, p_non_default_macro_sent, \
            r_non_default_macro_sent, f05_non_default_macro_sent = 0.0, 0.0, 0.0, 0.0

        for key in self.id2label_sent.keys():
            p, r, f, f05 = self.calculate_metrics(
                self.sentence_correct[key],
                self.sentence_predicted[key],
                self.sentence_total[key])
            label = "label=%s" % self.id2label_sent[key]
            results[label + "_predicted_sent"] = self.sentence_predicted[key]
            results[label + "_correct_sent"] = self.sentence_correct[key]
            results[label + "_total_sent"] = self.sentence_total[key]
            results[label + "_precision_sent"] = p
            results[label + "_recall_sent"] = r
            results[label + "_f-score_sent"] = f
            results[label + "_f05-score_sent"] = f05
            p_macro_sent += p
            r_macro_sent += r
            f_macro_sent += f
            f05_macro_sent += f05
            if key != 0:
                p_non_default_macro_sent += p
                r_non_default_macro_sent += r
                f_non_default_macro_sent += f
                f05_non_default_macro_sent += f05

        p_macro_sent /= len(self.id2label_sent.keys())
        r_macro_sent /= len(self.id2label_sent.keys())
        f_macro_sent /= len(self.id2label_sent.keys())
        f05_macro_sent /= len(self.id2label_sent.keys())

        p_non_default_macro_sent /= (len(self.id2label_sent.keys()) - 1)
        r_non_default_macro_sent /= (len(self.id2label_sent.keys()) - 1)
        f_non_default_macro_sent /= (len(self.id2label_sent.keys()) - 1)
        f05_non_default_macro_sent /= (len(self.id2label_sent.keys()) - 1)

        p_micro_sent, r_micro_sent, f_micro_sent, f05_micro_sent = self.calculate_metrics(
            sum(self.sentence_correct.values()),
            sum(self.sentence_predicted.values()),
            sum(self.sentence_total.values()))

        p_non_default_micro_sent, r_non_default_micro_sent, \
            f_non_default_micro_sent, f05_non_default_micro_sent = self.calculate_metrics(
                sum([value for key, value in self.sentence_correct.items() if key != 0]),
                sum([value for key, value in self.sentence_predicted.items() if key != 0]),
                sum([value for key, value in self.sentence_total.items() if key != 0]))

        results["precision_macro_sent"] = p_macro_sent
        results["recall_macro_sent"] = r_macro_sent
        results["f-score_macro_sent"] = f_macro_sent
        results["f05-score_macro_sent"] = f05_macro_sent

        results["precision_micro_sent"] = p_micro_sent
        results["recall_micro_sent"] = r_micro_sent
        results["f-score_micro_sent"] = f_micro_sent
        results["f05-score_micro_sent"] = f05_micro_sent

        results["precision_non_default_macro_sent"] = p_non_default_macro_sent
        results["recall_non_default_macro_sent"] = r_non_default_macro_sent
        results["f-score_non_default_macro_sent"] = f_non_default_macro_sent
        results["f05-score_non_default_macro_sent"] = f05_non_default_macro_sent

        results["precision_non_default_micro_sent"] = p_non_default_micro_sent
        results["recall_non_default_micro_sent"] = r_non_default_micro_sent
        results["f-score_non_default_micro_sent"] = f_non_default_micro_sent
        results["f05-score_non_default_micro_sent"] = f05_non_default_micro_sent

        if token_labels_available or "test" in name:
            results["count_tok"] = self.count_tok
            results["total_correct_tok"] = self.correct_binary_tok
            results["accuracy_tok"] = (self.correct_binary_tok / float(self.count_tok)
                                       if self.count_tok else 0.0)

            # Calculate the micro and macro averages for the token predictions.
            f_tok_macro, p_tok_macro, r_tok_macro, f05_tok_macro = 0.0, 0.0, 0.0, 0.0
            f_non_default_macro_tok, p_non_default_macro_tok, \
                r_non_default_macro_tok, f05_non_default_macro_tok = 0.0, 0.0, 0.0, 0.0

            for key in self.id2label_tok.keys():
                p, r, f, f05 = self.calculate_metrics(
                    self.token_correct[key], self.token_predicted[key], self.token_total[key])
                label = "label=%s" % self.id2label_tok[key]
                results[label + "_predicted_tok"] = self.token_predicted[key]
                results[label + "_correct_tok"] = self.token_correct[key]
                results[label + "_total_tok"] = self.token_total[key]
                results[label + "_precision_tok"] = p
                results[label + "_recall_tok"] = r
                results[label + "_f-score_tok"] = f
                results[label + "_tok_f05"] = f05
                p_tok_macro += p
                r_tok_macro += r
                f_tok_macro += f
                f05_tok_macro += f05
                if key != 0:
                    p_non_default_macro_tok += p
                    r_non_default_macro_tok += r
                    f_non_default_macro_tok += f
                    f05_non_default_macro_tok += f05

            p_tok_macro /= len(self.id2label_tok.keys())
            r_tok_macro /= len(self.id2label_tok.keys())
            f_tok_macro /= len(self.id2label_tok.keys())
            f05_tok_macro /= len(self.id2label_tok.keys())

            p_non_default_macro_tok /= (len(self.id2label_tok.keys()) - 1)
            r_non_default_macro_tok /= (len(self.id2label_tok.keys()) - 1)
            f_non_default_macro_tok /= (len(self.id2label_tok.keys()) - 1)
            f05_non_default_macro_tok /= (len(self.id2label_tok.keys()) - 1)

            p_tok_micro, r_tok_micro, f_tok_micro, f05_tok_micro = self.calculate_metrics(
                sum(self.token_correct.values()),
                sum(self.token_predicted.values()),
                sum(self.token_total.values()))

            p_non_default_micro_tok, r_non_default_micro_tok, \
                f_non_default_micro_tok, f05_non_default_micro_tok = self.calculate_metrics(
                    sum([value for key, value in self.token_correct.items() if key != 0]),
                    sum([value for key, value in self.token_predicted.items() if key != 0]),
                    sum([value for key, value in self.token_total.items() if key != 0]))

            results["precision_macro_tok"] = p_tok_macro
            results["recall_macro_tok"] = r_tok_macro
            results["f-score_macro_tok"] = f_tok_macro
            results["f05-score_macro_tok"] = f05_tok_macro

            results["precision_micro_tok"] = p_tok_micro
            results["recall_micro_tok"] = r_tok_micro
            results["f-score_micro_tok"] = f_tok_micro
            results["f05-score_micro_tok"] = f05_tok_micro

            results["precision_non_default_macro_tok"] = p_non_default_macro_tok
            results["recall_non_default_macro_tok"] = r_non_default_macro_tok
            results["f-score_non_default_macro_tok"] = f_non_default_macro_tok
            results["f05-score_non_default_macro_tok"] = f05_non_default_macro_tok

            results["precision_non_default_micro_tok"] = p_non_default_micro_tok
            results["recall_non_default_micro_tok"] = r_non_default_micro_tok
            results["f-score_non_default_micro_tok"] = f_non_default_micro_tok
            results["f05-score_non_default_micro_tok"] = f05_non_default_micro_tok

            if self.id2label_tok is not None and self.conll03_eval is True:
                conll_counts = conlleval.evaluate(self.conll_format)
                conll_metrics_overall, conll_metrics_by_type = conlleval.metrics(conll_counts)
                results["conll_accuracy"] = (float(conll_counts.correct_tags)
                                             / float(conll_counts.token_counter))
                results["conll_p"] = conll_metrics_overall.prec
                results["conll_r"] = conll_metrics_overall.rec
                results["conll_f"] = conll_metrics_overall.fscore

        results["time"] = float(time.time()) - float(self.start_time)
        return results