Beispiel #1
0
    def eval(self, dataset_name, log_output=None):
        dataset = self.datasets.get(dataset_name, None)
        if dataset is None:
            return

        results = []
        logger.info('Evaluating {} ({})'.format(self.name, dataset_name))
        set_loss = 0
        for tokens, labels, chars, seq_lens, char_lens in dataset.get_dataset(
                volatile=True, gpu=self.gpu):
            preds, loss = self.model.predict(tokens, labels, seq_lens, chars,
                                             char_lens)
            set_loss += float(loss.data[0])
            for pred, gold, seq_len, ts in zip(preds, labels, seq_lens,
                                               tokens):
                l = int(seq_len.data[0])
                pred = pred.data.tolist()[:l]
                gold = gold.data.tolist()[:l]
                ts = ts.data.tolist()[:l]
                for p, g, t in zip(pred, gold, ts):
                    t = self.idx_token.get(t, 'UNK')
                    results.append('{} {} {}'.format(t, self.idx_label[g],
                                                     self.idx_label[p]))
                results.append('')
        counts = evaluate(results)
        overall, by_type = metrics(counts)
        report(counts)
        logger.info('Loss: {:.5f}'.format(set_loss))
        return SCORES(fscore=overall.fscore,
                      precision=overall.prec,
                      recall=overall.rec,
                      loss=set_loss)
Beispiel #2
0
def conlleval_report(documents):
    """Return conlleval evaluation report for Documents as string."""
    # conlleval.py has a file-based API, so use StringIO
    counts = conlleval_evaluate(documents)
    report_string = StringIO()
    report(counts, out=report_string)
    return report_string.getvalue()
Beispiel #3
0
def evaluate(results, idx_token, idx_label, writer=None):
    """Evaluate prediction results.

    :param results: A List of which each item is a tuple
        (predictions, gold labels, sequence lengths, tokens) of a batch.
    :param idx_token: Index to token dictionary.
    :param idx_label: Index to label dictionary.
    :param writer: An object (file object) with a write() function. Extra output.
    :return: F-score, precision, and recall.
    """
    # b: batch, s: sequence
    outputs = []
    for preds_b, golds_b, len_b, tokens_b in results:
        for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b,
                                                     tokens_b):
            l = int(len_s.item())
            preds_s = preds_s.data.tolist()[:l]
            golds_s = golds_s.data.tolist()[:l]
            tokens_s = tokens_s.data.tolist()[:l]
            for p, g, t in zip(preds_s, golds_s, tokens_s):
                token = idx_token.get(t, C.UNK_INDEX)
                outputs.append('{} {} {}'.format(token, idx_label.get(g, 0),
                                                 idx_label.get(p, 0)))
            outputs.append('')
    counts = conlleval.evaluate(outputs)
    overall, by_type = conlleval.metrics(counts)
    conlleval.report(counts)
    if writer:
        conlleval.report(counts, out=writer)
        writer.flush()
    return overall.fscore, overall.prec, overall.rec
Beispiel #4
0
def main(argv):
    argparser = argument_parser()
    args = argparser.parse_args(argv[1:])
    seq_len = args.max_seq_length  # abbreviation

    pretrained_model, tokenizer = load_pretrained(args)

    train_words, train_tags = read_conll(args.train_data)
    test_words, test_tags = read_conll(args.test_data)
    train_data = process_sentences(train_words, train_tags, tokenizer, seq_len)
    test_data = process_sentences(test_words, test_tags, tokenizer, seq_len)

    label_list = get_labels(train_data.labels)
    tag_map = {l: i for i, l in enumerate(label_list)}
    inv_tag_map = {v: k for k, v in tag_map.items()}

    init_prob, trans_prob = viterbi_probabilities(train_data.labels, tag_map)

    train_x = encode(train_data.combined_tokens, tokenizer, seq_len)
    test_x = encode(test_data.combined_tokens, tokenizer, seq_len)

    train_y, train_weights = label_encode(train_data.combined_labels, tag_map,
                                          seq_len)
    test_y, test_weights = label_encode(test_data.combined_labels, tag_map,
                                        seq_len)

    ner_model = create_ner_model(pretrained_model, len(tag_map))
    optimizer = create_optimizer(len(train_x[0]), args)

    ner_model.compile(optimizer,
                      loss='sparse_categorical_crossentropy',
                      sample_weight_mode='temporal',
                      metrics=['sparse_categorical_accuracy'])

    ner_model.fit(train_x,
                  train_y,
                  sample_weight=train_weights,
                  epochs=args.num_train_epochs,
                  batch_size=args.batch_size)

    if args.ner_model_dir is not None:
        label_list = [v for k, v in sorted(list(inv_tag_map.items()))]
        save_ner_model(ner_model, tokenizer, label_list, args)
        save_viterbi_probabilities(init_prob, trans_prob, inv_tag_map, args)

    probs = ner_model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)

    pred_tags = []
    for i, pred in enumerate(preds):
        pred_tags.append(
            [inv_tag_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]])

    lines = write_result(args.output_file, test_data.words, test_data.lengths,
                         test_data.tokens, test_data.labels, pred_tags)

    c = conlleval.evaluate(lines)
    conlleval.report(c)
    return 0
def evaluate(args, data, model, id2label, all_ori_tokens):
    model.eval()
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data,
                            sampler=sampler,
                            batch_size=args.train_batch_size)

    logger.info("***** Running eval *****")
    # logger.info(f" Num examples = {len(data)}")
    # logger.info(f" Batch size = {args.eval_batch_size}")
    pred_labels = []
    ori_labels = []

    for b_i, (input_ids, input_mask, segment_ids, label_ids, bbox, bbox_pos_id,
              bbox_num) in enumerate(tqdm(dataloader, desc="Evaluating")):

        input_ids = input_ids.to(args.device)
        input_mask = input_mask.to(args.device)
        segment_ids = segment_ids.to(args.device)
        label_ids = label_ids.to(args.device)
        bbox = bbox.to(args.device)
        bbox_pos_id = bbox_pos_id.to(args.device)
        bbox_num = bbox_num.to(args.device)

        with torch.no_grad():
            logits = model.predict(input_ids, segment_ids, input_mask, bbox,
                                   bbox_pos_id, bbox_num)
        # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
        # logits = logits.detach().cpu().numpy()

        for l in logits:  # logits-> List[List[int]]
            pred_labels.append([id2label[idx] for idx in l])

        for l in label_ids:  # tensor
            ori_labels.append([id2label[idx.item()] for idx in l])

    eval_list = []

    for ori_tokens, oril, prel in zip(all_ori_tokens, ori_labels, pred_labels):
        for ot, ol, pl in zip(ori_tokens, oril, prel):
            if ot in ["[CLS]", "[SEP]"]:
                continue
            if len(f"{ot} {ol} {pl}\n".split(" ")) != 3:
                continue
            eval_list.append(f"{ot} {ol} {pl}\n")
        eval_list.append("\n")

    # eval the model
    counts = conlleval.evaluate(eval_list)
    conlleval.report(counts)

    # namedtuple('Metrics', 'tp fp fn prec rec fscore')
    overall, by_type = conlleval.metrics(counts)

    return overall, by_type
Beispiel #6
0
    def on_epoch_end(self, epoch, logs=None):
        ypred = self.model.predict(self.test_features)
        c, cmat = conll_eval_counts(ypred, self.test_ground_truth, self.labels)
        ceval.report(c, prefix=self.prefix)
        print_cm(cmat, ordered_label_keys(self.labels))
        o, b = ceval.metrics(c)

        # tensorboard requires those logs to be float64 with attribute item(), thus we create them with numpy
        logs[self.prefix + "_conll_f1"] = np.float64(o.fscore)
        logs[self.prefix + "_conll_prec"] = np.float64(o.prec)
        logs[self.prefix + "_conll_rec"] = np.float64(o.rec)
def evaluate(args, task_id, data, model, id2label, all_ori_words, file_name=None):
    model.eval()
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=args.train_batch_size)
    task_id = torch.tensor(task_id, dtype=torch.long).to(args.device)

    logger.info("***** Running eval *****")
    logger.info(f" Num examples = {len(data)}")

    pred_labels = []
    ori_labels = []
    for b_i, batch in enumerate(tqdm(dataloader, desc="Evaluating")):
        batch = tuple(t.to(args.device) for t in batch)
        if args.need_charcnn:
            input_word_ids, input_mask, label_ids, label_mask, char_ids = batch
        else:
            input_word_ids, input_mask, label_ids, label_mask = batch
            char_ids = None

        with torch.no_grad():
            logits = model.predict(task_id, input_word_ids, char_ids, input_mask)

        # print(len(all_ori_words), [len(x) for x in all_ori_words])
        # print(len(logits), [len(x) for x in logits])
        # print(len(label_ids), [len(x) for x in label_ids])
        # print(len(input_mask), [sum(x) for x in input_mask])
        # print(len(label_mask), [sum(x) for x in label_mask])

        for predL, goldL, maskL in zip(logits, label_ids, label_mask):
            for p, g, mask in zip(predL, goldL, maskL):
                if mask.item() == 1:
                    pred_labels.append(id2label[p])
                    ori_labels.append(id2label[g.item()])
            pred_labels.append(None)
            ori_labels.append(None)
    ori_words = []
    for sent in all_ori_words:
        ori_words.extend(sent+[None])
    eval_list = []
    # print(len(pred_labels), len(ori_labels), len(ori_words))
    for plabel, olabel, word in zip(pred_labels, ori_labels, ori_words):
        if plabel is not None:
            eval_list.append(f"{word} {olabel} {plabel}\n")
        else:
            eval_list.append("\n")

    if file_name is not None:
        with open(file_name, "w", encoding="utf-8") as f:
          for line in eval_list:
            f.write(line)

    # eval the model
    counts = conlleval.evaluate(eval_list)
    conlleval.report(counts)
Beispiel #8
0
def calculate_labeling_scores(results, report=True):
    outputs = []
    for p_b, g_b, t_b, l_b in results:
        for p_s, g_s, t_s, l_s in zip(p_b, g_b, t_b, l_b):
            p_s = p_s[:l_s]
            for p, g, t in zip(p_s, g_s, t_s):
                outputs.append('{} {} {}'.format(t, g, p))
            outputs.append('')
    counts = conlleval.evaluate(outputs)
    overall, by_type = conlleval.metrics(counts)
    if report:
        conlleval.report(counts)
    return (overall.fscore * 100.0, overall.prec * 100.0, overall.rec * 100.0)
Beispiel #9
0
def get_output_file(all_logit, all_label, decode, out):
    decode.pop(len(decode) - 1)
    assert len(all_logit) == len(all_label)
    evalseq = []
    for i in range(len(all_logit)):
        evalseq.append("{} {} {}".format(
            i,
            decode[int(all_label[i])]
            if int(all_label[i]) in decode.keys() else "O",
            decode[int(all_logit[i])]
            if int(all_logit[i]) in decode.keys() else "O",
        ))

    count = conlleval.evaluate(evalseq)
    conlleval.report(count, out)
Beispiel #10
0
def test_format():
    words = "Shyam lives in New York .".split()
    gold = "B-PER O O B-LOC I-LOC O".split()
    pred = "B-PER O O B-LOC O O".split()
    print("Testing inputting the wrong format. This should get an exception")
    try:
        evaluate([1, 2, 3])
    except Exception as e:
        print(e)

    pred = "B-PER O O B-LOC I-MISC O".split()
    print("This should be 50% F1")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 0.4
Beispiel #11
0
def evaluate(results, idx_token, idx_label, writer=None):
    """Evaluate prediction results.

    :param results: A List of which each item is a tuple
        (predictions, gold labels, sequence lengths, tokens) of a batch.
    :param idx_token: Index to token dictionary.
    :param idx_label: Index to label dictionary.
    :param writer: An object (file object) with a write() function. Extra output.
    :return: F-score, precision, and recall.
    """
    # b: batch, s: sequence
    outputs = []
    # preds: predictions
    # golds: answers?
    # len: length of something
    # tokens: original words?
    for preds_b, golds_b, len_b, tokens_b in results:
        for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b, tokens_b):
            l = int(len_s.item())
            preds_s = preds_s.data.tolist()[:l]
            golds_s = golds_s.data.tolist()[:l]
            tokens_s = tokens_s.data.tolist()[:l]
            for p, g, t in zip(preds_s, golds_s, tokens_s):
                token = idx_token.get(t, C.UNK)
                # if token == '':  # debug
                #     token = '<$UNK$>'
                # print(idx_token)  # debug
                # print("p: ", p, ", g: ", g, ", t: ", t, ", corresponding token:", token, "|")  # DEBUG
                outputs.append('{} {} {}'.format(
                    token, idx_label.get(g, 0), idx_label.get(p, 0)))
            outputs.append('')
    # print("OUTPUTS: ", outputs)  # DEBUG # seems like outputs is right but counts is wrong
    # Why is english-covered-test not like the other, uncovered datasets? is this causing an issue?
    counts = conlleval.evaluate(outputs)
    # print("counts: ", counts)  # DEBUG
    overall, by_type = conlleval.metrics(counts)
    conlleval.report(counts)
    if writer:
        conlleval.report(counts, out=writer)
        writer.flush()
    return overall.fscore, overall.prec, overall.rec
Beispiel #12
0
def test_entities_at_the_end():
    words = "Shyam lives in New York".split()
    gold = "B-PER O O B-LOC I-LOC".split()
    pred = "B-PER O O B-LOC O".split()

    print("Input gold. This should be perfect.")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, gold)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 1.0

    print("This should be 50% F1")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 0.5
    assert by_type["PER"].fscore == 1.0
    assert by_type["LOC"].fscore == 0.0

    print("This should be 50% F1")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, pred, gold)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 0.5
    assert by_type["PER"].fscore == 1.0
    assert by_type["LOC"].fscore == 0.0
Beispiel #13
0
def compare(gold_toks, gold_tags, pred_toks, pred_tags):
    if len(gold_toks) != len(pred_toks):
        raise ValueError('sentence count mismatch: {} in gold, {} in pred'.\
                         format(len(gold_toks), len(pred_toks)))
    lines = []
    for g_toks, g_tags, p_toks, p_tags in zip(gold_toks, gold_tags, pred_toks,
                                              pred_tags):
        if g_toks != p_toks:
            raise ValueError('text mismatch: gold "{}", pred "{}"'.\
                             format(g_toks, p_toks))
        for (g_tok, g_tag, p_tag) in zip(g_toks, g_tags, p_tags):
            lines.append('{}\t{}\t{}'.format(g_tok, g_tag, p_tag))

    return conlleval.report(conlleval.evaluate(lines))
Beispiel #14
0
def predict_test_file(fname, input_dim, timesteps, nlabels, labels):
    print('loading data from file ', fname)
    df = pd.read_csv(fname, sep=' ', header=0)
    X = extract_features(df, timesteps, input_dim)
    y = extract_labels(df, timesteps, nlabels)

    print('X temporal reshape: ', X.shape)
    print('y temporal reshape: ', y.shape)
    print('#samples: ', len(X))
    print('#labels: ', len(y))

    # we are averaging over all models output probabilities and then just taking the max
    m_preds = np.zeros((X.shape[0], timesteps, nlabels))
    for model in models:
        m_preds = m_preds + model.predict(X)
        break

    m_preds = m_preds / len(models)

    # just count and report and we are done
    counts, conf_matrix = conll_eval_counts(m_preds, y, labels)
    print('file: ', fname)
    ceval.report(counts)
    print_cm(conf_matrix, ordered_label_keys(labels))
Beispiel #15
0
def biobert_metrics(model: NERInferenceSession, input_path: str,
                    output_path: str):
    with open(input_path, "r") as f:
        data = f.readlines()

    total = 0
    for i in data:
        if i == "\n":
            total += 1

    print("Running over " + str(total) +
          " sentences")  #changed counter to total

    confusion_matrix: CounterT[str] = Counter()
    token_matrix: DefaultDict[str, DefaultDict[str, int]] = defaultdict(
        lambda: defaultdict(int))

    gs_labels: List[str] = []
    sequence = ""
    line_list = list()

    counter = 0

    for line in data:

        if line == "\n":
            counter += 1

            sys.stdout.write("Predicted {}/{} sentences so far.\r".format(
                counter, total))
            sys.stdout.flush()

            pred_pairs = model.predict(sequence.strip())

            tokens = sequence.strip().split()

            # The tokenization label X and special labels hold no more value
            pred_labels = [
                label[1] for label in pred_pairs if label[1] != 'X'
                and label[0] != '[CLS]' and label[0] != '[SEP]'
            ]

            cm, tm = sentence_metrics(pred_labels, gs_labels)

            confusion_matrix.update(cm)

            for gs_label in tm:
                for pred_label in tm[gs_label]:
                    token_matrix[gs_label][pred_label] += tm[gs_label][
                        pred_label]

            line_list = line_list + list(
                map(lambda token, gs, pred: token + " TK " + gs + " " + pred,
                    tokens, gs_labels, pred_labels))

            gs_labels = []
            sequence = ""
            continue

        columns = line.split("\t")
        sequence += columns[0] + " "
        gs_labels.append(columns[1].strip())

        #if counter_2 == 1000:
        #break

    conlleval_res = conlleval.report(conlleval.evaluate(line_list))
    print(conlleval_res)

    # CM
    cm_r = confusion_matrix["true_positive"] / (
        confusion_matrix["true_positive"] + confusion_matrix["false_negative"])
    cm_p = confusion_matrix["true_positive"] / (
        confusion_matrix["true_positive"] + confusion_matrix["false_positive"])
    cm_f1 = 2 * cm_r * cm_p / (cm_r + cm_p)

    # TM
    b_r = token_matrix["B"]["B"] / (token_matrix["B"]["B"] +
                                    token_matrix["B"]["I"] +
                                    token_matrix["B"]["O"])
    b_p = token_matrix["B"]["B"] / (token_matrix["B"]["B"] +
                                    token_matrix["I"]["B"] +
                                    token_matrix["O"]["B"])
    b_f1 = 2 * b_r * b_p / (b_r + b_p)

    i_r = token_matrix["I"]["I"] / (token_matrix["I"]["B"] +
                                    token_matrix["I"]["I"] +
                                    token_matrix["I"]["O"])
    i_p = token_matrix["I"]["I"] / (token_matrix["B"]["I"] +
                                    token_matrix["I"]["I"] +
                                    token_matrix["O"]["I"])
    i_f1 = 2 * i_r * i_p / (i_r + i_p)

    o_r = token_matrix["O"]["O"] / (token_matrix["O"]["B"] +
                                    token_matrix["O"]["I"] +
                                    token_matrix["O"]["O"])
    o_p = token_matrix["O"]["O"] / (token_matrix["B"]["O"] +
                                    token_matrix["I"]["O"] +
                                    token_matrix["O"]["O"])
    o_f1 = 2 * o_r * o_p / (o_r + o_p)

    with open(output_path, "a+") as out_f:
        out_f.write("\nConlleval results:\n" + conlleval_res)

        out_f.write("\nToken-Level Confusion Matrix:\n" + "True Positive:\t" +
                    str(confusion_matrix["true_positive"]) +
                    "\nTrue Negative:\t" +
                    str(confusion_matrix["true_negative"]) +
                    "\nFalse Positive:\t" +
                    str(confusion_matrix["false_positive"]) +
                    "\nFalse Negative:\t" +
                    str(confusion_matrix["false_negative"]) + "\nRecall:\t\t" +
                    str(cm_r) + "\nPrecision:\t" + str(cm_p) +
                    "\nF1-score:\t" + str(cm_f1))

        out_f.write("\n\nToken Matrix (true\predicted):\n\tB\tI\tO\n" + "B\t" +
                    str(token_matrix["B"]["B"]) + "\t" +
                    str(token_matrix["B"]["I"]) + "\t" +
                    str(token_matrix["B"]["O"]) + "\nI\t" +
                    str(token_matrix["I"]["B"]) + "\t" +
                    str(token_matrix["I"]["I"]) + "\t" +
                    str(token_matrix["I"]["O"]) + "\nO\t" +
                    str(token_matrix["O"]["B"]) + "\t" +
                    str(token_matrix["O"]["I"]) + "\t" +
                    str(token_matrix["O"]["O"]) + "\nB_Recall:\t" + str(b_r) +
                    "\nB_Precision:\t" + str(b_p) + "\nB_F1:\t\t" + str(b_f1) +
                    "\nI_Recall:\t" + str(i_r) + "\nI_Precision:\t" +
                    str(i_p) + "\nI_F1:\t\t" + str(i_f1) + "\nO_Recall:\t" +
                    str(o_r) + "\nO_Precision:\t" + str(o_p) + "\nO_F1:\t\t" +
                    str(o_f1) + "\n")

    print("Confusion matrix:")
    print({**confusion_matrix})
    print("Recall: " + str(cm_r))
    print("Precision: " + str(cm_p))
    print()

    print("Token matrix:")
    print({**token_matrix})
    print()
    words, infer_tags, unknown_tokens = ner.infer(sentence=item["sentence"],
                                                  true_tags=item["labels"])
    cm, tm, em = sentence_metrics(infer_tags, item["labels"])

    confusion_matrix.update(cm)
    entity_matrix.update(em)

    for gs_label in tm:
        for pred_label in tm[gs_label]:
            token_matrix[gs_label][pred_label] += tm[gs_label][pred_label]

    line_list = line_list + list(
        map(lambda token, gs, pred: token + " TK " + gs + " " + pred,
            item["sentence"].split(), item["labels"], infer_tags))

conlleval_res = conlleval.report(conlleval.evaluate(line_list))
print(conlleval_res)

# CM
cm_r = confusion_matrix["true_positive"] / (confusion_matrix["true_positive"] +
                                            confusion_matrix["false_negative"])
cm_p = confusion_matrix["true_positive"] / (confusion_matrix["true_positive"] +
                                            confusion_matrix["false_positive"])
cm_f1 = 2 * cm_r * cm_p / (cm_r + cm_p)

# EM
em_r = entity_matrix["true_positive"] / (entity_matrix["true_positive"] +
                                         entity_matrix["false_negative"])
em_p = entity_matrix["true_positive"] / (entity_matrix["true_positive"] +
                                         entity_matrix["false_positive"])
em_f1 = 2 * em_r * em_p / (em_r + em_p)
Beispiel #17
0
def main(argv):

    argparser = argument_parser()
    args = argparser.parse_args(argv[1:])
    seq_len = args.max_seq_length    # abbreviation

    pretrained_model, tokenizer = load_pretrained(args)

    train_words, train_tags = read_conll(args.train_data)
    test_words, test_tags = read_conll(args.test_data)


    print(args.no_context)

    if args.no_context:
        train_data = process_no_context(train_words, train_tags, tokenizer, seq_len)
        test_data = process_no_context(test_words, test_tags, tokenizer, seq_len)
    elif args.documentwise:
        tr_docs, tr_doc_tags, tr_line_ids = split_to_documents(train_words, train_tags)
        te_docs, te_doc_tags, te_line_ids = split_to_documents(test_words, test_tags)
        train_data = process_docs(tr_docs, tr_doc_tags, tr_line_ids, tokenizer, seq_len)
        test_data = process_docs(te_docs, te_doc_tags, te_line_ids, tokenizer, seq_len)
    else:
        train_data = process_sentences(train_words, train_tags, tokenizer, seq_len, args.predict_position)
        test_data = process_sentences(test_words, test_tags, tokenizer, seq_len, args.predict_position)
    
    label_list = get_labels(train_data.labels)
    tag_map = { l: i for i, l in enumerate(label_list) }
    inv_tag_map = { v: k for k, v in tag_map.items() }

    train_x = encode(train_data.combined_tokens, tokenizer, seq_len)
    test_x = encode(test_data.combined_tokens, tokenizer, seq_len)
    train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len)
    test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len)


    if args.use_ner_model and (args.ner_model_dir is not None):
        ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir)
    else:
        optimizer = create_optimizer(len(train_x[0]), args)
        model = create_ner_model(pretrained_model, len(tag_map))
        if args.num_gpus > 1:
            ner_model = multi_gpu_model(model, args.num_gpus)
        else:
            ner_model = model

        ner_model.compile(
            optimizer,
            loss='sparse_categorical_crossentropy',
            sample_weight_mode='temporal',
            metrics=['sparse_categorical_accuracy']
            )
                
        ner_model.fit(
            train_x,
            train_y,
            sample_weight=train_weights,
            epochs=args.num_train_epochs,
            batch_size=args.batch_size
            )
        if args.ner_model_dir is not None:
            label_list = [v for k, v in sorted(list(inv_tag_map.items()))]
            save_ner_model(ner_model, tokenizer, label_list, args)

    
    probs = ner_model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)
    
    results = []
    m_names = []
    if args.no_context:
        pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers)
        output_file = "output/{}-NC.tsv".format(args.output_file)
        m_names.append('NC')  
        ensemble = []
        for i,pred in enumerate(pr_test_first):
            ensemble.append([inv_tag_map[t] for t in pred])
        lines_ensemble, sentences_ensemble = write_result(
            output_file, test_data.words, test_data.lengths,
            test_data.tokens, test_data.labels, ensemble
            )
        c = conlleval.evaluate(lines_ensemble)
        conlleval.report(c)
        results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])



    else:
        # First tag then vote
        pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers)
        # Accumulate probabilities, then vote
        prob_ensemble, prob_test_first = get_predictions2(probs, test_data.tokens, test_data.sentence_numbers)
        ens = [pr_ensemble, prob_ensemble, pr_test_first, prob_test_first]
        if args.documentwise:
            # D-CMV: Documentwise CMV
            # D-CMVP: Documetwise CMV, probs summed, argmax after that
            # D-F: Documentwise First
            # D-FP: Same as D-FP 
            method_names = ['D-CMV','D-CMVP','D-F','D-FP']  
        else:           
            method_names = ['CMV','CMVP','F','FP']
        for i, ensem in enumerate(ens):
            ensemble = []
            for j,pred in enumerate(ensem):
                ensemble.append([inv_tag_map[t] for t in pred])
            output_file = "output/{}-{}.tsv".format(args.output_file, method_names[i])
            lines_ensemble, sentences_ensemble = write_result(
                    output_file, test_data.words, test_data.lengths,
                    test_data.tokens, test_data.labels, ensemble)
            print("Model trained: ", args.ner_model_dir)
            print("Seq-len: ", args.max_seq_length)
            print("Learning rate: ", args.learning_rate)
            print("Batch Size: ", args.batch_size)
            print("Epochs: ", args.num_train_epochs)
            print("Training data: ", args.train_data)
            print("Testing data: ", args.test_data)
            print("")
            print("Results with {}".format(method_names[i]))
            c = conlleval.evaluate(lines_ensemble)
            print("")
            conlleval.report(c)
            results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])
            m_names.extend(method_names)

        
    if args.sentence_in_context:     
        starting_pos = np.arange(0,seq_len+1,32)
        starting_pos[0] = 1
        m_names.extend(starting_pos)
        for start_p in starting_pos:
            tt_lines, tt_tags, line_nos, line_starts = combine_sentences2(test_data.tokens, test_data.labels, seq_len-1, start_p-1)
            tt_x = encode(tt_lines, tokenizer, seq_len)
            tt_y, train_weights = label_encode(tt_tags, tag_map, seq_len)
            probs = ner_model.predict(tt_x, batch_size=args.batch_size)
            preds = np.argmax(probs, axis=-1)


            pred_tags = []
            for i, pred in enumerate(preds):
                idx = line_nos[i].index(i)
                pred_tags.append([inv_tag_map[t] for t in pred[line_starts[i][idx]+1:line_starts[i][idx]+len(test_data.tokens[i])+1]])
                
            output_file = "output/{}-{}.tsv".format(args.output_file, start_p)
            lines_first, sentences_first = write_result(
                output_file, test_data.words, test_data.lengths,
                test_data.tokens, test_data.labels, pred_tags
            )
            print("")
            print("Results with prediction starting position ", start_p)
            c = conlleval.evaluate(lines_first)
            conlleval.report(c)
            results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])

    result_file = "./results/results-{}.csv".format(args.output_file) 
    with open(result_file, 'w+') as f:
        for i, line in enumerate(results):
            params = "{},{},{},{},{},{},{},{},{}".format(args.output_file,
                                            args.max_seq_length, 
                                            args.bert_config_file, 
                                            args.num_train_epochs, 
                                            args.learning_rate,
                                            args.batch_size,
                                            args.predict_position,
                                            args.train_data,
                                            args.test_data)
            f.write(params)
            f.write(",{}".format(m_names[i]))
            for item in line:
                f.write(",{}".format(item))
            f.write('\n') 

    for i in results:
        print(i)
    return 0
Beispiel #18
0
 def evaluate_conlleval_string(self, conlleval_string):
     counts = conlleval.evaluate(conlleval_string.split('\n'), {'delimiter': self.separator})
     full_report = conlleval.report(counts)
     overall, per_label = conlleval.metrics(counts)
     return overall, per_label, full_report
Beispiel #19
0
                        histogram_freq=1)
        ]

        model.fit(xtr,
                  ytr,
                  batch_size=batch_size,
                  epochs=nb_epoch,
                  verbose=1,
                  validation_data=(xte, yte),
                  callbacks=callbacks)

        print('loading the currently best model for final evaluation...')
        model = load_model(checkPointPath)

        print('--------------------------------------------------')
        print('Fold ', currentFold, ' performance')
        counts, cmat = conll_eval_counts(model.predict(xte), yte, labels)
        overall, byType = ceval.metrics(counts)
        ceval.report(counts)
        print_cm(cmat, ordered_label_keys(labels))
        foldScores.append(overall.fscore)
        print('\n')
        print('avg f1 fold scores so far: ', np.mean(foldScores))
        currentFold += 1

        # we clear the tensorflow session after each fold to not leak resources
        K.clear_session()

    print('f1 fold scores: ', foldScores)
    print('final avg f1 fold scores: ', np.mean(foldScores))
 def evaluate_evaluation_string(self, connl_evaluation_string):
     counts = conlleval.evaluate(connl_evaluation_string.split('\n'),
                                 {'delimiter': self.separator})
     return conlleval.report(counts)
Beispiel #21
0
    def fit(self, X, y, X_dev, y_dev, num_epoch = 10, batch_size = 32, seed = 1):
        random.seed(seed)
        trainset = zip(X, [ self._onehot(l,self.labels) for l in y ])
        devset = zip(X_dev, [ self._onehot(l,self.labels) for l in y_dev ])
        print "Target labels: {}".format(self.labels)
        
        train_split = trainset
        valid_split = devset

        print "{}/{} in training/validation set".format(len(train_split),len(valid_split))
        
        trainsp = random.sample(train_split,min(len(X)/2,200))
        trainfd = self.compiler.build_feed_dict(trainsp)
        valfd = self.compiler.build_feed_dict(valid_split)
        
        best_epoch = 0
        best_model = None
        best_score = 0
        epochs_since_best = 0
        for i in range(1,num_epoch+1):
            estart = time.time()
            batchpool = random.sample(train_split,len(train_split))
            
            minibatches = []
            for k in range(0,len(batchpool),batch_size):
                pool = batchpool[k:k+batch_size]
                minibatches.append(self.compiler.build_feed_dict(pool))
            
            self._train_minibatches(minibatches)
            self.sess.run(self.epoch_step_op)
            
            loss, yt_pred, yt_true = self.sess.run([self.y_loss, self.y_pred_idx, self.y_true_idx], trainfd)
            f1, precision, recall = self.fscore(yt_pred,yt_true)
            
            yv_pred, yv_true = self.sess.run([self.y_pred_idx, self.y_true_idx], valfd)
            vf1, vprecision, vrecall = self.fscore(yv_pred,yv_true)
            
            pred_dev = self.predict(X_dev)
            output_dev = []
            for (x,y,z) in zip(X_dev,y_dev,pred_dev):
                for token, y_true, y_pred in zip(x,y,z):
                    output_dev.append('{} {} {}'.format(token, y_true, y_pred))
                
                output_dev.append('')
            
            vfb1 = conlleval.report(conlleval.evaluate(output_dev))
            
            save_marker = ''
            if vfb1 >= best_score:
                best_model = '/tmp/model-{}-e{}-s{}.ckpt'.format(
                    type(self).__name__.lower(),i,seed)
                
                best_epoch, best_score = i, vfb1
                self.saver.save(self.sess, best_model)
                save_marker = '*'
                epochs_since_best = 0
            else:
                epochs_since_best += 1
                
            elapsed = int(time.time() - estart)
            emin, esec = elapsed / 60, elapsed % 60
            print "epoch {} loss {} fit {:.2f} val {:.2f}/{:.2f}/{:.2f} @ {:.2f} [{}m{}s] {}".format(i, 
                loss, f1, vf1, vprecision, vrecall, vfb1, emin, esec, save_marker)
            
            if epochs_since_best > 10:
                print "Stopping early from lack of improvements.."
                break
        
        if best_model is None:
            print "WARNING: NO GOOD FIT"
        
        self.saver.restore(self.sess, best_model)
        print "Fitted to model from epoch {} with score {} at {}".format(best_epoch,best_score,best_model)