コード例 #1
0
def evaluate(model, criterion, data_loader, file_path, mode):
    """
    mode eval:
    eval on development set and compute P/R/F1, called between training.
    mode predict:
    eval on development / test set, then write predictions to \
        predict_test.json and predict_test.json.zip \
        under args.data_path dir for later submission or evaluation.
    """
    example_all = []
    with open(file_path, "r", encoding="utf-8") as fp:
        for line in fp:
            example_all.append(json.loads(line))
    id2spo_path = os.path.join(os.path.dirname(file_path), "id2spo.json")
    with open(id2spo_path, 'r', encoding='utf8') as fp:
        id2spo = json.load(fp)

    model.eval()
    loss_all = 0
    eval_steps = 0
    formatted_outputs = []
    current_idx = 0
    for batch in tqdm(data_loader, total=len(data_loader)):
        eval_steps += 1
        input_ids, seq_len, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch
        logits = model(input_ids=input_ids)
        mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and(
            (input_ids != 2))
        loss = criterion(logits, labels, mask)
        loss_all += loss.numpy().item()
        probs = F.sigmoid(logits)
        logits_batch = probs.numpy()
        seq_len_batch = seq_len.numpy()
        tok_to_orig_start_index_batch = tok_to_orig_start_index.numpy()
        tok_to_orig_end_index_batch = tok_to_orig_end_index.numpy()
        formatted_outputs.extend(
            decoding(example_all[current_idx:current_idx + len(logits)],
                     id2spo, logits_batch, seq_len_batch,
                     tok_to_orig_start_index_batch,
                     tok_to_orig_end_index_batch))
        current_idx = current_idx + len(logits)
    loss_avg = loss_all / eval_steps
    print("eval loss: %f" % (loss_avg))

    if mode == "predict":
        predict_file_path = os.path.join(args.data_path, 'predictions.json')
    else:
        predict_file_path = os.path.join(args.data_path, 'predict_eval.json')

    predict_zipfile_path = write_prediction_results(formatted_outputs,
                                                    predict_file_path)

    if mode == "eval":
        precision, recall, f1 = get_precision_recall_f1(
            file_path, predict_zipfile_path)
        os.system('rm {} {}'.format(predict_file_path, predict_zipfile_path))
        return precision, recall, f1
    elif mode != "predict":
        raise Exception("wrong mode for eval func")
コード例 #2
0
def evaluate(model, criterion, data_loader, test_loss, file_path, mode):
    """
    mode eval:
    eval on development set and compute P/R/F1, called between training.
    mode predict:
    eval on development / test set, then write predictions to \
        predict_test.json and predict_test.json.zip \
        under args.data_path dir for later submission or evaluation.
    """
    probs_all = None
    seq_len_all = None
    tok_to_orig_start_index_all = None
    tok_to_orig_end_index_all = None
    loss_all = 0
    eval_steps = 0
    for batch in tqdm(data_loader):
        eval_steps += 1
        input_ids, seq_len, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch
        logits = model(input_ids=input_ids)
        mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and((input_ids != 2))
        loss = criterion((logits, labels, mask))
        loss_all += test_loss(loss).result()
        probs = logits
        if probs_all is None:
            probs_all = probs.numpy()
            seq_len_all = seq_len.numpy()
            tok_to_orig_start_index_all = tok_to_orig_start_index.numpy()
            tok_to_orig_end_index_all = tok_to_orig_end_index.numpy()
        else:
            probs_all = np.append(probs_all, probs.numpy(), axis=0)
            seq_len_all = np.append(seq_len_all, seq_len.numpy(), axis=0)
            tok_to_orig_start_index_all = np.append(
                tok_to_orig_start_index_all,
                tok_to_orig_start_index.numpy(),
                axis=0)
            tok_to_orig_end_index_all = np.append(
                tok_to_orig_end_index_all,
                tok_to_orig_end_index.numpy(),
                axis=0)
    loss_avg = loss_all / eval_steps
    print("eval loss: %f" % (loss_avg))

    id2spo_path = os.path.join(os.path.dirname(file_path), "id2spo.json")
    with open(id2spo_path, 'r', encoding='utf8') as fp:
        id2spo = json.load(fp)
    formatted_outputs = decoding(file_path, id2spo, probs_all, seq_len_all,
                                 tok_to_orig_start_index_all,
                                 tok_to_orig_end_index_all)
    if mode == "predict":
        predict_file_path = os.path.join(args.data_path, 'predictions.json')
    else:
        predict_file_path = os.path.join(args.data_path, 'predict_eval.json')

    predict_zipfile_path = write_prediction_results(formatted_outputs,
                                                    predict_file_path)

    if mode == "eval":
        precision, recall, f1 = get_precision_recall_f1(file_path,
                                                        predict_zipfile_path)
        os.system('rm {} {}'.format(predict_file_path, predict_zipfile_path))
        return precision, recall, f1
    elif mode != "predict":
        raise Exception("wrong mode for eval func")
コード例 #3
0
    def __test__(self, epoch, text_seqs, class_list):

        assert len(text_seqs) == len(class_list)

        start_time = time.time()
        step_time = time.time()

        test_steps = len(text_seqs) // config.batch_size
        if test_steps * config.batch_size < len(text_seqs):
            test_steps += 1

        # topk_list = list()
        pred_all = np.array([])
        out_confidence_all = np.array([])

        all_loss = np.zeros(1)

        for cstep in range(test_steps):

            text_seqs_mini = text_seqs[cstep * config.batch_size:min(
                (cstep + 1) * config.batch_size, len(text_seqs))]
            class_idx_mini = class_list[cstep * config.batch_size:min(
                (cstep + 1) * config.batch_size, len(text_seqs))]

            encode_seqs_id_mini, encode_seqs_mat_mini = self.prepro_encode(
                text_seqs_mini)

            # pred_mat = np.zeros([config.batch_size, len(self.class_dict)])

            test_loss, out = self.sess.run(
                [
                    self.model.test_loss,
                    self.model.test_net.outputs,
                ],
                feed_dict={
                    self.model.encode_seqs:
                    encode_seqs_mat_mini,
                    self.model.label_logits:
                    np.array(class_idx_mini).reshape(-1, 1)
                })

            all_loss[0] += test_loss

            pred = np.array(
                [1.0 if x[0] >= self.model.threshold else 0.0 for x in out])
            out_confidence = np.array([x[0] for x in out])

            # pred = np.array([_ / np.sum(_) for _ in np.exp(out)])

            # for i in range(len(self.seen_class)):
            #     pred_mat[:, self.full_class_map2index[self.seen_class[i]]] = pred[:, i]

            # topk = self.get_pred_class_topk(pred_mat, k=1)
            # topk_list.append(topk)
            pred_all = np.concatenate((pred_all, pred), axis=0)
            out_confidence_all = np.concatenate(
                (out_confidence_all, out_confidence), axis=0)

            if cstep % config.cstep_print == 0 and cstep > 0:
                # tmp_topk = np.concatenate(topk_list, axis=0)
                # tmp_topk = self.get_one_hot_results(np.array(tmp_topk[: (cstep + 1) * config.batch_size]))
                # tmp_gt = self.get_one_hot_results(np.reshape(np.array(class_list[ : (cstep + 1) * config.batch_size]), newshape=(-1, 1)))
                # tmp_stats = utils.get_statistics(tmp_topk, tmp_gt, single_label_pred=True)
                tmp_stats = utils.get_precision_recall_f1(
                    pred_all,
                    np.array(class_list[:len(pred_all)]),
                    with_confusion_matrix=True)

                print(
                    "[Test] Epoch: [%3d][%4d/%4d] time: %.4f, loss: %s, threshold: %.4f \n %s"
                    % (epoch, cstep, test_steps, time.time() - step_time,
                       all_loss / (cstep + 1), self.model.threshold,
                       utils.dict_to_string_4_print(tmp_stats)))
                step_time = time.time()

        # prediction_topk = np.concatenate(topk_list, axis=0)
        # prediction_topk = self.get_one_hot_results(np.array(prediction_topk[: test_steps * config.batch_size]))
        # ground_truth = self.get_one_hot_results(np.reshape(np.array(class_list[: test_steps * config.batch_size]), newshape=(-1, 1)))

        stats = utils.get_precision_recall_f1(pred_all,
                                              np.array(class_list),
                                              with_confusion_matrix=True)

        print(
            "[Test Sum] Epoch: [%3d] time: %.4f, loss: %s, threshold: %.4f \n %s"
            % (epoch, time.time() - start_time, all_loss / test_steps,
               self.model.threshold, utils.dict_to_string_4_print(stats)))

        return stats, pred_all, class_list, out_confidence_all
コード例 #4
0
        for idx, gt in enumerate(test_class_list):
            accepted_classes = [
                seen_classes[cid] for cid, x in enumerate(pred_list)
                if x[idx] == 1.0
            ]
            prediction.append(accepted_classes)
            if len(accepted_classes) > 0:
                accepted.append(1)
            else:
                accepted.append(0)
            if gt in seen_classes:
                gt_accepted.append(1)
            else:
                gt_accepted.append(0)
        accepted_stats = utils.get_precision_recall_f1(
            np.array(accepted),
            np.array(gt_accepted),
            with_confusion_matrix=True)
        pass_to_phase2.append(accepted)

        avg_classifier_stat = dict()
        for key in stat_list[0]:
            if key != 'texts_accepted_from_class':
                avg_classifier_stat[key] = sum(
                    [stat[key] for stat in stat_list]) / len(stat_list)

        summary_dict = {
            'iteration': i,
            'seen_classes': seen_classes,
            'unseen_classes': unseen_classes,
            'test_class_list': test_class_list,
            'stat_list': stat_list,
コード例 #5
0
def evaluate(model, criterion, data_loader, file_path, mode, logger):
    """
    mode eval:
    eval on development set and compute P/R/F1, called between training.
    mode predict:
    eval on development / test set, then write predictions to \
        predict_test.json and predict_test.json.zip \
        under args.data_path dir for later submission or evaluation.
    """
    model.eval()
    probs_all = None
    seq_len_all = None
    tok_to_orig_start_index_all = None
    tok_to_orig_end_index_all = None
    loss_all = 0
    eval_steps = 0
    logger.info(
        "\n----------------------------------IN Evaluate func-----------------------------------\n"
    )
    for batch in tqdm(data_loader, total=len(data_loader)):
        eval_steps += 1
        input_ids, seq_len, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch

        if args.device == 'cuda':
            input_ids = input_ids.cuda()
            labels = labels.cuda()

        logits = model(input_ids=input_ids)
        mask = (input_ids != 0) & (input_ids != 1) & (input_ids != 2)
        loss = criterion(logits, labels, mask)
        loss_all += loss.detach().cpu().numpy().item()
        probs = torch.sigmoid(logits).cpu()
        if probs_all is None:
            probs_all = probs.numpy()
            seq_len_all = seq_len.numpy()
            tok_to_orig_start_index_all = tok_to_orig_start_index.numpy()
            tok_to_orig_end_index_all = tok_to_orig_end_index.numpy()
        else:
            probs_all = np.append(probs_all, probs.numpy(), axis=0)
            seq_len_all = np.append(seq_len_all, seq_len.numpy(), axis=0)
            tok_to_orig_start_index_all = np.append(
                tok_to_orig_start_index_all,
                tok_to_orig_start_index.numpy(),
                axis=0)
            tok_to_orig_end_index_all = np.append(
                tok_to_orig_end_index_all,
                tok_to_orig_end_index.numpy(),
                axis=0)
    loss_avg = loss_all / eval_steps
    logger.info("eval loss: %f" % (loss_avg))

    id2spo_path = os.path.join(os.path.dirname(file_path), "id2spo.json")
    with open(id2spo_path, 'r', encoding='utf8') as fp:
        id2spo = json.load(fp)
    formatted_outputs = decoding(file_path, id2spo, probs_all, seq_len_all,
                                 tok_to_orig_start_index_all,
                                 tok_to_orig_end_index_all)
    if mode == "predict":
        predict_file_path = os.path.join(args.data_path, 'predictions.json')
    else:
        predict_file_path = os.path.join(args.data_path, 'predict_eval.json')

    predict_zipfile_path = write_prediction_results(formatted_outputs,
                                                    predict_file_path)

    if mode == "eval":
        precision, recall, f1 = get_precision_recall_f1(
            file_path, predict_zipfile_path)
        os.system('rm {} {}'.format(predict_file_path, predict_zipfile_path))
        return precision, recall, f1
    elif mode != "predict":
        logger.debug("wrong mode for eval func")
        raise Exception("wrong mode for eval func")
    logger.info("Finish evaluating.")
コード例 #6
0
ファイル: analyze.py プロジェクト: alwc/iSeqL
def main(args):
    train_dataset, valid_dataset, vocab, tag_vocab = load_all()
    model = bilstm_crf.BiLSTM_CRF(vocab, tag_vocab, args.embedding_dim,
                                  args.hidden_dim, args.batch_size)

    print('loading model')
    model.load_state_dict(
        torch.load(args.model_path, map_location=lambda storage, loc: storage))
    print('loaded model')

    data_loader = conlldataloader.get_data_loader(
        vocab,
        tag_vocab,
        valid_dataset,
        1,
        True,
        1,
    )

    correct = 0
    total = 0

    f1 = None

    bio_removed_tags = utils.remove_bio(tag_vocab.get_all())

    # iterate over epochs
    for i, (x, y) in enumerate(data_loader):
        if i > 100:
            break
        model.zero_grad()
        # print(' '.join([vocab.get_word(index) for index in x[0].data.tolist()]))
        out = model(x)

        predicted = utils.remove_bio(
            [tag_vocab.get_word(index) for index in out[0]])
        expected = utils.remove_bio(
            [tag_vocab.get_word(index) for index in y[0].data.tolist()])

        total += len(predicted)
        correct += sum(
            [predicted[i] == expected[i] for i in range(len(predicted))])

        # print(predicted)
        # print(expected)
        f1_temp = utils.compute_f1(predicted, expected, bio_removed_tags)

        if f1 is None:
            f1 = f1_temp
        else:
            f1 = utils.combine_f1(f1, f1_temp)
        # break

    print('correct: {} total: {} accuracy: {}'.format(correct, total,
                                                      correct / total))
    res = (utils.get_precision_recall_f1(f1))
    scores = []
    for key in res:
        scores.append(res[key]['f1'])
        print('{} f1: {}'.format(key, res[key]['f1']))
    avg = sum(scores) / sum([score > 0 for score in scores])
    print('Average F1: {}'.format(avg))