Python f1_score Examples, metrics.f1_score Python Examples

Example #1

0

Show file

 def test_f1_score(self):
     prediction = 'rock'
     ground_truth = 'rock n roll'
     precision = 1. * 1 / 1
     recall = 1. * 1 / 3
     f1 = (2 * precision * recall) / (precision + recall)
     self.assertEqual(f1_score(prediction, ground_truth), f1)
     self.assertEqual(f1_score(ground_truth, ground_truth), 1)

Example #2

0

Show file

def evaluate(dev_loader, model, mode='dev'):
    # set model to evaluation mode
    model.eval()

    id2label = config.id2label
    true_tags = []
    pred_tags = []
    sent_data = []
    dev_losses = 0

    with torch.no_grad():
        for idx, batch_samples in enumerate(dev_loader):
            batch_data, batch_token_starts, batch_tags, ori_data = batch_samples
            # shift tensors to GPU if available
            batch_data = batch_data.to(config.device)
            batch_token_starts = batch_token_starts.to(config.device)
            batch_tags = batch_tags.to(config.device)
            sent_data.extend(ori_data)
            batch_masks = batch_data.gt(0)  # get padding mask
            label_masks = batch_tags.gt(-1)
            # compute model output and loss
            loss = model((batch_data, batch_token_starts),
                         token_type_ids=None,
                         attention_mask=batch_masks,
                         labels=batch_tags)[0]
            dev_losses += loss.item()
            # shape: (batch_size, max_len, num_labels)
            batch_output = model((batch_data, batch_token_starts),
                                 token_type_ids=None,
                                 attention_mask=batch_masks)[0]
            if mode == 'dev':
                batch_output = model.module.crf.decode(batch_output,
                                                       mask=label_masks)
            else:
                # (batch_size, max_len - padding_label_len)
                batch_output = model.crf.decode(batch_output, mask=label_masks)
            batch_tags = batch_tags.to('cpu').numpy()

            pred_tags.extend([[id2label.get(idx) for idx in indices]
                              for indices in batch_output])
            # (batch_size, max_len - padding_label_len)
            true_tags.extend(
                [[id2label.get(idx) for idx in indices if idx > -1]
                 for indices in batch_tags])

    assert len(pred_tags) == len(true_tags)
    assert len(sent_data) == len(true_tags)

    # logging loss, f1 and report
    metrics = {}
    f1, p, r = f1_score(true_tags, pred_tags)
    metrics['f1'] = f1
    metrics['p'] = p
    metrics['r'] = r
    if mode != 'dev':
        bad_case(sent_data, pred_tags, true_tags)
        output_write(sent_data, pred_tags)
        output2res()
    metrics['loss'] = float(dev_losses) / len(dev_loader)
    return metrics

Example #3

0

Show file

File: cross_validation.py Project: rnepal2/Cutstom-kNN-Classifier

def for_each_fold(fold, folds, data, labels, model, error_function):
    
    (x_train, y_train), (x_test, y_test) = partition_data(data, labels, fold, folds)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
        
    # Based on the error_function passed
    if error_function is None: # if None calculate precision
        error = precision_score(y_test, y_pred)
            
    elif error_function == 'precision':
        error = precision_score(y_test, y_pred)
        
    if error_function == 'accuracy':
        error = accuracy_score(y_test, y_pred)
            
    elif error_function == 'recall':
        error = recall_score(y_test, y_pred)
            
    elif error_function == 'f1':
        error = f1_score(y_test, y_pred)
    else:
        raise ValueError('%s error function is not defined.' % error_function)
        
    return {'expected labels': y_test, 
            'predicted labels': y_pred, 
            'errors': [error]}

Example #4

0

Show file

File: utilities.py Project: nikolay-pavlnk/ml

def classification_report(y_true, y_pred):
    print('--------------------------------')
    print('Accuracy -', metrics.accuracy(y_true, y_pred))
    print('Recall -', metrics.recall(y_true, y_pred))
    print('Precision -', metrics.precision(y_true, y_pred))
    print('F1 score -', metrics.f1_score(y_true, y_pred))
    print('--------------------------------')

Example #5

0

Show file

def evaluate(model, data_iterator, params, mark='Eval', verbose=True):
    """Evaluate the model on `steps` batches."""
    # set model to evaluation mode
    model.eval()

    # id2tag dict
    idx2tag = {idx: tag for idx, tag in enumerate(params.tags)}

    true_tags = []
    pred_tags = []

    # a running average object for loss
    loss_avg = utils.RunningAverage()
    for input_ids, input_mask, labels in data_iterator:
        # to device
        input_ids = input_ids.to(params.device)
        input_mask = input_mask.to(params.device)
        labels = labels.to(params.device)

        batch_size, max_len = labels.size()

        # get loss
        loss = model(input_ids, attention_mask=input_mask.bool(), labels=labels)
        loss /= batch_size
        # update the average loss
        loss_avg.update(loss.item())

        # inference
        with torch.no_grad():
            batch_output = model(input_ids, attention_mask=input_mask.bool())

        # 恢复标签真实长度
        real_batch_tags = []
        for i in range(batch_size):
            real_len = int(input_mask[i].sum())
            real_batch_tags.append(labels[i][:real_len].to('cpu').numpy())

        # List[int]
        pred_tags.extend([idx2tag.get(idx) for indices in batch_output for idx in indices])
        true_tags.extend([idx2tag.get(idx) for indices in real_batch_tags for idx in indices])
    # sanity check
    assert len(pred_tags) == len(true_tags), 'len(pred_tags) is not equal to len(true_tags)!'

    # logging loss, f1 and report
    metrics = {}
    f1 = f1_score(true_tags, pred_tags)
    accuracy = accuracy_score(true_tags, pred_tags)
    metrics['loss'] = loss_avg()
    metrics['f1'] = f1
    metrics['accuracy'] = accuracy
    metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items())
    logging.info("- {} metrics: ".format(mark) + metrics_str)

    # f1 classification report
    if verbose:
        report = classification_report(true_tags, pred_tags)
        logging.info(report)
    return metrics

Example #6

0

Show file

File: evaluate.py Project: asagar60/BERT-keyphrase-extraction

def evaluate(model, data_iterator, params, mark='Test', verbose=False):
    """Evaluate the model on `steps` batches."""
    # set model to evaluation mode
    model.eval()

    idx2tag = params.idx2tag

    true_tags = []
    pred_tags = []

    # a running average object for loss
    loss_avg = utils.RunningAverage()

    for _ in range(params.eval_steps):
        # fetch the next evaluation batch
        batch_data, batch_tags = next(data_iterator)
        batch_masks = batch_data.gt(0)

        loss = model(batch_data,
                     token_type_ids=None,
                     attention_mask=batch_masks,
                     labels=batch_tags)
        batch_output = model(batch_data,
                             token_type_ids=None,
                             attention_mask=batch_masks
                             )  # shape: (batch_size, max_len, num_labels)

        loss = loss[0]
        batch_output = batch_output[0]

        if params.n_gpu > 1 and params.multi_gpu:
            loss = loss.mean()
        loss_avg.update(loss.item())

        batch_output = batch_output.detach().cpu().numpy()
        batch_tags = batch_tags.to('cpu').numpy()

        pred_tags.extend([
            idx2tag.get(idx) for indices in np.argmax(batch_output, axis=2)
            for idx in indices
        ])
        true_tags.extend(
            [idx2tag.get(idx) for indices in batch_tags for idx in indices])
    assert len(pred_tags) == len(true_tags)

    # logging loss, f1 and report
    metrics = {}
    f1 = f1_score(true_tags, pred_tags)
    metrics['loss'] = loss_avg()
    metrics['f1'] = f1
    metrics_str = "; ".join("{}: {:05.2f}".format(k, v)
                            for k, v in metrics.items())
    logging.info("- {} metrics: ".format(mark) + metrics_str)

    if verbose:
        report = classification_report(true_tags, pred_tags)
        logging.info(report)
    return metrics

Example #7

0

Show file

    def validate(self):
        self.train = False

        a = self.activation()
        preds = probability_to_preds(a, self.threshold)

        acc = accuracy(a, self.valid_y)
        f1 = f1_score(preds, self.valid_y)

        print(f'f1 score {f1}')
        print(f'test accuracy {acc}')

Example #8

0

Show file

File: seq2seq-code.py Project: bzz/practical-pytorch-lightning

    def validation_end(self, outputs):  # OPTIONAL
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        tb_logs = {'val_loss': avg_loss, 'ppl': math.exp(avg_loss)}

        tb_logs['acc'] = torch.stack([x['val_acc'] for x in outputs]).mean()
        tb_logs['bleu'] = np.mean([x['bleu'] for x in outputs])

        total = {}
        for metric_name in ['tp', 'fp', 'fn']:
            metric_value = torch.stack([x[metric_name] for x in outputs]).sum()
            total[metric_name] = metric_value

        prec_rec_f1 = metrics.f1_score(total['tp'], total['fp'], total['fn'])
        tb_logs.update(prec_rec_f1)
        return {'avg_val_loss': avg_loss, 'log': tb_logs}

Example #9

0

Show file

File: svm.py Project: AIhomework/text-classification

def kfold_average_score(learner, files, dirs, k=5, min_word_len=2, min_freq=10, feature_size=160, weight='tfidf', sw=True):
    scores = [0] * k
    kfold = KFold(files, dirs, k)
    for ii in range(k):
        train_X, train_Y, test_X, test_Y = kfold.kth(ii)
        features = Features(train_X, train_Y, min_word_len, min_freq, feature_size, sw)
        x = []
        for f_name in train_X:
            x.append(features.get_x_vector(f_name, weight))

        learner.fit(x, features.y_transform(train_Y))
        x = []
        for f_name in test_X:
            x.append(features.get_x_vector(f_name, weight))
        scores[ii] = f1_score(features.y_transform(test_Y), learner.predict(x).tolist())
    return mean(scores)

Example #10

0

Show file

File: myModel.py Project: luocan17/OPPO_Competetion

    def evaluate(self, darray, thr):

        batch_index = 0
        X_batch, P_batch, y_batch = self.get_batch(darray, self.batch_size,
                                                   batch_index)
        y_pred = None
        y_label = None
        while len(X_batch) > 0:
            num_batch = len(y_batch)
            feed_dict = {
                self.vocab_index:
                X_batch,
                self.props:
                P_batch,
                self.label:
                y_batch,
                self.first_level_lstm_dropout_p:
                [1.0] * len(self.first_level_lstm_dropout),
                self.deep_dropout_p: [1.0] * len(self.deep_dropout),
                self.conv_pool_dropout_p: [1.0] * len(self.conv_pool_dropout),
                self.second_level_lstm_dropout_p:
                [1.0] * len(self.second_level_lstm_dropout),
                self.train_phase:
                False
            }
            batch_out = self.sess.run(self.out, feed_dict=feed_dict)

            if batch_index == 0:
                y_pred = np.reshape(batch_out, (num_batch, ))
                y_label = np.reshape(y_batch, (num_batch, ))
            else:
                y_pred = np.concatenate(
                    (y_pred, np.reshape(batch_out, (num_batch, ))))
                y_label = np.concatenate(
                    (y_label, np.reshape(y_batch, (num_batch, ))))

            batch_index += 1
            X_batch, P_batch, y_batch = self.get_batch(darray, self.batch_size,
                                                       batch_index)

        pred = [1 if y_pred[i] > thr else 0 for i in range(len(y_pred))]
        accuracy = metrics.accuracy_score(y_label, pred)
        precision = metrics.precision_score(y_label, pred)
        recall = metrics.recall_score(y_label, pred)
        f1 = metrics.f1_score(y_label, pred)

        return accuracy, precision, recall, f1

Example #11

0

Show file

File: evaluator.py Project: olekscode/NameGen

    def __predict_names(self, model):
        x = []
        y_true = []
        y_pred = []

        for training_pair in self.pairs:
            input_tensor = training_pair[0]
            output_tensor = model(input_tensor)

            x.append(input_tensor)
            y_true.append(training_pair[1])
            y_pred.append(output_tensor)

        # Convert numbers to words and remove SOS and EOS tokens
        x = [[
            self.input_lang.index2word[index.item()] for index in sent
            if index.item() not in [SOS_TOKEN, EOS_TOKEN]
        ] for sent in x]

        y_true = [[
            self.output_lang.index2word[index.item()] for index in sent
            if index.item() not in [SOS_TOKEN, EOS_TOKEN]
        ] for sent in y_true]

        y_pred = [[
            self.output_lang.index2word[index] for index in sent
            if index not in [SOS_TOKEN, EOS_TOKEN]
        ] for sent in y_pred]

        names = pd.DataFrame(
            OrderedDict([
                ('Source', [' '.join(sent) for sent in x]),
                ('True Name', [' '.join(sent) for sent in y_true]),
                ('Our Name', [' '.join(sent) for sent in y_pred]),
                ('BLEU',
                 [bleu(y_true[i], y_pred[i]) for i in range(len(y_true))]),
                ('ROUGE',
                 [rouge(y_true[i], y_pred[i]) for i in range(len(y_true))]),
                ('F1',
                 [f1_score(y_true[i], y_pred[i]) for i in range(len(y_true))])
            ]))

        return names

Example #12

0

Show file

    def forward(self, epoch=10000):
        for i in range(epoch):
            print(f'epoch {i}')

            a = self.activation()
            preds = probability_to_preds(a, self.threshold)

            dz = a - self.train_y
            dw = np.dot(self.train_x, dz.T) / self.no_of_samples
            db = np.sum(dz) / self.no_of_samples

            self.optimizer(dw, db, self.learning_rate)

            acc = accuracy(a, self.train_y)
            f1 = f1_score(preds, self.train_y)

            print(f'f1_score {f1}')
            print(f'train accuracy {acc}')
            print(f'train loss {self.cost(self.train_y, a)}')

        self.validate()

Example #13

0

Show file

File: main.py Project: wjgaas/approximate-rank-order-clustering

def main():
    """
    Main.
    """
    # Load features
    data = sio.loadmat('data/LightenedCNN_C_lfw.mat')
    features = data['features']
    labels = data['labels_original'][0]
    label_lookup = {}
    for idx, label in enumerate(labels):
        label_lookup[idx] = int(label[0][:])
    print('Features shape: ', features.shape)

    start_time = time.time()
    clusters = aroc(features, 200, 1.1, 12)
    print('Time taken for clustering: {:.3f} seconds'.format(
        time.time() - start_time))

    _, _, _, precision, recall, score = f1_score(
        clusters, label_lookup)
    print('Clusters: {}  Precision: {:.3f}  Recall: {:.3f}  F1: {:.3f}'.format(
        len(clusters), precision, recall, score))

Example #14

0

Show file

def generate_classification_perf(truths, pred_probs, multiclass=False):
    """Given truths, and predicted probabilities, generate ModelPerf object"""
    pred_classes = np.round(pred_probs).astype(int)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        retval = ClassificationModelPerf(
            auroc=metrics.roc_auc_score(truths, pred_probs),
            auroc_curve=metrics.roc_curve(truths, pred_probs)
            if not multiclass else None,
            auprc=metrics.average_precision_score(truths, pred_probs),
            accuracy=metrics.accuracy_score(truths, pred_classes)
            if not multiclass else None,
            recall=metrics.recall_score(truths, pred_classes)
            if not multiclass else None,
            precision=metrics.precision_score(truths, pred_classes)
            if not multiclass else None,
            f1=metrics.f1_score(truths, pred_classes)
            if not multiclass else None,
            ce_loss=metrics.log_loss(truths, pred_probs, normalize=False) /
            np.prod(truths.shape),
        )
    return retval

Example #15

0

Show file

def main():
    best_val_acc = -1.0
    start_epoch = 1

    trn_ds = loaders.SatClassificationDataset(LBL_DATA_DIR, SPLIT_CSV,
                                              POSITIVE_CLASS, False, trn_tfms)
    print('Train Samples:', len(trn_ds))
    trn_dl = DataLoader(trn_ds, BATCH_SIZE, shuffle=True, num_workers=WORKERS)

    unlbl_ds = loaders.UnlabeledDataset(UNLBL_DATA_DIR, IMAGE_SIZE)
    print('Unlabeled:', len(unlbl_ds))
    unlbl_dl = DataLoader(unlbl_ds,
                          BATCH_SIZE,
                          shuffle=True,
                          num_workers=WORKERS)

    val_ds = loaders.SatClassificationDataset(LBL_DATA_DIR, SPLIT_CSV,
                                              POSITIVE_CLASS, True, val_tfms)
    print('Val Samples:', len(val_ds))
    val_dl = DataLoader(val_ds, BATCH_SIZE, shuffle=False, num_workers=WORKERS)

    model = models.Resnet(visionmodels.resnet50, 2)
    model.to(DEVICE)

    ce_loss_fn = nn.CrossEntropyLoss().to(DEVICE)
    vat_loss_fn = vat.VATLoss(IP, EPSILON, XI).to(DEVICE)

    optimizer = optim.Adam(model.parameters(), lr=LR)
    lr_sched = optim.lr_scheduler.StepLR(optimizer, LR_STEP, gamma=LR_DECAY)

    trn_metrics = BookKeeping(TENSORBOARD_LOGDIR, 'trn')
    val_metrics = BookKeeping(TENSORBOARD_LOGDIR, 'val')

    if not os.path.exists(WEIGHTS_SAVE_PATH):
        os.mkdir(WEIGHTS_SAVE_PATH)

    if LOAD_CHECKPOINT is not None:
        checkpoint = torch.load(LOAD_CHECKPOINT, pickle_module=dill)
        start_epoch = checkpoint['epoch']

        model.load_state_dict(checkpoint['state_dict'])
        optimizer = checkpoint['optimizer']

        lr_sched = checkpoint['lr_scheduler']
        best_val_acc = checkpoint['best_metrics']

    for epoch in range(start_epoch, EPOCHS + 1):

        # Train
        t_pbar = tqdm(trn_dl,
                      desc=pbar_desc('train', epoch, EPOCHS, 0.0, -1.0, -1.0))
        ul_iter = iter(unlbl_dl)

        model.train()
        for (xs, ys) in t_pbar:
            try:
                xs_ul, ys_ul = next(ul_iter)
            except StopIteration:
                # Reset the iterator in case we've used
                # up all of the images
                ul_iter = iter(unlbl_dl)
                xs_ul, ys_ul = next(ul_iter)

            xs = xs.to(DEVICE)
            ys = ys.to(DEVICE)

            y_pred1 = model(xs)
            ce_loss = ce_loss_fn(y_pred1, ys)

            xs_ul = xs_ul.to(DEVICE)
            vat_loss = vat_loss_fn(xs_ul, model, logits=True)

            total_loss = ce_loss + vat_loss

            acc = metrics.accuracy(y_pred1, ys)
            f1 = metrics.f1_score(y_pred1, ys)

            trn_metrics.update(ce=ce_loss.item(),
                               vat=vat_loss.item(),
                               total=total_loss.item(),
                               f1=f1.item(),
                               accuracy=acc.item())

            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

            t_pbar.set_description(
                pbar_desc('train', epoch, EPOCHS, total_loss.item(), acc, f1))

        # Final update to training bar
        avg_trn_metrics = trn_metrics.get_avg_losses()
        t_pbar.set_description(
            pbar_desc('train', epoch, EPOCHS, avg_trn_metrics['total'],
                      avg_trn_metrics['accuracy'], avg_trn_metrics['f1']))
        trn_metrics.update_tensorboard(epoch)

        # Validate
        v_pbar = tqdm(val_dl,
                      desc=pbar_desc('valid', epoch, EPOCHS, 0.0, -1.0, -1.0))
        model.eval()

        for xs, ys in v_pbar:

            xs = xs.to(DEVICE)
            ys = ys.to(DEVICE)

            y_pred1 = model(xs)
            ce_loss = ce_loss_fn(y_pred1, ys)

            acc = metrics.accuracy(y_pred1, ys)
            f1 = metrics.f1_score(y_pred1, ys)

            val_metrics.update(ce=ce_loss.item(),
                               vat=0,
                               total=ce_loss.item(),
                               f1=f1.item(),
                               accuracy=acc.item())

            v_pbar.set_description(
                pbar_desc('valid', epoch, EPOCHS, ce_loss.item(), acc, f1))

        avg_val_metrics = val_metrics.get_avg_losses()
        avg_acc = avg_val_metrics['accuracy']
        if avg_acc > best_val_acc:
            best_val_acc = avg_acc
            torch.save(
                model.state_dict(),
                f'{WEIGHTS_SAVE_PATH}/{EXP_NO:02d}-{MODEL_NAME}_epoch-{epoch:04d}_acc-{avg_acc:.3f}.pth'
            )

        # Final update to validation bar
        v_pbar.set_description(
            pbar_desc('train', epoch, EPOCHS, avg_val_metrics['total'],
                      avg_val_metrics['accuracy'], avg_val_metrics['f1']))
        val_metrics.update_tensorboard(epoch)

        # Update scheduler and save checkpoint
        lr_sched.step(epoch=epoch)
        save_checkpoint(epoch, model, best_val_acc, optimizer, lr_sched)

Example #16

0

Show file

File: train.py Project: houking-can/NER

def evaluate(dataloader,
             model,
             word_vocab,
             label_vocab,
             output_path,
             prefix,
             use_gpu=False):
    model.eval()
    prediction = []
    trues_list = []
    preds_list = []
    for batch in dataloader:
        batch_text, seq_length, word_perm_idx = batch['text']
        batch_label, _, _ = batch['label']
        char_inputs = batch['char']
        char_inputs = char_inputs[word_perm_idx]
        char_dim = char_inputs.size(-1)
        char_inputs = char_inputs.contiguous().view(-1, char_dim)
        if use_gpu:
            batch_text = batch_text.cuda()
            batch_label = batch_label.cuda()
            char_inputs = char_inputs.cuda()
        mask = get_mask(batch_text)
        with torch.no_grad():
            tag_seq = model(batch_text, seq_length, char_inputs, batch_label,
                            mask)

        for line_tesor, labels_tensor, predicts_tensor in zip(
                batch_text, batch_label, tag_seq):
            for word_tensor, label_tensor, predict_tensor in zip(
                    line_tesor, labels_tensor, predicts_tensor):
                if word_tensor.item() == 0:
                    break
                line = [
                    word_vocab.id_to_word(word_tensor.item()),
                    label_vocab.id_to_label(label_tensor.item()),
                    label_vocab.id_to_label(predict_tensor.item())
                ]
                trues_list.append(line[1])
                preds_list.append(line[2])
                prediction.append(' '.join(line))
            prediction.append('')

    true_entities = get_entities_bio(trues_list)
    pred_entities = get_entities_bio(preds_list)
    print(len(trues_list), len(preds_list), len(prediction))

    results = {
        "f1": f1_score(true_entities, pred_entities),
        'report': classification_report(true_entities, pred_entities)
    }

    with open(os.path.join(output_path, '%s_pred.txt' % prefix),
              'w',
              encoding='utf-8') as f:
        f.write('\n'.join(prediction))

    with open(os.path.join(output_path, '%s_score.txt' % prefix),
              "a") as writer:
        writer.write("***** Eval results {} *****\n".format(prefix))
        for key in sorted(results.keys()):
            if key == 'report_dict':
                continue
            writer.write("{} = {}\n".format(key, str(results[key])))

    return results["f1"]

Example #17

0

Show file

File: validation.py Project: cch-gs/ds_portfolio

def main():
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
    cudnn.benchmark = True

    if args.model == 'res18':
        net = resnet.ResNet18(num_classes=40).cuda()
    elif args.model == 'resnext':
        net = resnext.ResNeXt(cardinality=args.cardinality,
                              depth=args.depth,
                              nlabels=40,
                              base_width=args.base_width,
                              widen_factor=args.widen_factor).cuda()
    elif args.model == 'res_cifar':
        net = resnet_cifar.resnet20(num_classes=40).cuda()

    state_dict = torch.load(f'{args.model_path}/model_200.pth')
    net.load_state_dict(state_dict)

    criterion = nn.CrossEntropyLoss().cuda()
    metric_logger = utils.Logger(os.path.join(args.save_path, 'metric.log'))
    ''' Open Set Recognition '''
    ''' validation '''
    print('')
    print('Open Set Recognition/Out of Distribution Detection-Validation')
    print('known data: CIFAR40')
    print('unknown data: new-TinyImageNet158')
    print('')

    train_loader = dataloader.train_loader(args.data_root, args.data,
                                           args.batch_size)

    in_valid_loader = dataloader.in_dist_loader(args.data_root, args.in_data,
                                                args.batch_size, 'valid')
    ood_valid_loader = dataloader.out_dist_loader(args.data_root,
                                                  'new-tinyimagenet158',
                                                  args.batch_size, 'valid')
    alpha_list = [40]
    eta_list = [5, 10, 20, 30, 40]

    for alpha in alpha_list:
        for eta in eta_list:
            args.weibull_alpha = alpha
            args.weibull_tail = eta

            in_softmax, in_openmax, in_softlogit, in_openlogit,\
                _, _, _ = test(net, train_loader, in_valid_loader)
            out_softmax, out_openmax, out_softlogit, out_openlogit,\
                _, _, _ = test(net, train_loader, ood_valid_loader)


            f1, li_f1, li_thresholds, \
            li_precision, li_recall = metrics.f1_score(1-np.array(in_openmax), 1-np.array(out_openmax),
                                                      pos_label=0)
            ood_scores = metrics.ood_metrics(1 - np.array(in_openmax),
                                             1 - np.array(out_openmax))

            if not os.path.exists(args.save_path):
                os.makedirs(args.save_path)

            metric_logger.write([
                'VAL CIFAR40-Tiny158', '\t', 'FPR@95%TPR', '\t', 'DET ERR',
                '\t', 'AUROC', '\t\t', 'AUPR-IN', '\t', 'AUPR-OUT', '\t',
                'F1 SCORE', '\t', ''
            ])
            metric_logger.write([
                '', '\t\t\t', 100 * ood_scores['FPR95'], '\t',
                100 * ood_scores['DTERR'], '\t', 100 * ood_scores['AUROC'],
                '\t', 100 * ood_scores['AUIN'], '\t',
                100 * ood_scores['AUOUT'], '\t', f1, '\t', ''
            ])

            # save to .csv
            with open(f'{args.save_path}/openmax-scores.csv', 'a',
                      newline='') as f:
                columns = [
                    "", "FPR@95%TPR", "DET ERR", "AUROC", "AUPR-IN",
                    "AUPR-OUT", "F1 SCORE"
                    "alpha"
                    "eta"
                ]
                writer = csv.writer(f)
                if args.weibull_alpha == 40 and args.weibull_tail == 5:
                    writer.writerow([
                        '* Open Set Recognition/Out of Distribution Detection Validation-new-TinyImageNet158'
                    ])
                    writer.writerow(columns)
                writer.writerow([
                    '', 100 * ood_scores['FPR95'], 100 * ood_scores['DTERR'],
                    100 * ood_scores['AUROC'], 100 * ood_scores['AUIN'],
                    100 * ood_scores['AUOUT'], f1, args.weibull_alpha,
                    args.weibull_tail
                ])
                # writer.writerow([''])
            f.close()

Example #18

0

Show file

File: main.py Project: jiachenwestlake/Entity_BERT

def evaluate(model, iterator, f, ner_label, verbose = False):
    """Evaluate the model on `steps` batches."""
    # set model to evaluation mode
    model.eval()

    y_true = []
    y_pred = []
    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, input_ids, is_heads, tags, input_tags, entity_label, seqlens = batch

            _, _, y_hat = model(input_ids, input_tags, entity_label)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(input_tags.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())
    ## gets results and save
    with open("temp", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [ner_label.idx2tag[hat] for hat in y_hat]
            if len(preds[1:-1]) > 0:
                y_pred.append(preds[1:-1])
            if len(tags.split()[1:-1]) > 0:
                y_true.append(tags.split()[1:-1])
            assert len(preds) == len(words.split()) == len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write(f"{w} {t} {p}\n")
            fout.write("\n")

    assert len(y_pred) == len(y_true)

    # logging loss, f1 and report
    p, r, f1 = f1_score(y_true, y_pred)

    # metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items())
    # logging.info("- {} metrics: ".format(mark) + metrics_str)
    #
    # if verbose:
    #     report = classification_report(true_tags, pred_tags)
    #     logging.info(report)

    final = f + ".P%.4f_R%.4f_F%.4f" %(p, r, f1)
    with open(final, 'w') as fout:
        result = open("temp", "r").read()
        fout.write(f"{result}\n")

        fout.write(f"precision={p}\n")
        fout.write(f"recall={r}\n")
        fout.write(f"f1={f1}\n")
        if verbose:
            report = classification_report(y_true, y_pred)
            print(report)

    os.remove("temp")

    print("precision=%.2f"%p)
    print("recall=%.2f"%r)
    print("f1=%.2f"%f1)
    return p, r, f1

Example #19

0

Show file

File: evaluate.py Project: zhaolulul/NER-1

def evaluate(args, model, eval_dataloader, params):
    model.eval()
    # 记录平均损失
    loss_avg = utils.RunningAverage()
    # init
    pre_result = []
    gold_result = []

    # get data
    for batch in tqdm(eval_dataloader, unit='Batch'):
        # to device
        batch = tuple(t.to(params.device) for t in batch)
        input_ids, input_mask, segment_ids, start_pos, end_pos, ne_cate = batch

        with torch.no_grad():
            # get loss
            loss = model(input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         start_positions=start_pos,
                         end_positions=end_pos)
            if params.n_gpu > 1 and args.multi_gpu:
                loss = loss.mean()  # mean() to average on multi-gpu.
            # update the average loss
            loss_avg.update(loss.item())

            # inference
            start_logits, end_logits = model(input_ids=input_ids,
                                             token_type_ids=segment_ids,
                                             attention_mask=input_mask)

        # gold label
        start_pos = start_pos.to("cpu").numpy().tolist()
        end_pos = end_pos.to("cpu").numpy().tolist()
        input_mask = input_mask.to('cpu').numpy().tolist()
        ne_cate = ne_cate.to("cpu").numpy().tolist()

        # predict label
        start_label = start_logits.detach().cpu().numpy().tolist()
        end_label = end_logits.detach().cpu().numpy().tolist()

        # idx to label
        cate_idx2label = {
            idx: value
            for idx, value in enumerate(params.label_list)
        }

        # get bio result
        for start_p, end_p, start_g, end_g, input_mask_s, ne_cate_s in zip(
                start_label, end_label, start_pos, end_pos, input_mask,
                ne_cate):
            ne_cate_str = cate_idx2label[ne_cate_s]
            # 问题长度
            q_len = len(IO2QUERY[ne_cate_str])
            # 有效长度
            act_len = sum(input_mask_s[q_len + 2:-1])
            # get BIO labels
            pre_bio_labels = pointer2bio(start_p[q_len + 2:q_len + 2 +
                                                 act_len],
                                         end_p[q_len + 2:q_len + 2 + act_len],
                                         ne_cate=ne_cate_str)
            gold_bio_labels = pointer2bio(start_g[q_len + 2:q_len + 2 +
                                                  act_len],
                                          end_g[q_len + 2:q_len + 2 + act_len],
                                          ne_cate=ne_cate_str)
            pre_result.append(pre_bio_labels)
            gold_result.append(gold_bio_labels)

    # metrics
    f1 = f1_score(y_true=gold_result, y_pred=pre_result)
    acc = accuracy_score(y_true=gold_result, y_pred=pre_result)

    # f1, acc
    metrics = {'loss': loss_avg(), 'f1': f1, 'acc': acc}
    metrics_str = "; ".join("{}: {:05.2f}".format(k, v)
                            for k, v in metrics.items())
    logging.info("- {} metrics: ".format('Val') + metrics_str)
    # f1 classification report
    report = classification_report(y_true=gold_result, y_pred=pre_result)
    logging.info(report)

    return metrics

Example #20

0

Show file

File: evaluate.py Project: richardsfc/pixel2mesh-ssds

    pools = []
    for i in range(5):
        pools.append(FeaturePooling(ims[i]))
    pred_points = model_gcn(graph, pools)

    # Compute eval metrics
    _, loss_norm = chamfer_loss(pred_points[-1],
                                gt_points_list[0].squeeze(),
                                normalized=True)
    _, loss_unorm = chamfer_loss(pred_points[-1],
                                 gt_points_list[0].squeeze(),
                                 normalized=False)
    tot_loss_norm += loss_norm.item()
    tot_loss_unorm += loss_unorm.item()
    tot_f1_1 += f1_score(pred_points[-1],
                         gt_points_list[0].squeeze(),
                         threshold=tau)
    tot_f1_2 += f1_score(pred_points[-1],
                         gt_points_list[0].squeeze(),
                         threshold=2 * tau)

    # Logs
    if n % log_step == 0:
        print("Batch", n)
        print("Normalized Chamfer loss so far", tot_loss_norm / (n + 1))
        print("Unormalized Chamfer loss so far", tot_loss_unorm / (n + 1))
        print("F1 score (tau=1e-4)", tot_f1_1 / (n + 1))
        print("F1 score (tau=2e-4)", tot_f1_2 / (n + 1))

    # Generate meshes
    if args.output is not None:

Example #21

0

Show file

all_ids_conc = np.concatenate(all_ids, axis=0)

# Argmax to get predicted label
arg_type = np.argmax(all_preds_conc, 1)
arg_sp = np.argmax(all_sp_conc, 1)
arg_mt = np.argmax(all_mt_conc, 1)
arg_ch = np.argmax(all_ch_conc, 1)
arg_th = np.argmax(all_th_conc, 1)
arg_y = np.argmax(all_y_conc, 1)
all_tp = np.concatenate(all_tp, axis=0)

# Calculate precision and recall for the cleavage site
prec_sp, prec_mt, prec_ch, prec_th = precision_cs(arg_sp, arg_mt, arg_ch,
                                                  arg_th, arg_type, all_tp,
                                                  arg_y)
recall_sp, recall_mt, recall_ch, recall_th = recall_cs(arg_sp, arg_mt, arg_ch,
                                                       arg_th, arg_type,
                                                       all_tp, arg_y)

# Calculate f1 score
f1_type = metrics.f1_score(all_tp, arg_type, average=None)

# Summary
print('========== Final results ==========')
print('Signal\tF1 score\tPrec. CS\tRec. CS')
print('noTP\t%.6f\t%.6f\t%.6f' % (f1_type[0], 0.0, 0.0))
print('SP\t%.6f\t%.6f\t%.6f' % (f1_type[1], prec_sp, recall_sp))
print('mTP\t%.6f\t%.6f\t%.6f' % (f1_type[2], prec_mt, recall_mt))
print('cTP\t%.6f\t%.6f\t%.6f' % (f1_type[3], prec_ch, recall_ch))
print('luTP\t%.6f\t%.6f\t%.6f' % (f1_type[4], prec_th, recall_th))

Example #22

0

Show file

File: evaler.py Project: lightcome/persona

    def run(self):
        self.model.eval()

        total_bleu = 0
        total_f1 = 0
        total_dist1 = 0
        total_dist2 = 0
        total_loss = 0

        print('Run eval...')
        with torch.no_grad():
            for batch_idx, feature in enumerate(self.test_iter):
                utils.feature_to_device(feature, self.device)

                out, out_lm = self.model(feature)
                print(self.vocab.itos(out[3, 0].argmax(dim=0).item()),
                      self.vocab.itos(out_lm[3, 0].argmax(dim=0).item()))
                loss, loss_lm = models.AR.loss(self.out_loss_fn, out, out_lm,
                                               feature.resp, feature.lm.y)
                print(loss, loss_lm)
                loss = loss + self.model_config.alpha * loss_lm
                total_loss += loss.item()

                # target include w1, w2...[EOS], len: max_seq_length + 1
                target = copy.deepcopy(feature.resp[1:])
                # feature will be changed
                pred, pred_padded = utils.sample_sequence(
                    feature, self.vocab, self.model, self.args)

                pred_tokens = [[self.vocab.itos(k) for k in ks] for ks in pred]
                target_tokens = [[[self.vocab.itos(k) for k in ks]]
                                 for ks in target.T.tolist()]
                print('----------------------------------')
                print(
                    'Context: ', ''.join([
                        self.vocab.itos(k)
                        for k in feature.context.T.tolist()[0]
                    ]))
                print(
                    'LM x: ', ''.join([
                        self.vocab.itos(k) for k in feature.lm.x.T.tolist()[0]
                    ]))
                print(
                    'LM y: ', ''.join([
                        self.vocab.itos(k) for k in feature.lm.y.T.tolist()[0]
                    ]))
                print(
                    'Pred: ', ''.join([
                        self.vocab.itos(k) for k in pred_padded.T.tolist()[0]
                    ]))
                print('Target: ', ''.join(target_tokens[0][0]))
                print(
                    'Pred: ', ''.join([
                        self.vocab.itos(k) for k in pred_padded.T.tolist()[-1]
                    ]))
                print('Target: ', ''.join(target_tokens[-1][0]))
                print('----------------------------------')
                bleu = metrics.bleu_score(pred_tokens, target_tokens)
                f1 = metrics.f1_score(pred_padded.T.to('cpu'),
                                      target.T.to('cpu'))
                # dist1 = metrics.distinct_score([v[:-1] for v in pred])
                dist1 = metrics.distinct_score(pred_tokens)
                dist2 = metrics.distinct_score(pred_tokens, 2)

                total_bleu += bleu
                total_f1 += f1
                total_dist1 += dist1
                total_dist2 += dist2

        l = len(self.test_iter)
        bleu = total_bleu / l
        f1 = total_f1 / l
        dist1 = total_dist1 / l
        dist2 = total_dist2 / l
        # https://stackoverflow.com/questions/59209086/calculate-perplexity-in-pytorch
        # see per-word perplexity:
        # https://github.com/huggingface/transfer-learning-conv-ai/blob/master/convai_evaluation.py#L161
        # https://github.com/facebookresearch/ParlAI/blob/56d46551190a7ffaedccd13534412d43bc7076e5/parlai/scripts/eval_ppl.py
        ppl = math.exp(total_loss / l)

        print(f'\tBleu: {bleu:.8f} | F1: {f1:.8f} | '
              f'Dist1: {dist1:.3f} | Dist2: {dist2:.3f} | PPL: {ppl:7.3f}')

Example #23

0

Show file

File: classes.py Project: irgmedeiros/TCCRecommender

                    continue

                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            #Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append({metric: eval_function(real_arrays,
                    relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f})

        #Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(final_score_error['avg'][key])

        #Compute the final score for IR statistics
        for result in permutation_scores_ir:

Example #24

0

Show file

File: svm.py Project: AIhomework/text-classification

    print('Reading Data Path')
    files, dirs = fe.get_file_name_and_path(parser.parse_args().data_dir)
    print('Spliting Train-Test Set ')
    train_x, train_y, test_x, test_y = train_test_split(files, dirs, 0.25)

    learner = svm.SVC(kernel='rbf', C=1)
    features = Features(train_x, train_y, 3, 0, 160)
    x = []
    for f_name in train_x:
        x.append(features.get_x_vector(f_name, 'tfidf'))

    learner.fit(x, features.y_transform(train_y))
    x=[]
    for f_name in test_x:
        x.append(features.get_x_vector(f_name, 'tfidf'))
    print('Score:', f1_score(features.y_transform(test_y), learner.predict(x).tolist()))


    # print('Test if "TFIDF" is better than "TF"')
    # print('TF:', kfold_average_score(learner, train_x, train_y, weight='tf',feature_size=2))
    # print('TFIDF:', kfold_average_score(learner, train_x, train_y, weight='tfidf',feature_size=2))
    # #
    # #
    # print('Test which min length of word is best')
    # lst = []
    # for i in range(10):
    #     lst.append(kfold_average_score(learner, files, dirs, k=5, min_word_len=i))
    #     print(i, lst[-1])
    # do_plot(1, lst)
    # #
    # print('Test which min freq of word is best')

Example #25

0

Show file

File: classes.py Project: aayush3011/CS412Yelp

    def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs):
        """
        Evaluate on the folds of a dataset split

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        cv: integer or crossvalidation, optional, default = None
            If an integer is passed, it is the number of fold (default 3).
            Specific sampling objects can be passed, see
            scikits.crab.metrics.cross_validation module for the list of
            possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        score: dict
            a dictionary containing the average results over
            the different permutations on the split.

        permutation_scores : array, shape = [n_permutations]
            The scores obtained for each permutations.

        """
        sampling_users = kwargs.pop('sampling_users', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        permutation_scores_error = []
        permutation_scores_ir = []
        final_score_error = {'avg': {}, 'stdev': {}}
        final_score_ir = {'avg': {}, 'stdev': {}}

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        total_ratings = []
        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            total_ratings.extend([(user_id, preference)
                                 for preference in preferences])

        n_ratings = len(total_ratings)
        cross_val = check_cv(cv, n_ratings)
        #Defining the splits and run on the splits.
        for train_set, test_set in cross_val:

            training_set = {}
            testing_set = {}

            for idx in train_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref[0]] = pref[1]
                else:
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref] = 1.0

            for idx in test_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append(pref)
                else:
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append((pref, 1.0))

            #Evaluate the recommender.
            recommender_training = self._build_recommender(training_set, \
                                    recommender)

            real_preferences = []
            estimated_preferences = []

            for user_id, preferences in testing_set.iteritems():
                for item_id, preference in preferences:
                    #Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(
                                    user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        # It is possible that an item exists
                        #in the test data but
                        # not training data in which case
                        #an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)

            #Return the error results.
            if metric in ['rmse', 'mae', 'nmae']:
                eval_function = evaluation_metrics[metric]
                if metric == 'nmae':
                    permutation_scores_error.append({
                                metric: eval_function(real_preferences,
                                                 estimated_preferences,
                                recommender.model.maximum_preference_value(),
                                recommender.model.minimum_preference_value())})
                else:
                    permutation_scores_error.append(
                    {metric: eval_function(real_preferences,
                                       estimated_preferences)})
            elif metric is None:
                #Return all
                mae, nmae, rmse = evaluation_error(real_preferences,
                        estimated_preferences,
                        recommender.model.maximum_preference_value(),
                        recommender.model.minimum_preference_value())
                permutation_scores_error.append({'mae': mae, 'nmae': nmae,
                                                  'rmse': rmse})

        #IR_Statistics (Precision, Recall and F1-Score)
        n_users = recommender.model.users_count()
        cross_val = check_cv(cv, n_users)

        for train_idx, test_idx in cross_val:
            relevant_arrays = []
            real_arrays = []
            for user_id in user_ids[train_idx]:
                preferences = recommender.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if len(preferences) < 2 * at:
                    # Really not enough prefs to meaningfully evaluate the user
                    continue

                # List some most-preferred items that would count as most
                if not recommender.model.has_preference_values():
                    preferences = [(preference, 1.0) for preference in preferences]

                preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
                relevant_item_ids = [item_id for item_id, preference
                                        in preferences[:at]]

                if len(relevant_item_ids) == 0:
                    continue

                #Build the training set.
                training_set = {}
                for other_user_id in recommender.model.user_ids():
                    preferences_other_user = \
                        recommender.model.preferences_from_user(other_user_id)

                    if not recommender.model.has_preference_values():
                        preferences_other_user = [(preference, 1.0)
                                         for preference in preferences_other_user]
                    if other_user_id == user_id:
                        preferences_other_user = \
                            [pref for pref in preferences_other_user \
                                if pref[0] not in relevant_item_ids]

                        if preferences_other_user:
                            training_set[other_user_id] = \
                                dict(preferences_other_user)
                    else:
                        training_set[other_user_id] = dict(preferences_other_user)

                #Evaluate the recommender
                recommender_training = self._build_recommender(training_set, \
                            recommender)

                try:
                    preferences = \
                        recommender_training.model.preferences_from_user(user_id)
                    preferences = list(preferences)
                    if not preferences:
                        continue
                except:
                    #Excluded all prefs for the user. move on.
                    continue

                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            #Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append({metric: eval_function(real_arrays,
                                          relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f})

        #Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(final_score_error['avg'][key])

        #Compute the final score for IR statistics
        for result in permutation_scores_ir:
            for key in result:
                final_score_ir['avg'].setdefault(key, [])
                final_score_ir['avg'][key].append(result[key])
        for key in final_score_ir['avg']:
            final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key])
            final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key])

        permutation_scores = {}
        scores = {}
        if permutation_scores_error:
            permutation_scores['error'] = permutation_scores_error
            scores['final_error'] = final_score_error
        if permutation_scores_ir:
            permutation_scores['ir'] = permutation_scores_ir
            scores.setdefault('final_error', {})
            scores['final_error'].setdefault('avg', {})
            scores['final_error'].setdefault('stdev', {})
            scores['final_error']['avg'].update(final_score_ir['avg'])
            scores['final_error']['stdev'].update(final_score_ir['stdev'])

        return permutation_scores, scores

Example #26

0

Show file

File: classes.py Project: aayush3011/CS412Yelp

    def evaluate(self, recommender, metric=None, **kwargs):
        """
        Evaluates the predictor

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        sampling_ratings:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        ratings. If sampling_ratings is None, 70% will be used in the
        training set and 30% in the test set. Specific sampling objects
        can be passed, see scikits.crab.metrics.sampling module
        for the list of possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        Returns a dictionary containing the evaluation results:
        (NMAE, MAE, RMSE, Precision, Recall, F1-Score)

        """
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)

            sampling_eval = check_sampling(sampling_ratings, \
                                             len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,
                                        permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict((preferences[idx]
                             for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx]
                             for idx in test_set] if preferences else []
            else:
                training_set[user_id] = dict(((preferences[idx], 1.0)
                             for idx in train_set)) if preferences else {}
                testing_set[user_id] = [(preferences[idx], 1.0)
                             for idx in test_set] if preferences else []

        #Evaluate the recommender.
        recommender_training = self._build_recommender(training_set, \
                                recommender)

        real_preferences = []
        estimated_preferences = []

        for user_id, preferences in testing_set.iteritems():
            for item_id, preference in preferences:
            #Estimate the preferences
                try:
                    estimated = recommender_training.estimate_preference(
                                user_id, item_id)
                    real_preferences.append(preference)
                except ItemNotFoundError:
                    # It is possible that an item exists in the test data but
                    # not training data in which case an exception will be
                    # throw. Just ignore it and move on
                    continue
                estimated_preferences.append(estimated)

        #Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {metric: eval_function(real_preferences,
                                          estimated_preferences,
                                recommender.model.maximum_preference_value(),
                                recommender.model.minimum_preference_value())}
            return {metric: eval_function(real_preferences,
                                          estimated_preferences)}

        #IR_Statistics
        relevant_arrays = []
        real_arrays = []

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            if len(preferences) < 2 * at:
                # Really not enough prefs to meaningfully evaluate the user
                continue

            # List some most-preferred items that would count as most
            if not recommender.model.has_preference_values():
                preferences = [(preference, 1.0) for preference in preferences]

            preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
            relevant_item_ids = [item_id for item_id, preference
                                    in preferences[:at]]

            if len(relevant_item_ids) == 0:
                continue

            training_set = {}
            for other_user_id in recommender.model.user_ids():
                preferences_other_user = \
                    recommender.model.preferences_from_user(other_user_id)

                if not recommender.model.has_preference_values():
                    preferences_other_user = [(preference, 1.0)
                                     for preference in preferences_other_user]
                if other_user_id == user_id:
                    preferences_other_user = \
                        [pref for pref in preferences_other_user \
                            if pref[0] not in relevant_item_ids]

                    if preferences_other_user:
                        training_set[other_user_id] = \
                            dict(preferences_other_user)
                else:
                    training_set[other_user_id] = dict(preferences_other_user)

            #Evaluate the recommender
            recommender_training = self._build_recommender(training_set, \
                        recommender)

            try:
                preferences = \
                    recommender_training.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if not preferences:
                    continue
            except:
                #Excluded all prefs for the user. move on.
                continue

            recommended_items = recommender_training.recommend(user_id, at)
            relevant_arrays.append(list(relevant_item_ids))
            real_arrays.append(list(recommended_items))

        relevant_arrays = np.array(relevant_arrays)
        real_arrays = np.array(real_arrays)

        #Return the IR results.
        if metric in ['precision', 'recall', 'f1score']:
            eval_function = evaluation_metrics[metric]
            return {metric: eval_function(real_arrays, relevant_arrays)}

        if metric is None:
            #Return all
            mae, nmae, rmse = evaluation_error(real_preferences,
                        estimated_preferences,
                        recommender.model.maximum_preference_value(),
                        recommender.model.minimum_preference_value())
            f = f1_score(real_arrays, relevant_arrays)
            r = recall_score(real_arrays, relevant_arrays)
            p = precision_score(real_arrays, relevant_arrays)

            return {'mae': mae, 'nmae': nmae, 'rmse': rmse,
                    'precision': p, 'recall': r, 'f1score': f}

Example #27

0

Show file

File: naive_bayes.py Project: AIhomework/text-classification

    # print(os.path.dirname(os.path.realpath(__file__)))
    parser = argparse.ArgumentParser("Read in data directory")
    parser.add_argument('data_dir')
    files, cls = fe.get_file_name_and_path(parser.parse_args().data_dir)

    train_X, train_Y, test_X, test_Y = train_test_split(files, cls, 0.25)

    nb = NaiveBayes()
    nb.fit(train_X, train_Y, 40)
    # lst = []
    # for i in range(1, 100):
    #     nb.fit(train_X, train_Y, i)
    #     y_pred = nb.predict_list(test_X)
    #     lst.append(f1_score(test_Y, y_pred))
    #     print(i,lst[-1])
    # do_plot(0, lst)

    save_learner = open('naive_bayes.pkl','wb')
    pickle.dump(nb,save_learner)
    # load_learner = open('naive_bayes.pkl', 'rb')
    # nb = pickle.load(load_learner)


    # incorrect = 0
    y_pred = nb.predict_list(test_X)
    error = 0
    for i in range(len(y_pred)):
        if y_pred[i] != test_Y[i]:
            error += 1
    print('Errors/Total:',error,'/',len(y_pred), 'F1: ', f1_score(test_Y, y_pred))
    # print("InCorrect:", incorrect, "Present:", incorrect / len(test_X))

Example #28

0

Show file

    def evaluate(self, recommender, metric=None, **kwargs):
        """
        Evaluates the predictor

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        sampling_ratings:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        ratings. If sampling_ratings is None, 70% will be used in the
        training set and 30% in the test set. Specific sampling objects
        can be passed, see scikits.crab.metrics.sampling module
        for the list of possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        Returns a dictionary containing the evaluation results:
        (NMAE, MAE, RMSE, Precision, Recall, F1-Score)

        """
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)

            sampling_eval = check_sampling(sampling_ratings, \
                                             len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,
                                                      permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict(
                    (preferences[idx]
                     for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx] for idx in test_set
                                        ] if preferences else []
            else:
                training_set[user_id] = dict(
                    ((preferences[idx], 1.0)
                     for idx in train_set)) if preferences else {}
                testing_set[user_id] = [
                    (preferences[idx], 1.0) for idx in test_set
                ] if preferences else []

        #Evaluate the recommender.
        recommender_training = self._build_recommender(training_set, \
                                recommender)

        real_preferences = []
        estimated_preferences = []

        for user_id, preferences in testing_set.iteritems():
            for item_id, preference in preferences:
                #Estimate the preferences
                try:
                    estimated = recommender_training.estimate_preference(
                        user_id, item_id)
                    real_preferences.append(preference)
                except ItemNotFoundError:
                    # It is possible that an item exists in the test data but
                    # not training data in which case an exception will be
                    # throw. Just ignore it and move on
                    continue
                estimated_preferences.append(estimated)

        #Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {
                    metric:
                    eval_function(real_preferences, estimated_preferences,
                                  recommender.model.maximum_preference_value(),
                                  recommender.model.minimum_preference_value())
                }
            return {
                metric: eval_function(real_preferences, estimated_preferences)
            }

        #IR_Statistics
        relevant_arrays = []
        real_arrays = []

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            if len(preferences) < 2 * at:
                # Really not enough prefs to meaningfully evaluate the user
                continue

            # List some most-preferred items that would count as most
            if not recommender.model.has_preference_values():
                preferences = [(preference, 1.0) for preference in preferences]

            preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
            relevant_item_ids = [
                item_id for item_id, preference in preferences[:at]
            ]

            if len(relevant_item_ids) == 0:
                continue

            training_set = {}
            for other_user_id in recommender.model.user_ids():
                preferences_other_user = \
                    recommender.model.preferences_from_user(other_user_id)

                if not recommender.model.has_preference_values():
                    preferences_other_user = [
                        (preference, 1.0)
                        for preference in preferences_other_user
                    ]
                if other_user_id == user_id:
                    preferences_other_user = \
                        [pref for pref in preferences_other_user \
                            if pref[0] not in relevant_item_ids]

                    if preferences_other_user:
                        training_set[other_user_id] = \
                            dict(preferences_other_user)
                else:
                    training_set[other_user_id] = dict(preferences_other_user)

            #Evaluate the recommender
            recommender_training = self._build_recommender(training_set, \
                        recommender)

            try:
                preferences = \
                    recommender_training.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if not preferences:
                    continue
            except:
                #Excluded all prefs for the user. move on.
                continue

            recommended_items = recommender_training.recommend(user_id, at)
            relevant_arrays.append(list(relevant_item_ids))
            real_arrays.append(list(recommended_items))

        relevant_arrays = np.array(relevant_arrays)
        real_arrays = np.array(real_arrays)

        #Return the IR results.
        if metric in ['precision', 'recall', 'f1score']:
            eval_function = evaluation_metrics[metric]
            return {metric: eval_function(real_arrays, relevant_arrays)}

        if metric is None:
            #Return all
            mae, nmae, rmse = evaluation_error(
                real_preferences, estimated_preferences,
                recommender.model.maximum_preference_value(),
                recommender.model.minimum_preference_value())
            f = f1_score(real_arrays, relevant_arrays)
            r = recall_score(real_arrays, relevant_arrays)
            p = precision_score(real_arrays, relevant_arrays)

            return {
                'mae': mae,
                'nmae': nmae,
                'rmse': rmse,
                'precision': p,
                'recall': r,
                'f1score': f
            }

Example #29

0

Show file

    def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs):
        """
        Evaluate on the folds of a dataset split

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        cv: integer or crossvalidation, optional, default = None
            If an integer is passed, it is the number of fold (default 3).
            Specific sampling objects can be passed, see
            scikits.crab.metrics.cross_validation module for the list of
            possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        score: dict
            a dictionary containing the average results over
            the different permutations on the split.

        permutation_scores : array, shape = [n_permutations]
            The scores obtained for each permutations.

        """
        sampling_users = kwargs.pop('sampling_users', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        permutation_scores_error = []
        permutation_scores_ir = []
        final_score_error = {'avg': {}, 'stdev': {}}
        final_score_ir = {'avg': {}, 'stdev': {}}

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        total_ratings = []
        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            total_ratings.extend([(user_id, preference)
                                  for preference in preferences])

        n_ratings = len(total_ratings)
        cross_val = check_cv(cv, n_ratings)
        #Defining the splits and run on the splits.
        for train_set, test_set in cross_val:

            training_set = {}
            testing_set = {}

            for idx in train_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref[0]] = pref[1]
                else:
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref] = 1.0

            for idx in test_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append(pref)
                else:
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append((pref, 1.0))

            #Evaluate the recommender.
            recommender_training = self._build_recommender(training_set, \
                                    recommender)

            real_preferences = []
            estimated_preferences = []

            for user_id, preferences in testing_set.iteritems():
                for item_id, preference in preferences:
                    #Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(
                            user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        # It is possible that an item exists
                        #in the test data but
                        # not training data in which case
                        #an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)

            #Return the error results.
            if metric in ['rmse', 'mae', 'nmae']:
                eval_function = evaluation_metrics[metric]
                if metric == 'nmae':
                    permutation_scores_error.append({
                        metric:
                        eval_function(
                            real_preferences, estimated_preferences,
                            recommender.model.maximum_preference_value(),
                            recommender.model.minimum_preference_value())
                    })
                else:
                    permutation_scores_error.append({
                        metric:
                        eval_function(real_preferences, estimated_preferences)
                    })
            elif metric is None:
                #Return all
                mae, nmae, rmse = evaluation_error(
                    real_preferences, estimated_preferences,
                    recommender.model.maximum_preference_value(),
                    recommender.model.minimum_preference_value())
                permutation_scores_error.append({
                    'mae': mae,
                    'nmae': nmae,
                    'rmse': rmse
                })

        #IR_Statistics (Precision, Recall and F1-Score)
        n_users = recommender.model.users_count()
        cross_val = check_cv(cv, n_users)

        for train_idx, test_idx in cross_val:
            relevant_arrays = []
            real_arrays = []
            for user_id in user_ids[train_idx]:
                preferences = recommender.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if len(preferences) < 2 * at:
                    # Really not enough prefs to meaningfully evaluate the user
                    continue

                # List some most-preferred items that would count as most
                if not recommender.model.has_preference_values():
                    preferences = [(preference, 1.0)
                                   for preference in preferences]

                preferences = sorted(preferences,
                                     key=lambda x: x[1],
                                     reverse=True)
                relevant_item_ids = [
                    item_id for item_id, preference in preferences[:at]
                ]

                if len(relevant_item_ids) == 0:
                    continue

                #Build the training set.
                training_set = {}
                for other_user_id in recommender.model.user_ids():
                    preferences_other_user = \
                        recommender.model.preferences_from_user(other_user_id)

                    if not recommender.model.has_preference_values():
                        preferences_other_user = [
                            (preference, 1.0)
                            for preference in preferences_other_user
                        ]
                    if other_user_id == user_id:
                        preferences_other_user = \
                            [pref for pref in preferences_other_user \
                                if pref[0] not in relevant_item_ids]

                        if preferences_other_user:
                            training_set[other_user_id] = \
                                dict(preferences_other_user)
                    else:
                        training_set[other_user_id] = dict(
                            preferences_other_user)

                #Evaluate the recommender
                recommender_training = self._build_recommender(training_set, \
                            recommender)

                try:
                    preferences = \
                        recommender_training.model.preferences_from_user(user_id)
                    preferences = list(preferences)
                    if not preferences:
                        continue
                except:
                    #Excluded all prefs for the user. move on.
                    continue

                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            #Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append(
                    {metric: eval_function(real_arrays, relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({
                    'precision': p,
                    'recall': r,
                    'f1score': f
                })

        #Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(
                final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(
                final_score_error['avg'][key])

        #Compute the final score for IR statistics
        for result in permutation_scores_ir:
            for key in result:
                final_score_ir['avg'].setdefault(key, [])
                final_score_ir['avg'][key].append(result[key])
        for key in final_score_ir['avg']:
            final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key])
            final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key])

        permutation_scores = {}
        scores = {}
        if permutation_scores_error:
            permutation_scores['error'] = permutation_scores_error
            scores['final_error'] = final_score_error
        if permutation_scores_ir:
            permutation_scores['ir'] = permutation_scores_ir
            scores.setdefault('final_error', {})
            scores['final_error'].setdefault('avg', {})
            scores['final_error'].setdefault('stdev', {})
            scores['final_error']['avg'].update(final_score_ir['avg'])
            scores['final_error']['stdev'].update(final_score_ir['stdev'])

        return permutation_scores, scores

Example #30

0

Show file

File: classes.py Project: jackaduma/py-recommender-framework

    def evaluate(self, recommender, metric=None, **kwargs):
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
                      are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        # Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            # Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)

            sampling_eval = check_sampling(sampling_ratings, \
                                           len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,
                                                      permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict((preferences[idx]
                                              for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx]
                                        for idx in test_set] if preferences else []
            else:
                training_set[user_id] = dict(((preferences[idx], 1.0)
                                              for idx in train_set)) if preferences else {}
                testing_set[user_id] = [(preferences[idx], 1.0)
                                        for idx in test_set] if preferences else []

        # Evaluate the recommender.
        recommender_training = self._build_recommender(training_set, \
                                                       recommender)

        real_preferences = []
        estimated_preferences = []

        for user_id, preferences in testing_set.iteritems():
            for item_id, preference in preferences:
                # Estimate the preferences
                try:
                    estimated = recommender_training.estimate_preference(
                        user_id, item_id)
                    real_preferences.append(preference)
                except ItemNotFoundError:
                    # It is possible that an item exists in the test data but
                    # not training data in which case an exception will be
                    # throw. Just ignore it and move on
                    continue
                estimated_preferences.append(estimated)

        # Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {metric: eval_function(real_preferences,
                                              estimated_preferences,
                                              recommender.model.maximum_preference_value(),
                                              recommender.model.minimum_preference_value())}
            return {metric: eval_function(real_preferences,
                                          estimated_preferences)}

        # IR_Statistics
        relevant_arrays = []
        real_arrays = []

        # Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            if len(preferences) < 2 * at:
                # Really not enough prefs to meaningfully evaluate the user
                continue

            # List some most-preferred items that would count as most
            if not recommender.model.has_preference_values():
                preferences = [(preference, 1.0) for preference in preferences]

            preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
            relevant_item_ids = [item_id for item_id, preference
                                 in preferences[:at]]

            if len(relevant_item_ids) == 0:
                continue

            training_set = {}
            for other_user_id in recommender.model.user_ids():
                preferences_other_user = \
                    recommender.model.preferences_from_user(other_user_id)

                if not recommender.model.has_preference_values():
                    preferences_other_user = [(preference, 1.0)
                                              for preference in preferences_other_user]
                if other_user_id == user_id:
                    preferences_other_user = \
                        [pref for pref in preferences_other_user \
                         if pref[0] not in relevant_item_ids]

                    if preferences_other_user:
                        training_set[other_user_id] = \
                            dict(preferences_other_user)
                else:
                    training_set[other_user_id] = dict(preferences_other_user)

            # Evaluate the recommender
            recommender_training = self._build_recommender(training_set, \
                                                           recommender)

            try:
                preferences = \
                    recommender_training.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if not preferences:
                    continue
            except:
                # Excluded all prefs for the user. move on.
                continue

            recommended_items = recommender_training.recommend(user_id, at)
            relevant_arrays.append(list(relevant_item_ids))
            real_arrays.append(list(recommended_items))

        relevant_arrays = np.array(relevant_arrays)
        real_arrays = np.array(real_arrays)

        # Return the IR results.
        if metric in ['precision', 'recall', 'f1score']:
            eval_function = evaluation_metrics[metric]
            return {metric: eval_function(real_arrays, relevant_arrays)}

        if metric is None:
            # Return all
            mae, nmae, rmse = evaluation_error(real_preferences,
                                               estimated_preferences,
                                               recommender.model.maximum_preference_value(),
                                               recommender.model.minimum_preference_value())
            f = f1_score(real_arrays, relevant_arrays)
            r = recall_score(real_arrays, relevant_arrays)
            p = precision_score(real_arrays, relevant_arrays)

            return {'mae': mae, 'nmae': nmae, 'rmse': rmse,
                    'precision': p, 'recall': r, 'f1score': f}

Example #31

0

Show file

def predict(loss_fn, model, data_set, data_loader, counting=False):
    """ Validate after training an epoch
    Note:
    """
    model.eval()

    true_positives = []
    predicted_positives = []
    possible_positives = []
    union_areas = []
    loss = []
    for bc_cnt, bc_data in enumerate(data_loader):
        if counting:
            print('%d/%d' % (bc_cnt, len(data_set) // data_loader.batch_size))
        imgs, masks, _ = bc_data
        imgs = Variable(imgs).cuda()
        masks = Variable(masks).cuda()
        # labels = Variable(labels).cuda()

        outputs = model(imgs)

        # outputs = outputs.view(-1, outputs.size()[2], outputs.size()[3])

        # print outputs.size(), masks.size()
        # if outputs.size() != masks.size():
        #     outputs = F.upsample(outputs, size=masks.size()[-2:], mode='bilinear')
        mask_loss = torch.zeros(1).cuda()
        for o in outputs:
            o = o.view(-1, o.size()[2], o.size()[3])

            mask_loss = mask_loss + float(loss_fn(o, masks))

        # mask_loss = mask_loss
        # loss = criterion(outputs, masks)

        loss.append(mask_loss)
        # loss.append(loss_fn(outputs, masks))
        # outputs = F.softmax(model(imgs), dim=1)
        # if outputs.size() != masks.size():
        #     outputs = F.upsample(outputs, size=masks.size()[-2:], mode='bilinear')
        #
        # _, outputs = torch.max(outputs, dim=1)
        output = outputs[-1]
        output = output.view(-1, output.size()[2], output.size()[3])

        output = output.cpu().data.numpy()
        # labels = labels.cpu().data.numpy()
        masks = masks.cpu().data.numpy()
        imgs = imgs.cpu().data.numpy()

        true_positive, predicted_positive, possible_positive, union_area = metrics_pred(
            output, imgs, masks)

        true_positives += true_positive
        predicted_positives += predicted_positive
        possible_positives += possible_positive
        union_areas += union_area
    precisions = precision(true_positives, predicted_positives)
    recalls = recall(true_positives, possible_positives)
    f1_scores = f1_score(recalls, precisions)
    loss = torch.tensor(loss)
    return precisions, recalls, f1_scores, loss.mean()

Example #32

0

Show file

 sents = sents[idx]
 labs = labs[idx]
 loss = model.neg_log_likelihood(sents, labs, lens)
 loss.backward()
 optimizer.step()
 score, preds = model(sents, lens)
 true_labs = [
     seqid2text(labs[i, :l], ix_to_lab)
     for i, l in enumerate(lens)
 ]
 pred_labs = [
     seqid2text(preds[i, :l], ix_to_lab)
     for i, l in enumerate(lens)
 ]
 acc = accuracy_score(true_labs, pred_labs)
 f1 = f1_score(true_labs, pred_labs)
 print(
     "Epoch {}, batch {}, train loss {:.4f}, train acc {:.4f}, train f1 {:.4f} "
     .format(epoch, i, loss.item(), acc, f1))
 if ((i + 1) % 50 == 0):
     with torch.no_grad():
         model.eval()
         print("Evaluation on validation set")
         true_labels = []
         pred_labels = []
         for batch in val_data_loader:
             sents, labs, lens = batch
             sents = pad_sequence(sents,
                                  batch_first=True).to(device)
             labs = pad_sequence(labs,
                                 batch_first=True).to(device)

Example #33

0

Show file

                             descriptions,
                             split_sentences=False,
                             transform_labels=False)
    model = 'sbw'

    #idf = create_decs_embeddings() # Run just in case you don't have decs_mix decs_sbw and idf json files
    with open('../embeddings/idf.json') as f:
        idf = json.load(f)

    dev_sbw_similarity = similarity(x_dev, model, idf)

    result = np.apply_along_axis(top_k_values, 1, dev_sbw_similarity, 100)
    create_json(dev['id'], result, descriptions, model)

    with open(f'../embeddings/{model}_predictions.json') as json_file:
        data = json.load(json_file)

    pred = []
    for doc in data['documents']:
        pred.append(doc['labels'])

    real = dev["decsCodes"]
    assert (len(real) == len(pred))
    tp, fn, fp, p, r, f1 = f1_score(real, pred)
    print(F'TP: {tp}')
    print(F'FN: {fn}')
    print(F'FP: {fp}')
    print(f'Precision: {p}')
    print(f'Recall: {r}')
    print(f'F1-Score: {f1}')

Example #34

0

Show file

File: classes.py Project: jackaduma/py-recommender-framework

    def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs):
        sampling_users = kwargs.pop('sampling_users', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
                      are %s' % (metric, evaluation_metrics.keys()))

        permutation_scores_error = []
        permutation_scores_ir = []
        final_score_error = {'avg': {}, 'stdev': {}}
        final_score_ir = {'avg': {}, 'stdev': {}}

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        total_ratings = []
        # Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            # Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            total_ratings.extend([(user_id, preference)
                                  for preference in preferences])

        n_ratings = len(total_ratings)
        cross_val = check_cv(cv, n_ratings)
        # Defining the splits and run on the splits.
        for train_set, test_set in cross_val:

            training_set = {}
            testing_set = {}

            for idx in train_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref[0]] = pref[1]
                else:
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref] = 1.0

            for idx in test_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append(pref)
                else:
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append((pref, 1.0))

            # Evaluate the recommender.
            recommender_training = self._build_recommender(training_set, \
                                                           recommender)

            real_preferences = []
            estimated_preferences = []

            for user_id, preferences in testing_set.iteritems():
                for item_id, preference in preferences:
                    # Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(
                            user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        # It is possible that an item exists
                        # in the test data but
                        # not training data in which case
                        # an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)

            # Return the error results.
            if metric in ['rmse', 'mae', 'nmae']:
                eval_function = evaluation_metrics[metric]
                if metric == 'nmae':
                    permutation_scores_error.append({
                        metric: eval_function(real_preferences,
                                              estimated_preferences,
                                              recommender.model.maximum_preference_value(),
                                              recommender.model.minimum_preference_value())})
                else:
                    permutation_scores_error.append(
                        {metric: eval_function(real_preferences,
                                               estimated_preferences)})
            elif metric is None:
                # Return all
                mae, nmae, rmse = evaluation_error(real_preferences,
                                                   estimated_preferences,
                                                   recommender.model.maximum_preference_value(),
                                                   recommender.model.minimum_preference_value())
                permutation_scores_error.append({'mae': mae, 'nmae': nmae,
                                                 'rmse': rmse})

        # IR_Statistics (Precision, Recall and F1-Score)
        n_users = recommender.model.users_count()
        cross_val = check_cv(cv, n_users)

        for train_idx, test_idx in cross_val:
            relevant_arrays = []
            real_arrays = []
            for user_id in user_ids[train_idx]:
                preferences = recommender.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if len(preferences) < 2 * at:
                    # Really not enough prefs to meaningfully evaluate the user
                    continue

                # List some most-preferred items that would count as most
                if not recommender.model.has_preference_values():
                    preferences = [(preference, 1.0) for preference in preferences]

                preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
                relevant_item_ids = [item_id for item_id, preference
                                     in preferences[:at]]

                if len(relevant_item_ids) == 0:
                    continue

                # Build the training set.
                training_set = {}
                for other_user_id in recommender.model.user_ids():
                    preferences_other_user = recommender.model.preferences_from_user(other_user_id)

                    if not recommender.model.has_preference_values():
                        preferences_other_user = [(preference, 1.0) for preference in preferences_other_user]
                    if other_user_id == user_id:
                        preferences_other_user = [pref for pref in preferences_other_user if
                                                  pref[0] not in relevant_item_ids]

                        if preferences_other_user:
                            training_set[other_user_id] = dict(preferences_other_user)
                    else:
                        training_set[other_user_id] = dict(preferences_other_user)

                # Evaluate the recommender
                recommender_training = self._build_recommender(training_set, recommender)

                try:
                    preferences = recommender_training.model.preferences_from_user(user_id)
                    preferences = list(preferences)
                    if not preferences:
                        continue
                except:
                    # Excluded all prefs for the user. move on.
                    continue

                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            # Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append({metric: eval_function(real_arrays, relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f})

        # Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(final_score_error['avg'][key])

        # Compute the final score for IR statistics
        for result in permutation_scores_ir:
            for key in result:
                final_score_ir['avg'].setdefault(key, [])
                final_score_ir['avg'][key].append(result[key])
        for key in final_score_ir['avg']:
            final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key])
            final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key])

        permutation_scores = {}
        scores = {}
        if permutation_scores_error:
            permutation_scores['error'] = permutation_scores_error
            scores['final_error'] = final_score_error
        if permutation_scores_ir:
            permutation_scores['ir'] = permutation_scores_ir
            scores.setdefault('final_error', {})
            scores['final_error'].setdefault('avg', {})
            scores['final_error'].setdefault('stdev', {})
            scores['final_error']['avg'].update(final_score_ir['avg'])
            scores['final_error']['stdev'].update(final_score_ir['stdev'])

        return permutation_scores, scores

Example #35

0

Show file

File: solver.py Project: wujunjie1998/kg-topic-chat

    def test(self):
        self.model.eval()
        batch_loss_history = []
        n_total_words = 0
        n_sentences = 0
        f1_total = []
        for batch_i, (conversations, conversation_length,
                      sentence_length) in enumerate(
                          tqdm(self.test_data_loader, ncols=80)):
            # conversations: (batch_size) list of conversations
            #   conversation: list of sentences
            #   sentence: list of tokens
            # conversation_length: list of int
            # sentence_length: (batch_size) list of conversation list of sentence_lengths

            input_conversations = [conv[:-1] for conv in conversations]
            target_conversations = [conv[1:] for conv in conversations]

            # flatten input and target conversations
            input_sentences = [
                sent for conv in input_conversations for sent in conv
            ]
            target_sentences = [
                sent for conv in target_conversations for sent in conv
            ]
            input_sentence_length = [
                l for len_list in sentence_length for l in len_list[:-1]
            ]
            target_sentence_length = [
                l for len_list in sentence_length for l in len_list[1:]
            ]
            input_conversation_length = [l - 1 for l in conversation_length]

            with torch.no_grad():
                input_sentences = to_var(torch.LongTensor(input_sentences))
                target_sentences = to_var(torch.LongTensor(target_sentences))
                input_sentence_length = to_var(
                    torch.LongTensor(input_sentence_length))
                target_sentence_length = to_var(
                    torch.LongTensor(target_sentence_length))
                input_conversation_length = to_var(
                    torch.LongTensor(input_conversation_length))

            if batch_i == 0:
                self.generate_sentence(input_sentences, input_sentence_length,
                                       input_conversation_length,
                                       target_sentences)

            generated_sentences = self.generate_conversations_with_gold_responses(
                input_sentences, input_sentence_length,
                input_conversation_length, target_sentences)
            conv_f1 = 0
            for target_sent, output_sent in zip(target_sentences,
                                                generated_sentences):
                target_sent = self.vocab.decode(target_sent)
                output_sent = self.vocab.decode(output_sent)
                f1 = metrics.f1_score(output_sent, target_sent)
                conv_f1 += f1
            conv_f1 = conv_f1 / target_sentences.shape[0]

            sentence_logits = self.model(input_sentences,
                                         input_sentence_length,
                                         input_conversation_length,
                                         target_sentences)

            batch_loss, n_words = masked_cross_entropy(sentence_logits,
                                                       target_sentences,
                                                       target_sentence_length)

            assert not isnan(batch_loss.item())
            batch_loss_history.append(batch_loss.item())
            n_total_words += n_words.item()
            f1_total.append(conv_f1)
            n_sentences += target_sentences.shape[0]

        epoch_loss = np.sum(batch_loss_history) / n_total_words
        f1_average = np.sum(f1_total) / n_sentences

        print(f'Number of words: {n_total_words}')
        print(f'Bits per word: {epoch_loss:.3f}')
        word_perplexity = np.exp(epoch_loss)

        return word_perplexity, f1_average

Example #36

0

Show file

File: Debug_Sk_LR.py Project: CJuanvip/Practice_Recommender_System

# 测试集AUC
probs_test= lr_model.predict_proba(X_test)  
predict_test = lr_model.predict(X_test)
AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1])
print("Test Auc: %s"%(AUC2))

# 准确率
accuracy = metrics.accuracy_score(Y_test, predict_test) 
print("Test Accuracy: %s"%(accuracy))

# 召回率
recall = metrics.recall_score(Y_test, predict_test) 
print("Test Recall: %s"%(recall))

# F1值
f1 = metrics.f1_score(Y_test, predict_test) 
print("Test F1: %s"%(f1))

In [42]:
# 3.4 打印模型参数
w=lr_model.coef_
print("参数大小:")
print(w.shape)
print("参数前10个:")
print(lr_model.coef_[:,0:10]) 
print("截距:")  
print(lr_model.intercept_) 
print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100))  
print("sigmoid函数转化的值，即：概率p")  
print(lr_model.predict_proba(X_test[0:5]))

Example #37

0

Show file

    def evaluating(self, model, dataset, split):
        """
          input:
            model: (object) pytorch model
            dataset: (object) dataset
            split: (str) split of dataset in ['train', 'val', 'test']
          return [overall_accuracy, precision, recall, f1-score, jaccard, kappa]
        """
        args = self.args
        oa, precision, recall, f1, jac, kappa = 0, 0, 0, 0, 0, 0
        model.eval()
        data_loader = DataLoader(dataset,
                                 args.batch_size,
                                 num_workers=4,
                                 shuffle=False)
        batch_iterator = iter(data_loader)
        steps = len(dataset) // args.batch_size

        start = time.time()
        for step in range(steps):
            x, y = next(batch_iterator)
            x = Variable(x, volatile=True)
            y = Variable(y, volatile=True)
            if args.cuda:
                x = x.cuda()
                y = y.cuda()
            # calculate pixel accuracy of generator
            gen_y = model(x)
            if self.is_multi:
                gen_y = gen_y[0]
            oa += metrics.overall_accuracy(gen_y.data, y.data)
            precision += metrics.precision(gen_y.data, y.data)
            recall += metrics.recall(gen_y.data, y.data)
            f1 += metrics.f1_score(gen_y.data, y.data)
            jac += metrics.jaccard(gen_y.data, y.data)
            kappa += metrics.kappa(gen_y.data, y.data)

        _time = time.time() - start

        if not os.path.exists(os.path.join(Logs_DIR, 'statistic')):
            os.makedirs(os.path.join(Logs_DIR, 'statistic'))

        # recording performance of the model
        nb_samples = steps * args.batch_size
        basic_info = [
            self.date, self.method, self.epoch, self.iter, nb_samples, _time
        ]
        basic_info_names = [
            'date', 'method', 'epochs', 'iters', 'nb_samples', 'time(sec)'
        ]

        perform = [
            round(idx / steps, 3)
            for idx in [oa, precision, recall, f1, jac, kappa]
        ]
        perform_names = [
            "overall_accuracy", "precision", "recall", "f1-score", "jaccard",
            "kappa"
        ]
        cur_log = pd.DataFrame([basic_info + perform],
                               columns=basic_info_names + perform_names)
        # save performance
        if os.path.exists(
                os.path.join(Logs_DIR, 'statistic', "{}.csv".format(split))):
            logs = pd.read_csv(
                os.path.join(Logs_DIR, 'statistic', "{}.csv".format(split)))
        else:
            logs = pd.DataFrame([])
        logs = logs.append(cur_log, ignore_index=True)
        logs.to_csv(os.path.join(Logs_DIR, 'statistic',
                                 "{}.csv".format(split)),
                    index=False,
                    float_format='%.3f')

Example #38

0

Show file

File: evaluate.py Project: wsb853529465/ccks-ee-subject

def evaluate(args, model, eval_dataloader, params):
    model.eval()
    # 记录平均损失
    loss_avg = utils.RunningAverage()
    # init
    pre_result = []
    gold_result = []

    # get data
    for batch in tqdm(eval_dataloader, unit='Batch', ascii=True):
        # fetch the next training batch
        batch = tuple(t.to(params.device) for t in batch)
        input_ids, input_mask, start_pos, end_pos, _, _ = batch

        with torch.no_grad():
            # get loss
            loss = model(input_ids, attention_mask=input_mask,
                         start_positions=start_pos, end_positions=end_pos)
            if params.n_gpu > 1 and args.multi_gpu:
                loss = loss.mean()  # mean() to average on multi-gpu.
            # update the average loss
            loss_avg.update(loss.item())

            # inference
            start_pre, end_pre = model(input_ids=input_ids, attention_mask=input_mask)

        # gold label
        start_pos = start_pos.to("cpu").numpy().transpose((0, 2, 1)).tolist()  # (batch_size, tag_size, seq_len)
        end_pos = end_pos.to("cpu").numpy().transpose((0, 2, 1)).tolist()
        input_mask = input_mask.to('cpu').numpy().tolist()

        # predict label
        start_label = start_pre.detach().cpu().numpy().transpose((0, 2, 1)).tolist()
        end_label = end_pre.detach().cpu().numpy().transpose((0, 2, 1)).tolist()

        # idx to label
        cate_idx2label = {idx: str(idx + 1) for idx, _ in enumerate(params.label_list)}

        # get bio result
        for start_p_s, end_p_s, start_g_s, end_g_s, input_mask_s in zip(start_label, end_label,
                                                                        start_pos, end_pos, input_mask):
            # 有效长度
            act_len = sum(input_mask_s)
            for idx, (start_p, end_p, start_g, end_g) in enumerate(zip(start_p_s,
                                                                       end_p_s, start_g_s, end_g_s)):
                pre_bio_labels = pointer2bio(start_p[:act_len], end_p[:act_len],
                                             ne_cate=cate_idx2label[idx])
                gold_bio_labels = pointer2bio(start_g[:act_len], end_g[:act_len],
                                              ne_cate=cate_idx2label[idx])
                pre_result.append(pre_bio_labels)
                gold_result.append(gold_bio_labels)

    # metrics
    f1 = f1_score(y_true=gold_result, y_pred=pre_result)
    acc = accuracy_score(y_true=gold_result, y_pred=pre_result)

    # f1, acc
    metrics = {'loss': loss_avg(), 'f1': f1, 'acc': acc}
    metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items())
    logging.info("- {} metrics: ".format('Val') + metrics_str)
    # f1 classification report
    report = classification_report(y_true=gold_result, y_pred=pre_result)
    logging.info(report)

    return metrics