def test_score(self): ch = hmcdatasets.load_shades_class_hierachy() X, y = hmcdatasets.load_shades_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=0) dt = hmc.DecisionTreeHierarchicalClassifier(ch) dt = dt.fit(X_train, y_train) y_pred = dt.predict(X_test) metrics.classification_report(ch, y_test, pd.DataFrame(y_pred))
def eval(dataset): dev_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2) n_sample = len(dev_dataloader) result = dict() detection_loss = torch.nn.CrossEntropyLoss().to(device) pos.eval() E.eval() all_detection_preds = [] all_detection_logit = [] for sample in tqdm(dev_dataloader): sample = (i.to(device) for i in sample) token, mask, type_ids, pos1, pos2, pos_mask, y = sample batch = len(token) # -------------------------evaluate D------------------------- # # BERT encode sentence to feature vector with torch.no_grad(): sequence_output, pooled_output = E(token, mask, type_ids) real_feature = pooled_output out = pos(pos1, pos2, real_feature) all_detection_logit.append(out) all_detection_preds.append(torch.argmax(out, 1)) all_y = LongTensor(dataset.dataset[:, -4].astype(int)).cpu() # [length, n_class] all_binary_y = (all_y != 0).long() # [length, 1] label 0 is oos all_detection_preds = torch.cat(all_detection_preds, 0).cpu() # [length, 1] # all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze()) # [length, 1] all_detection_logit = torch.cat(all_detection_logit, 0).cpu() # 计算损失 detection_loss = detection_loss(all_detection_logit, all_binary_y.long()) result['detection_loss'] = detection_loss logger.info( metrics.classification_report(all_binary_y, all_detection_preds, target_names=['oos', 'in'])) # report oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore( all_detection_preds, all_binary_y) detection_acc = metrics.accuracy(all_detection_preds, all_binary_y) y_score = all_detection_logit.softmax(1)[:, 1].tolist() eer = metrics.cal_eer(all_binary_y, y_score) result['eer'] = eer result['all_detection_binary_preds'] = all_detection_preds result['detection_acc'] = detection_acc result['all_binary_y'] = all_binary_y result['oos_ind_precision'] = oos_ind_precision result['oos_ind_recall'] = oos_ind_recall result['oos_ind_f_score'] = oos_ind_fscore result['y_score'] = y_score result['auc'] = roc_auc_score(all_binary_y, y_score) return result
def test(dataset): load_model(model, path=config['model_save_path'], model_name='bert') test_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2) n_sample = len(test_dataloader) result = dict() model.eval() # Loss function classified_loss = torch.nn.CrossEntropyLoss().to(device) all_pred = [] total_loss = 0 all_logit = [] for sample in tqdm.tqdm(test_dataloader): sample = (i.to(device) for i in sample) token, mask, type_ids, y = sample batch = len(token) with torch.no_grad(): logit = model(token, mask, type_ids) all_logit.append(logit) all_pred.append(torch.argmax(logit, 1)) total_loss += classified_loss(logit, y.long()) all_y = LongTensor( dataset.dataset[:, -1].astype(int)).cpu() # [length, n_class] all_binary_y = (all_y != 0).long() # [length, 1] label 0 is oos all_pred = torch.cat(all_pred, 0).cpu() all_logit = torch.cat(all_logit, 0).cpu() # classification report ind_class_acc = metrics.ind_class_accuracy(all_pred, all_y) report = metrics.classification_report(all_y, all_pred, output_dict=True) oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore( all_pred, all_binary_y) result.update(report) # 只有二分类时候ERR才有意义 y_score = all_logit.softmax(1)[:, 1].tolist() eer = metrics.cal_eer(all_binary_y, y_score) result['eer'] = eer result['ind_class_acc'] = ind_class_acc result['loss'] = total_loss / n_sample result['all_y'] = all_y.tolist() result['all_pred'] = all_pred.tolist() result['oos_ind_precision'] = oos_ind_precision result['oos_ind_recall'] = oos_ind_recall result['oos_ind_f_score'] = oos_ind_fscore result['auc'] = roc_auc_score(all_binary_y, y_score) result['y_score'] = y_score result['all_binary_y'] = all_binary_y freeze_data['test_all_y'] = all_y.tolist() freeze_data['test_all_pred'] = all_pred.tolist() freeze_data['test_score'] = y_score return result
def evaluate(val_dataset): val_confusion_matrix.reset_states() for (batch, (speaker, utterance, emotion)) in enumerate(val_dataset): eval_step(speaker, utterance, emotion) return metrics.classification_report(val_confusion_matrix)
def incremental_evaluate(sess, model, minibatch_iter, size, test=False): t_test = time.time() val_losses = [] val_preds = [] labels = [] iter_num = 0 finished = False while not finished: feed_dict_val, batch_labels, finished, _ = \ minibatch_iter.incremental_node_val_feed_dict( size, iter_num, test=test) node_outs_val = sess.run([model.preds, model.loss], feed_dict=feed_dict_val) val_preds.append(node_outs_val[0]) labels.append(batch_labels) val_losses.append(node_outs_val[1]) iter_num += 1 # TODO 放进model val_preds = np.vstack(val_preds) labels = np.vstack(labels) f1_scores = calc_f1(labels, val_preds) report = classification_report(labels, val_preds) # precision, recall, thresholds = precision_recall_curve( # labels[:, 1], val_preds[:, 1]) # area = auc(recall, precision) return np.mean(val_losses), f1_scores[0], f1_scores[1], report, ( time.time() - t_test) #, area
def test(classifier,x_test,y_test): prediction = classifier.predict(x_test) print("Confusion Matrix for Decision tree Classofier Model given : ") print(confusion_matrix(y_test,prediction)) print("Classification Report for given Decision tree Classisifer : ") print(classification_report(y_test,prediction)) print("Accuracy Score : ",accuracy_score(y_test,prediction)) return
def evaluate(model, data_iterator, params, mark='Eval', verbose=True): """Evaluate the model on `steps` batches.""" # set model to evaluation mode model.eval() # id2tag dict idx2tag = {idx: tag for idx, tag in enumerate(params.tags)} true_tags = [] pred_tags = [] # a running average object for loss loss_avg = utils.RunningAverage() for input_ids, input_mask, labels in data_iterator: # to device input_ids = input_ids.to(params.device) input_mask = input_mask.to(params.device) labels = labels.to(params.device) batch_size, max_len = labels.size() # get loss loss = model(input_ids, attention_mask=input_mask.bool(), labels=labels) loss /= batch_size # update the average loss loss_avg.update(loss.item()) # inference with torch.no_grad(): batch_output = model(input_ids, attention_mask=input_mask.bool()) # 恢复标签真实长度 real_batch_tags = [] for i in range(batch_size): real_len = int(input_mask[i].sum()) real_batch_tags.append(labels[i][:real_len].to('cpu').numpy()) # List[int] pred_tags.extend([idx2tag.get(idx) for indices in batch_output for idx in indices]) true_tags.extend([idx2tag.get(idx) for indices in real_batch_tags for idx in indices]) # sanity check assert len(pred_tags) == len(true_tags), 'len(pred_tags) is not equal to len(true_tags)!' # logging loss, f1 and report metrics = {} f1 = f1_score(true_tags, pred_tags) accuracy = accuracy_score(true_tags, pred_tags) metrics['loss'] = loss_avg() metrics['f1'] = f1 metrics['accuracy'] = accuracy metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) logging.info("- {} metrics: ".format(mark) + metrics_str) # f1 classification report if verbose: report = classification_report(true_tags, pred_tags) logging.info(report) return metrics
def evaluate(model, data_iterator, params, mark='Test', verbose=False): """Evaluate the model on `steps` batches.""" # set model to evaluation mode model.eval() idx2tag = params.idx2tag true_tags = [] pred_tags = [] # a running average object for loss loss_avg = utils.RunningAverage() for _ in range(params.eval_steps): # fetch the next evaluation batch batch_data, batch_tags = next(data_iterator) batch_masks = batch_data.gt(0) loss = model(batch_data, token_type_ids=None, attention_mask=batch_masks, labels=batch_tags) batch_output = model(batch_data, token_type_ids=None, attention_mask=batch_masks ) # shape: (batch_size, max_len, num_labels) loss = loss[0] batch_output = batch_output[0] if params.n_gpu > 1 and params.multi_gpu: loss = loss.mean() loss_avg.update(loss.item()) batch_output = batch_output.detach().cpu().numpy() batch_tags = batch_tags.to('cpu').numpy() pred_tags.extend([ idx2tag.get(idx) for indices in np.argmax(batch_output, axis=2) for idx in indices ]) true_tags.extend( [idx2tag.get(idx) for indices in batch_tags for idx in indices]) assert len(pred_tags) == len(true_tags) # logging loss, f1 and report metrics = {} f1 = f1_score(true_tags, pred_tags) metrics['loss'] = loss_avg() metrics['f1'] = f1 metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) logging.info("- {} metrics: ".format(mark) + metrics_str) if verbose: report = classification_report(true_tags, pred_tags) logging.info(report) return metrics
def test_ffnn(): params = { 'n_layers': 4, 'hidden_nodes': [512, 512, 512, 512], 'epochs': 10, 'use_dynamic_features': True, 'use_mspec': False, 'as_mat': False, 'speaker_norm': False, 'context_length': 17 } net = FFNN(params) model = net.train_model() net.set_model(model) y_true, yp = net.predict_on_test() print("FFNN RESULTS") print(get_f1_score(y_true, yp)) print(get_accuracy(y_true, yp)) print(classification_report(y_true, yp))
def test_rnn(): """Notice as_mat is true here!""" params = { 'n_layers': 2, 'hidden_nodes': [32, 32], 'epochs': 100, 'use_dynamic_features': True, 'use_mspec': True, 'as_mat': True, 'speaker_norm': False, 'context_length': 35 } net = RNN(params) model = net.train_model(params['unroll']) net.set_model(model) y_true, yp = net.predict_on_test() print("RNN RESULTS") print(get_f1_score(y_true, yp)) print(get_accuracy(y_true, yp)) print(classification_report(y_true, yp)) model.save('rnn-64-64-context-35.h5')
def evaluate(test_dataset): confusion_matrix = metrics.ConfusionMatrix(model_config.n_classes) for (batch, (speaker, utterance, emotion)) in enumerate(test_dataset): speaker = tf.squeeze(speaker) # (batch_size, dial_len) emotion = tf.squeeze(emotion) # (batch_size, dial_len) mask = tf.cast(tf.math.not_equal(utterance, 0), dtype=tf.float32) utterance = encode_utterance(utterance) predictions = model(utterance, False, mask) # (batch_size, dial_len, n_classes) sample_weight = tf.math.not_equal(tf.math.reduce_sum(mask, axis=2), 0) sample_weight = tf.cast(sample_weight, dtype=tf.float32) pred_emotion = tf.math.argmax(predictions, axis=2) confusion_matrix(emotion, pred_emotion, sample_weight=sample_weight) return metrics.classification_report(confusion_matrix)
def main(): # check for gpu device print(tf.test.gpu_device_name()) # print tf and keras version print(tf.VERSION) print(tf.keras.__version__) data, labels = data_prep.read_images("data/train", IMAGE_DIMS, general=True) print("Train Set loaded") data_test, labels_test = data_prep.read_images("data/test", IMAGE_DIMS, general=True) print("Test Set loaded") lb = LabelBinarizer() X_train, y_train = data_prep.binarize(data, labels, lb) X_test, y_test = data_prep.binarize(data_test, labels_test, lb) print(X_train.shape) print(y_train.shape) model = VGGNet.vgg_net(N_CLASSES, IMAGE_DIMS) # Train the model start = time.time() history = VGGNet.fit(model, X_train, y_train, X_test, y_test, EPOCHS, BS) end = time.time() print("Training Time: " + timer(start, end)) # Metrics metrics.plot_evaluation(history) y_pred = model.predict(X_test) classification_report = metrics.classification_report(y_test, y_pred, lb) print("Classification report : \n", classification_report) confusion_matrix = metrics.confusion_matrix(y_test, y_pred) print("Confusion Matrix : \n", confusion_matrix) metrics.print_confusion_matrix(y_test, y_pred, lb)
class_to_index, subtitles, section=[0, 11]) # %% metrics.worst_samples(imgs_valid, labels_valid, scores_predict, class_to_index, top=16, names_valid=None) # %% metrics.classification_report(labels_valid, label_predict, class_to_index) # %% metrics.confusion_matrix(labels_valid, label_predict, class_to_index) # %% metrics.ROC(labels_valid, scores_predict, class_to_index, section=[0, 5, 11]) #%% metrics.ROCCompare(labels_valids, scores_predicts, class_to_index, subtitles,
def test(dataset): load_model(model, path=config['model_save_path'], model_name='bert') test_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2) n_sample = len(test_dataloader) result = dict() model.eval() # Loss function classified_loss = torch.nn.CrossEntropyLoss().to(device) detection_loss = torch.nn.BCELoss().to(device) all_detection_preds = [] all_features = [] all_pred = [] total_loss = 0 all_logit = [] for sample in tqdm.tqdm(test_dataloader): sample = (i.to(device) for i in sample) token, mask, type_ids, y = sample batch = len(token) with torch.no_grad(): f_vector, discriminator_output, classification_output = model( token, mask, type_ids, return_feature=True) discriminator_output = discriminator_output.squeeze() all_detection_preds.append(discriminator_output) if args.do_vis: all_features.append(f_vector) all_y = LongTensor( dataset.dataset[:, -1].astype(int)).cpu() # [length, n_class] all_binary_y = (all_y != 0).long() # [length, 1] label 0 is oos all_detection_preds = torch.cat(all_detection_preds, 0).cpu() # [length, 1] all_detection_binary_preds = convert_to_int_by_threshold( all_detection_preds.squeeze()) # [length, 1] # 计算损失 detection_loss = detection_loss(all_detection_preds, all_binary_y.float()) result['detection_loss'] = detection_loss logger.info( metrics.classification_report(all_binary_y, all_detection_binary_preds, target_names=['oos', 'in'])) # report oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore( all_detection_binary_preds, all_binary_y) detection_acc = metrics.accuracy(all_detection_binary_preds, all_binary_y) y_score = all_detection_preds.squeeze().tolist() eer = metrics.cal_eer(all_binary_y, y_score) if args.do_vis: all_features = torch.cat(all_features, 0).cpu().numpy() result['all_features'] = all_features ind_class_acc = metrics.ind_class_accuracy(all_detection_binary_preds, all_y) result['ind_class_acc'] = ind_class_acc result['loss'] = total_loss / n_sample result['eer'] = eer result['all_detection_binary_preds'] = all_detection_binary_preds result['detection_acc'] = detection_acc result['all_binary_y'] = all_binary_y result['all_y'] = all_y result['oos_ind_precision'] = oos_ind_precision result['oos_ind_recall'] = oos_ind_recall result['oos_ind_f_score'] = oos_ind_fscore result['score'] = y_score result['y_score'] = y_score result['all_pred'] = all_detection_binary_preds result['auc'] = roc_auc_score(all_binary_y, y_score) freeze_data['test_all_y'] = all_y.tolist() freeze_data['test_all_pred'] = all_detection_binary_preds.tolist() freeze_data['test_score'] = y_score return result
def eval(dataset): dev_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2) n_sample = len(dev_dataloader) result = dict() # Loss function detection_loss = torch.nn.BCELoss().to(device) classified_loss = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) G.eval() D.eval() E.eval() all_detection_preds = [] all_class_preds = [] for sample in tqdm.tqdm(dev_dataloader): sample = (i.to(device) for i in sample) token, mask, type_ids, y = sample batch = len(token) # -------------------------evaluate D------------------------- # # BERT encode sentence to feature vector with torch.no_grad(): sequence_output, pooled_output = E(token, mask, type_ids) real_feature = pooled_output # 大于2表示除了训练判别器还要训练分类器 if n_class > 2: f_vector, discriminator_output, classification_output = D(real_feature, return_feature=True) all_detection_preds.append(discriminator_output) all_class_preds.append(classification_output) # 只预测判别器 else: f_vector, discriminator_output = D.detect_only(real_feature, return_feature=True) all_detection_preds.append(discriminator_output) all_y = LongTensor(dataset.dataset[:, -1].astype(int)).cpu() # [length, n_class] all_binary_y = (all_y != 0).long() # [length, 1] label 0 is oos all_detection_preds = torch.cat(all_detection_preds, 0).cpu() # [length, 1] all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze()) # [length, 1] # print('all_detection_preds', all_detection_preds.size()) # print('all_binary_y', all_binary_y.size()) # 计算损失 detection_loss = detection_loss(all_detection_preds.squeeze(), all_binary_y.float()) result['detection_loss'] = detection_loss if n_class > 2: class_one_hot_preds = torch.cat(all_class_preds, 0).detach().cpu() # one hot label class_loss = classified_loss(class_one_hot_preds, all_y) # compute loss all_class_preds = torch.argmax(class_one_hot_preds, 1) # label class_acc = metrics.ind_class_accuracy(all_class_preds, all_y, oos_index=0) # accuracy for ind class logger.info(metrics.classification_report(all_y, all_class_preds, target_names=processor.id_to_label)) # logger.info(metrics.classification_report(all_binary_y, all_detection_binary_preds, target_names=['oos', 'in'])) # report oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore(all_detection_binary_preds, all_binary_y) detection_acc = metrics.accuracy(all_detection_binary_preds, all_binary_y) y_score = all_detection_preds.squeeze().tolist() eer = metrics.cal_eer(all_binary_y, y_score) result['eer'] = eer result['all_detection_binary_preds'] = all_detection_binary_preds result['detection_acc'] = detection_acc result['all_binary_y'] = all_binary_y result['oos_ind_precision'] = oos_ind_precision result['oos_ind_recall'] = oos_ind_recall result['oos_ind_f_score'] = oos_ind_fscore result['y_score'] = y_score result['auc'] = roc_auc_score(all_binary_y, y_score) if n_class > 2: result['class_loss'] = class_loss result['class_acc'] = class_acc freeze_data['valid_all_y'] = all_y freeze_data['vaild_all_pred'] = all_detection_binary_preds freeze_data['valid_score'] = y_score return result
def evaluate_f1_no_mask(model, dl_test, save_dir, criterion_clsf = nn.CrossEntropyLoss().to(device), criterion_tgt = nn.CrossEntropyLoss(ignore_index=PAD).to(device), verbose = False): loss_test = 0 pred_tags = [] true_tags = [] pred_clss = [] true_clss = [] criterion_clsf = criterion_clsf criterion_tgt = criterion_tgt idx2lbl = load_obj(save_dir+'idx2lbl.json') for enc, tgt, cls in dl_test[:]: model.eval() with torch.no_grad(): enc = enc.to(device) tgt = tgt.to(device) cls = cls.to(device) enc_self_attn_mask = get_attn_pad_mask(enc, enc) enc_self_attn_mask.to(device) logits_tgt, logits_clsf = model(enc,enc_self_attn_mask) loss_tgt = criterion_tgt(logits_tgt.transpose(1, 2), tgt) # for masked LM loss_tgt = (loss_tgt.float()).mean() loss_clsf = criterion_clsf(logits_clsf, cls)# for sentence classification loss = loss_clsf + loss_tgt # loss = loss_clsf loss_test+=loss pad_mask = enc.data.eq(0).sum(axis = 1) score_tgt, tgt_idx = torch.max(logits_tgt,dim = -1) score_cls, cls_idx = torch.max(logits_clsf, dim = -1) for pre, true, pad_num in zip(tgt_idx, tgt, pad_mask): pred_tags += pre[0:-pad_num].data.tolist() true_tags += true[0:-pad_num].data.tolist() # print(cls_idx.size()) pred_clss += cls_idx.tolist() true_clss += cls.tolist() # print(len(pred_tags), len(true_tags)) # print(pred_tags) # print(true_tags) # print(len(pred_clss), len(true_clss)) # print(pred_clss) # print(true_clss) assert len(pred_tags) == len(true_tags) assert len(pred_clss) == len(true_clss) # print(pred_clss[-20:]) # print(true_clss[-20:]) # print(pred_tags[-20:]) # print(true_tags[-20:]) # print(enc[-20:]) f1_tgt = f1_score(pred_tags, true_tags, average='micro') f1_cls = f1_score(pred_clss, true_clss, average='micro') # logging loss, f1 and report metrics = {} true_lbls = [] pred_lbls = [] for t,p in zip(true_tags,pred_tags): true_lbls.append(idx2lbl[str(t)]) pred_lbls.append(idx2lbl[str(p)]) f1_tgt_merged = f1_score_merged(true_lbls, pred_lbls) if verbose: report = classification_report(true_lbls, pred_lbls) print("============no_mask_slot================") print(report, flush=True) return loss_test/len(dl_test), f1_cls*100, f1_tgt*100, f1_tgt_merged
def test(dataset): # # load BERT and GAN # load_gan_model(D, G, config['gan_save_path']) # if args.fine_tune: # load_model(E, path=config['bert_save_path'], model_name='bert') # test_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2) n_sample = len(test_dataloader) result = dict() # Loss function detection_loss = torch.nn.BCELoss().to(device) classified_loss = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) pos.eval() E.eval() all_detection_preds = [] all_class_preds = [] all_features = [] for sample in tqdm(test_dataloader): sample = (i.to(device) for i in sample) token, mask, type_ids, pos1, pos2, pos_mask, y = sample batch = len(token) # -------------------------evaluate D------------------------- # # BERT encode sentence to feature vector with torch.no_grad(): sequence_output, pooled_output = E(token, mask, type_ids) real_feature = pooled_output out = pos(pos1, pos2, real_feature) all_detection_preds.append(out) all_y = LongTensor( dataset.dataset[:, -4].astype(int)).cpu() # [length, n_class] all_binary_y = (all_y != 0).long() # [length, 1] label 0 is oos all_detection_preds = torch.cat(all_detection_preds, 0).cpu() # [length, 1] all_detection_binary_preds = convert_to_int_by_threshold( all_detection_preds.squeeze()) # [length, 1] # 计算损失 detection_loss = detection_loss(all_detection_preds, all_binary_y.float()) result['detection_loss'] = detection_loss logger.info( metrics.classification_report(all_binary_y, all_detection_binary_preds, target_names=['oos', 'in'])) # report oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore( all_detection_binary_preds, all_binary_y) detection_acc = metrics.accuracy(all_detection_binary_preds, all_binary_y) y_score = all_detection_preds.squeeze().tolist() eer = metrics.cal_eer(all_binary_y, y_score) result['eer'] = eer result['all_detection_binary_preds'] = all_detection_binary_preds result['detection_acc'] = detection_acc result['all_binary_y'] = all_binary_y result['oos_ind_precision'] = oos_ind_precision result['oos_ind_recall'] = oos_ind_recall result['oos_ind_f_score'] = oos_ind_fscore result['y_score'] = y_score result['auc'] = roc_auc_score(all_binary_y, y_score) return result
def evaluate(model, iterator, f, ner_label, verbose = False): """Evaluate the model on `steps` batches.""" # set model to evaluation mode model.eval() y_true = [] y_pred = [] Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], [] with torch.no_grad(): for i, batch in enumerate(iterator): words, input_ids, is_heads, tags, input_tags, entity_label, seqlens = batch _, _, y_hat = model(input_ids, input_tags, entity_label) # y_hat: (N, T) Words.extend(words) Is_heads.extend(is_heads) Tags.extend(tags) Y.extend(input_tags.numpy().tolist()) Y_hat.extend(y_hat.cpu().numpy().tolist()) ## gets results and save with open("temp", 'w') as fout: for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat): y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1] preds = [ner_label.idx2tag[hat] for hat in y_hat] if len(preds[1:-1]) > 0: y_pred.append(preds[1:-1]) if len(tags.split()[1:-1]) > 0: y_true.append(tags.split()[1:-1]) assert len(preds) == len(words.split()) == len(tags.split()) for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]): fout.write(f"{w} {t} {p}\n") fout.write("\n") assert len(y_pred) == len(y_true) # logging loss, f1 and report p, r, f1 = f1_score(y_true, y_pred) # metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) # logging.info("- {} metrics: ".format(mark) + metrics_str) # # if verbose: # report = classification_report(true_tags, pred_tags) # logging.info(report) final = f + ".P%.4f_R%.4f_F%.4f" %(p, r, f1) with open(final, 'w') as fout: result = open("temp", "r").read() fout.write(f"{result}\n") fout.write(f"precision={p}\n") fout.write(f"recall={r}\n") fout.write(f"f1={f1}\n") if verbose: report = classification_report(y_true, y_pred) print(report) os.remove("temp") print("precision=%.2f"%p) print("recall=%.2f"%r) print("f1=%.2f"%f1) return p, r, f1
def evaluate(args, model, eval_dataloader, params): model.eval() # 记录平均损失 loss_avg = utils.RunningAverage() # init pre_result = [] gold_result = [] # get data for batch in tqdm(eval_dataloader, unit='Batch'): # to device batch = tuple(t.to(params.device) for t in batch) input_ids, input_mask, segment_ids, start_pos, end_pos, ne_cate = batch with torch.no_grad(): # get loss loss = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, start_positions=start_pos, end_positions=end_pos) if params.n_gpu > 1 and args.multi_gpu: loss = loss.mean() # mean() to average on multi-gpu. # update the average loss loss_avg.update(loss.item()) # inference start_logits, end_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) # gold label start_pos = start_pos.to("cpu").numpy().tolist() end_pos = end_pos.to("cpu").numpy().tolist() input_mask = input_mask.to('cpu').numpy().tolist() ne_cate = ne_cate.to("cpu").numpy().tolist() # predict label start_label = start_logits.detach().cpu().numpy().tolist() end_label = end_logits.detach().cpu().numpy().tolist() # idx to label cate_idx2label = { idx: value for idx, value in enumerate(params.label_list) } # get bio result for start_p, end_p, start_g, end_g, input_mask_s, ne_cate_s in zip( start_label, end_label, start_pos, end_pos, input_mask, ne_cate): ne_cate_str = cate_idx2label[ne_cate_s] # 问题长度 q_len = len(IO2QUERY[ne_cate_str]) # 有效长度 act_len = sum(input_mask_s[q_len + 2:-1]) # get BIO labels pre_bio_labels = pointer2bio(start_p[q_len + 2:q_len + 2 + act_len], end_p[q_len + 2:q_len + 2 + act_len], ne_cate=ne_cate_str) gold_bio_labels = pointer2bio(start_g[q_len + 2:q_len + 2 + act_len], end_g[q_len + 2:q_len + 2 + act_len], ne_cate=ne_cate_str) pre_result.append(pre_bio_labels) gold_result.append(gold_bio_labels) # metrics f1 = f1_score(y_true=gold_result, y_pred=pre_result) acc = accuracy_score(y_true=gold_result, y_pred=pre_result) # f1, acc metrics = {'loss': loss_avg(), 'f1': f1, 'acc': acc} metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) logging.info("- {} metrics: ".format('Val') + metrics_str) # f1 classification report report = classification_report(y_true=gold_result, y_pred=pre_result) logging.info(report) return metrics
score, preds = model(sents, lens) for i, l in enumerate(lens): true_labels.append( seqid2text(labs[i, :l], ix_to_lab)) pred_labels.append( seqid2text(preds[i, :l], ix_to_lab)) f1 = f1_score(true_labels, pred_labels) if (f1 > best_f1): torch.save(model.state_dict(), "models/model-27-02-20") best_f1 = f1 print("Accuracy: {:.4f}".format( accuracy_score(true_labels, pred_labels))) print("F1 score: {:.4f}".format(f1)) print(classification_report(true_labels, pred_labels)) model.train(True) if args.do_test: with torch.no_grad(): print("Evaluation on test set") model.load_state_dict( torch.load("models/model-27-02-20", map_location=device)) model.eval() true_labels = [] pred_labels = [] word_sents = [] for batch in test_data_loader: sents, labs, lens = batch sents = pad_sequence(sents, batch_first=True).to(device) labs = pad_sequence(labs, batch_first=True).to(device) lens = torch.tensor(lens).to(device)
def test(dataset): # # load BERT and GAN # load_gan_model(D, G, config['gan_save_path']) # if args.fine_tune: # load_model(E, path=config['bert_save_path'], model_name='bert') # test_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2) n_sample = len(test_dataloader) result = dict() # Loss function detection_loss = torch.nn.CrossEntropyLoss().to(device) model.eval() all_detection_preds = [] all_detection_logit = [] total_loss = 0 for sample in tqdm(test_dataloader): sample = (i.to(device) for i in sample) token, mask, type_ids, y = sample batch = len(token) # -------------------------evaluate D------------------------- # # BERT encode sentence to feature vector with torch.no_grad(): logit = model(token, mask, type_ids) all_detection_logit.append(logit) all_detection_preds.append(torch.argmax(logit, 1)) total_loss += detection_loss(logit, y.long()) all_y = LongTensor( dataset.dataset[:, -1].astype(int)).cpu() # [length, n_class] all_binary_y = (all_y != 0).long() # [length, 1] label 0 is oos all_detection_preds = torch.cat(all_detection_preds, 0).cpu() # [length, 1] # all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze()) # [length, 1] all_detection_logit = torch.cat(all_detection_logit, 0).cpu() # 计算损失 result['detection_loss'] = total_loss logger.info( metrics.classification_report(all_binary_y, all_detection_preds, target_names=['oos', 'in'])) # report oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore( all_detection_preds, all_binary_y) detection_acc = metrics.accuracy(all_detection_preds, all_binary_y) # y_score = all_detection_preds.squeeze().tolist() y_score = all_detection_logit.softmax(1)[:, 1].tolist() eer = metrics.cal_eer(all_binary_y, y_score) test_logit = all_detection_logit.tolist() result['test_logit'] = test_logit result['eer'] = eer result['all_detection_preds'] = all_detection_preds result['detection_acc'] = detection_acc result['all_binary_y'] = all_binary_y result['oos_ind_precision'] = oos_ind_precision result['oos_ind_recall'] = oos_ind_recall result['oos_ind_f_score'] = oos_ind_fscore result['y_score'] = y_score result['auc'] = roc_auc_score(all_binary_y, y_score) return result
def predict(features, stage, dataframe_path, ch_path, hmc_path): """Gets label predictions from the previously fit model Parameters ---------- features : str The specified type of features - ["bert", "tfidf"] stage : str Predict stage; which data to use - ["dev", "test"] dataframe_path : str The path to the Pandas dataframe which contains the preprocessed data ch_path : str Location of the class hierarchy file hmc_path : str Location of the pretrained HierarchicalClassifier """ df = pd.read_pickle(dataframe_path) topics = list(df)[5:] df, y = prep_df_for_train(df, None) print(y.shape) print(df.info()) df.reset_index(inplace=True, drop=True) # load features if features == "bert": path = f"bert-multilingual/bert_{stage}.npy" f = np.load(path, allow_pickle=True) print( f"Total: {len(f)}, tokens: {len(f[0]['embeddings'][0])}, embeds: ", f"{len(f[0]['embeddings'][1])}x{len(f[0]['embeddings'][1][0])}", ) embeddings_list = f # parse from pickle embeddings = [] tokens_list = [] y = [] for entry in embeddings_list: _y = label_ids_to_labels(entry["label_ids"], topics) y.append(_y) tokens, vectors = entry["embeddings"] tokens_list.append(tokens) embeddings.append(vectors) assert len(vectors) == 4 assert len(vectors[0]) == 768 assert len(embeddings) == len(y) print(f"Number of examples: {len(embeddings)}") f.close() X = embeddings print(len(X), len(X[0]), len(y)) X = np.asarray(X).reshape(len(X), 3072) print(X.shape) elif features == "tfidf": y = df["topics"] xtrain, xtest, ytrain, ytest = train_test_split( df["clean_text_tokenized"], y, test_size=0.2, random_state=42) xtrain, xdev, ytrain, ydev = train_test_split(xtrain, ytrain, test_size=0.25, random_state=42) print(f"train shape, {xtrain.shape}") print(f"dev shape, {xdev.shape}") print(f"test shape, {xtest.shape}") # create TF-IDF features tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000, preprocessor=" ".join) xtrain = tfidf_vectorizer.fit_transform(xtrain) xdev = tfidf_vectorizer.transform(xdev) xtest = tfidf_vectorizer.transform(xtest) print(xdev.shape, xtest.shape) if stage == "dev": X = pd.DataFrame(xdev.todense()) y = ydev elif stage == "test": X = pd.DataFrame(xtest.todense()) y = ytest X.reset_index(inplace=True, drop=True) y.reset_index(inplace=True, drop=True) # load trained model clf = None if features == "tfidf": with open(hmc_path, "rb") as f: clf = dill.load(f) elif features == "bert": with open(hmc_path, "rb") as f: clf = dill.load(f) # load class hierarchy ch = None with open(ch_path, "rb") as f: ch = pickle.load(f) # get predictions ypred = clf.predict(X) metrics.classification_report(ch, y, pd.DataFrame(ypred)) metrics.EXTEND_PRED = False print("=" * 100) metrics.classification_report(ch, y, pd.DataFrame(ypred)) metrics.EXTEND_PRED = True ypred_topk = dth.predict_topk(Xdev, 5) metrics.classification_report_topk(ch, ydev, ypred_topk, 1) metrics.classification_report_topk(ch, ydev, ypred_topk, 3) metrics.classification_report_topk(ch, ydev, ypred_topk, 5)
best_weighted_f1 = 0. for epoch in range(train_config.n_epochs): start = time.time() train_loss.reset_states() # train_accuracy.reset_states() train_confusion_matrix.reset_states() for (batch, (speaker, utterance, emotion)) in enumerate(train_dataset): train_step(speaker, utterance, emotion) if batch % 20 == 0: report = metrics.classification_report(train_confusion_matrix) print( 'Epoch {} Batch {} Loss {:.4f} Micro-f1 {:.4f} Macro-f1 {:.4f}' ' Weighted-f1 {:.4f} Accuracy {:.4f}'.format( epoch + 1, batch, train_loss.result(), report[1].numpy(), report[2].numpy(), report[3].numpy(), report[4].numpy())) with np.printoptions(precision=4, suppress=True): print('Metrics of classes:\n', report[0].numpy()) if (epoch + 1) % 5 == 0: ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format( epoch + 1, ckpt_save_path)) report = metrics.classification_report(train_confusion_matrix) print('Epoch {} Loss {:.4f} Micro-f1 {:.4f} Macro-f1 {:.4f}'
def test(dataset): # load BERT and GAN load_gan_model(D, G, config['gan_save_path']) if args.fine_tune: load_model(E, path=config['bert_save_path'], model_name='bert') test_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2) n_sample = len(test_dataloader) result = dict() # Loss function detection_loss = torch.nn.BCELoss().to(device) classified_loss = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) G.eval() D.eval() E.eval() all_detection_preds = [] all_class_preds = [] all_features = [] for sample in tqdm.tqdm(test_dataloader): sample = (i.to(device) for i in sample) if args.dataset == 'smp': token, mask, type_ids, knowledge_tag, y = sample if args.dataset == 'oos-eval': token, mask, type_ids, y = sample batch = len(token) anchor_ood = torch.zeros(args.num_outcomes, dtype=torch.float).to(device) + torch.tensor(anchor0, dtype=torch.float).to(device) anchor_ind = torch.zeros(args.num_outcomes, dtype=torch.float).to(device) + torch.tensor(anchor1, dtype=torch.float).to(device) # -------------------------evaluate D------------------------- # # BERT encode sentence to feature vector with torch.no_grad(): sequence_output, pooled_output = E(token, mask, type_ids) real_feature = pooled_output # 大于2表示除了训练判别器还要训练分类器 if n_class > 2: f_vector, discriminator_output, classification_output = D(real_feature, return_feature=True) all_class_preds.append(classification_output) # 只预测判别器 else: f_vector, discriminator_output = D.detect_only(real_feature, return_feature=True) discriminator_output = discriminator_output.log_softmax(1).exp() if args.do_vis: all_features.append(f_vector) divergence_to_preidction = [] # logger.info('discriminator_output: {}'.format(discriminator_output)) for output in discriminator_output: d_ood = triplet_loss(anchor_ood, output, skewness=args.positive_skew) d_ind = triplet_loss(anchor_ind, output, skewness=args.negative_skew) # logger.info('d_ood : d_ind = {} : {}'.format(d_ood, d_ind)) # divergence_to_preidction.append(1 if d_ind < d_ood else 0) divergence_to_preidction.append(d_ood / (d_ind + d_ood)) all_detection_preds.extend(divergence_to_preidction) all_y = LongTensor(dataset.dataset[:, -1].astype(int)).cpu() # [length, n_class] all_binary_y = (all_y != 0).long() # [length, 1] label 0 is oos # 用 realness_D 做 ood 判别 # all_detection_preds = torch.cat(all_detection_preds, 0).cpu() # [length, 1] # all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze()) # [length, 1] all_detection_preds = FloatTensor(all_detection_preds).cpu() # all_detection_binary_preds = all_detection_preds.squeeze() # [length, 1] all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze()) # [length, 1] # logger.info('all_detection_preds: {}'.format(all_detection_preds)) # logger.info('all_binary_y: {}'.format(all_binary_y)) # 计算损失 detection_loss = detection_loss(all_detection_preds, all_binary_y.float()) result['detection_loss'] = detection_loss if n_class > 2: class_one_hot_preds = torch.cat(all_class_preds, 0).detach().cpu() # one hot label class_loss = classified_loss(class_one_hot_preds, all_y) # compute loss all_class_preds = torch.argmax(class_one_hot_preds, 1) # label class_acc = metrics.ind_class_accuracy(all_class_preds, all_y, oos_index=0) # accuracy for ind class logger.info(metrics.classification_report(all_y, all_class_preds, target_names=processor.id_to_label)) # report oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore(all_detection_binary_preds,all_binary_y) detection_acc = metrics.accuracy(all_detection_binary_preds, all_binary_y) y_score = all_detection_preds.squeeze().tolist() eer = metrics.cal_eer(all_binary_y, y_score) result['eer'] = eer result['all_detection_binary_preds'] = all_detection_binary_preds result['detection_acc'] = detection_acc result['all_binary_y'] = all_binary_y result['all_y'] = all_y result['oos_ind_precision'] = oos_ind_precision result['oos_ind_recall'] = oos_ind_recall result['oos_ind_f_score'] = oos_ind_fscore result['score'] = y_score result['y_score'] = y_score result['auc'] = roc_auc_score(all_binary_y, y_score) result['fpr95'] = ErrorRateAt95Recall(all_binary_y, y_score) if n_class > 2: result['class_loss'] = class_loss result['class_acc'] = class_acc if args.do_vis: all_features = torch.cat(all_features, 0).cpu().numpy() result['all_features'] = all_features freeze_data['test_all_y'] = all_y.tolist() freeze_data['test_all_pred'] = all_detection_binary_preds.tolist() freeze_data['test_score'] = y_score return result
def evaluate(dataloader, model, word_vocab, label_vocab, output_path, prefix, use_gpu=False): model.eval() prediction = [] trues_list = [] preds_list = [] for batch in dataloader: batch_text, seq_length, word_perm_idx = batch['text'] batch_label, _, _ = batch['label'] char_inputs = batch['char'] char_inputs = char_inputs[word_perm_idx] char_dim = char_inputs.size(-1) char_inputs = char_inputs.contiguous().view(-1, char_dim) if use_gpu: batch_text = batch_text.cuda() batch_label = batch_label.cuda() char_inputs = char_inputs.cuda() mask = get_mask(batch_text) with torch.no_grad(): tag_seq = model(batch_text, seq_length, char_inputs, batch_label, mask) for line_tesor, labels_tensor, predicts_tensor in zip( batch_text, batch_label, tag_seq): for word_tensor, label_tensor, predict_tensor in zip( line_tesor, labels_tensor, predicts_tensor): if word_tensor.item() == 0: break line = [ word_vocab.id_to_word(word_tensor.item()), label_vocab.id_to_label(label_tensor.item()), label_vocab.id_to_label(predict_tensor.item()) ] trues_list.append(line[1]) preds_list.append(line[2]) prediction.append(' '.join(line)) prediction.append('') true_entities = get_entities_bio(trues_list) pred_entities = get_entities_bio(preds_list) print(len(trues_list), len(preds_list), len(prediction)) results = { "f1": f1_score(true_entities, pred_entities), 'report': classification_report(true_entities, pred_entities) } with open(os.path.join(output_path, '%s_pred.txt' % prefix), 'w', encoding='utf-8') as f: f.write('\n'.join(prediction)) with open(os.path.join(output_path, '%s_score.txt' % prefix), "a") as writer: writer.write("***** Eval results {} *****\n".format(prefix)) for key in sorted(results.keys()): if key == 'report_dict': continue writer.write("{} = {}\n".format(key, str(results[key]))) return results["f1"]
import pandas as pd import sklearn.model_selection import train_test_split import sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier import sklearn import metrics import joblib ## load dataset.csv dataframe = pd.read_csv("csv/dataset.csv") print(dataframe.head()) #Split into training and test data data_x = dataframe.drop(["Label"], axis=1) data_y = dataframe["Label"] trained_x, test_x, trained_y, test_y = train_test_split(data_x, data_y, test_size=0.2, random_state-4) ## Build the model model = RandomForestClassifier(num_estimators=100, max_depth=5) model.fit(trained_x,trained_y) joblib.dump(model, "rf_malaria_100_5") # Create and build predictions and get classification report on the trained data predictions = model.predict(test_x) print(metrics.classification_report(predictions, test_y ))
def evaluate(args, model, eval_dataloader, params): model.eval() # 记录平均损失 loss_avg = utils.RunningAverage() # init pre_result = [] gold_result = [] # get data for batch in tqdm(eval_dataloader, unit='Batch', ascii=True): # fetch the next training batch batch = tuple( t.to(params.device) if isinstance(t, torch.Tensor) else t for t in batch) input_ids, input_mask, tags, cls_labels, random_cls_ids, \ random_start_posis, random_end_posis, _, _ = batch with torch.no_grad(): # get loss loss = model(input_ids, attention_mask=input_mask, cls_labels=cls_labels, cls_ids=random_cls_ids, start_positions=random_start_posis, end_positions=random_end_posis) if params.n_gpu > 1 and args.multi_gpu: loss = loss.mean() # mean() to average on multi-gpu. # update the average loss loss_avg.update(loss.item()) # inference cls_pre, start_pre, end_pre = model(input_ids=input_ids, attention_mask=input_mask) # gold label tags = tags[0] # predict label start_pre = start_pre.detach().cpu().numpy().tolist() end_pre = end_pre.detach().cpu().numpy().tolist() # idx to label cate_idx2label = { idx: str(idx + 1) for idx, _ in enumerate(params.label_list) } # get bio result # 有效长度 act_len = sum(input_mask[0]) # 合并一个样本的结果(用于metrics) old_bio_labels = ['O'] * act_len for start_p, end_p, cls_p in zip(start_pre, end_pre, cls_pre): pre_bio_labels = pointer2bio(start_p[:act_len], end_p[:act_len], ne_cate=cate_idx2label[cls_p]) old_bio_labels = [ new if old == 'O' else old for old, new in zip(old_bio_labels, pre_bio_labels) ] pre_result.append(old_bio_labels) gold_result.append(tags[:act_len]) # metrics f1 = f1_score(y_true=gold_result, y_pred=pre_result) acc = accuracy_score(y_true=gold_result, y_pred=pre_result) # f1, acc metrics = {'loss': loss_avg(), 'f1': f1, 'acc': acc} metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) logging.info("- {} metrics: ".format('Val') + metrics_str) # f1 classification report report = classification_report(y_true=gold_result, y_pred=pre_result) logging.info(report) return metrics
def test(dataset): # load BERT and GAN load_gan_model(D, G, config['gan_save_path']) if args.fine_tune: load_model(E, path=config['bert_save_path'], model_name='bert') test_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2) n_sample = len(test_dataloader) result = dict() # Loss function detection_loss = torch.nn.BCELoss().to(device) detection_loss_v2 = torch.nn.CrossEntropyLoss().to(device) classified_loss = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) G.eval() D.eval() E.eval() detector.eval() all_detection_preds = [] all_class_preds = [] all_features = [] all_logit = [] for sample in tqdm.tqdm(test_dataloader): sample = (i.to(device) for i in sample) token, mask, type_ids, y = sample batch = len(token) # -------------------------evaluate D------------------------- # # BERT encode sentence to feature vector with torch.no_grad(): sequence_output, pooled_output = E(token, mask, type_ids) real_feature = pooled_output # 大于2表示除了训练判别器还要训练分类器 if n_class > 2: # f_vector, discriminator_output, classification_output = D(real_feature, return_feature=True) # all_detection_preds.append(discriminator_output) # all_class_preds.append(classification_output) pass else: if args.loss == 'v1': detector_out = detector(real_feature) all_detection_preds.append(detector_out) else: detector_out = detector(real_feature) all_logit.append(detector_out) all_detection_preds.append( torch.argmax(detector_out, 1)) # if args.do_vis: # all_features.append(f_vector) all_y = LongTensor( dataset.dataset[:, -1].astype(int)).cpu() # [length, n_class] all_binary_y = (all_y != 0).long() # [length, 1] label 0 is oos all_detection_preds = torch.cat(all_detection_preds, 0).cpu() # [length, 1] if args.loss == 'v1': all_detection_binary_preds = convert_to_int_by_threshold( all_detection_preds.squeeze()) # [length, 1] else: all_detection_binary_preds = all_detection_preds all_logit = torch.cat(all_logit, 0).cpu() # 计算损失 if args.loss == 'v1': loss = detection_loss(all_detection_preds, all_binary_y.float()) else: loss = detection_loss_v2(all_logit, all_y.long()) result['detection_loss'] = loss if n_class > 2: class_one_hot_preds = torch.cat(all_class_preds, 0).detach().cpu() # one hot label class_loss = classified_loss(class_one_hot_preds, all_y) # compute loss all_class_preds = torch.argmax(class_one_hot_preds, 1) # label class_acc = metrics.ind_class_accuracy( all_class_preds, all_y, oos_index=0) # accuracy for ind class logger.info( metrics.classification_report( all_y, all_class_preds, target_names=processor.id_to_label)) logger.info( metrics.classification_report(all_binary_y, all_detection_binary_preds, target_names=['oos', 'in'])) # report oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore( all_detection_binary_preds, all_binary_y) detection_acc = metrics.accuracy(all_detection_binary_preds, all_binary_y) y_score = all_detection_preds.squeeze().tolist() eer = metrics.cal_eer(all_binary_y, y_score) result['eer'] = eer result['all_detection_binary_preds'] = all_detection_binary_preds result['detection_acc'] = detection_acc result['all_binary_y'] = all_binary_y result['all_y'] = all_y result['oos_ind_precision'] = oos_ind_precision result['oos_ind_recall'] = oos_ind_recall result['oos_ind_f_score'] = oos_ind_fscore result['score'] = y_score result['y_score'] = y_score result['auc'] = roc_auc_score(all_binary_y, y_score) if n_class > 2: result['class_loss'] = class_loss result['class_acc'] = class_acc if args.do_vis: all_features = torch.cat(all_features, 0).cpu().numpy() result['all_features'] = all_features freeze_data['test_all_y'] = all_y.tolist() freeze_data['test_all_pred'] = all_detection_binary_preds.tolist() freeze_data['test_score'] = y_score return result
def evaluate(args, model, eval_dataloader, params): model.eval() # 记录平均损失 loss_avg = utils.RunningAverage() # init pre_result = [] gold_result = [] # get data for batch in tqdm(eval_dataloader, unit='Batch', ascii=True): # fetch the next training batch batch = tuple(t.to(params.device) for t in batch) input_ids, input_mask, start_pos, end_pos, _, _ = batch with torch.no_grad(): # get loss loss = model(input_ids, attention_mask=input_mask, start_positions=start_pos, end_positions=end_pos) if params.n_gpu > 1 and args.multi_gpu: loss = loss.mean() # mean() to average on multi-gpu. # update the average loss loss_avg.update(loss.item()) # inference start_pre, end_pre = model(input_ids=input_ids, attention_mask=input_mask) # gold label start_pos = start_pos.to("cpu").numpy().transpose((0, 2, 1)).tolist() # (batch_size, tag_size, seq_len) end_pos = end_pos.to("cpu").numpy().transpose((0, 2, 1)).tolist() input_mask = input_mask.to('cpu').numpy().tolist() # predict label start_label = start_pre.detach().cpu().numpy().transpose((0, 2, 1)).tolist() end_label = end_pre.detach().cpu().numpy().transpose((0, 2, 1)).tolist() # idx to label cate_idx2label = {idx: str(idx + 1) for idx, _ in enumerate(params.label_list)} # get bio result for start_p_s, end_p_s, start_g_s, end_g_s, input_mask_s in zip(start_label, end_label, start_pos, end_pos, input_mask): # 有效长度 act_len = sum(input_mask_s) for idx, (start_p, end_p, start_g, end_g) in enumerate(zip(start_p_s, end_p_s, start_g_s, end_g_s)): pre_bio_labels = pointer2bio(start_p[:act_len], end_p[:act_len], ne_cate=cate_idx2label[idx]) gold_bio_labels = pointer2bio(start_g[:act_len], end_g[:act_len], ne_cate=cate_idx2label[idx]) pre_result.append(pre_bio_labels) gold_result.append(gold_bio_labels) # metrics f1 = f1_score(y_true=gold_result, y_pred=pre_result) acc = accuracy_score(y_true=gold_result, y_pred=pre_result) # f1, acc metrics = {'loss': loss_avg(), 'f1': f1, 'acc': acc} metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) logging.info("- {} metrics: ".format('Val') + metrics_str) # f1 classification report report = classification_report(y_true=gold_result, y_pred=pre_result) logging.info(report) return metrics