def evaluate_results(net, test_loader, pad_id, cuda, args, epoch): logger.info("Evaluating test samples...") acc = 0 out_labels = [] true_labels = [] net.eval() with torch.no_grad(): for i, data in tqdm(enumerate(test_loader), total=len(test_loader)): x, e1_e2_start, labels, _, _, _ = data attention_mask = (x != pad_id).float() token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long() if args.only_evaluate == 2 and i >= 10: break if cuda: x = x.cuda() labels = labels.cuda() attention_mask = attention_mask.cuda() token_type_ids = token_type_ids.cuda() classification_logits = net(x, token_type_ids=token_type_ids, attention_mask=attention_mask, \ e1_e2_start=e1_e2_start) accuracy, (o, l) = evaluate_(classification_logits, labels) out_labels.extend(o) true_labels.extend(l) acc += accuracy accuracy = acc / (i + 1) results = { "accuracy": accuracy, "sklearn f1-macro": sklearn_f1_score(true_labels, out_labels, labels=list(range(args.num_classes)), average='macro'), "sklearn f1-micro": sklearn_f1_score(true_labels, out_labels, labels=list(range(args.num_classes)), average='micro') } if args.task == 'SemEval': logger.info("Generating additional files ...") semeval_files(args, true_labels, out_labels, epoch + 1) elif args.task == 'KBP37': KBP37_scorer1(args, true_labels, out_labels) KBP37_files(args, true_labels, out_labels, epoch + 1) elif args.task == 'TACRED': TACRED_scorer(args, true_labels, out_labels) logger.info("***** Eval results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results
def f1_score(predictions, ground_truth): '''Compute F1 scores. .. math:: F_{score}^{(n)} = \\frac {2 * Precision * Recall} {Precision + Recall} Parameters ---------- predictions, ground_truth : nilmtk.MeterGroup Returns ------- f1_scores : pd.Series Each index is an meter instance int (or tuple for MeterGroups). Each value is the F1 score for that appliance. If there are multiple chunks then the value is the weighted mean of the F1 score for each chunk. ''' # If we import sklearn at top of file then sphinx breaks. from sklearn.metrics import f1_score as sklearn_f1_score # sklearn produces lots of DepreciationWarnings with PyTables import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) f1_scores = {} both_sets_of_meters = iterate_through_submeters_of_two_metergroups( predictions, ground_truth) for pred_meter, ground_truth_meter in both_sets_of_meters: scores_for_meter = pd.DataFrame(columns=['score', 'num_samples']) aligned_meters = align_two_meters( pred_meter, ground_truth_meter, 'when_on') for aligned_states_chunk in aligned_meters: aligned_states_chunk.dropna(inplace=True) aligned_states_chunk = aligned_states_chunk.astype(int) score = sklearn_f1_score(aligned_states_chunk.icol(0), aligned_states_chunk.icol(1)) scores_for_meter = scores_for_meter.append( {'score': score, 'num_samples': len(aligned_states_chunk)}, ignore_index=True) # Calculate weighted mean num_samples = scores_for_meter['num_samples'].sum() if num_samples > 0: scores_for_meter['proportion'] = ( scores_for_meter['num_samples'] / num_samples) avg_score = ( scores_for_meter['score'] * scores_for_meter['proportion'] ).sum() else: warn("No aligned samples when calculating F1-score for prediction" " meter {} and ground truth meter {}." .format(pred_meter, ground_truth_meter)) avg_score = np.NaN f1_scores[pred_meter.instance()] = avg_score return pd.Series(f1_scores)
def f1_score(predictions, ground_truth): '''Compute F1 scores. .. math:: F_{score}^{(n)} = \\frac {2 * Precision * Recall} {Precision + Recall} Parameters ---------- predictions, ground_truth : nilmtk.MeterGroup Returns ------- f1_scores : pd.Series Each index is an meter instance int (or tuple for MeterGroups). Each value is the F1 score for that appliance. If there are multiple chunks then the value is the weighted mean of the F1 score for each chunk. ''' # If we import sklearn at top of file then sphinx breaks. from sklearn.metrics import f1_score as sklearn_f1_score # sklearn produces lots of DepreciationWarnings with PyTables import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) f1_scores = {} both_sets_of_meters = iterate_through_submeters_of_two_metergroups( predictions, ground_truth) for pred_meter, ground_truth_meter in both_sets_of_meters: scores_for_meter = pd.DataFrame(columns=['score', 'num_samples']) aligned_meters = align_two_meters(pred_meter, ground_truth_meter, 'when_on') for aligned_states_chunk in aligned_meters: aligned_states_chunk.dropna(inplace=True) aligned_states_chunk = aligned_states_chunk.astype(int) score = sklearn_f1_score(aligned_states_chunk.iloc[:, 0], aligned_states_chunk.iloc[:, 1]) scores_for_meter = scores_for_meter.append( { 'score': score, 'num_samples': len(aligned_states_chunk) }, ignore_index=True) # Calculate weighted mean num_samples = scores_for_meter['num_samples'].sum() if num_samples > 0: scores_for_meter['proportion'] = (scores_for_meter['num_samples'] / num_samples) avg_score = (scores_for_meter['score'] * scores_for_meter['proportion']).sum() else: warn("No aligned samples when calculating F1-score for prediction" " meter {} and ground truth meter {}.".format( pred_meter, ground_truth_meter)) avg_score = np.NaN f1_scores[pred_meter.instance()] = avg_score return pd.Series(f1_scores)
def classify_generic(classificator, data): start_time = data["start_time"] x_test = data["x_test"] y_test = data["y_test"] x_train = data["x_train"] y_train = data["y_train"] experiment_hash = data["experiment_hash"] name = classificator.__class__.__name__ console.print(f"\n[yellow]Classificator: [blue]{classificator}\n") log('start', 'fit', start_time) classificator.fit(x_train, y_train) log('end', 'fit', start_time) log('start', 'predict', start_time) predict = classificator.predict(x_test) log('end', 'predict', start_time) log('start', 'f1_score', start_time) f1_score = utils.round_float( sklearn_f1_score(y_test, predict, labels=numpy.unique(predict), average='weighted')) log('end', 'f1_score', start_time) log('start', 'accuracy', start_time) accuracy = utils.round_float(accuracy_score(y_test, predict)) log('end', 'accuracy', start_time) log('start', 'conf_mat', start_time) conf_mat = confusion_matrix(y_test, predict) log('end', 'conf_mat', start_time) log('start', 'save_conf_mat', start_time) generate_conf_mat.save_conf_mat(experiment_hash, classificator, name, x_test, y_test) log('end', 'save_conf_mat', start_time) log('start', 'classification_report', start_time) creport = classification_report(y_test, predict) log('end', 'classification_report', start_time) time = utils.round_float(utils.get_time_diff(start_time)) result = { 'f1_score': f1_score, 'accuracy': accuracy, 'conf_mat': conf_mat, 'creport': creport, 'time': time } return result
def validation_step(self, batch, batch_nb): input_ids, attention_mask, token_type_ids, slot_labels = batch outputs = self( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, ) active_loss = attention_mask.view(-1) == 1 active_logits = outputs.view(-1, len(self.slot_labels_type))[active_loss] active_labels = slot_labels.view(-1)[active_loss] loss = F.cross_entropy(active_logits, active_labels) a, y_hat = torch.max(outputs, dim=2) y_hat = y_hat.detach().cpu().numpy() slot_label_ids = slot_labels.detach().cpu().numpy() slot_label_map = { i: label for i, label in enumerate(self.slot_labels_type) } slot_gt_labels = [[] for _ in range(slot_label_ids.shape[0])] slot_pred_labels = [[] for _ in range(slot_label_ids.shape[0])] for i in range(slot_label_ids.shape[0]): for j in range(slot_label_ids.shape[1]): if slot_label_ids[i, j] != self.ignore_index: slot_gt_labels[i].append( slot_label_map[slot_label_ids[i][j]]) slot_pred_labels[i].append(slot_label_map[y_hat[i][j]]) val_acc = torch.tensor(seqeval_f1_score(slot_gt_labels, slot_pred_labels), dtype=torch.float32) token_val_acc = sklearn_f1_score( list(chain.from_iterable(slot_gt_labels)), list(chain.from_iterable(slot_pred_labels)), average="micro", ) token_val_acc = torch.tensor(token_val_acc, dtype=torch.float32) return { "val_loss": loss, "val_acc": val_acc, "token_val_acc": token_val_acc }
def f1_score(predictions, ground_truth): """Compute F1 scores. .. math:: F_{score}^{(n)} = \\frac {2 * Precision * Recall} {Precision + Recall} Parameters ---------- predictions, ground_truth : nilmtk.MeterGroup Returns ------- f1_scores : pd.Series Each index is an meter instance int (or tuple for MeterGroups). Each value is the F1 score for that appliance. If there are multiple chunks then the value is the weighted mean of the F1 score for each chunk. """ # If we import sklearn at top of file then sphinx breaks. from sklearn.metrics import f1_score as sklearn_f1_score # sklearn produces lots of DepreciationWarnings with PyTables import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) f1_scores = {} both_sets_of_meters = iterate_through_submeters_of_two_metergroups(predictions, ground_truth) for pred_meter, ground_truth_meter in both_sets_of_meters: scores_for_meter = pd.DataFrame(columns=["score", "n_samples"]) for aligned_states_chunk in align_two_meters(pred_meter, ground_truth_meter, "when_on"): aligned_states_chunk.dropna(inplace=True) aligned_states_chunk = aligned_states_chunk.astype(int) score = sklearn_f1_score(aligned_states_chunk.icol(0), aligned_states_chunk.icol(1)) scores_for_meter = scores_for_meter.append( {"score": score, "n_samples": len(aligned_states_chunk)}, ignore_index=True ) # Calculate weighted mean tot_samples = scores_for_meter["n_samples"].sum() scores_for_meter["proportion"] = scores_for_meter["n_samples"] / tot_samples avg_score = (scores_for_meter["score"] * scores_for_meter["proportion"]).sum() f1_scores[pred_meter.instance()] = avg_score return pd.Series(f1_scores)
def do_svm(input_trained_svm, input_test_svm, conj_train=0): x_train, y_train = load_svmlight_file(input_trained_svm) x_test, y_test = load_svmlight_file(input_test_svm) x_train = x_train.toarray() x_test = x_test.toarray() classificator = svm.SVC() classificator.fit(x_train, y_train) predict = classificator.predict(x_test) f1_score = round_float( sklearn_f1_score(y_test, predict, labels=np.unique(predict), average='weighted')) accuracy = round_float(accuracy_score(y_test, predict)) cm = confusion_matrix(y_test, predict) # Saving Results title = 'svm_c' + str(conj_train) with open('out/reports/' + title + '_results.txt', 'w') as f: with redirect_stdout(f): print(f'Accuracy: {accuracy}') print(f'F1Score: {f1_score}') # Confusion Matrix print(f'Confusion Matrix: \n', cm)
y: np.reshape(y_test, (y_test.shape[0], )) }) # Get predictions for the test set. _, y_pred = sess.run([accuracy, predictions], feed_dict={ X: test_data, y: np.reshape(y_test, (y_test.shape[0], )) }) # Write summaries. summary_train_acc = acc_summary.eval( feed_dict={ X: train_data, y: np.reshape(y_train, (y_train.shape[0], )) }) summary_test_acc = acc_summary.eval( feed_dict={ X: test_data, y: np.reshape(y_test, (y_test.shape[0], )) }) summary_writer_train.add_summary(summary_train_acc, epoch) summary_writer_test.add_summary(summary_test_acc, epoch) print( "Epoch: {} Last batch accuracy: {} Test accuracy: {} F1-Score: {}" .format(epoch, acc_train, acc_test, sklearn_f1_score(y_test, y_pred, average='macro'))) saver.save(sess, LOG_DIR + "/tf_model")