def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase, f1=False): """ Evaluation Function """ test_pyreader.start() total_cost, total_acc, total_num_seqs = [], [], [] y_pred, y_true = [], [] time_begin = time.time() if f1: while True: try: probs, labels = exe.run(program=test_program, fetch_list=fetch_list, return_numpy=True) y_pred.extend([np.argmax(prob) for prob in probs]) y_true.extend([label[0] for label in labels]) except fluid.core.EOFException: test_pyreader.reset() break time_end = time.time() y_true = np.array(y_true) y_pred = np.array(y_pred) accuracy = utils.accuracy(y_true, y_pred) cls_report = utils.classification_report(y_true, y_pred) macro_avg = cls_report["macro avg"] print( "[%s evaluation] accuracy: %f, macro precision: %f, recall: %f, f1: %f, elapsed time: %f s" % (eval_phase, accuracy, macro_avg['precision'], macro_avg['recall'], macro_avg['f1-score'], time_end - time_begin)) else: while True: try: np_loss, np_acc, np_num_seqs = exe.run(program=test_program, fetch_list=fetch_list, return_numpy=False) np_loss = np.array(np_loss) np_acc = np.array(np_acc) np_num_seqs = np.array(np_num_seqs) total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) except fluid.core.EOFException: test_pyreader.reset() break time_end = time.time() print("[%s evaluation] avg loss: %f, ave acc: %f, elapsed time: %f s" % (eval_phase, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), time_end - time_begin))
def using_expert_knowledge(self, real_labels_file): logging.info('Validation results using labelled data from expert') real_labels_data = pd.read_csv(real_labels_file, sep=';') data_combined = pd.merge(left=real_labels_data[['Method', 'CLevel']], right=self.data[['Method'] + self.list_labels], on='Method', how='inner') for y_pred in self.list_labels: report = classification_report(data_combined['CLevel'], data_combined[y_pred]) logging.info('------- {} ------ \n {}'.format(y_pred, report))
def train(train_dataset, valid_dataset, validation_bool, test_dataset, fam_dict_path, num_column, num_trains, num_tests, test_file_path, args): # load model model = rna_model.DeepRfam(seq_length=args.seq_length, num_c=num_column, num_filters=args.num_filters, filter_sizes=args.filter_sizes, dropout_rate=args.keep_prob, num_classes=args.num_classes, num_hidden=args.num_hidden) print(model.summary()) # model compile model.compile( loss=args.loss_function, optimizer=eval(f"optimizers.{args.optimizer}")(lr=args.learning_rate), metrics=['accuracy']) # start and record training history if validation_bool: train_history = model.fit_generator(train_dataset, epochs=args.num_epochs, verbose=1, validation_data=valid_dataset, use_multiprocessing=True, workers=6) else: train_history = model.fit_generator(train_dataset, epochs=args.num_epochs, verbose=1, use_multiprocessing=True, workers=6) # # test accuracy # t1 = time.time() # scores = model.evaluate_generator(test_dataset, steps=num_tests // args.batch_size + 1) # delta_t = time.time() - t1 # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}") # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}") # =================================logging============================================= local_time = time.strftime("%m-%d_%H-%M", time.localtime()) # determine log file name and `mkdir` if args.log_name is None: log_file_name = local_time else: log_file_name = local_time + '_' + args.log_name # os.system(f"mkdir -p {args.log_dir}/{log_file_name}") os.makedirs(f"{args.log_dir}/{log_file_name}") # save model to .h5 file model.save(f"{args.log_dir}/{log_file_name}/{log_file_name}.h5") # save the image of model structure plot_model(model, to_file=f"{args.log_dir}/{log_file_name}/model_structure.png", show_shapes=True) # save confusion matrix into .csv file # prediction = model.predict_generator(test_generator, workers=6, use_multiprocessing=True) prediction = model.predict_generator( test_generator) # don't use the multiprocessing # get the list of true label with open(test_file_path) as f: label_list = [] for line in f: line = line.strip() seq_index = line.split(',').pop(0) if seq_index != '': label_list.append(int(seq_index)) else: pass prediction = prediction[:len(label_list)] prediction_1d = np.array( [np.argmax(prediction) for prediction in prediction]) # print("Length of true label:", len(label_list)) # print("Length of predict label:", len(prediction_1d)) utils.cm2csv(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_dict_path, save_dir=f"{args.log_dir}/{log_file_name}") print('Accuracy:', accuracy_score(label_list, prediction_1d)) # generate the confusion matrix if args.num_classes <= 20: utils.plot_cm(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_dict_path, title=f'Confusion Matrix', save_dir=f"{args.log_dir}/{log_file_name}") else: pass # draw and save history plot utils.plot_history(train_history, f"{args.log_dir}/{log_file_name}") # save the classification report utils.classification_report(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_dict_path, save_dir=f"{args.log_dir}/{log_file_name}", std_out=True) # save history to .csv file with open(f"{args.log_dir}/history.csv", 'a') as csv: print( f'{local_time},{log_file_name},{args.dataset},{accuracy_score(label_list, prediction_1d)},{str(args.filter_sizes).replace(","," ")},{args.num_filters},{args.batch_size},{args.num_epochs},{args.keep_prob},{args.num_hidden},{args.learning_rate},{args.loss_function},{args.optimizer}, ', file=csv)
from sklearn.model_selection import train_test_split import pandas as pd # utils.py and separation_mvp.py are in the repo from utils import classification_report from separation_mvp import SeparatedClassifier url = "https://raw.githubusercontent.com/omarfsosa/datasets/master/fairness_synthetic_data.csv" df = pd.read_csv(url) X_train, X_test, y_train, y_test, A_train, A_test = train_test_split( df.drop(columns="y"), df["y"], df["A"], test_size=0.6, random_state=42) clf = LogisticRegression(solver="lbfgs") clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred, A_test)) R_train = clf.predict_proba(X_train)[:, 1] R_test = clf.predict_proba(X_test)[:, 1] goal_tpr, goal_fpr = 0.83591123066577, 0.2639968121139669 fair_clf = SeparatedClassifier(y_train, R_train, A_train) fair_clf.fit(goal_fpr, goal_tpr) for k, v in fair_clf.randomized_thresholds.items(): print(f"Group {k}: t0={v[0]:.2f}, t1={v[1]:.2f}, p={v[2]:.2f}") y_pred_fair = fair_clf.fair_predict(R_test, A_test) print(classification_report(y_test, y_pred_fair, A_test))
def train(train_dataset, valid_dataset, validation__bool, test_dataset, label_list, fam_path, num_channels, num_trains, num_valids, num_tests, args): # load model model = rna_model.L5CFam(seq_length=args.seq_length, num_filters=args.num_filters, num_channels=num_channels, filter_sizes=args.filter_sizes, dropout_rate=args.keep_prob, num_classes=args.num_classes, num_hidden=args.num_hidden) print(model.summary()) # model compile model.compile(loss=args.loss_function, optimizer=args.optimizer, metrics=['accuracy']) # start and record training history if validation__bool: train_history = model.fit_generator(train_dataset, epochs=args.num_epochs, verbose=1, validation_data=valid_dataset, workers=6, use_multiprocessing=True) else: train_history = model.fit_generator(train_dataset, epochs=args.num_epochs, verbose=1, workers=6, use_multiprocessing=True) # # test accuracy # t1 = time.time() # scores = model.evaluate_generator(test_dataset, steps=num_tests // args.batch_size + 1) # delta_t = time.time() - t1 # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}") # =================================logging============================================= local_time = time.strftime("%m-%d_%H-%M", time.localtime()) # determine log file name and `mkdir` if args.log_name is None: log_file_name = local_time else: log_file_name = local_time + '_' + args.log_name # os.system(f"mkdir -p {args.log_dir}/{log_file_name}") os.makedirs(f"{args.log_dir}/{log_file_name}") # save model to .h5 file model.save(f"{args.log_dir}/{log_file_name}/{log_file_name}.h5") # save the image of model structure plot_model(model, to_file=f"{args.log_dir}/{log_file_name}/model_structure.png", show_shapes=True) # save confusion matrix into .csv file prediction = model.predict_generator(test_dataset, workers=6, use_multiprocessing=True) prediction_1d = np.array( [np.argmax(prediction) for prediction in prediction]) # generate the list of the true label # label_list = np.zeros((num_tests,), dtype=int) # no_label = 0 # for i in range(1, num_tests): # if i % int(num_tests / args.num_classes) == 0: # no_label += 1 # label_list[i] = no_label utils.cm2csv(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_path, save_dir=f"{args.log_dir}/{log_file_name}") print('Accuracy:', accuracy_score(label_list, prediction_1d)) # draw and save history plot utils.plot_history(train_history, f"{args.log_dir}/{log_file_name}") # generate the confusion matrix if args.num_classes <= 20: utils.plot_cm(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_dict_path, title=f'Confusion Matrix', save_dir=f"{args.log_dir}/{log_file_name}") else: pass # save the classification report utils.classification_report(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_dict_path, save_dir=f"{args.log_dir}/{log_file_name}", std_out=True) # save history to .csv file with open(f"{args.log_dir}/history.csv", 'a') as csv: print( f'{local_time},{log_file_name},{args.dataset},{accuracy_score(label_list, prediction_1d)},{str(args.filter_sizes).replace(","," ")},{str(args.num_filters).replace(",","")},{args.batch_size},{args.num_epochs},{args.keep_prob},{str(args.num_hidden).replace(",","")},{args.learning_rate},{args.loss_function},{args.optimizer}, ', file=csv)