def reports(classifier, train_data, train_labels): kf = KFold(n_splits=5) kf.get_n_splits(train_data) print(kf) scores = [] for train_index, test_index in kf.split(train_data): #print("TRAIN:", len(train_index), "TEST:", len(test_index)) X_train, X_test = train_data[train_index], train_data[test_index] y_train, y_test = train_labels[train_index], train_labels[test_index] classifier.fit(X_train, y_train) predicted = classifier.predict(X_test) scores.append(accuracy_score(predicted, y_test)) scores = np.array(scores) print("Average Accuracy K Fold: ", scores.mean()) train_data_len = len(train_data) chunksize = int(train_data_len * train_test_split_ratio) train_x = train_data[0:chunksize] train_y = train_labels[0:chunksize] test_x = train_data[chunksize:train_data_len] test_y = train_labels[chunksize:train_data_len] classifier.fit(train_x, train_y) predicted = classifier.predict(test_x) print("Test Data Results:") print("Test Accuracy: ", accuracy_score(predicted, test_y)) X = classification_report(test_y, predicted, output_dict=True) #print (X.keys()) print("Sensitivity: ", X['1']['recall']) print("Specificity: ", X['0']['recall']) print("MCC: ", mcc(test_y, predicted)) print("")
def ablation(strategy): model = RandomForestClassifier(n_estimators=50, oob_score=True) assert strategy in ["random", "important"] x_train = np.array(x).copy() y_train = np.array(y).copy() mccs = [] for feat in range(len(x[0])): _ = model.fit(x_train, y_train) if strategy == "random": x_train = np.delete(x_train, np.random.randint(len(x_train[0])), axis=1) elif strategy == "important": x_train = np.delete(x_train, np.argmax(model.feature_importances_), axis=1) else: continue mccs += [mcc(np.argmax(model.oob_decision_function_, axis=1), y)] return mccs
def compute_metrics(preds: np.ndarray, labels: np.ndarray) -> Dict[str, float]: # noinspection PyUnresolvedReferences acc_score = (preds == labels).mean() mcc_score = mcc(labels, preds) tot_samp = preds.shape[0] tn, fp, fn, tp = confusion_matrix(labels, preds).ravel() tn, fp, fn, tp = (tn / tot_samp).round(2), (fp / tot_samp).round(2), (fn / tot_samp).round(2), ( tp / tot_samp).round(2) return {"acc": acc_score, "mcc": mcc_score, "tn": tn, "fp": fp, "fn": fn, "tp": tp}
def evaluate(pred_vals, true_vals, pred_prob): precision, recall, thresholds = metrics.precision_recall_curve( true_vals, pred_prob) return [ mcc(true_vals, pred_vals), metrics.f1_score(true_vals, pred_vals), metrics.precision_score(true_vals, pred_vals), ac(true_vals, pred_vals), metrics.roc_auc_score(true_vals, pred_prob), metrics.auc(recall, precision) ]
def calc_mcc(yval, yval_rk): ycat = np.zeros(yval_rk.shape[0]) x = 0 for val in yval_rk: index = np.argmax(val) ycat[x] = index x += 1 yreal = np.zeros(ycat.shape) for i in range(yval.shape[0]): yreal[i] = np.argmax(yval[i]) #print(yreal) #print(ycat) #print('MCC: ', str(mcc(yreal, ycat))) return mcc(yreal,ycat)
def reportStats(weight, current_iteration, X_train, y_train, X_test, y_test): y_train[y_train < 0] = 0 y_test[y_test < 0] = 0 ypred_is = predict_all(X_train, weight) ypred_oos = predict_all(X_test, weight) np_err_handling = np.seterr(invalid='ignore') is_acc = acc(y_train, ypred_is) is_mcc = mcc(y_train, ypred_is) is_f1 = f1(y_train, ypred_is) is_mse = mse(y_train, ypred_is) oos_acc = acc(y_test, ypred_oos) oos_mcc = mcc(y_test, ypred_oos) oos_f1 = f1(y_test, ypred_oos) oos_mse = mse(y_test, ypred_oos) is_tn, is_fp, is_fn, is_tp = confusion_matrix(y_train, ypred_is).ravel() oos_tn, oos_fp, oos_fn, oos_tp = confusion_matrix(y_test, ypred_oos).ravel() is_auprc = auprc(y_train, ypred_is) oos_auprc = auprc(y_test, ypred_oos) np.seterr(**np_err_handling) print( f"Consensus {current_iteration}: IS acc {is_acc:0.5f}. IS MCC {is_mcc:0.5f}. IS F1 {is_f1:0.5f}. IS MSE {is_mse:0.5f}. OOS acc {oos_acc:0.5f}. OOS MCC {oos_mcc:0.5f}. OOS F1 {oos_f1:0.5f}. OOS MSE {oos_mse:0.5f}." ) print( f"Confusion {current_iteration}: IS TP: {is_tp}, IS FP: {is_fp}, IS TN: {is_tn}, IS FN: {is_fn}, IS AUPRC: {is_auprc:0.5f}. OOS TP: {oos_tp}, OOS FP: {oos_fp}, OOS TN: {oos_tn}, OOS FN: {oos_fn}, OOS AUPRC: {oos_auprc:0.5f}." ) return is_acc, is_mcc, is_f1, is_mse, is_auprc, oos_acc, oos_mcc, oos_f1, oos_mse, oos_auprc
def run_growth(x, y): model = RandomForestClassifier(n_estimators=50, n_jobs=10, oob_score=True) _x = x.copy() _y = y.copy() index = np.random.choice(np.where(_y == 0)[0]) X_training = _x[index].copy() Y_training = np.array([_y[index]]) _x = np.delete(_x, index, 0) _y = np.delete(_y, index) index = np.random.choice(np.where(_y == 1)[0]) X_training = np.vstack((X_training, _x[index])) Y_training = np.append(Y_training, _y[index]) _x = np.delete(_x, index, 0) _y = np.delete(_y, index) mccs = [] for _ in range(len(_x)): _ = model.fit(X_training, Y_training) mccs += [ mcc(np.argmax(model.oob_decision_function_, axis=1), Y_training) ] index = np.random.randint(len(_x)) X_training = np.vstack((X_training, _x[index])) Y_training = np.append(Y_training, _y[index]) _x = np.delete(_x, index, 0) _y = np.delete(_y, index) #print str(len(_x)) + "\t" + str(mccs2[-1]) return mccs
def eval_mcc(y_true, y_prob, show=False): idx = np.argsort(y_prob) y_true_sort = y_true[idx] n = y_true.shape[0] nump = 1.0 * np.sum(y_true) # number of positive numn = n - nump # number of negative tp = nump tn = 0.0 fp = numn fn = 0.0 best_mcc = 0.0 best_id = -1 prev_proba = -1 best_proba = -1 mccs = np.zeros(n) for i in range(n): # all items with idx < i are predicted negative while others are predicted positive # only evaluate mcc when probability changes proba = y_prob[idx[i]] if proba != prev_proba: prev_proba = proba new_mcc = mcc(tp, tn, fp, fn) if new_mcc >= best_mcc: best_mcc = new_mcc best_id = i best_proba = proba mccs[i] = new_mcc if y_true_sort[i] == 1: tp -= 1.0 fn += 1.0 else: fp -= 1.0 tn += 1.0 if show: y_pred = (y_prob >= best_proba).astype(int) score = matthews_corrcoef(y_true, y_pred) print(score, best_mcc) return best_proba, best_mcc, y_pred else: return best_mcc
def confusion_matrix(y_true, y_pred, decoder): revsere_decoder_index = { value: key for key, value in decoder.word_index.items() } space_inx = decoder.word_index["-"] y = np.argmax(y_true, axis=-1) y_ = np.argmax(y_pred, axis=-1) mask1 = np.greater(y, 0) mask2 = np.not_equal(y, space_inx) mask = np.logical_and(mask1, mask2) nclass = len(revsere_decoder_index) + 1 mat = np.zeros([nclass, nclass], dtype=int) ym = y[mask] y_m = y_[mask] for i in range(len(ym)): mat[ym[i], y_m[i]] += 1 sum_class_true = np.sum(mat, axis=-1) sum_class_pred = np.sum(mat, axis=0) sum_all = np.sum(sum_class_true) acc = np.sum(np.diagonal(mat / sum_all)) add_epsilon = lambda x: x + 1e-10 if x == 0 else x div1 = sum_class_true.reshape(sum_class_true.shape[0], 1) div1 = np.apply_along_axis(func1d=add_epsilon, axis=1, arr=div1) div2 = sum_class_pred.reshape(1, sum_class_true.shape[0]) div2 = np.apply_along_axis(func1d=add_epsilon, axis=0, arr=div2) recall = np.diagonal(mat / div1) precision = np.diagonal(mat / div2) add_epsilon = lambda x: np.where(x == 0.0, x + 1e-10, x) freq = sum_class_true / sum_all div3 = np.apply_along_axis(add_epsilon, axis=0, arr=recall) div4 = np.apply_along_axis(add_epsilon, axis=0, arr=precision) div5 = (1 / div3 + 1 / div4) f_score = 2 / div5 mat = mat m = mcc(ym, y_m) return mat, recall, precision, f_score, acc, freq, m
def matthews_correlation(y_true, y_pred): '''Calculates the Matthews correlation coefficient measure for quality of binary classification problems. y_pred_pos = K.round(K.clip(y_pred, 0, 1)) y_pred_neg = 1 - y_pred_pos y_pos = K.round(K.clip(y_true, 0, 1)) y_neg = 1 - y_pos tp = K.sum(y_pos * y_pred_pos) tn = K.sum(y_neg * y_pred_neg) fp = K.sum(y_neg * y_pred_pos) fn = K.sum(y_pos * y_pred_neg) numerator = (tp * tn - fp * fn) denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) return numerator / (denominator + K.epsilon()) ''' return mcc(y_true, y_pred)
def matthews_corrcoef(preds, labels): """ Matthew's correlation coefficient .. note:: The implementation from ``sklearn.metrics`` is used to compute the score. Parameters ---------- preds : list or numpy.ndarray A list of predictions from a model labels : list or numpy.ndarray A list of ground truth labels with the same number of elements as ``preds`` Returns ------- mcc_score : float Matthew's correlation coefficient of the model """ preds = _numpyfy(preds) labels = _numpyfy(labels) return mcc(preds, labels)
def evaluate(logits, labels): all_targets = [] all_probs_0 = [] all_probs_1 = [] all_probs_2 = [] all_probs_3 = [] for i in range(len(logits)): probs = torch.nn.Softmax(dim=0)(logits[i]).detach().cpu().numpy() all_probs_0.extend(probs[0].ravel()) all_probs_1.extend(probs[1].ravel()) all_probs_2.extend(probs[2].ravel()) all_probs_3.extend(probs[3].ravel()) target = labels[i].numpy() all_targets.append(target.ravel()) all_probs_np = np.stack([all_probs_0, all_probs_1, all_probs_2, all_probs_3], axis=1) all_preds_np = np.argmax(all_probs_np, axis=1) all_targets_np = np.hstack(all_targets) return f1_score(all_targets_np, all_preds_np,average='weighted'), mcc(all_targets_np, all_preds_np) # def evaluate(logits, labels, n_classes, ignore_index = -100, fast=True): # # all_probs_0 = [] # all_targets = [] # # # if n_classes == 4: # all_probs_1 = [] # all_probs_2 = [] # all_probs_3 = [] # # act = torch.sigmoid if n_classes==1 else torch.nn.Softmax(dim=0) # # for i in range(len(logits)): # # prediction = act(logits[i]).detach().cpu().numpy()[-1] # this takes last channel in multi-class, ok for 2-class # # logits[i] is n_classes x h x w # prob = act(logits[i]).detach().cpu().numpy() # prob is n_classes x h x w # target = labels[i].cpu().numpy() # # if n_classes==1: # all_probs_0.extend(prob.ravel()) # else: # all_probs_0.extend(prob[0].ravel()) # all_probs_1.extend(prob[1].ravel()) # all_probs_2.extend(prob[2].ravel()) # all_probs_3.extend(prob[3].ravel()) # # all_targets.append(target.ravel()) # # if n_classes == 1: all_probs_np = np.hstack(all_probs_0) # else: all_probs_np = np.stack([all_probs_0, all_probs_1, all_probs_2, all_probs_3], axis=1) # # all_targets_np = np.hstack(all_targets) # # all_probs_np = all_probs_np[all_targets_np != ignore_index] # all_targets_np = all_targets_np[all_targets_np!=ignore_index] # # if n_classes == 4: # all_preds_np = np.argmax(all_probs_np, axis=1) # return roc_auc_score(all_targets_np, all_probs_np, multi_class='ovo',average='weighted'), f1_score(all_targets_np, all_preds_np,average='weighted') # else: # all_preds_np = all_probs_np > 0.5 # if fast==True: # return fast_auc(all_targets_np>0.5, all_probs_np), f1_score(all_targets_np>0.5, all_preds_np) # else: # # return roc_auc_score(all_targets_np, all_probs_np), f1_score(all_targets_np, all_preds_np)
def qualitativeValidation(self): ''' performs validation for qualitative models ''' # Make a copy of the original matrices X = self.X.copy() Y = self.Y.copy() # Get predicted classes. Yp = self.estimator.predict(X) if len(Yp) != len(Y): raise Exception('Lenght of experimental and predicted Y' 'do not match') info = [] # Get confusion matrix for predicted Y try: self.TNpred, self.FPpred,\ self.FNpred, self.TPpred = confusion_matrix(Y, Yp, labels=[0, 1]).ravel() self.sensitivityPred = (self.TPpred / (self.TPpred + self.FNpred)) self.specificityPred = (self.TNpred / (self.TNpred + self.FPpred)) self.mccp = mcc(Y, Yp) info.append(('TPpred', 'True positives', self.TPpred)) info.append(('TNpred', 'True negatives', self.TNpred)) info.append(('FPpred', 'False positives', self.FPpred)) info.append(('FNpred', 'False negatives', self.FNpred)) info.append(('SensitivityPed', 'Sensitivity in fitting', self.sensitivityPred)) info.append(('SpecificityPred', 'Specificity in fitting', self.specificityPred)) info.append( ('MCCpred', 'Matthews Correlation Coefficient', self.mccp)) LOG.debug('Computed class prediction for estimator instances') except Exception as e: LOG.error(f'Error computing class prediction of Yexp' f'with exception {e}') raise e # Get cross-validated Y try: y_pred = cross_val_predict(self.estimator, X, Y, cv=self.cv, n_jobs=-1) except Exception as e: LOG.error(f'Cross-validation failed with exception' f'exception {e}') raise e # Get confusion matrix try: self.TN, self.FP, self.FN, self.TP = confusion_matrix( Y, y_pred, labels=[0, 1]).ravel() except Exception as e: LOG.error(f'Failed to compute confusion matrix with' f'exception {e}') raise e try: self.sensitivity = (self.TP / (self.TP + self.FN)) except Exception as e: LOG.error(f'Failed to compute sensibility with' f'exception {e}') self.sensitivity = '-' try: self.specificity = (self.TN / (self.TN + self.FP)) except Exception as e: LOG.error(f'Failed to compute specificity with' f'exception {e}') self.specificity = '-' try: # Compute Matthews Correlation Coefficient self.mcc = (((self.TP * self.TN) - (self.FP * self.FN)) / np.sqrt( (self.TP + self.FP) * (self.TP + self.FN) * (self.TN + self.FP) * (self.TN + self.FN))) except Exception as e: LOG.error(f'Failed to compute Mathews Correlation Coefficient' f'exception {e}') self.mcc = '-' info.append(('TP', 'True positives in cross-validation', self.TP)) info.append(('TN', 'True negatives in cross-validation', self.TN)) info.append(('FP', 'False positives in cross-validation', self.FP)) info.append(('FN', 'False negatives in cross-validation', self.FN)) info.append(('Sensitivity', 'Sensitivity in cross-validation', self.sensitivity)) info.append(('Specificity', 'Specificity in cross-validation', self.specificity)) info.append( ('MCC', 'Matthews Correlation Coefficient in cross-validation', self.mcc)) info.append(('Y_adj', 'Adjusted Y values', Y)) info.append(('Y_adj', 'Adjusted Y values', Yp)) info.append( ('Y_pred', 'Predicted Y values after cross-validation', y_pred)) LOG.debug(f'Qualitative crossvalidation performed') results = {} results['quality'] = info results['Y_adj'] = Yp results['Y_pred'] = y_pred return True, results
def random_forest_training(X, y, stratify_array, experiment_folder_path, train_test_splits=TRAIN_TEST_SPLIT_RUN, cv_nsplits=CV_NSPLITS, cv_repeats=CV_REPEATS): """""" for train_test_split_run in range(train_test_splits): mcc_scores = [] acc_scores = [] # Create the folder for the current experiment train_test_run_folder_path = os.path.join( experiment_folder_path, '{}'.format(train_test_split_run)) os.makedirs(train_test_run_folder_path, exist_ok=True) feat_rankings_folder = os.path.join(train_test_run_folder_path, 'features_importance') os.makedirs(feat_rankings_folder, exist_ok=True) X_tr, X_ts, y_tr, y_ts, S_tr, S_ts = splt( X, y, stratify_array, test_size=0.2, random_state=train_test_split_run, stratify=stratify_array) print('Experiment {} out of {} ...'.format(train_test_split_run + 1, train_test_splits), end=' ') rskf_ = rskf(n_splits=cv_nsplits, n_repeats=cv_repeats, random_state=42) cv_exp_number = 1 for train_index, val_index in rskf_.split(X_tr, S_tr): X_train, X_val = X_tr[train_index], X_tr[val_index] y_train, y_val = y_tr[train_index], y_tr[val_index] forest = rfc(n_estimators=1000, n_jobs=-1) forest.fit(X_train, y_train) y_pred_val = forest.predict(X_val) mc = mcc(y_val, y_pred_val) ac = acc(y_val, y_pred_val) mcc_scores.append(mc) acc_scores.append(ac) # Save Feature ranking np.savez(os.path.join( feat_rankings_folder, 'feat_ranking_{:02d}.npz'.format(cv_exp_number)), ranking=forest.feature_importances_) rf_pickle_filepath = os.path.join( train_test_run_folder_path, 'forest_{:02d}.pkl'.format(cv_exp_number)) with open(rf_pickle_filepath, 'wb') as pickle_file: pickle.dump(forest, pickle_file) cv_exp_number += 1 # Re-train everything from scratch on the entire training set forest = rfc(n_estimators=1000, n_jobs=-1) forest.fit(X_tr, y_tr) y_ts_our = forest.predict(X_ts) mc = mcc(y_ts, y_ts_our) ac = acc(y_ts, y_ts_our) rf_pickle_filepath = os.path.join( train_test_run_folder_path, 'forest_training.pkl'.format(cv_exp_number)) with open(rf_pickle_filepath, 'wb') as pickle_file: pickle.dump(forest, pickle_file) # Store the logs for this experiment log_file_path = os.path.join(train_test_run_folder_path, 'log.csv') mcc_ci_min, mcc_ci_max = bootstrap_ci(np.asarray(mcc_scores)) acc_ci_min, acc_ci_max = bootstrap_ci(np.asarray(acc_scores)) scores = pd.DataFrame( { 'ACC': np.mean(acc_scores), 'ACC_CI_MIN': acc_ci_min, 'ACC_CI_MAX': acc_ci_max, 'MCC': np.mean(mcc_scores), 'MCC_CI_MIN': mcc_ci_min, 'MCC_CI_MAX': mcc_ci_max, 'ACC_TEST': ac, 'MCC_TEST': mc }, index=[0]) scores.to_csv(log_file_path, sep=',') print('Done')
def optimize(self, X, Y, estimator, tune_parameters): ''' optimizes a model using a grid search over a range of values for diverse parameters''' print("Optimizing PLS-DA algorithm") latent_variables = tune_parameters["n_components"] mcc_final = 0 estimator0 = "" list_latent = [] for n_comp in latent_variables: mcc0 = 0 estimator.set_params(**{"n_components": n_comp}) y_pred = cross_val_predict(estimator, X, Y, cv=self.cv, n_jobs=1) estimator1 = "" threshold_1 = 0 for threshold in range(0, 100, 5): threshold = threshold / 100 y_pred2 = copy.copy(y_pred) y_pred2[y_pred2 < threshold] = 0 y_pred2[y_pred2 >= threshold] = 1 mcc1 = mcc(Y, y_pred2) if mcc1 >= mcc0: mcc0 = mcc1 estimator1 = copy.copy(estimator) estimator1.set_params(**{'threshold': threshold}) threshold_1 = (threshold) if mcc0 >= mcc_final: mcc_final = mcc0 estimator0 = copy.copy(estimator1) self.estimator = estimator0 list_latent.append([n_comp, threshold_1, mcc0]) print("MCC per lantent variable at best cutoff") for el in list_latent: print("Number of latent variables: %s \nBest cutoff: %s \nMCC: %s\n" % (el[0], el[1], el[2])) self.estimator = estimator0 self.estimator.fit(X, Y) print(self.estimator.get_params()) #### Overriding of parent methods # def CF_quantitative_validation(self): # ''' performs validation for conformal quantitative models ''' # def CF_qualitative_validation(self): # ''' performs validation for conformal qualitative models ''' # def quantitativeValidation(self): # ''' performs validation for quantitative models ''' # def qualitativeValidation(self): # ''' performs validation for qualitative models ''' # def validate(self): # ''' Validates the model and computes suitable model quality scoring values''' # def optimize(self, X, Y, estimator, tune_parameters): # ''' optimizes a model using a grid search over a range of values for diverse parameters''' # def regularProject(self, Xb, results): # ''' projects a collection of query objects in a regular model, for obtaining predictions ''' # def conformalProject(self, Xb, results): # ''' projects a collection of query objects in a conformal model, for obtaining predictions ''' # def project(self, Xb, results): # ''' Uses the X matrix provided as argument to predict Y'''
def evaluate_partitions(keep_bin_edges, df_processed): """ This function evaluates a lightweight classifier according to the thresholds. Inputs are a list of bin-edges for the continuous target and the processed df. """ # initialize the empty lists accs = [] aucs = [] mccs = [] apcs = [] accs_control = [] aucs_control = [] mccs_control = [] apcs_control = [] threshs = [] bin_pct = [] # starting data percentile pct = 0.0 # binning parameters fixed - DO NOT CHANGE num_bins = 10 num_trials = 10 # sweep through all bin edges for bin_edge in keep_bin_edges: threshold = bin_edge # obtain the X,y matrices X, X_control, y = partition_data(df_processed, threshold) # starting data percentile pct += 1 / num_bins for trial in range(num_trials): # get the training, testing, and control data-sets x_train_idf, y_train, x_test_idf, y_test, x_control_idf = split_transform_data( X, X_control, y) # fit the classifier clf = ComplementNB(alpha=0.1, class_prior=None, fit_prior=True, norm=False) clf.fit(x_train_idf, y_train) # evaluate on test and control sets accs.append(clf.score(x_test_idf, y_test)) accs_control.append(clf.score(x_control_idf, y)) y_pred = clf.predict(x_test_idf) y_pred_cont = clf.predict(x_control_idf) mccs.append(mcc(y_test, y_pred)) mccs_control.append(mcc(y, y_pred_cont)) y_proba = clf.predict_proba(x_test_idf) y_cont_proba = clf.predict_proba(x_control_idf) aucs.append(roc_auc_score(y_test, y_proba[:, 1])) aucs_control.append(roc_auc_score(y, y_cont_proba[:, 1])) apcs.append(apscore(y_test, y_proba[:, 1])) apcs_control.append(apscore(y, y_cont_proba[:, 1])) threshs.append(threshold) bin_pct.append(pct) # populate into a df for downstream analysis df_eval = pd.DataFrame() df_eval['data percentile'] = bin_pct # data percentile df_eval['threshold'] = threshs # bin edge df_eval['test accuracy'] = accs # accuracy df_eval['test mcc'] = mccs # matthews correlation coefficient df_eval['test auc'] = aucs # roc-auc df_eval['test ap'] = apcs # average precision df_eval['control accuracy'] = accs_control df_eval['control mcc'] = mccs_control df_eval['control auc'] = aucs_control df_eval['control ap'] = apcs_control return df_eval
def fit(self, epoch, train_loader, verbose=True): X_train = [] y_train = [] cluster_ids_train = [] for batch_idx, (data, y) in enumerate(train_loader): batch_size = data.size()[0] data = data.view(batch_size, -1).to(self.device) # Collect training data and labels for the later classifier X_train.append(data.cpu().numpy()) y_train.extend(y.numpy()) # Get the latent features with torch.no_grad(): latent_X = self.autoencoder(data, latent=True) latent_X = latent_X.cpu().numpy() if self.args.clustering == "cac": cluster_id = self.clustering.cluster(latent_X, y, self.args.beta, self.args.alpha) else: # [Step-1] Update the assignment results cluster_id = self.clustering.update_assign(latent_X, y) # [Step-2] Update cluster centers in batch Clustering elem_count = np.bincount(cluster_id, minlength=self.args.n_clusters) for k in range(self.args.n_clusters): # avoid empty slicing if elem_count[k] == 0: continue # updating the cluster center self.clustering.update_cluster(latent_X[cluster_id == k], k) # [Step-3] Update the network parameters loss, rec_loss, dist_loss = self._loss(data, cluster_id) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # if verbose and (batch_idx+1) % self.args.log_interval == 0: msg = 'Epoch: {:02d} | Batch: {:03d} | Loss: {:.3f} | Rec-' \ 'Loss: {:.3f} | Dist-Loss: {:.3f}' print( msg.format(epoch, batch_idx + 1, loss.detach().cpu().numpy(), rec_loss, dist_loss)) X_train = np.vstack(X_train) self.eval() if self.args.clustering == "cac": with torch.no_grad(): latent_X_train = self.autoencoder(torch.FloatTensor( np.array(X_train)).to(self.args.device), latent=True) latent_X_train = latent_X_train.to(self.args.device).numpy() cluster_ids_train = self.clustering.update_assign(latent_X_train) y_train = np.array(y_train) X_train = latent_X_train print("Training Base classifier") classifier = self.get_classifier(self.classifier) classifier.fit(X_train, y_train) self.base_classifier.append(classifier) print("Base Training F1:", f1_score(y_train, classifier.predict(X_train).ravel())) print("Base Training MCC:", mcc(y_train, classifier.predict(X_train).ravel())) print( "Base Training AUC:", roc_auc_score(y_train, classifier.predict_proba(X_train)[:, 1])) print("Training CAC classifiers") self.cluster_classifiers.append([]) y_pred = [] y_true = [] y_pred_proba = [] for j in range(self.args.n_clusters): cluster_indices = np.where(cluster_ids_train == j)[0] X_cluster = X_train[cluster_indices] y_cluster = y_train[cluster_indices] y_true.extend(y_cluster) classifier = self.get_classifier(self.classifier) if np.unique(y_cluster).shape[0] > 1: classifier.fit(X_cluster, y_cluster.ravel()) print("CAC Training F1:", f1_score(y_cluster, classifier.predict(X_cluster))) print("CAC Training MCC:", mcc(y_cluster, classifier.predict(X_cluster))) print( "CAC Training AUC:", roc_auc_score( y_cluster, classifier.predict_proba(X_cluster)[:, 1])) y_pred.extend(classifier.predict(X_cluster)) y_pred_proba.extend( classifier.predict_proba(X_cluster)[:, 1]) else: print("Fitting random classifier, Iteration:", j) tmp = np.random.randint(2, size=y_cluster.shape[0]) y_pred.extend(tmp) y_pred_proba.extend(tmp) classifier.fit(X_cluster, tmp) self.cluster_classifiers[-1].append(classifier) print("Final CAC Training F1:", f1_score(y_true, y_pred)) print("Final CAC Training MCC:", mcc(y_true, y_pred)) print("Final CAC Training AUC:", roc_auc_score(y_true, y_pred_proba))
def matthews_corrcoef(preds, labels): preds = numpyfy(preds) labels = numpyfy(labels) return mcc(preds, labels)
def param_search_lgb( params: dict, n_iter: int, X_train, y_train, cv=None, learner_n_jobs: int = -1, search_n_jobs: int = 1, X_test=None, y_test=None, device="cpu", cv_filename="cv_results.h5", params_filename="params.txt", **kwargs, ) -> dict: """Assissted param search for lightgbm classifier. Holds max_depth iterates through num_leaves, then performace random search for each (max_depth, num_leaves) combo. This is useful because the relationship num_leaves < 2**max_depth should hold. If blindly using a random search, the pair selected may violate this condition. The method writs cv results into a HDF5 file, indexed by num_leave_n keys, where n is the parameter used for num_leaves. It also writes the best params into a text file. Parameters ---------- params : dict Random search parameter dict. Must have {'max_depth', 'num_leaves'}. n_iter : int number of searches X_train : TYPE y_train : TYPE cv : None, optional cross valiation indices if given. learner_n_jobs : int, optional default -1, use all cpus for learner fitting search_n_jobs : int, optional default 1, use only 1 cpu for search. this is because by default the learner is allowed to use all CPUs already. X_test : None, optional test/valid data y_test : None, optional test/valid targets device : str, optional default 'cpu', can be either of {'cpu', 'gpu'} cv_filename : str, optional file to store cv results params_filename : str, optional file to store best params **kwargs kwargs passed to sklearn.RandomizedSearchCV Returns ------- dict Keys: ----- best_param: best parameters best_score: best achieved score best_learner: fitted best model """ # some params max_depth = params.pop("max_depth") num_leaves = params.pop("num_leaves") assert max_depth is not None assert len(max_depth) == 1 assert num_leaves is not None assert len(num_leaves) > 0 max_depth = max_depth[0] out = dict() best_score = None best_params = None best_learner = None start = time.process_time() for n in num_leaves: print(f"max_depth = {max_depth}, num_leaves = {n}...") learner = lgb.LGBMClassifier( max_depth=max_depth, num_leaves=n, # boosting_type='gbdt', # objective='xentropy', # eval_metric='binary_logloss', # early_stopping_rounds=100, # verbose_eval=200, device=device, # verbosity=lgb_verbosity, n_jobs=learner_n_jobs, ) rs = RandomizedSearchCV( learner, params, cv=cv, n_jobs=search_n_jobs, n_iter=n_iter, return_train_score=False, **kwargs, ) # model_selection._search.format_results() sometimes has an bug and # returns nothing, causing ValueError when unpacking output. rs.fit(X_train, y_train, verbose=-1) if best_score is None: best_score = rs.best_score_ best_learner = rs.best_estimator_ best_params = rs.best_params_.copy() best_params["max_depth"] = max_depth best_params["num_leaves"] = n elif best_score < rs.best_score_: best_score = rs.best_score_ best_learner = rs.best_estimator_ best_params = rs.best_params_.copy() best_params["max_depth"] = max_depth best_params["num_leaves"] = n key = f"max_depth_{max_depth}_num_leaves_{n}" # store this search object for later use out[key] = rs # save cv scores if cv is not None: pd.DataFrame(rs.cv_results_).to_hdf(cv_filename, key=key, mode="a") # predict test set. if X_test is not None and y_test is not None: assert len(X_test) == len(y_test) y_pred = rs.predict(X_test) train_loss = rs.score(X_train, y_train) test_loss = rs.score(X_test, y_test) test_mcc = mcc(y_test, y_pred) msg = (f"{key}, Dev score: {train_loss:.3f}, Test score: " + f"{test_loss:.3f}, Test MCC: {test_mcc:.3f}\n") print(msg) print(f"{key}, Save TSCV Best params = {best_params}") with open(params_filename, "a") as fp: fp.write(msg) fp.write(str(best_params)) fp.write("\n") time_taken = time.process_time() - start print("Time taken (s): ", time_taken) # write final best param with open(params_filename, "a") as fp: # convert to python types for json writes # best_params = {k: np.asscalar(v) for k, v in best_params.items()} # fp.write(json.dumps(best_params)) fp.write("Final result:\n") fp.write(str(best_params)) fp.write("\n\n") out["best_params"] = best_params out["best_score"] = best_score out["best_learner"] = best_learner return out
X_t_2 = enc_t.transform(X_t_1) X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_t_1, y, random_state=0) X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_t_2, y, random_state=0) print('Accuracy on dataset converted from label to integer category:') logistic_regression = LogisticRegression(random_state=0) clf_lr = logistic_regression.fit(X_train_1, y_train_1) acc_lr = clf_lr.score(X_test_1, y_test_1) lr_pred = logistic_regression.predict(X_test_1) lr_mcc = mcc(y_test_1, lr_pred) print('logistic regression accuracy: {}'.format(acc_lr)) print('logistic regression MCC: {}'.format(lr_mcc)) naive_bayes = BernoulliNB() clf_bnb = naive_bayes.fit(X_train_1, y_train_1) acc_bnb = clf_bnb.score(X_test_1, y_test_1) nb_pred = naive_bayes.predict(X_test_1) nb_mcc = mcc(y_test_1, nb_pred) print('naive bayes accuracy: {}'.format(acc_bnb)) print('naive bayes MCC: {}'.format(nb_mcc)) gradient_boosting = xgboost.XGBClassifier() clf_xb = gradient_boosting.fit(X_train_1, y_train_1) acc_xb = clf_xb.score(X_test_1, y_test_1) gb_pred = gradient_boosting.predict(X_test_1)
def eval_model(model_type="ord", look_at_test_set=False, dropout=0): #Use the same dropout and number of epochs across models (initial cross- #validations were used to select) epochs = 40 testscore = None xtrain, xval, ytrain, yval, xtest, ytest = load_data(model_type) if model_type == "rf": if look_at_test_set == True: model = RandomForestClassifier(n_estimators=1000, n_jobs=3, min_samples_split=35, min_samples_leaf=5, oob_score=True) xtrain = np.vstack([xtrain, xval]) ytrain = np.concatenate([ytrain, yval]) model.fit(xtrain[:, 0:180], ytrain, sample_weight=xtrain[:, -1]) trainpreds, valpreds, testpreds = model.predict(xtrain[:,0:180]), model.predict(xval[:,0:180]),\ model.predict(xtest[:,0:180]) else: model = RandomForestClassifier(n_estimators=1000, n_jobs=3, min_samples_split=35, min_samples_leaf=5, oob_score=True) model.fit(xtrain[:, 0:180], ytrain, sample_weight=xtrain[:, -1]) trainpreds, valpreds, testpreds = model.predict(xtrain[:,0:180]), model.predict(xval[:,0:180]),\ model.predict(xtest[:,0:180]) elif model_type == "enrich": model = enrichment_nn(dropout=dropout, input_dim=180, l2=0.0000) model.trainmod(xtrain, epochs=epochs, minibatch=250, lr=0.005, use_weights=True) trainpreds, valpreds, testpreds = model.predict(xtrain[:,0:180])[1], model.predict(xval[:,0:180])[1],\ model.predict(xtest[:,0:180])[1] trainscore = r2_score(trainpreds, ytrain) valscore = r2_score(valpreds, yval) if look_at_test_set == True: testscore = r2_score(testpreds, ytest) else: model_dict = { "ord": ord_nn, "nom": nominal_classifier, "bin": bin_class_nn } dropout_dict = {"ord": 0.3, "nom": 0.3, "bin": 0.4} model = model_dict[model_type](dropout=dropout_dict[model_type], input_dim=180, l2=0.0000) if model_type == "bin" and look_at_test_set == True: xtrain = np.vstack([xtrain, xval]) ytrain = np.concatenate([ytrain, yval]) model.trainmod(xtrain, epochs=epochs, minibatch=250, lr=0.005, use_weights=True) trainpreds, valpreds, testpreds = model.predict(xtrain[:,0:180])[1], model.predict(xval[:,0:180])[1],\ model.predict(xtest[:,0:180])[1] if model_type != "enrich": trainscore = mcc(trainpreds, ytrain) valscore = mcc(valpreds, yval) if look_at_test_set == True: testscore = mcc(testpreds, ytest) print_model_eval_results(trainscore, valscore, testscore, model_description=model_type)
def optimize(self, X, Y, estimator, tune_parameters): ''' optimizes a model using a grid search over a range of values for diverse parameters''' LOG.info('Optimizing PLS-DA algorithm using local ' 'implementation of gridsearch cv specially designed ' 'for PLS discriminant analysis') # Max number of latent variables latent_variables = tune_parameters["n_components"] # Mathew correlation coefficient of best threshold mcc_final = 0 estimator0 = "" # List to add the best threshold and Matthews correlation # coefficient for each number of latent variables list_latent = [] try: for n_comp in latent_variables: mcc0 = 0 estimator.set_params(**{"n_components": n_comp}) y_pred = cross_val_predict(estimator, X, Y, cv=self.cv, n_jobs=1) estimator1 = "" threshold_1 = 0 # Get optimum threshold for threshold in range(0, 100, 5): threshold = threshold / 100 y_pred2 = copy.copy(y_pred) y_pred2[y_pred2 < threshold] = 0 y_pred2[y_pred2 >= threshold] = 1 mcc1 = mcc(Y, y_pred2) # Update threshold value with current best value if mcc1 >= mcc0: mcc0 = mcc1 estimator1 = copy.copy(estimator) estimator1.set_params(**{'threshold': threshold}) threshold_1 = (threshold) # Assign class estimator the best current estimator if mcc0 >= mcc_final: mcc_final = mcc0 estimator0 = copy.copy(estimator1) self.estimator = estimator0 list_latent.append([n_comp, threshold_1, mcc0]) except Exception as e: LOG.error(f'Error optimizing PLS-DA with exception {e}') raise e LOG.debug('Number of latent variables, Best cutoff, and its Matthews ' 'correlation coefficient') for lv in list_latent: LOG.debug(f'Number of latent variables: ' f'{lv[0]} \nBest cutoff: {lv[1]} \nMCC: {lv[2]}\n') self.estimator.fit(X, Y) LOG.info(f'Estimator best parameters: {self.estimator.get_params()}') #### Overriding of parent methods # def CF_quantitative_validation(self): # ''' performs validation for conformal quantitative models ''' # def CF_qualitative_validation(self): # ''' performs validation for conformal qualitative models ''' # def quantitativeValidation(self): # ''' performs validation for quantitative models ''' # def qualitativeValidation(self): # ''' performs validation for qualitative models ''' # def validate(self): # ''' Validates the model and computes suitable model quality scoring values''' # def optimize(self, X, Y, estimator, tune_parameters): # ''' optimizes a model using a grid search over a range of values for diverse parameters''' # def regularProject(self, Xb, results): # ''' projects a collection of query objects in a regular model, for obtaining predictions ''' # def conformalProject(self, Xb, results): # ''' projects a collection of query objects in a conformal model, for obtaining predictions ''' # def project(self, Xb, results): # ''' Uses the X matrix provided as argument to predict Y'''
with open(save_dir_matFiles + 'name_loc_prob.pkl', 'wb') as fid: pickle.dump(name_loc_prob, fid) test_pred = np.around(test_pred) C_test = confusion_matrix(Y_test, test_pred) per_class_accuracy_test = np.diag(C_test.astype(np.float32)) / np.sum(C_test.astype(np.float32), axis=1) print(per_class_accuracy_test) print("testing mcc score:", mcc(Y_test, test_pred)) print("testing F1 score:", f1(Y_test, test_pred))
#%% # train classifiers random forest classifier # label: GC, data: third hop attribute print("-------TRAINING CLASSIFIERS-------") # second_hop_attri = cPickle.load(open(save_dir_matFiles+'second_hop_attributes.pkl', 'rb')) clf1 = RandomForestClassifier(n_jobs=int(multiprocessing.cpu_count()/2), verbose=0, class_weight='balanced',n_estimators=n_estimators_rf) label_flatten = GC.reshape((-1)) if classifier == 1: # use first hop attribute only one_hop_attri_flatten = one_hop_attri.reshape((-1, 12)) clf1.fit(one_hop_attri_flatten, label_flatten) predicted_train_label = clf1.predict(one_hop_attri_flatten) mcc_score = mcc(label_flatten, predicted_train_label) elif classifier == 2: # use second hop attributes only sec_hop_flatten = second_hop_attri.reshape((-1, 50)) clf1.fit(sec_hop_flatten, label_flatten) predicted_train_label = clf1.predict(sec_hop_flatten) mcc_score = mcc(label_flatten, predicted_train_label) elif classifier == 3: # use third hop attribute only third_hop_flatten = third_hop_attri.reshape((-1, 150)) clf1.fit(third_hop_flatten, label_flatten) predicted_train_label = clf1.predict(third_hop_flatten) mcc_score = mcc(label_flatten, predicted_train_label)
def external_validation(self): ''' when experimental values are available for the predicted compounds, run external validation ''' ext_val_results = [] # Ye are the y values present in the input file Ye = np.asarray(self.conveyor.getVal("ymatrix")) # For qualitative models, make sure the Y is qualitative as well if not self.param.getVal("quantitative"): qy, message = utils.qualitative_Y(Ye) if not qy: self.conveyor.setWarning( f'No qualitative activity suitable for external validation "{message}". Skipping.' ) LOG.warning( f'No qualitative activity suitable for external validation "{message}". Skipping.' ) return # there are four variants of external validation, depending if the method # if conformal or non-conformal and the model is qualitative and quantitative if not self.param.getVal("conformal"): # non-conformal if not self.param.getVal("quantitative"): # non-conformal & qualitative Yp = np.asarray(self.conveyor.getVal("values")) if Ye.size == 0: raise ValueError("Experimental activity vector is empty") if Yp.size == 0: raise ValueError("Predicted activity vector is empty") # the use of labels is compulsory to inform the confusion matrix that # it must return a 2x2 confussion matrix. Otherwise it will fail when # a single class is represented (all TP, for example) TN, FP, FN, TP = confusion_matrix(Ye, Yp, labels=[0, 1]).ravel() # protect to avoid warnings in special cases (div by zero) MCC = mcc(Ye, Yp) if (TP + FN) > 0: sensitivity = (TP / (TP + FN)) else: sensitivity = 0.0 if (TN + FP) > 0: specificity = (TN / (TN + FP)) else: specificity = 0.0 ext_val_results.append( ('TP', 'True positives in external-validation', float(TP))) ext_val_results.append( ('TN', 'True negatives in external-validation', float(TN))) ext_val_results.append( ('FP', 'False positives in external-validation', float(FP))) ext_val_results.append( ('FN', 'False negatives in external-validation', float(FN))) ext_val_results.append( ('Sensitivity', 'Sensitivity in external-validation', float(sensitivity))) ext_val_results.append( ('Specificity', 'Specificity in external-validation', float(specificity))) ext_val_results.append( ('MCC', 'Mattews Correlation Coefficient in external-validation', float(MCC))) else: # non-conformal & quantitative Yp = np.asarray(self.conveyor.getVal("values")) if Ye.size == 0: raise ValueError("Experimental activity vector is empty") if Yp.size == 0: raise ValueError("Predicted activity vector is empty") Ym = np.mean(Ye) nobj = len(Yp) SSY0_out = np.sum(np.square(Ym - Ye)) SSY_out = np.sum(np.square(Ye - Yp)) scoringP = mean_squared_error(Ye, Yp) SDEP = np.sqrt(SSY_out / (nobj)) if SSY0_out == 0: Q2 = 0.0 else: Q2 = 1.00 - (SSY_out / SSY0_out) ext_val_results.append(('scoringP', 'Scoring P', scoringP)) ext_val_results.append( ('Q2', 'Determination coefficient in cross-validation', Q2)) ext_val_results.append( ('SDEP', 'Standard Deviation Error of the Predictions', SDEP)) self.conveyor.addVal(ext_val_results, 'external-validation', 'external validation', 'method', 'single', 'External validation results') else: # conformal external validation if not self.param.getVal("quantitative"): # conformal & qualitative Yp = np.concatenate( (np.asarray(self.conveyor.getVal('c0')).reshape(-1, 1), np.asarray(self.conveyor.getVal('c1')).reshape(-1, 1)), axis=1) if Ye.size == 0: raise ValueError("Experimental activity vector is empty") if Yp.size == 0: raise ValueError("Predicted activity vector is empty") c0_correct = 0 c1_correct = 0 not_predicted = 0 c0_incorrect = 0 c1_incorrect = 0 Ye1 = [] Yp1 = [] for i in range(len(Ye)): real = float(Ye[i]) predicted = Yp[i] if predicted[0] != predicted[1]: Ye1.append(real) if predicted[0]: Yp1.append(0) else: Yp1.append(1) if real == 0 and predicted[0] == True: c0_correct += 1 if real == 0 and predicted[1] == True: c0_incorrect += 1 if real == 1 and predicted[1] == True: c1_correct += 1 if real == 1 and predicted[0] == True: c1_incorrect += 1 else: not_predicted += 1 MCC = mcc(Ye1, Yp1) TN = c0_correct FP = c0_incorrect TP = c1_correct FN = c1_incorrect coverage = float((len(Yp) - not_predicted) / len(Yp)) try: # Compute accuracy (% of correct predictions) conformal_accuracy = (float(TN + TP) / float(FP + FN + TN + TP)) except Exception as e: LOG.error(f'Failed to compute conformal accuracy with' f'exception {e}') conformal_accuracy = '-' if (TP + FN) > 0: sensitivity = (TP / (TP + FN)) else: sensitivity = 0.0 if (TN + FP) > 0: specificity = (TN / (TN + FP)) else: specificity = 0.0 ext_val_results.append( ('TP', 'True positives in external-validation', float(TP))) ext_val_results.append( ('TN', 'True negatives in external-validation', float(TN))) ext_val_results.append( ('FP', 'False positives in external-validation', float(FP))) ext_val_results.append( ('FN', 'False negatives in external-validation', float(FN))) ext_val_results.append( ('Sensitivity', 'Sensitivity in external-validation', float(sensitivity))) ext_val_results.append( ('Specificity', 'Specificity in external-validation', float(specificity))) ext_val_results.append( ('MCC', 'Mattews Correlation Coefficient in external-validation', float(MCC))) ext_val_results.append( ('Conformal_coverage', 'Conformal coverage in external-validation', float(coverage))) ext_val_results.append( ('Conformal_accuracy', 'Conformal accuracy in external-validation', float(conformal_accuracy))) self.conveyor.addVal(ext_val_results, 'external-validation', 'external validation', 'method', 'single', 'External validation results') else: # conformal & quantitative Yp_lower = self.conveyor.getVal('lower_limit') Yp_upper = self.conveyor.getVal('upper_limit') mean_interval = np.mean(np.abs(Yp_lower) - np.abs(Yp_upper)) inside_interval = (Yp_lower.reshape(-1, 1) < Ye) & (Yp_upper.reshape(-1, 1) > Ye) accuracy = len(inside_interval) / len(Ye) conformal_accuracy = float("{0:.2f}".format(accuracy)) conformal_mean_interval = float( "{0:.2f}".format(mean_interval)) ext_val_results.append( ('Conformal_mean_interval', 'Conformal mean interval', conformal_mean_interval)) ext_val_results.append( ('Conformal_accuracy', 'Conformal accuracy', conformal_accuracy)) self.conveyor.addVal(ext_val_results, 'external-validation', 'external validation', 'method', 'single', 'External validation results')
print("Testing model...") model.eval() for i in tqdm(range(nb_data_test)): out = model(th.Tensor(signals[None, i, :, :])).detach().numpy() auc_c0.add(out[None, 0, 0], target[None, i, 0]) auc_c1.add(out[None, 0, 1], target[None, i, 1]) auc_c2.add(out[None, 0, 2], target[None, i, 2]) res[i, 0] = 1 if out[0, 0] > 0.5 else -1 res[i, 1] = 1 if out[0, 1] > 0.5 else -1 res[i, 2] = 1 if out[0, 2] > 0.5 else -1 target = np.where(target == 1, 1, -1) mcc_canal_0 = mcc(target[:, 0], res[:, 0]) mcc_canal_1 = mcc(target[:, 1], res[:, 1]) mcc_canal_2 = mcc(target[:, 2], res[:, 2]) print("\nMCC") print("Canal 0 : %d" % (mcc_canal_0,)) print("Canal 1 : %d" % (mcc_canal_1,)) print("Canal 2 : %d" % (mcc_canal_2,)) print("\nROC AUC") print("Canal 0 %f" % (auc_c0.value()[0])) print("Canal 1 %f" % (auc_c1.value()[0])) print("Canal 2 %f" % (auc_c2.value()[0]))
text_cols='min_toehold_sequence', label_cols='Toehold Rating', bs=128, backwards=True) # assign this data to the trained learner learn_cr.data = data_classify_testr # compute metrics preds, _, _ = learn_cr.get_preds(ordered=True, with_loss=True) roc_aucr = roc_auc_score(df_test['Toehold Rating'], preds[:, 1]) acur, mccr = nuspeak.get_metrics(learn_cr, return_metrics=True) test_df['shufftok_class'] = test_df['shufftok_toehold'].apply( lambda x: pred_class(x, learn_cf)) y_test_true = test_df['Toehold Rating'] y_test_shufftok = test_df['shufftok_class'] mcc_c1 = mcc(y_test_true, y_test_shufftok) c1_scores.append( accuracy_score(y_test_true, y_test_shufftok, normalize=True)) mccs_c1.append(mcc_c1) test_df['shuffchar_class'] = test_df['scrambled_toehold'].apply( lambda x: pred_class(x, learn_cf)) y_test_shuffchar = test_df['shuffchar_class'] mcc_c2 = mcc(y_test_true, y_test_shuffchar) c2_scores.append( accuracy_score(y_test_true, y_test_shuffchar, normalize=True)) mccs_c2.append(mcc_c2) scores.append((acuf + acur) / 2) aucs.append((roc_aucf + roc_aucr) / 2) mccs.append((mccf + mccr) / 2)
#view the results #Usage: python results.py resultfile import pandas as pd import sys from sklearn.metrics import accuracy_score, roc_auc_score as roc, auc, classification_report, balanced_accuracy_score, matthews_corrcoef as mcc df = pd.read_csv( sys.argv[1], header=None, sep=',', ) y_true = df[2] y_pred = df[3] y_predprob = df[4] #print(y_test) print("Accuracy:", accuracy_score(y_true, y_pred)) print("Balanced accuracy:", balanced_accuracy_score(y_true, y_pred)) #print(auc(y_true, y_predprob)) print(classification_report(y_true, y_pred)) print("MCC:", mcc(y_true, y_pred)) print("ROC_AUC:", roc(y_true, y_predprob))