def compare_all(X_tr, y_tr, X_te, y_te, n_f, dp, it, pty, n_est, tag): fpr = [None]*5 tpr = [None]*5 precision = [None]*5 recall = [None]*5 roc_auc = [None]*5 auPRC = [None]*5 metrics = [None]*5 #Run_CNN tf.reset_default_graph() CNN.fit_CNN(X_tr, y_tr, int(n_f), float(dp), int(it), tag, TF_TEMP) meta = tag+"_CNN.machine.meta" fpr[0], tpr[0], precision[0], recall[0], metrics[0] = mev.performance_stat_nn(X_te, y_te, 'TEMP', meta) roc_auc[0] = metrics[0]['ROC_AUC']; auPRC[0] = metrics[0]['PRC_AUC'] #Run FNN tf.reset_default_graph() FNN.fit_FNN(X_tr, y_tr, int(n_f), float(dp), int(it), tag, TF_TEMP) meta = tag+"_FNN.machine.meta" fpr[1], tpr[1], precision[1], recall[1], metrics[1] = mev.performance_stat_nn(X_te, y_te, 'TEMP', meta) roc_auc[1] = metrics[1]['ROC_AUC']; auPRC[1] = metrics[1]['PRC_AUC'] #Run RF clf = RandomForestClassifier(n_estimators=int(n_est)) fitted = clf.fit(X_tr, y_tr) fpr[2], tpr[2], precision[2], recall[2], metrics[2] = mev.performance_stat(X_te, y_te, fitted) roc_auc[2] = metrics[2]['ROC_AUC']; auPRC[2] = metrics[2]['PRC_AUC'] #Run SVM clf = svm.SVC(kernel='rbf', probability = True, C = int(pty)) fitted = clf.fit(X_tr, y_tr) fpr[3], tpr[3], precision[3], recall[3], metrics[3] = mev.performance_stat(X_te, y_te, fitted) roc_auc[3] = metrics[3]['ROC_AUC']; auPRC[3] = metrics[3]['PRC_AUC'] #Run pLR clf = LogisticRegression(C=int(pty), penalty='l2', tol=0.01) fitted = clf.fit(X_tr, y_tr) fpr[4], tpr[4], precision[4], recall[4], metrics[4] = mev.performance_stat(X_te, y_te, fitted) roc_auc[4] = metrics[4]['ROC_AUC']; auPRC[4] = metrics[4]['PRC_AUC'] cla_nam = ['CNN', 'FNN', 'RF', 'SVM', 'pLR'] ind = np.argmax(auPRC) return fpr, tpr, precision, recall, roc_auc, auPRC, cla_nam[ind]
def main(argv): dpath = os.environ["DeeprimerPATH"] try: opts, args = getopt.getopt(argv, "hC:w:o:p:") except getopt.GetoptError: print( 'Run_SVM.py -C <penalty> -w <fit/pred/eval> -o <preprocessed_object> -p <pred_input>' ) sys.exit(2) for opt, arg in opts: if opt == '-h': print( "[Command]: Run_SVM.py -C <penalty> -w <fit/pred/eval> -o <preprocessed_object> -p <pred_input>" + "\n\n" + "Example:" + "\n\n" + "1. Fit the model only:" + "\n\n" + "Run_SVM.py -C 1000 -w fit -o <sample_pre.fit.pickled>" + "\n\n" + "2. Evaluate the model:" + "\n\n" + "Run_SVM.py -C 1000 -w eval -o <sample_split.eval.pickled>" + "\n\n" + "3. Make prediction:" + "\n\n" + "Run_SVM.py -C 1000 -w pred -o <sample_pre.fit_SVM.machine> -p <pred_input>" + "\n") sys.exit() elif opt == "-C": pty = int(arg) elif opt == "-w": task = arg elif opt == "-o": obj = arg elif opt == "-p": pred_input = pd.read_csv(arg, delim_whitespace=True) pred = pred_input.values LW = 2 RANDOM_STATE = 42 tag = re.sub('\.pickled$', '', obj).split("/")[-1] if task == "fit": with open(obj, 'rb') as f: pre_obj = pickle.load(f) X_tr = pre_obj[0] y_tr = pre_obj[1] if sum(y_tr == 1) >= 2 * sum(y_tr == 0) or sum( y_tr == 1) <= (1 / 2) * sum(y_tr == 0): classifier = svm.SVC(kernel='rbf', probability=True, C=pty) sampler = RandomOverSampler(random_state=RANDOM_STATE) clf = make_pipeline(sampler, classifier) else: clf = svm.SVC(kernel='rbf', probability=True, C=pty) fitted = clf.fit(X_tr, y_tr) with open(OBJpath + "/" + tag + "_SVM.machine", 'wb') as f: pickle.dump(fitted, f, pickle.HIGHEST_PROTOCOL) elif task == "eval": with open(obj, 'rb') as f: pre_obj = pickle.load(f) X_tr = pre_obj[0] y_tr = pre_obj[1] X_te = pre_obj[2] y_te = pre_obj[3] if sum(y_tr == 1) >= 2 * sum(y_tr == 0) or sum( y_tr == 1) <= (1 / 2) * sum(y_tr == 0): classifier = svm.SVC(kernel='rbf', probability=True, C=pty) sampler = RandomOverSampler(random_state=RANDOM_STATE) clf = make_pipeline(sampler, classifier) else: clf = svm.SVC(kernel='rbf', probability=True, C=pty) fitted = clf.fit(X_tr, y_tr) fpr, tpr, precision, recall, metrics = mev.performance_stat( X_te, y_te, fitted) mev.plot_eval(fpr, tpr, precision, recall, metrics, tag) with open(REPpath + "/" + tag + "_SVM.metrics", 'wb') as f: for key, value in metrics.items(): f.write("%s %.3f" % (key, value) + '\n') elif task == "pred": tag1 = re.sub('\.machine$', '', obj).split("/")[-1] with open(obj, 'rb') as f: fitted_obj = pickle.load(f) pred_p = fitted_obj.predict_proba(pred) pred_c = fitted_obj.predict(pred) df = pd.DataFrame({ "Probability_1": pred_p[:, 1], "Probability_0": pred_p[:, 0], "Predited_class": pred_c }) df.to_csv(REPpath + "/" + tag1 + ".classification", index=None, sep='\t') else: print("Not supported task. Choose from fit, pred, eval" + "\n") sys.exit()
def main(argv): dpath = os.environ["DeeprimerPATH"] try: opts, args = getopt.getopt(argv, "ht:w:o:p:c:") except getopt.GetoptError: print( 'Run_randomforest.py -t <n_estimator> -c <n_class> -w <fit/pred/eval> -o <preprocessed_object> -p <pred_input>' ) sys.exit(2) for opt, arg in opts: if opt == '-h': print( "[Command]: Run_randomforest.py -t <n_estimator> -w <fit/pred/eval> -o <preprocessed_object> -p <pred_input>" + "\n\n" + "Example:" + "\n\n" + "1. Fit the model only:" + "\n\n" + "Run_randomforest.py -t 1000 -w fit -o <sample_pre.fit.pickled>" + "\n\n" + "2. Evaluate the model:" + "\n\n" + "Run_randomforest.py -t 1000 -w eval -o <sample_split.eval.pickled>" + "\n\n" + "3. Make prediction:" + "\n\n" + "Run_randomforest.py -t 1000 -w pred -o <sample_pre.fit_RF.machine> -p <pred_input>" + "\n") sys.exit() elif opt == "-t": n_est = int(arg) elif opt == "-w": task = arg elif opt == "-o": obj = arg elif opt == "-c": nclass = int(arg) elif opt == "-p": pred_input = pd.read_csv(arg, delim_whitespace=True) pred = pred_input.values if 'nclass' not in locals(): nclass = 2 LW = 2 RANDOM_STATE = 42 tag = re.sub('\.pickled$', '', obj).split("/")[-1] if task == "fit": with open(obj, 'rb') as f: pre_obj = pickle.load(f) X_tr = pre_obj[0] y_tr = pre_obj[1] if sum(y_tr == 1) >= 2 * sum(y_tr == 0) or sum( y_tr == 1) <= (1 / 2) * sum(y_tr == 0): classifier = RandomForestClassifier(n_estimators=n_est) sampler = RandomOverSampler(random_state=RANDOM_STATE) clf = make_pipeline(sampler, classifier) else: clf = RandomForestClassifier(n_estimators=n_est) fitted = clf.fit(X_tr, y_tr) if nclass == 2: with open(OBJpath + "/" + tag + "_RF.machine", 'wb') as f: pickle.dump(fitted, f, pickle.HIGHEST_PROTOCOL) elif nclass == 3: with open(OBJpath + "/" + tag + "_3C_RF.machine", 'wb') as f: pickle.dump(fitted, f, pickle.HIGHEST_PROTOCOL) else: print("Not support class over 3. Try Regress." + "\n") sys.exit() elif task == "eval": with open(obj, 'rb') as f: pre_obj = pickle.load(f) X_tr = pre_obj[0] y_tr = pre_obj[1] X_te = pre_obj[2] y_te = pre_obj[3] if sum(y_tr == 1) >= 2 * sum(y_tr == 0) or sum( y_tr == 1) <= (1 / 2) * sum(y_tr == 0): classifier = RandomForestClassifier(n_estimators=n_est) sampler = RandomOverSampler(random_state=RANDOM_STATE) clf = make_pipeline(sampler, classifier) else: clf = RandomForestClassifier(n_estimators=n_est) fitted = clf.fit(X_tr, y_tr) if nclass == 2: fpr, tpr, precision, recall, metrics = mev.performance_stat( X_te, y_te, fitted) mev.plot_eval(fpr, tpr, precision, recall, metrics, tag) with open(REPpath + "/" + tag + "_RF.metrics", 'wb') as f: for key, value in metrics.items(): f.write("%s %.3f" % (key, value) + '\n') elif nclass == 3: pred_p = fitted.predict_proba(X_te) pred_c = fitted.predict(X_te) cnf_matrix = confusion_matrix(y_te, pred_c) np.set_printoptions(precision=2) class_names = ["0", "1", "2"] plot_confusion_matrix(cnf_matrix, class_names, True, 'Normalized confusion matrix', tag) with open(REPpath + "/" + tag + "_3C_RF_eval.summay", 'wb') as f: f.write( classification_report(y_te, pred_c, target_names=class_names)) elif task == "pred": tag1 = re.sub('\.machine$', '', obj).split("/")[-1] with open(obj, 'rb') as f: fitted_obj = pickle.load(f) pred_p = fitted_obj.predict_proba(pred) pred_c = fitted_obj.predict(pred) if nclass == 2: df = pd.DataFrame({ "Probability_1": pred_p[:, 1], "Probability_0": pred_p[:, 0], "Predited_class": pred_c }) df.to_csv(REPpath + "/" + tag1 + ".2classification", index=None, sep='\t') elif nclass == 3: df = pd.DataFrame({ "Probability_2": pred_p[:, 2], "Probability_1": pred_p[:, 1], "Probability_0": pred_p[:, 0], "Predited_class": pred_c }) df.to_csv(REPpath + "/" + tag1 + ".3classification", index=None, sep='\t') else: print("Not support class over 3. Try Regression.") sys.exit() else: print("Not supported task. Choose from fit, pred, eval" + "\n") sys.exit()
def get_cv_scores(X, y, cla, k, tag): random_state = np.random.RandomState(42) cv = StratifiedKFold(n_splits=k) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) mean_precision = 0.0 mean_recall = np.linspace(0, 1, 100) fpr = [None]*k tpr = [None]*k precision = [None]*k recall = [None]*k roc_auc = [None]*k auPRC = [None]*k metrics = [None]*k lw = 2 if cla == "CNN" or cla == "FNN": n_f = raw_input("Tell the machine your number of features: ") dp = raw_input("Node dropping probability (put 0.5 if you don't know): ") it = raw_input("Number of iteractions (put 10000 if you don't know): ") elif cla == "RF": n_est = raw_input("number of estimators (put 1000 if you don't know): ") elif cla == "SVM" or cla == "pLR": pty = raw_input("Penalty (put 1000 if you don't know): ") else: print("Pick one classifier from CNN, FNN, RF, SVM, pLR") sys.exit() i = 0 for train, test in cv.split(X, y): if cla == "CNN": tf.reset_default_graph() CNN.fit_CNN(X[train], y[train], int(n_f), float(dp), int(it), tag, TF_TEMP) meta = tag+"_CNN.machine.meta" fpr[i], tpr[i], precision[i], recall[i], metrics[i] = mev.performance_stat_nn(X[test], y[test], 'CNN', meta) elif cla == "FNN": tf.reset_default_graph() FNN.fit_FNN(X[train], y[train], int(n_f), float(dp), int(it), tag, TF_TEMP) meta = tag+"_FNN.machine.meta" fpr[i], tpr[i], precision[i], recall[i], metrics[i] = mev.performance_stat_nn(X[test], y[test], 'FNN', meta) else: if cla == "RF": clf = RandomForestClassifier(n_estimators=int(n_est)) elif cla == "SVM": clf = svm.SVC(kernel='rbf', probability = True, C = int(pty)) elif cla == "pLR": clf = LogisticRegression(C=int(pty), penalty='l2', tol=0.01) else: print("Pick one classifier from CNN, FNN, RF, SVM, pLR") sys.exit() fitted = clf.fit(X[train], y[train]) fpr[i], tpr[i], precision[i], recall[i], metrics[i] = mev.performance_stat(X[test], y[test], fitted) mean_tpr += interp(mean_fpr, fpr[i], tpr[i]) mean_tpr[0] = 0.0 roc_auc[i] = metrics[i]['ROC_AUC'] mean_precision += griddata(recall[i], precision[i], mean_recall) auPRC[i] = metrics[i]['PRC_AUC'] i += 1 mean_tpr /= cv.get_n_splits(X, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) mean_precision /= cv.get_n_splits(X, y) mean_auPRC = auc(mean_recall, mean_precision) scores = [fpr, tpr, precision, recall, roc_auc, auPRC] means = [mean_fpr, mean_tpr, mean_precision, mean_recall, mean_auc, mean_auPRC] return scores, means