def compare_all(X_tr, y_tr, X_te, y_te, n_f, dp, it, pty, n_est, tag):

	fpr = [None]*5
	tpr = [None]*5
	precision = [None]*5
	recall = [None]*5
	roc_auc = [None]*5
	auPRC = [None]*5
	metrics = [None]*5
	
	#Run_CNN
	tf.reset_default_graph()
	CNN.fit_CNN(X_tr, y_tr, int(n_f), float(dp), int(it), tag, TF_TEMP)
	meta = tag+"_CNN.machine.meta"
	fpr[0], tpr[0], precision[0], recall[0], metrics[0] = mev.performance_stat_nn(X_te, y_te, 'TEMP', meta)
	roc_auc[0] = metrics[0]['ROC_AUC']; auPRC[0] = metrics[0]['PRC_AUC']
	
	#Run FNN
	tf.reset_default_graph()
	FNN.fit_FNN(X_tr, y_tr, int(n_f), float(dp), int(it), tag, TF_TEMP)
	meta = tag+"_FNN.machine.meta"
	fpr[1], tpr[1], precision[1], recall[1], metrics[1] = mev.performance_stat_nn(X_te, y_te, 'TEMP', meta)
	roc_auc[1] = metrics[1]['ROC_AUC']; auPRC[1] = metrics[1]['PRC_AUC']
	
	#Run RF
	clf = RandomForestClassifier(n_estimators=int(n_est))
	fitted = clf.fit(X_tr, y_tr)
	fpr[2], tpr[2], precision[2], recall[2], metrics[2] = mev.performance_stat(X_te, y_te, fitted)
	roc_auc[2] = metrics[2]['ROC_AUC']; auPRC[2] = metrics[2]['PRC_AUC']
	
	#Run SVM
	clf = svm.SVC(kernel='rbf', probability = True, C = int(pty))
	fitted = clf.fit(X_tr, y_tr)
	fpr[3], tpr[3], precision[3], recall[3], metrics[3] = mev.performance_stat(X_te, y_te, fitted)
	roc_auc[3] = metrics[3]['ROC_AUC']; auPRC[3] = metrics[3]['PRC_AUC']
	
	#Run pLR
	clf = LogisticRegression(C=int(pty), penalty='l2', tol=0.01)
	fitted = clf.fit(X_tr, y_tr)
	fpr[4], tpr[4], precision[4], recall[4], metrics[4] = mev.performance_stat(X_te, y_te, fitted)
	roc_auc[4] = metrics[4]['ROC_AUC']; auPRC[4] = metrics[4]['PRC_AUC']

	cla_nam = ['CNN', 'FNN', 'RF', 'SVM', 'pLR']

	ind = np.argmax(auPRC)

	return fpr, tpr, precision, recall, roc_auc, auPRC, cla_nam[ind]
Exemple #2
0
def main(argv):
    dpath = os.environ["DeeprimerPATH"]
    try:
        opts, args = getopt.getopt(argv, "hC:w:o:p:")
    except getopt.GetoptError:
        print(
            'Run_SVM.py -C <penalty> -w <fit/pred/eval> -o <preprocessed_object> -p <pred_input>'
        )
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print(
                "[Command]: Run_SVM.py -C <penalty> -w <fit/pred/eval> -o <preprocessed_object> -p <pred_input>"
                + "\n\n" + "Example:" + "\n\n" + "1. Fit the model only:" +
                "\n\n" +
                "Run_SVM.py -C 1000 -w fit -o <sample_pre.fit.pickled>" +
                "\n\n" + "2. Evaluate the model:" + "\n\n" +
                "Run_SVM.py -C 1000 -w eval -o <sample_split.eval.pickled>" +
                "\n\n" + "3. Make prediction:" + "\n\n" +
                "Run_SVM.py -C 1000 -w pred -o <sample_pre.fit_SVM.machine> -p <pred_input>"
                + "\n")
            sys.exit()
        elif opt == "-C":
            pty = int(arg)
        elif opt == "-w":
            task = arg
        elif opt == "-o":
            obj = arg
        elif opt == "-p":
            pred_input = pd.read_csv(arg, delim_whitespace=True)
            pred = pred_input.values

    LW = 2
    RANDOM_STATE = 42
    tag = re.sub('\.pickled$', '', obj).split("/")[-1]

    if task == "fit":

        with open(obj, 'rb') as f:
            pre_obj = pickle.load(f)
            X_tr = pre_obj[0]
            y_tr = pre_obj[1]
        if sum(y_tr == 1) >= 2 * sum(y_tr == 0) or sum(
                y_tr == 1) <= (1 / 2) * sum(y_tr == 0):
            classifier = svm.SVC(kernel='rbf', probability=True, C=pty)
            sampler = RandomOverSampler(random_state=RANDOM_STATE)
            clf = make_pipeline(sampler, classifier)
        else:
            clf = svm.SVC(kernel='rbf', probability=True, C=pty)

        fitted = clf.fit(X_tr, y_tr)
        with open(OBJpath + "/" + tag + "_SVM.machine", 'wb') as f:
            pickle.dump(fitted, f, pickle.HIGHEST_PROTOCOL)

    elif task == "eval":

        with open(obj, 'rb') as f:
            pre_obj = pickle.load(f)
            X_tr = pre_obj[0]
            y_tr = pre_obj[1]
            X_te = pre_obj[2]
            y_te = pre_obj[3]

        if sum(y_tr == 1) >= 2 * sum(y_tr == 0) or sum(
                y_tr == 1) <= (1 / 2) * sum(y_tr == 0):
            classifier = svm.SVC(kernel='rbf', probability=True, C=pty)
            sampler = RandomOverSampler(random_state=RANDOM_STATE)
            clf = make_pipeline(sampler, classifier)
        else:
            clf = svm.SVC(kernel='rbf', probability=True, C=pty)

        fitted = clf.fit(X_tr, y_tr)
        fpr, tpr, precision, recall, metrics = mev.performance_stat(
            X_te, y_te, fitted)
        mev.plot_eval(fpr, tpr, precision, recall, metrics, tag)
        with open(REPpath + "/" + tag + "_SVM.metrics", 'wb') as f:
            for key, value in metrics.items():
                f.write("%s %.3f" % (key, value) + '\n')

    elif task == "pred":
        tag1 = re.sub('\.machine$', '', obj).split("/")[-1]
        with open(obj, 'rb') as f:
            fitted_obj = pickle.load(f)
        pred_p = fitted_obj.predict_proba(pred)
        pred_c = fitted_obj.predict(pred)
        df = pd.DataFrame({
            "Probability_1": pred_p[:, 1],
            "Probability_0": pred_p[:, 0],
            "Predited_class": pred_c
        })
        df.to_csv(REPpath + "/" + tag1 + ".classification",
                  index=None,
                  sep='\t')

    else:
        print("Not supported task. Choose from fit, pred, eval" + "\n")
        sys.exit()
Exemple #3
0
def main(argv):
    dpath = os.environ["DeeprimerPATH"]
    try:
        opts, args = getopt.getopt(argv, "ht:w:o:p:c:")
    except getopt.GetoptError:
        print(
            'Run_randomforest.py -t <n_estimator> -c <n_class> -w <fit/pred/eval> -o <preprocessed_object> -p <pred_input>'
        )
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print(
                "[Command]: Run_randomforest.py -t <n_estimator> -w <fit/pred/eval> -o <preprocessed_object> -p <pred_input>"
                + "\n\n" + "Example:" + "\n\n" + "1. Fit the model only:" +
                "\n\n" +
                "Run_randomforest.py -t 1000 -w fit -o <sample_pre.fit.pickled>"
                + "\n\n" + "2. Evaluate the model:" + "\n\n" +
                "Run_randomforest.py -t 1000 -w eval -o <sample_split.eval.pickled>"
                + "\n\n" + "3. Make prediction:" + "\n\n" +
                "Run_randomforest.py -t 1000 -w pred -o <sample_pre.fit_RF.machine> -p <pred_input>"
                + "\n")
            sys.exit()
        elif opt == "-t":
            n_est = int(arg)
        elif opt == "-w":
            task = arg
        elif opt == "-o":
            obj = arg
        elif opt == "-c":
            nclass = int(arg)
        elif opt == "-p":
            pred_input = pd.read_csv(arg, delim_whitespace=True)
            pred = pred_input.values

    if 'nclass' not in locals():
        nclass = 2
    LW = 2
    RANDOM_STATE = 42
    tag = re.sub('\.pickled$', '', obj).split("/")[-1]

    if task == "fit":

        with open(obj, 'rb') as f:
            pre_obj = pickle.load(f)
            X_tr = pre_obj[0]
            y_tr = pre_obj[1]
        if sum(y_tr == 1) >= 2 * sum(y_tr == 0) or sum(
                y_tr == 1) <= (1 / 2) * sum(y_tr == 0):
            classifier = RandomForestClassifier(n_estimators=n_est)
            sampler = RandomOverSampler(random_state=RANDOM_STATE)
            clf = make_pipeline(sampler, classifier)
        else:
            clf = RandomForestClassifier(n_estimators=n_est)

        fitted = clf.fit(X_tr, y_tr)
        if nclass == 2:
            with open(OBJpath + "/" + tag + "_RF.machine", 'wb') as f:
                pickle.dump(fitted, f, pickle.HIGHEST_PROTOCOL)
        elif nclass == 3:
            with open(OBJpath + "/" + tag + "_3C_RF.machine", 'wb') as f:
                pickle.dump(fitted, f, pickle.HIGHEST_PROTOCOL)
        else:
            print("Not support class over 3. Try Regress." + "\n")
            sys.exit()

    elif task == "eval":

        with open(obj, 'rb') as f:
            pre_obj = pickle.load(f)
            X_tr = pre_obj[0]
            y_tr = pre_obj[1]
            X_te = pre_obj[2]
            y_te = pre_obj[3]

        if sum(y_tr == 1) >= 2 * sum(y_tr == 0) or sum(
                y_tr == 1) <= (1 / 2) * sum(y_tr == 0):
            classifier = RandomForestClassifier(n_estimators=n_est)
            sampler = RandomOverSampler(random_state=RANDOM_STATE)
            clf = make_pipeline(sampler, classifier)
        else:
            clf = RandomForestClassifier(n_estimators=n_est)

        fitted = clf.fit(X_tr, y_tr)
        if nclass == 2:
            fpr, tpr, precision, recall, metrics = mev.performance_stat(
                X_te, y_te, fitted)
            mev.plot_eval(fpr, tpr, precision, recall, metrics, tag)
            with open(REPpath + "/" + tag + "_RF.metrics", 'wb') as f:
                for key, value in metrics.items():
                    f.write("%s %.3f" % (key, value) + '\n')
        elif nclass == 3:
            pred_p = fitted.predict_proba(X_te)
            pred_c = fitted.predict(X_te)
            cnf_matrix = confusion_matrix(y_te, pred_c)
            np.set_printoptions(precision=2)
            class_names = ["0", "1", "2"]

            plot_confusion_matrix(cnf_matrix, class_names, True,
                                  'Normalized confusion matrix', tag)
            with open(REPpath + "/" + tag + "_3C_RF_eval.summay", 'wb') as f:
                f.write(
                    classification_report(y_te,
                                          pred_c,
                                          target_names=class_names))

    elif task == "pred":
        tag1 = re.sub('\.machine$', '', obj).split("/")[-1]
        with open(obj, 'rb') as f:
            fitted_obj = pickle.load(f)
        pred_p = fitted_obj.predict_proba(pred)
        pred_c = fitted_obj.predict(pred)

        if nclass == 2:
            df = pd.DataFrame({
                "Probability_1": pred_p[:, 1],
                "Probability_0": pred_p[:, 0],
                "Predited_class": pred_c
            })
            df.to_csv(REPpath + "/" + tag1 + ".2classification",
                      index=None,
                      sep='\t')

        elif nclass == 3:
            df = pd.DataFrame({
                "Probability_2": pred_p[:, 2],
                "Probability_1": pred_p[:, 1],
                "Probability_0": pred_p[:, 0],
                "Predited_class": pred_c
            })
            df.to_csv(REPpath + "/" + tag1 + ".3classification",
                      index=None,
                      sep='\t')
        else:
            print("Not support class over 3. Try Regression.")
            sys.exit()

    else:
        print("Not supported task. Choose from fit, pred, eval" + "\n")
        sys.exit()
def get_cv_scores(X, y, cla, k, tag):
	
	random_state = np.random.RandomState(42)
	cv = StratifiedKFold(n_splits=k)

	mean_tpr = 0.0
	mean_fpr = np.linspace(0, 1, 100)
	mean_precision = 0.0
	mean_recall = np.linspace(0, 1, 100)
	fpr = [None]*k
	tpr = [None]*k
	precision = [None]*k
	recall = [None]*k
	roc_auc = [None]*k
	auPRC = [None]*k
	metrics = [None]*k

	lw = 2
	
	if cla == "CNN" or cla == "FNN":
		n_f = raw_input("Tell the machine your number of features: ")
                dp = raw_input("Node dropping probability (put 0.5 if you don't know): ")
                it = raw_input("Number of iteractions (put 10000 if you don't know): ")
	elif cla == "RF":
		n_est = raw_input("number of estimators (put 1000 if you don't know): ")
	elif cla == "SVM" or cla == "pLR":
		pty = raw_input("Penalty (put 1000 if you don't know): ")
	else:
		print("Pick one classifier from CNN, FNN, RF, SVM, pLR")
		sys.exit()
	
	i = 0
	for train, test in cv.split(X, y):
    	
    		if cla == "CNN":
    			tf.reset_default_graph()
			CNN.fit_CNN(X[train], y[train], int(n_f), float(dp), int(it), tag, TF_TEMP)
			meta = tag+"_CNN.machine.meta"
			fpr[i], tpr[i], precision[i], recall[i], metrics[i] = mev.performance_stat_nn(X[test], y[test], 'CNN', meta)
		elif cla == "FNN":
			tf.reset_default_graph()
			FNN.fit_FNN(X[train], y[train], int(n_f), float(dp), int(it), tag, TF_TEMP)
			meta = tag+"_FNN.machine.meta"
			fpr[i], tpr[i], precision[i], recall[i], metrics[i] = mev.performance_stat_nn(X[test], y[test], 'FNN', meta)
		else:
			if cla == "RF":
				clf = RandomForestClassifier(n_estimators=int(n_est))
			elif cla == "SVM":
				clf = svm.SVC(kernel='rbf', probability = True, C = int(pty))
			elif cla == "pLR":
				clf = LogisticRegression(C=int(pty), penalty='l2', tol=0.01)
			else:
				print("Pick one classifier from CNN, FNN, RF, SVM, pLR")
				sys.exit()
			fitted = clf.fit(X[train], y[train])
			fpr[i], tpr[i], precision[i], recall[i], metrics[i] = mev.performance_stat(X[test], y[test], fitted)

    		mean_tpr += interp(mean_fpr, fpr[i], tpr[i])
    		mean_tpr[0] = 0.0
    		roc_auc[i] = metrics[i]['ROC_AUC']
    
    		mean_precision += griddata(recall[i], precision[i], mean_recall)
    		auPRC[i] = metrics[i]['PRC_AUC']
    		i += 1

	mean_tpr /= cv.get_n_splits(X, y)
	mean_tpr[-1] = 1.0
	mean_auc = auc(mean_fpr, mean_tpr)

	mean_precision /= cv.get_n_splits(X, y)
	mean_auPRC = auc(mean_recall, mean_precision)

	scores = [fpr, tpr, precision, recall, roc_auc, auPRC]
	means = [mean_fpr, mean_tpr, mean_precision, mean_recall, mean_auc, mean_auPRC]
	return scores, means