def main(options, args): ########################################################################## # Retrieve user defined options ########################################################################## if len(sys.argv[1:]) == 0: print "no argument given!" print parser.print_help() sys.exit(2) if not options.genome_file: print "Either genome region file is missing \n Please provide the file and run the program again\nAborting....\n" sys.exit(2) if not options.scalar_file: print "Scalar file is missing \n Please provide the scalar file (see document) and run the program again\nAborting....\n" sys.exit(2) model_filename = os.path.abspath(options.model_file) genome_file = os.path.abspath(options.genome_file) out_folder = options.output_folder data_cols = options.data_columns scalerFile = os.path.abspath(options.scalar_file) if not os.path.exists(out_folder): os.makedirs(out_folder) if int(options.verbosity) > 0: print "Options: ", options print "Model File: ", model_filename ########################################################################## # Process data ########################################################################## data_cols = [int(x) for x in data_cols.split(",")] genomeFeatureData = np.loadtxt(genome_file, usecols=data_cols, delimiter="\t", skiprows=1) # Load the model file# myEstimator = myDataLogger.load(model_filename) # Perform same scaling on training and testing data genomeFeatureDataScaled, shuffled_indices = rescaleGenomeData( genomeFeatureData, scalerFile) if int(options.verbosity) > 0: print "Genome Feature Data: ", genomeFeatureData.shape print "Genome Feature Scaled Data: ", genomeFeatureDataScaled.shape cols = 0 with open(genome_file, "r") as temp: next(temp) a = '\n'.join(line.strip("\n") for line in temp) b = np.genfromtxt(StringIO(a), usecols=cols, delimiter="\t", dtype=None) enhancer_names_test = b[shuffled_indices] temp.close() if int(options.verbosity) > 0: print "Number of Regions: ", enhancer_names_test.shape ########################################################################## # Carryout Genomewide Prediction ########################################################################## ## Predict y_pred = myEstimator.predict(genomeFeatureDataScaled) y_score_test = myEstimator.predict_proba(genomeFeatureDataScaled) combined_test = zip(enhancer_names_test, y_pred, y_score_test[:, 0], y_score_test[:, 1]) predictionAllLociFile = out_folder + "/GenomewidePredictionAllLocus_" + options.save_file + ".txt" prediction_output = open(predictionAllLociFile, 'w') prediction_output.write( "Chromosome\tY_predicted_labels\tProb_Class0\tProb_class1\n") for i in combined_test: line = '\t'.join(str(x) for x in i) prediction_output.write(line + '\n') prediction_output.close() y_pred_pos_ind = np.where(y_pred == 1) enhancer_names_test_pos = enhancer_names_test[y_pred_pos_ind] ## Create a BED file for the Enhancers positiveEnhancerBED = map(lambda x: x.split('_'), enhancer_names_test_pos) positiveEnhancerBED = np.array(positiveEnhancerBED) y_pred_pos = y_pred[y_pred_pos_ind] y_score_neg = y_score_test[y_pred_pos_ind, 0] y_score_neg = y_score_neg.T y_score_pos = y_score_test[y_pred_pos_ind, 1] y_score_pos = y_score_pos.T y_score_pos_2 = map(lambda x: round(x, 4), y_score_pos) combinedPositiveEnhancers = zip(positiveEnhancerBED[:, 0], positiveEnhancerBED[:, 1], positiveEnhancerBED[:, 2], y_score_pos_2) refs = {} for line in combinedPositiveEnhancers: l = list(line) l[3] = 10000 * l[3] if not l[0] in refs.keys(): refs[l[0]] = [] refs[l[0]].append([int(x) for x in l[1:]]) allData = [] for ref, val in refs.items(): sortedCoords = sortCoordinate(val) clusters = clusterByOverlap(sortedCoords) for cluster in clusters: info1 = [] for i in cluster[2]: info1.append(float(val[i][2]) / 10000) allData.append( np.hstack(('00' + ref[3:], cluster[0], cluster[1], round(np.mean(info1), 4)))) for it in range(0, len(allData)): allData[it][0] = allData[it][0].replace('00X', '0024') allData = np.array(allData, dtype=float) allDataSorted = sorted(allData, key=lambda t: t[0]) allDataSorted_2 = map(lambda x: (int(x[0]), int(x[1]), int(x[2]), x[3]), allDataSorted) allDataSorted_2 = np.array(allDataSorted_2, str) for it in range(0, len(allDataSorted_2)): allDataSorted_2[it][0] = allDataSorted_2[it][0].replace('24', 'X') allDataSorted_2[it][0] = 'chr' + allDataSorted_2[it][0] ## Write Predictions to a File predictionPositiveOutputFile = out_folder + "/GenomewidePredictedEnhancers_" + options.save_file + ".txt" predictionPositiveOutput = open(predictionPositiveOutputFile, 'w') predictionPositiveOutput.write("Chromosome\tStart\tEnd\tID\tConfidence\n") idx = 1 # colorCode = [] for row in allDataSorted_2: line = row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + 'E_' + str( idx).rjust(6, '0') + '\t' + row[3] # colorCode.append(row[3]*200) idx += 1 predictionPositiveOutput.write(line + '\n') predictionPositiveOutput.close() ## Write Predictions to a File for Genome Browser predictionPositiveOutputFile_2 = out_folder + "/GenomewidePredictedEnhancers_BrowserUpload_" + options.save_file + ".bed" predictionPositiveOutput_2 = open(predictionPositiveOutputFile_2, 'w') predictionPositiveOutput_2.write( "browser position chr1:1-200000\ntrack name=GEP_Enhancer_Prediction description=\"GEP_Enhancer_Prediction\" color=0,60,120 useScore=1 db=hg19\n" ) for row in allDataSorted_2: line = row[0] + '\t' + row[1] + '\t' + row[ 2] + '\t' + '.' + '\t' + str(float(row[3]) * 1000) predictionPositiveOutput_2.write(line + '\n') predictionPositiveOutput_2.close() print "Finished enhancer predictions !!!\nPredictions saved to: %s" % ( str(predictionPositiveOutputFile))
def main(options, args): ########################################################################## # Retrieve user defined options ########################################################################## if len(sys.argv[1:]) == 0: print "no argument given!" print parser.print_help() sys.exit(2) if not options.test_file: print "Either test file is missing \n Please provide the file and run the program again\nAborting....\n" sys.exit(2) if not options.model_file: print "Model file is missing \n Please provide the model file and run the program again\nAborting....\n" sys.exit(2) if not options.scalar_file: print "Scaler file is missing \n Please provide the scalar file (see document) and run the program again\n Aborting.......\n" sys.exit(2) out_folder = options.output_folder dataCols = options.data_columns label_col = int(options.label_column) model_filename = os.path.abspath(options.model_file) test_file = os.path.abspath(options.test_file) scalerFile = os.path.abspath(options.scalar_file) if not os.path.exists(out_folder): os.makedirs(out_folder) if int(options.verbosity) > 0: print "Options: ", options print "Model file is", model_filename, "\n" ########################################################################## # Process data ########################################################################## dataCols = [int(x) for x in dataCols.split(",")] featureDataTest = np.loadtxt(test_file, usecols=dataCols, delimiter="\t", skiprows=1) labelDataTest = np.genfromtxt(test_file, usecols=label_col, delimiter="\t", skip_header=1) print "No. of samples in the validation dataset: ", len( labelDataTest), "\n" # Load the model file myEstimator = myDataLogger.load(model_filename) # Perform same scaling on training and testing data labelDataTestScaled, featureDataTestScaled, shuffled_indices = rescaleTestData( featureDataTest, labelDataTest, scalerFile) ########################################################################## # Perform prediction on independent validation dataset ########################################################################## # Get enhancers names: cols = 0 with open(test_file, "r") as temp: a = '\n'.join(line.strip("\n") for line in temp) allEnhancerNames = np.genfromtxt(StringIO(a), usecols=cols, delimiter="\t", dtype=None, skip_header=1) enhancerNamesTest = allEnhancerNames[shuffled_indices] temp.close() y_pred = myEstimator.predict(featureDataTestScaled) y_score_test = myEstimator.predict_proba(featureDataTestScaled) combined_test_predictions = zip(enhancerNamesTest, labelDataTestScaled, y_pred, y_score_test[:, 0], y_score_test[:, 1]) prediction_output = open( out_folder + "/Predictions_" + options.save_file + ".txt", 'w') prediction_output.write( "Enhancer_name\tY_true_labels\tY_predicted_labels\tProb_Class0\tProb_class1\n" ) for i in combined_test_predictions: line = '\t'.join(str(x) for x in i) prediction_output.write(line + '\n') prediction_output.close() print "Classification report of the prediction is", myScores.classification_report( labelDataTestScaled, y_pred), "\n" print "Random Forests: Final Generalization Accuracy: %.6f" % myScores.accuracy_score( labelDataTestScaled, y_pred) print "Number of mislabeled samples : %d" % (labelDataTestScaled != y_pred).sum() if int(int(options.verbosity)) > 0: # Get names of the features from header of file infile = open(options.test_file, 'r') firstline = infile.readline().rstrip() names = [x for x in firstline.split("\t")] names_sel = [] for i in dataCols: names_sel.append(names[i]) print "names", names_sel, "\n" infile.close() # Get the indices of the features according to their importance feature_rank_descend = np.argsort( myEstimator.feature_importances_)[::-1] # Descending Order for f in xrange(len(dataCols)): print "%d. feature %d (%f) %s" % ( f + 1, feature_rank_descend[f], myEstimator.feature_importances_[feature_rank_descend[f]], names_sel[feature_rank_descend[f]]) # Plot ROC roc_plt = plotROC(myEstimator, featureDataTestScaled, labelDataTestScaled, y_pred, options.verbosity) roc_plt.savefig(out_folder + "/ROC-curve_" + options.save_file + ".svg", bbox_inches='tight', pad_inches=0.2) roc_plt.show() roc_plt.close()
def main(options, args): if len(sys.argv[1:]) == 0: print "no argument given!" print parser.print_help() sys.exit(2) if not options.data_file: print "No Data File Present \n Aborting....." sys.exit(2) out_folder = options.output_folder data_cols = options.data_columns label_col = int(options.label_column) if int(options.verbosity) > 0: print "Options: ", options if not os.path.exists(out_folder): os.makedirs(out_folder) ########################################################################## # Process data ########################################################################## # Get columns indices of the features data_cols = [int(x) for x in data_cols.split(",")] # Read features and labels featureData = np.loadtxt(options.data_file, usecols=data_cols, delimiter = "\t", skiprows = 1) labelData = np.genfromtxt(options.data_file, usecols=label_col, delimiter = "\t", skip_header = 1) #Perform same scaling on training and testing data labelDataShuffled, featureDataShuffled, shuffled_indices, scalerData = performScaling(featureData, labelData) scalerFile = options.output_folder + "/" + "Scaler_" + options.save_file + ".pkl" myDataLogger.dump(scalerData, scalerFile) ########################################################################## # Training and parameter tuning ########################################################################## # Train and test on the whole dataset sanityResult = sanityCheck(featureDataShuffled, labelDataShuffled, int(options.verbosity)) if sanityResult > 0: print "Data is Sane..... Hurray !!!" else: print "Data is corrupted... Please check the input files !!" # exit() # Get indices of 2 classes seperately to make a test dataset with 20% split in the whole dataset in straified manner featureDataTrain, labelDataTrain, featureDataTest, labelDataTest = myDataShuffler(featureDataShuffled, labelDataShuffled, shuffled_indices, float(options.percent_test_size), int(options.verbosity)) #Make cross-validation iterator to tune the parameters by taking 20% of the remaining 80% of the training data using STRAITIFIED shuffled 10 fold cross-validation crossValidator = myDataSplitter(labelDataTrain, int(options.fold_cross_validation), test_size=float(options.percent_test_size), random_state=0) #Apply the cross-validation iterator on the Training set using GridSearchCV if int(options.verbosity) > 0: print "Cross Validation is: ", crossValidator #Parallel processing during grid search SVM_C_min = int(options.SVM_C_min) SVM_C_max = int(options.SVM_C_max) SVM_gamma_min = int(options.SVM_gamma_min) SVM_gamma_max = int(options.SVM_gamma_max) C_range = 10.0 ** np.arange(SVM_C_min, SVM_C_max) gamma_range = 10.0 ** np.arange(SVM_gamma_min, SVM_gamma_max) param_grid = dict(gamma=gamma_range, C=C_range) if int(options.verbosity) > 0: print "Method Chosen: SVM\n" best_C, best_gamma = findParamatersSVM(crossValidator, featureDataTrain, labelDataTrain, param_grid, int(options.n_jobs), int(options.verbosity)) myClassifier = myClassifiers.svm(kernel= 'rbf', gamma=best_gamma, C=best_C,random_state=0, probability=True) myClassifier.fit(featureDataTrain, labelDataTrain) #Below is a plot_learning_curve module that's provided by scikit-learn. It allows us to quickly and easily visualize how #well the model is performing based on number of samples we're training on. It helps to understand situations such as #high variance or bias. if int(options.verbosity) > 0: print "See how well the model is fitting - plot learning curve for testing and training dataset\n"; title = "Learning Curves (Random Forests, C=%.9f)" %(best_C) learning_plot=plotLearningCurve(myClassifier, title, featureDataTrain, labelDataTrain, crossValidator, int(int(options.verbosity))) learning_curve_figure_file = options.output_folder + "/" + "Learning_Curve_" + options.save_file + ".svg" learning_plot.savefig(learning_curve_figure_file, bbox_inches='tight', pad_inches=0.2) learning_plot.show() learning_plot.close() ########################################################################## # Testing ########################################################################## #Save the model to load afterwards filename_model = options.output_folder + "/" + "Model_" + options.save_file + ".pkl" myDataLogger.dump(myClassifier, filename_model, compress=9) #Predict on the test set predicted_labels = myClassifier.predict(featureDataTest) #Plot ROC Curve roc_plt = plotROC(myClassifier, featureDataTest, labelDataTest, predicted_labels, int(int(options.verbosity))) ROC_curve_figure_file = options.output_folder + "/" + "ROC_Curve_" + options.save_file + ".svg" roc_plt.savefig(ROC_curve_figure_file, bbox_inches='tight', pad_inches=0.2) roc_plt.show() roc_plt.close() if int(options.verbosity) > 0: print myScores.classification_report(labelDataTest, predicted_labels) print "Random Forests: Final Generalization Accuracy: %.6f" %myScores.accuracy_score(labelDataTest, predicted_labels)
def main(options, args): out_folder = options.output_folder data_cols = options.data_columns label_col = int(options.label_column) if not options.data_file: print "No Data File Present \n Aborting....." sys.exit(2) if options.verbosity > 0: print "Label Column: ", label_col if not os.path.exists(out_folder): os.makedirs(out_folder) ########################################################################## # Process data ########################################################################## # Get columns indices of the features data_cols = [int(x) for x in data_cols.split(",")] # Get names of the features from header of file infile = open(options.data_file, 'r') firstline = infile.readline().rstrip() names = [x for x in firstline.split("\t")] del names[0] # delete column for position del names[0] # delete column for class infile.close() # Read features and labels featureData = np.loadtxt(options.data_file, usecols=data_cols, delimiter="\t", skiprows=1) labelData = np.genfromtxt(options.data_file, usecols=label_col, delimiter="\t", skip_header=1) # Same scaling on both test and train data (centering the data scaling) featureData = myScaler.fit_transform(featureData) # Shuffle rows of the data np.random.seed(0) shuffled_indices = np.random.permutation(len(featureData)) featureDataTraining = featureData[shuffled_indices] labelDataTraining = labelData[shuffled_indices] # With significant data after feature selection: Provide importance of the feature myClassifier = myClassifiers.RF(n_estimators=int(options.n_estimators), random_state=0) myClassifier.fit(featureDataTraining, labelDataTraining) # Importance of the feature is given by: featureImportances = myClassifier.feature_importances_ if options.verbosity > 0: print "Feature Importance:", featureImportances # Get the indices of the features according to their importance feature_rank_descend = np.argsort( featureImportances)[::-1] # Descending Order # Print feature ranking if options.verbosity > 0: for f in xrange(len(data_cols)): print "%d. feature %d (%f) %s" % ( f + 1, feature_rank_descend[f], featureImportances[feature_rank_descend[f]], names[feature_rank_descend[f]]) # Get Feature Importance from the classifier feature_rank_ascend = np.argsort( myClassifier.feature_importances_) # Ascending Order ########################################################################## # Plotting feature importance ########################################################################## # Plot the importance of the feature as bar plot if int(options.plot_bar_without_std) == 1: myPlot.barh(np.arange(len(names)), myClassifier.feature_importances_[feature_rank_ascend]) myPlot.yticks( np.arange(len(names)) + 0.25, np.array(names)[feature_rank_ascend]) _ = myPlot.xlabel('Relative importance') myPlot.savefig(out_folder + '/withNames_RFClassifier_Feature_importance.svg', bbox_inches='tight', pad_inches=0.2) myPlot.show() myPlot.close() if int(options.plot_bar_with_std) == 1: std = np.std([ feature.feature_importances_ for feature in myClassifier.estimators_ ], axis=0) myPlot.figure() myPlot.title("Feature Importances") myPlot.bar(xrange(len(data_cols)), featureImportances[feature_rank_descend], color="r", yerr=std[feature_rank_descend], align="center") myPlot.xticks(xrange(len(data_cols)), feature_rank_descend) myPlot.xlim([-1, len(data_cols)]) myPlot.savefig(out_folder + '/RF_classifier_tssDist_include_Feature_importance.svg', transparent=True, bbox_inches='tight', pad_inches=0.2) # pl.savefig(out_folder + '/RF_classifier_No_transperant_tssDist_include_Feature_importance.svg', bbox_inches='tight', pad_inches=0.2) myPlot.show() myPlot.close() # Plot the feature featureImportances of the trees and of the forest if int(options.plot_line) == 1: myPlot.figure() myPlot.title("Feature Importances") for feature in myClassifier.estimators_: myPlot.plot(xrange(len(data_cols)), feature.feature_importances_[feature_rank_descend], "r") myPlot.plot(xrange(len(data_cols)), featureImportances[feature_rank_descend], "b") myPlot.show() myPlot.close() # Select only those features which are important by leaving other features featureDataTraining = myClassifier.fit( featureDataTraining, labelDataTraining).transform(featureDataTraining) if options.verbosity > 0: print "The shape of the data after feature selection is", featureDataTraining.shape, "\n"
def main(options, args): ########################################################################## # Retrieve user defined options ########################################################################## if len(sys.argv[1:]) == 0: print "no argument given!" print parser.print_help() sys.exit(2) if not options.data_file: print "No Data File Present \n Aborting....." sys.exit(2) out_folder = options.output_folder data_cols = options.data_columns label_col = int(options.label_column) if int(options.verbosity) > 0: print "Options: ", options if not os.path.exists(out_folder): os.makedirs(out_folder) print "output_folder is", options.output_folder, "\n" ########################################################################## # Process data ########################################################################## # Get columns indices of the features data_cols = [int(x) for x in data_cols.split(",")] # Read features and labels featureData = np.loadtxt(options.data_file, usecols=data_cols, delimiter="\t", skiprows=1) labelData = np.genfromtxt(options.data_file, usecols=label_col, delimiter="\t", skip_header=1) #Perform same scaling on training and testing data labelDataShuffled, featureDataShuffled, shuffled_indices, scalerData = performScaling( featureData, labelData) # Define lists to store ML measures scores = list() train_score = list() #train_score test_score = list() #test_score accuracySelectedCalssifier = list() #Accuracy prec = list() #Precision rec = list() #Recall areaRoc = list() #AUC fscore = list() #F-measure # For random classification accuracyRandomClassifier = list() recRand = list() precRand = list() fscoreRand = list() # For Plotting ROC-curve random_mean_tpr_nFold = 0.0 random_mean_fpr_nFold = np.linspace(0, 1, 100) random.seed(80) ########################################################################## # Training ########################################################################## #Generate object for n-fold CV skf = myDataSplitterFold(labelDataShuffled, n_folds=int(options.fold_cross_validation)) for feature_train_index, feature_test_index in skf: rand_list = list() featureDataTrain, featureDataTest = featureDataShuffled[ feature_train_index], featureDataShuffled[feature_test_index] labelDataTrain, labelDataTest = labelDataShuffled[ feature_train_index], labelDataShuffled[feature_test_index] if int(options.verbosity) > 0: print featureDataTrain, featureDataTest, "\n" #choose a method of your choice if options.method == "SVM": C = float(10**options.SVM_C) gamma = float(10**options.SVM_gamma) myClassifier = myClassifiers.svm(kernel='rbf', probability=True, C=C, gamma=gamma) if int(options.verbosity) > 0: print "Method chosen is: SVM\n" else: n_estimators = int(options.RF_n_estimators) max_depth = float(options.RF_max_depth) myClassifier = myClassifiers.RF(n_estimators=n_estimators, max_depth=max_depth) if int(options.verbosity) > 0: print "Method chosen is: Random Forest\n" # Calculate Scores for the Method Selected scores.append( myClassifier.fit(featureDataTrain, labelDataTrain).score(featureDataTest, labelDataTest)) y_pred = myClassifier.predict(featureDataTest) predicted_label_score = myClassifier.predict_proba(featureDataTest) predicted_label_score = np.around(predicted_label_score, decimals=2) accuracySelectedCalssifier.append( myScores.accuracy_score(labelDataTest, y_pred)) # Calculate accuracy for each cross-validation (classification + Random classification) for i in range(0, len(y_pred)): rand_list.append(random.randint(0, 1)) y_rand = np.array(rand_list) accuracyRandomClassifier.append( myScores.accuracy_score(labelDataTest, y_rand)) # Calculate precision, recall, fscore for each cross-validation (classification + Random classification) prec.append( myScores.precision_score(labelDataTest, y_pred, average='micro')) rec.append( myScores.recall_score(labelDataTest, y_pred, average='micro')) fscore.append( myScores.fbeta_score(labelDataTest, y_pred, average='micro', beta=0.5)) precRand.append( myScores.precision_score(labelDataTest, y_rand, average='micro')) recRand.append( myScores.recall_score(labelDataTest, y_rand, average='micro')) fscoreRand.append( myScores.fbeta_score(labelDataTest, y_rand, average='micro', beta=0.5)) areaRoc.append( myScores.roc_auc_score(labelDataTest, predicted_label_score[:, 1])) # Generate ROC curve for each cross-validation fpr, tpr, thresholds = myScores.roc_curve( labelDataTest, predicted_label_score[:, 1], pos_label=1) #Pos level for positive class random_mean_tpr_nFold += interpolator(random_mean_fpr_nFold, fpr, tpr) random_mean_tpr_nFold[0] = 0.0 train_score.append( myClassifier.fit(featureDataTrain, labelDataTrain).score(featureDataTrain, labelDataTrain)) test_score.append( myClassifier.fit(featureDataTest, labelDataTest).score(featureDataTest, labelDataTest)) random_mean_tpr_nFold /= int(options.fold_cross_validation) random_mean_tpr_nFold[-1] = 1.0 random_mean_auc_nFold = myScores.auc(random_mean_fpr_nFold, random_mean_tpr_nFold) if int(options.verbosity) > 0: print "scores are:\n", scores, "\n" combined_measures = zip(accuracySelectedCalssifier, accuracyRandomClassifier, prec, precRand, rec, recRand, fscore, fscoreRand, areaRoc) if int(options.verbosity) > 0: print "######################################################" print "1. Length Accuracy Selected Classifier", len( accuracySelectedCalssifier) print "2. Length Accuracy Random Classifier", len( accuracyRandomClassifier) print "3. Length Precision Selected Classifier", len(prec) print "4. Length Precision Random Classifier", len(precRand) print "######################################################" ########################################################################## # Output and plotting ########################################################################## nFold_result_file = options.output_folder + "/" + "measures_" + options.save_file + ".txt" predictedOutput = open(nFold_result_file, 'w') predictedOutput.write( "Accuracy\tAccuracy_Rand\tPrecision\tPrecision_Rand\tRecall\tRecall_Rand\tFscore\tFscore_rand\tareaRoc\n" ) for i in combined_measures: line = '\t'.join(str(x) for x in i) predictedOutput.write(line + '\n') mean_measures = str(np.mean(accuracySelectedCalssifier)) + "\t" + str( np.mean(accuracyRandomClassifier)) + "\t" + str( np.mean(prec)) + "\t" + str(np.mean(precRand)) + "\t" + str( np.mean(rec)) + "\t" + str(np.mean(recRand)) + "\t" + str( np.mean(fscore)) + "\t" + str( np.mean(fscoreRand)) + "\t" + str(np.mean(areaRoc)) if int(options.verbosity) > 0: print "All the average measures are: ", mean_measures, "\n" predictedOutput.write(mean_measures + '\n') # Get variance across the cross-validation scores predictedOutput.write("Mean score n-fold: {0:.3f} (+/-{1:.3f})".format( np.mean(scores), myStats.sem(scores)) + '\n') predictedOutput.write("Mean train score: {0:.3f} (+/-{1:.3f})".format( np.mean(train_score), myStats.sem(train_score)) + '\n') predictedOutput.write("Mean test score: {0:.3f} (+/-{1:.3f})".format( np.mean(test_score), myStats.sem(test_score)) + '\n') predictedOutput.close() if int(options.verbosity) > 0: print "Area Under the ROC : ", areaRoc, "mean AUC", np.mean(areaRoc) # Print ROC curve for n-fold cross-validation myPlot.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Standard') myPlot.plot(random_mean_fpr_nFold, random_mean_tpr_nFold, 'k--', label='Random Forest (area = %0.2f)' % random_mean_auc_nFold, lw=2, color=(0.45, 0.42, 0.18)) #Plot mean ROC area in cross validation myPlot.xlim([-0.05, 1.05]) myPlot.ylim([-0.05, 1.05]) myPlot.xlabel('False Positive Rate') myPlot.ylabel('True Positive Rate') myPlot.title('ROC: fold_cross_validation fold CV') myPlot.legend(loc="lower right") # Save plot in svg format ROC_curve_figure_file = options.output_folder + "/" + "ROC-curve_" + options.save_file + ".svg" myPlot.savefig(ROC_curve_figure_file, bbox_inches='tight', pad_inches=0.2) myPlot.show() myPlot.close()