Beispiel #1
0
def main(options, args):

    if len(sys.argv[1:]) == 0:
        print "no argument given!"
        print parser.print_help()
        sys.exit(2)
    if not options.data_file:
        print "The data file is missing\n Please provide the file and run the program again\nAborting....\n"
        sys.exit(2)
    if not options.label_column:
        print "Labels missing. Please provide column index containing label and un the program again\nAborting....\n"
        sys.exit(2)
    if not options.output_folder:
        print "output filename not given\n"
        sys.exit(2)

    y_true = np.genfromtxt(options.data_file,
                           usecols=int(options.label_column),
                           delimiter="\t",
                           skip_header=1)
    with open(options.data_file) as f:
        line = f.readline()
        nCol = len(line.split('\t'))
    print "Total no of columns in the file is:", nCol, "\n"
    nCol = nCol - 2
    print "Total no of columns in the file is:", nCol, "\n"
    for lab in range(2, nCol):
        print "lab", lab
        y_pred = np.genfromtxt(options.data_file,
                               usecols=lab,
                               delimiter="\t",
                               skip_header=1)
        print "The classification report for Column", lab, "is \n"
        print myScores.classification_report(y_true, y_pred)
        print "Accuracy: %.6f" % myScores.accuracy_score(y_true, y_pred)
        cm = confM(y_true, y_pred)
        print "Confusion matrix as \n", cm
        tn = int(cm[0, 0])
        fp = int(cm[0, 1])
        print "tn", tn
        print "fp", fp
        s = tn / (tn + fp)
        print "Speicificity is", s, "\n"
        print "Metthiew correlation co-efficient: %.6f" % mc(y_true, y_pred)
Beispiel #2
0
def main(options, args):
    ##########################################################################
    # Retrieve user defined options
    ##########################################################################
    if len(sys.argv[1:]) == 0:
        print "no argument given!"
        print parser.print_help()
        sys.exit(2)
    if not options.genome_file:
        print "Either genome region file is missing \n Please provide the file and run the program again\nAborting....\n"
        sys.exit(2)
    if not options.scalar_file:
        print "Scalar file is missing \n Please provide the scalar file (see document) and run the program again\nAborting....\n"
        sys.exit(2)

    model_filename = os.path.abspath(options.model_file)
    genome_file = os.path.abspath(options.genome_file)
    out_folder = options.output_folder
    data_cols = options.data_columns
    scalerFile = os.path.abspath(options.scalar_file)

    if not os.path.exists(out_folder):
        os.makedirs(out_folder)
    if int(options.verbosity) > 0:
        print "Options: ", options
        print "Model File: ", model_filename

##########################################################################
# Process data
##########################################################################

    data_cols = [int(x) for x in data_cols.split(",")]
    genomeFeatureData = np.loadtxt(genome_file,
                                   usecols=data_cols,
                                   delimiter="\t",
                                   skiprows=1)

    # Load the model file#
    myEstimator = myDataLogger.load(model_filename)

    # Perform same scaling on training and testing data
    genomeFeatureDataScaled, shuffled_indices = rescaleGenomeData(
        genomeFeatureData, scalerFile)

    if int(options.verbosity) > 0:
        print "Genome Feature Data: ", genomeFeatureData.shape
        print "Genome Feature Scaled Data: ", genomeFeatureDataScaled.shape

    cols = 0
    with open(genome_file, "r") as temp:
        next(temp)
        a = '\n'.join(line.strip("\n") for line in temp)
        b = np.genfromtxt(StringIO(a),
                          usecols=cols,
                          delimiter="\t",
                          dtype=None)
        enhancer_names_test = b[shuffled_indices]
    temp.close()

    if int(options.verbosity) > 0:
        print "Number of Regions: ", enhancer_names_test.shape


##########################################################################
# Carryout Genomewide Prediction
##########################################################################

## Predict
    y_pred = myEstimator.predict(genomeFeatureDataScaled)
    y_score_test = myEstimator.predict_proba(genomeFeatureDataScaled)
    combined_test = zip(enhancer_names_test, y_pred, y_score_test[:, 0],
                        y_score_test[:, 1])
    predictionAllLociFile = out_folder + "/GenomewidePredictionAllLocus_" + options.save_file + ".txt"
    prediction_output = open(predictionAllLociFile, 'w')
    prediction_output.write(
        "Chromosome\tY_predicted_labels\tProb_Class0\tProb_class1\n")
    for i in combined_test:
        line = '\t'.join(str(x) for x in i)
        prediction_output.write(line + '\n')
    prediction_output.close()

    y_pred_pos_ind = np.where(y_pred == 1)
    enhancer_names_test_pos = enhancer_names_test[y_pred_pos_ind]

    ## Create a BED file for the Enhancers
    positiveEnhancerBED = map(lambda x: x.split('_'), enhancer_names_test_pos)
    positiveEnhancerBED = np.array(positiveEnhancerBED)

    y_pred_pos = y_pred[y_pred_pos_ind]
    y_score_neg = y_score_test[y_pred_pos_ind, 0]
    y_score_neg = y_score_neg.T
    y_score_pos = y_score_test[y_pred_pos_ind, 1]
    y_score_pos = y_score_pos.T
    y_score_pos_2 = map(lambda x: round(x, 4), y_score_pos)

    combinedPositiveEnhancers = zip(positiveEnhancerBED[:, 0],
                                    positiveEnhancerBED[:, 1],
                                    positiveEnhancerBED[:, 2], y_score_pos_2)

    refs = {}
    for line in combinedPositiveEnhancers:
        l = list(line)
        l[3] = 10000 * l[3]
        if not l[0] in refs.keys():
            refs[l[0]] = []
        refs[l[0]].append([int(x) for x in l[1:]])

    allData = []
    for ref, val in refs.items():
        sortedCoords = sortCoordinate(val)
        clusters = clusterByOverlap(sortedCoords)
        for cluster in clusters:
            info1 = []
            for i in cluster[2]:
                info1.append(float(val[i][2]) / 10000)
            allData.append(
                np.hstack(('00' + ref[3:], cluster[0], cluster[1],
                           round(np.mean(info1), 4))))

    for it in range(0, len(allData)):
        allData[it][0] = allData[it][0].replace('00X', '0024')

    allData = np.array(allData, dtype=float)
    allDataSorted = sorted(allData, key=lambda t: t[0])
    allDataSorted_2 = map(lambda x: (int(x[0]), int(x[1]), int(x[2]), x[3]),
                          allDataSorted)
    allDataSorted_2 = np.array(allDataSorted_2, str)

    for it in range(0, len(allDataSorted_2)):
        allDataSorted_2[it][0] = allDataSorted_2[it][0].replace('24', 'X')
        allDataSorted_2[it][0] = 'chr' + allDataSorted_2[it][0]

    ## Write Predictions to a File
    predictionPositiveOutputFile = out_folder + "/GenomewidePredictedEnhancers_" + options.save_file + ".txt"
    predictionPositiveOutput = open(predictionPositiveOutputFile, 'w')
    predictionPositiveOutput.write("Chromosome\tStart\tEnd\tID\tConfidence\n")
    idx = 1
    # colorCode = []
    for row in allDataSorted_2:
        line = row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + 'E_' + str(
            idx).rjust(6, '0') + '\t' + row[3]
        # colorCode.append(row[3]*200)
        idx += 1
        predictionPositiveOutput.write(line + '\n')
    predictionPositiveOutput.close()

    ## Write Predictions to a File for Genome Browser
    predictionPositiveOutputFile_2 = out_folder + "/GenomewidePredictedEnhancers_BrowserUpload_" + options.save_file + ".bed"
    predictionPositiveOutput_2 = open(predictionPositiveOutputFile_2, 'w')
    predictionPositiveOutput_2.write(
        "browser position chr1:1-200000\ntrack name=GEP_Enhancer_Prediction description=\"GEP_Enhancer_Prediction\" color=0,60,120 useScore=1 db=hg19\n"
    )
    for row in allDataSorted_2:
        line = row[0] + '\t' + row[1] + '\t' + row[
            2] + '\t' + '.' + '\t' + str(float(row[3]) * 1000)
        predictionPositiveOutput_2.write(line + '\n')
    predictionPositiveOutput_2.close()

    print "Finished enhancer predictions !!!\nPredictions saved to: %s" % (
        str(predictionPositiveOutputFile))
Beispiel #3
0
def main(options, args):

    ##########################################################################
    # Retrieve user defined options
    ##########################################################################
    if len(sys.argv[1:]) == 0:
        print "no argument given!"
        print parser.print_help()
        sys.exit(2)
    if not options.test_file:
        print "Either test file is missing \n Please provide the file and run the program again\nAborting....\n"
        sys.exit(2)
    if not options.model_file:
        print "Model file is missing \n Please provide the model file and run the program again\nAborting....\n"
        sys.exit(2)
    if not options.scalar_file:
        print "Scaler file is missing \n Please provide the scalar file (see document) and run the program again\n Aborting.......\n"
        sys.exit(2)

    out_folder = options.output_folder
    dataCols = options.data_columns
    label_col = int(options.label_column)
    model_filename = os.path.abspath(options.model_file)
    test_file = os.path.abspath(options.test_file)
    scalerFile = os.path.abspath(options.scalar_file)

    if not os.path.exists(out_folder):
        os.makedirs(out_folder)

    if int(options.verbosity) > 0:
        print "Options: ", options
        print "Model file is", model_filename, "\n"

##########################################################################
# Process data
##########################################################################
    dataCols = [int(x) for x in dataCols.split(",")]
    featureDataTest = np.loadtxt(test_file,
                                 usecols=dataCols,
                                 delimiter="\t",
                                 skiprows=1)
    labelDataTest = np.genfromtxt(test_file,
                                  usecols=label_col,
                                  delimiter="\t",
                                  skip_header=1)
    print "No. of samples in the validation dataset: ", len(
        labelDataTest), "\n"

    # Load the model file
    myEstimator = myDataLogger.load(model_filename)

    # Perform same scaling on training and testing data
    labelDataTestScaled, featureDataTestScaled, shuffled_indices = rescaleTestData(
        featureDataTest, labelDataTest, scalerFile)

    ##########################################################################
    # Perform prediction on independent validation dataset
    ##########################################################################
    # Get enhancers names:
    cols = 0
    with open(test_file, "r") as temp:
        a = '\n'.join(line.strip("\n") for line in temp)
        allEnhancerNames = np.genfromtxt(StringIO(a),
                                         usecols=cols,
                                         delimiter="\t",
                                         dtype=None,
                                         skip_header=1)
        enhancerNamesTest = allEnhancerNames[shuffled_indices]

    temp.close()
    y_pred = myEstimator.predict(featureDataTestScaled)
    y_score_test = myEstimator.predict_proba(featureDataTestScaled)
    combined_test_predictions = zip(enhancerNamesTest, labelDataTestScaled,
                                    y_pred, y_score_test[:, 0],
                                    y_score_test[:, 1])
    prediction_output = open(
        out_folder + "/Predictions_" + options.save_file + ".txt", 'w')
    prediction_output.write(
        "Enhancer_name\tY_true_labels\tY_predicted_labels\tProb_Class0\tProb_class1\n"
    )
    for i in combined_test_predictions:
        line = '\t'.join(str(x) for x in i)
        prediction_output.write(line + '\n')
    prediction_output.close()

    print "Classification report of the prediction is", myScores.classification_report(
        labelDataTestScaled, y_pred), "\n"
    print "Random Forests: Final Generalization Accuracy: %.6f" % myScores.accuracy_score(
        labelDataTestScaled, y_pred)
    print "Number of mislabeled samples : %d" % (labelDataTestScaled !=
                                                 y_pred).sum()

    if int(int(options.verbosity)) > 0:
        # Get names of the features from header of file
        infile = open(options.test_file, 'r')
        firstline = infile.readline().rstrip()
        names = [x for x in firstline.split("\t")]
        names_sel = []
        for i in dataCols:
            names_sel.append(names[i])
        print "names", names_sel, "\n"
        infile.close()

        # Get the indices of the features according to their importance
        feature_rank_descend = np.argsort(
            myEstimator.feature_importances_)[::-1]  # Descending Order
        for f in xrange(len(dataCols)):
            print "%d. feature %d (%f) %s" % (
                f + 1, feature_rank_descend[f],
                myEstimator.feature_importances_[feature_rank_descend[f]],
                names_sel[feature_rank_descend[f]])

    # Plot ROC
    roc_plt = plotROC(myEstimator, featureDataTestScaled, labelDataTestScaled,
                      y_pred, options.verbosity)
    roc_plt.savefig(out_folder + "/ROC-curve_" + options.save_file + ".svg",
                    bbox_inches='tight',
                    pad_inches=0.2)
    roc_plt.show()
    roc_plt.close()
Beispiel #4
0
def main(options, args):
    out_folder = options.output_folder
    data_cols = options.data_columns
    label_col = int(options.label_column)
    if not options.data_file:
        print "No Data File Present \n Aborting....."
        sys.exit(2)
    if options.verbosity > 0:
        print "Label Column: ", label_col
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)

##########################################################################
# Process data
##########################################################################
# Get columns indices of the features
    data_cols = [int(x) for x in data_cols.split(",")]

    # Get names of the features from header of file
    infile = open(options.data_file, 'r')
    firstline = infile.readline().rstrip()
    names = [x for x in firstline.split("\t")]
    del names[0]  # delete column for position
    del names[0]  # delete column for class
    infile.close()

    # Read features and labels
    featureData = np.loadtxt(options.data_file,
                             usecols=data_cols,
                             delimiter="\t",
                             skiprows=1)
    labelData = np.genfromtxt(options.data_file,
                              usecols=label_col,
                              delimiter="\t",
                              skip_header=1)

    # Same scaling on both test and train data (centering the data scaling)
    featureData = myScaler.fit_transform(featureData)

    # Shuffle rows of the data
    np.random.seed(0)
    shuffled_indices = np.random.permutation(len(featureData))
    featureDataTraining = featureData[shuffled_indices]
    labelDataTraining = labelData[shuffled_indices]

    # With significant data after feature selection: Provide importance of the feature
    myClassifier = myClassifiers.RF(n_estimators=int(options.n_estimators),
                                    random_state=0)
    myClassifier.fit(featureDataTraining, labelDataTraining)

    # Importance of the feature is given by:
    featureImportances = myClassifier.feature_importances_
    if options.verbosity > 0:
        print "Feature Importance:", featureImportances

    # Get the indices of the features according to their importance
    feature_rank_descend = np.argsort(
        featureImportances)[::-1]  # Descending Order

    # Print feature ranking
    if options.verbosity > 0:
        for f in xrange(len(data_cols)):
            print "%d. feature %d (%f) %s" % (
                f + 1, feature_rank_descend[f],
                featureImportances[feature_rank_descend[f]],
                names[feature_rank_descend[f]])

    # Get Feature Importance from the classifier
    feature_rank_ascend = np.argsort(
        myClassifier.feature_importances_)  # Ascending Order

    ##########################################################################
    # Plotting feature importance
    ##########################################################################
    # Plot the importance of the feature as bar plot
    if int(options.plot_bar_without_std) == 1:
        myPlot.barh(np.arange(len(names)),
                    myClassifier.feature_importances_[feature_rank_ascend])
        myPlot.yticks(
            np.arange(len(names)) + 0.25,
            np.array(names)[feature_rank_ascend])
        _ = myPlot.xlabel('Relative importance')
        myPlot.savefig(out_folder +
                       '/withNames_RFClassifier_Feature_importance.svg',
                       bbox_inches='tight',
                       pad_inches=0.2)
        myPlot.show()
        myPlot.close()

    if int(options.plot_bar_with_std) == 1:
        std = np.std([
            feature.feature_importances_
            for feature in myClassifier.estimators_
        ],
                     axis=0)
        myPlot.figure()
        myPlot.title("Feature Importances")
        myPlot.bar(xrange(len(data_cols)),
                   featureImportances[feature_rank_descend],
                   color="r",
                   yerr=std[feature_rank_descend],
                   align="center")
        myPlot.xticks(xrange(len(data_cols)), feature_rank_descend)
        myPlot.xlim([-1, len(data_cols)])
        myPlot.savefig(out_folder +
                       '/RF_classifier_tssDist_include_Feature_importance.svg',
                       transparent=True,
                       bbox_inches='tight',
                       pad_inches=0.2)
        # pl.savefig(out_folder + '/RF_classifier_No_transperant_tssDist_include_Feature_importance.svg', bbox_inches='tight', pad_inches=0.2)
        myPlot.show()
        myPlot.close()

    # Plot the feature featureImportances of the trees and of the forest
    if int(options.plot_line) == 1:
        myPlot.figure()
        myPlot.title("Feature Importances")

        for feature in myClassifier.estimators_:
            myPlot.plot(xrange(len(data_cols)),
                        feature.feature_importances_[feature_rank_descend],
                        "r")

        myPlot.plot(xrange(len(data_cols)),
                    featureImportances[feature_rank_descend], "b")
        myPlot.show()
        myPlot.close()

    # Select only those features which are important by leaving other features
    featureDataTraining = myClassifier.fit(
        featureDataTraining, labelDataTraining).transform(featureDataTraining)
    if options.verbosity > 0:
        print "The shape of the data after feature selection is", featureDataTraining.shape, "\n"
Beispiel #5
0
def main(options, args):
    if len(sys.argv[1:]) == 0:
        print "no argument given!"
        print parser.print_help()
        sys.exit(2)
    if not options.data_file:
        print "No Data File Present \n Aborting....."
        sys.exit(2)
    out_folder = options.output_folder
    data_cols = options.data_columns
    label_col = int(options.label_column)
    if int(options.verbosity) > 0:
        print "Options: ", options
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)

##########################################################################
# Process data
##########################################################################
    # Get columns indices of the features
    data_cols = [int(x) for x in data_cols.split(",")]

    # Read features and labels
    featureData = np.loadtxt(options.data_file, usecols=data_cols, delimiter = "\t", skiprows = 1)
    labelData = np.genfromtxt(options.data_file,  usecols=label_col, delimiter = "\t", skip_header = 1)

    #Perform same scaling on training and testing data
    labelDataShuffled, featureDataShuffled, shuffled_indices, scalerData = performScaling(featureData, labelData)

    scalerFile = options.output_folder + "/" + "Scaler_" + options.save_file + ".pkl"
    myDataLogger.dump(scalerData, scalerFile)

##########################################################################
# Training and parameter tuning
##########################################################################
    # Train and test on the whole dataset
    sanityResult = sanityCheck(featureDataShuffled, labelDataShuffled, int(options.verbosity))
    if sanityResult > 0:
        print "Data is Sane..... Hurray !!!"
    else:
        print "Data is corrupted... Please check the input files !!"
        # exit()

    # Get indices of 2 classes seperately to make a test dataset with 20% split in the whole dataset in straified manner
    featureDataTrain, labelDataTrain, featureDataTest, labelDataTest = myDataShuffler(featureDataShuffled, labelDataShuffled, shuffled_indices, float(options.percent_test_size), int(options.verbosity))

    #Make cross-validation iterator to tune the parameters by taking 20% of the remaining 80% of the training data using STRAITIFIED shuffled 10 fold cross-validation
    crossValidator = myDataSplitter(labelDataTrain, int(options.fold_cross_validation), test_size=float(options.percent_test_size), random_state=0)

    #Apply the cross-validation iterator on the Training set using GridSearchCV
    if int(options.verbosity) > 0:
        print "Cross Validation is: ", crossValidator

    #Parallel processing during grid search
    SVM_C_min = int(options.SVM_C_min)
    SVM_C_max = int(options.SVM_C_max)
    SVM_gamma_min = int(options.SVM_gamma_min)
    SVM_gamma_max = int(options.SVM_gamma_max)

    C_range = 10.0 ** np.arange(SVM_C_min, SVM_C_max)
    gamma_range = 10.0 ** np.arange(SVM_gamma_min, SVM_gamma_max)
    param_grid = dict(gamma=gamma_range, C=C_range)
    if int(options.verbosity) > 0:
        print "Method Chosen: SVM\n"
    best_C, best_gamma = findParamatersSVM(crossValidator, featureDataTrain, labelDataTrain, param_grid, int(options.n_jobs), int(options.verbosity))

    myClassifier = myClassifiers.svm(kernel= 'rbf', gamma=best_gamma, C=best_C,random_state=0, probability=True)
    myClassifier.fit(featureDataTrain, labelDataTrain)

    #Below is a plot_learning_curve module that's provided by scikit-learn. It allows us to quickly and easily visualize how #well the model is performing based on number of samples we're training on. It helps to understand situations such as  #high variance or bias.
    if int(options.verbosity) > 0:
        print "See how well the model is fitting - plot learning curve for testing and training dataset\n";
    title = "Learning Curves (Random Forests, C=%.9f)" %(best_C)
    learning_plot=plotLearningCurve(myClassifier, title, featureDataTrain, labelDataTrain, crossValidator, int(int(options.verbosity)))
    learning_curve_figure_file = options.output_folder + "/" + "Learning_Curve_" + options.save_file + ".svg"
    learning_plot.savefig(learning_curve_figure_file, bbox_inches='tight', pad_inches=0.2)
    learning_plot.show()
    learning_plot.close()

##########################################################################
# Testing
##########################################################################
    #Save the model to load afterwards
    filename_model = options.output_folder + "/" + "Model_" + options.save_file + ".pkl"
    myDataLogger.dump(myClassifier, filename_model, compress=9)

    #Predict on the test set
    predicted_labels = myClassifier.predict(featureDataTest)

    #Plot ROC Curve
    roc_plt = plotROC(myClassifier, featureDataTest, labelDataTest, predicted_labels, int(int(options.verbosity)))
    ROC_curve_figure_file = options.output_folder + "/" + "ROC_Curve_" + options.save_file + ".svg"
    roc_plt.savefig(ROC_curve_figure_file, bbox_inches='tight', pad_inches=0.2)
    roc_plt.show()
    roc_plt.close()

    if int(options.verbosity) > 0:
        print myScores.classification_report(labelDataTest, predicted_labels)
        print "Random Forests: Final Generalization Accuracy: %.6f" %myScores.accuracy_score(labelDataTest, predicted_labels)
Beispiel #6
0
def main(options, args):
    ##########################################################################
    # Retrieve user defined options
    ##########################################################################
    if len(sys.argv[1:]) == 0:
        print "no argument given!"
        print parser.print_help()
        sys.exit(2)
    if not options.data_file:
        print "No Data File Present \n Aborting....."
        sys.exit(2)
    out_folder = options.output_folder
    data_cols = options.data_columns
    label_col = int(options.label_column)

    if int(options.verbosity) > 0:
        print "Options: ", options
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)
        print "output_folder is", options.output_folder, "\n"

##########################################################################
# Process data
##########################################################################
# Get columns indices of the features
    data_cols = [int(x) for x in data_cols.split(",")]

    # Read features and labels
    featureData = np.loadtxt(options.data_file,
                             usecols=data_cols,
                             delimiter="\t",
                             skiprows=1)
    labelData = np.genfromtxt(options.data_file,
                              usecols=label_col,
                              delimiter="\t",
                              skip_header=1)

    #Perform same scaling on training and testing data
    labelDataShuffled, featureDataShuffled, shuffled_indices, scalerData = performScaling(
        featureData, labelData)

    # Define lists to store ML measures
    scores = list()
    train_score = list()  #train_score
    test_score = list()  #test_score
    accuracySelectedCalssifier = list()  #Accuracy
    prec = list()  #Precision
    rec = list()  #Recall
    areaRoc = list()  #AUC
    fscore = list()  #F-measure

    # For random classification
    accuracyRandomClassifier = list()
    recRand = list()
    precRand = list()
    fscoreRand = list()

    # For Plotting ROC-curve
    random_mean_tpr_nFold = 0.0
    random_mean_fpr_nFold = np.linspace(0, 1, 100)
    random.seed(80)

    ##########################################################################
    # Training
    ##########################################################################
    #Generate object for n-fold CV
    skf = myDataSplitterFold(labelDataShuffled,
                             n_folds=int(options.fold_cross_validation))
    for feature_train_index, feature_test_index in skf:
        rand_list = list()
        featureDataTrain, featureDataTest = featureDataShuffled[
            feature_train_index], featureDataShuffled[feature_test_index]
        labelDataTrain, labelDataTest = labelDataShuffled[
            feature_train_index], labelDataShuffled[feature_test_index]

        if int(options.verbosity) > 0:
            print featureDataTrain, featureDataTest, "\n"

        #choose a method of your choice
        if options.method == "SVM":
            C = float(10**options.SVM_C)
            gamma = float(10**options.SVM_gamma)
            myClassifier = myClassifiers.svm(kernel='rbf',
                                             probability=True,
                                             C=C,
                                             gamma=gamma)
            if int(options.verbosity) > 0:
                print "Method chosen is: SVM\n"
        else:
            n_estimators = int(options.RF_n_estimators)
            max_depth = float(options.RF_max_depth)
            myClassifier = myClassifiers.RF(n_estimators=n_estimators,
                                            max_depth=max_depth)
            if int(options.verbosity) > 0:
                print "Method chosen is: Random Forest\n"

        # Calculate Scores for the Method Selected
        scores.append(
            myClassifier.fit(featureDataTrain,
                             labelDataTrain).score(featureDataTest,
                                                   labelDataTest))
        y_pred = myClassifier.predict(featureDataTest)
        predicted_label_score = myClassifier.predict_proba(featureDataTest)
        predicted_label_score = np.around(predicted_label_score, decimals=2)
        accuracySelectedCalssifier.append(
            myScores.accuracy_score(labelDataTest, y_pred))

        # Calculate accuracy for each cross-validation (classification + Random classification)
        for i in range(0, len(y_pred)):
            rand_list.append(random.randint(0, 1))

        y_rand = np.array(rand_list)
        accuracyRandomClassifier.append(
            myScores.accuracy_score(labelDataTest, y_rand))

        # Calculate precision, recall, fscore for each cross-validation (classification + Random classification)
        prec.append(
            myScores.precision_score(labelDataTest, y_pred, average='micro'))
        rec.append(
            myScores.recall_score(labelDataTest, y_pred, average='micro'))
        fscore.append(
            myScores.fbeta_score(labelDataTest,
                                 y_pred,
                                 average='micro',
                                 beta=0.5))
        precRand.append(
            myScores.precision_score(labelDataTest, y_rand, average='micro'))
        recRand.append(
            myScores.recall_score(labelDataTest, y_rand, average='micro'))
        fscoreRand.append(
            myScores.fbeta_score(labelDataTest,
                                 y_rand,
                                 average='micro',
                                 beta=0.5))
        areaRoc.append(
            myScores.roc_auc_score(labelDataTest, predicted_label_score[:, 1]))

        # Generate ROC curve for each cross-validation
        fpr, tpr, thresholds = myScores.roc_curve(
            labelDataTest, predicted_label_score[:, 1],
            pos_label=1)  #Pos level for positive class
        random_mean_tpr_nFold += interpolator(random_mean_fpr_nFold, fpr, tpr)
        random_mean_tpr_nFold[0] = 0.0
        train_score.append(
            myClassifier.fit(featureDataTrain,
                             labelDataTrain).score(featureDataTrain,
                                                   labelDataTrain))
        test_score.append(
            myClassifier.fit(featureDataTest,
                             labelDataTest).score(featureDataTest,
                                                  labelDataTest))

    random_mean_tpr_nFold /= int(options.fold_cross_validation)
    random_mean_tpr_nFold[-1] = 1.0
    random_mean_auc_nFold = myScores.auc(random_mean_fpr_nFold,
                                         random_mean_tpr_nFold)

    if int(options.verbosity) > 0:
        print "scores are:\n", scores, "\n"

    combined_measures = zip(accuracySelectedCalssifier,
                            accuracyRandomClassifier, prec, precRand, rec,
                            recRand, fscore, fscoreRand, areaRoc)
    if int(options.verbosity) > 0:
        print "######################################################"
        print "1. Length Accuracy Selected Classifier", len(
            accuracySelectedCalssifier)
        print "2. Length Accuracy Random Classifier", len(
            accuracyRandomClassifier)
        print "3. Length Precision Selected Classifier", len(prec)
        print "4. Length Precision Random Classifier", len(precRand)
        print "######################################################"

##########################################################################
# Output and plotting
##########################################################################
    nFold_result_file = options.output_folder + "/" + "measures_" + options.save_file + ".txt"
    predictedOutput = open(nFold_result_file, 'w')
    predictedOutput.write(
        "Accuracy\tAccuracy_Rand\tPrecision\tPrecision_Rand\tRecall\tRecall_Rand\tFscore\tFscore_rand\tareaRoc\n"
    )
    for i in combined_measures:
        line = '\t'.join(str(x) for x in i)
        predictedOutput.write(line + '\n')
    mean_measures = str(np.mean(accuracySelectedCalssifier)) + "\t" + str(
        np.mean(accuracyRandomClassifier)) + "\t" + str(
            np.mean(prec)) + "\t" + str(np.mean(precRand)) + "\t" + str(
                np.mean(rec)) + "\t" + str(np.mean(recRand)) + "\t" + str(
                    np.mean(fscore)) + "\t" + str(
                        np.mean(fscoreRand)) + "\t" + str(np.mean(areaRoc))
    if int(options.verbosity) > 0:
        print "All the average measures are: ", mean_measures, "\n"
    predictedOutput.write(mean_measures + '\n')

    # Get variance across the cross-validation scores
    predictedOutput.write("Mean score n-fold: {0:.3f} (+/-{1:.3f})".format(
        np.mean(scores), myStats.sem(scores)) + '\n')
    predictedOutput.write("Mean train score: {0:.3f} (+/-{1:.3f})".format(
        np.mean(train_score), myStats.sem(train_score)) + '\n')
    predictedOutput.write("Mean test score: {0:.3f} (+/-{1:.3f})".format(
        np.mean(test_score), myStats.sem(test_score)) + '\n')
    predictedOutput.close()

    if int(options.verbosity) > 0:
        print "Area Under the ROC : ", areaRoc, "mean AUC", np.mean(areaRoc)

    # Print ROC curve for n-fold cross-validation
    myPlot.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Standard')
    myPlot.plot(random_mean_fpr_nFold,
                random_mean_tpr_nFold,
                'k--',
                label='Random Forest (area = %0.2f)' % random_mean_auc_nFold,
                lw=2,
                color=(0.45, 0.42,
                       0.18))  #Plot mean ROC area in cross validation
    myPlot.xlim([-0.05, 1.05])
    myPlot.ylim([-0.05, 1.05])
    myPlot.xlabel('False Positive Rate')
    myPlot.ylabel('True Positive Rate')
    myPlot.title('ROC: fold_cross_validation fold CV')
    myPlot.legend(loc="lower right")

    # Save plot in svg format
    ROC_curve_figure_file = options.output_folder + "/" + "ROC-curve_" + options.save_file + ".svg"
    myPlot.savefig(ROC_curve_figure_file, bbox_inches='tight', pad_inches=0.2)
    myPlot.show()
    myPlot.close()