scores = cross_val_score(clf, X_new, y_2d, cv=5, scoring='accuracy')
    print "accuracy scores with 5 fold cross validation with reduced features for ", to_predict, scores
    print "mean of accuracy", scores.mean()


if __name__ == '__main__':
    # raw_path_string = raw_input("Enter path where data is located (Location of accession number dirs): ")
    # csv_path = raw_input("Enter path of directory to store csv files: ")
    # train_path = raw_input("Enter path of train csv file (Path upto p1_train.csv): ")
    slash = "/"
    raw_path_string = '/home/rasika/Documents/Computational Biology/Project/Data'
    csv_path = '/home/rasika/Documents/Computational Biology/Project/Result'
    train_path = '/home/rasika/Documents/Computational Biology/Project/p1_train_pop_lab.csv'

    # make csv files from quant.sf files
    make_csv.make_csv_files(raw_path_string + slash, csv_path, slash)

    colnames1 = ['TPM']
    classifier_input = list()

    label_dict = {}
    # store the labels from train file in a dictionary
    train_data = pd.read_csv(train_path, sep=',', header=0, dtype='unicode')
    for i, row in train_data.iterrows():
        label_dict[row[0]] = (row[1], row[2])

    classifier_input = list()

    print "Starting reading csv files"
    print datetime.datetime.now()
    print "accuracy scores with 5 fold cross validation for Population DT", scores
    print "mean of accuracy", scores.mean()

if __name__ == '__main__':
    raw_path_string = raw_input("Enter path where data is located (Location of accession number dirs): ")
    csv_path = raw_input("Enter path of directory to store csv files: ")
    train_path = raw_input("Enter path of train csv file (Path upto p1_train.csv): ")
    slash = "\\"
    # raw_path_string = '/home/rasika/Documents/Computational Biology/Project/Data'
    # csv_path = '/home/rasika/Documents/Computational Biology/Project/Result'
    # train_path = '/home/rasika/Documents/Computational Biology/Project/p1_train_pop_lab.csv'


    # make csv files from quant.sf files
    colnames1 = ['TPM','Length']
    make_csv.make_csv_files(raw_path_string + slash, csv_path, slash, ['Name'] + colnames1)

    classifier_input = list()

    label_dict = {}
    # store the labels from train file in a dictionary
    train_data = pd.read_csv(train_path, sep=',', header=0, dtype='unicode')
    for i, row in train_data.iterrows():
        label_dict[row[0]] = (row[1], row[2])

    classifier_input = list()

    print "Starting reading csv files"
    print datetime.datetime.now()

    files = listdir(csv_path)
def main():
    args = sys.argv
    global model_dump_path
    global slash
    global csv_path
    global train_path

    model_dump_path = args[1]
    raw_path_string = args[2]
    csv_path = args[3]
    train_path = args[4]
    slash = args[5]
    eq_class = args[6]
    dataframe_csv_path = args[7]

    scores = {}
    df = None

    # w_file = open('/home/rasika/Documents/Computational Biology/Project/output_file.txt', 'a')
    # w_file.write('\n\n' + str(datetime.datetime.now()))
    col_names = ['TPM', 'Length']

    # make csv files from quant.sf files
    make_csv.make_csv_files(raw_path_string + slash, csv_path, slash,
                            ['Name'] + col_names)

    label_dict = {}
    # store the labels from train file in a dictionary
    train_data = pd.read_csv(train_path, sep=',', header=0, dtype='unicode')
    for i, row in train_data.iterrows():
        label_dict[row[0]] = (row[1], row[2])

    print "Started reading csv files"
    print datetime.datetime.now()

    # Reading the data from csv files and creating a data list of acession number, tpm, length and effective length
    files = listdir(csv_path)
    if eq_class == 'False':
        df = create_dataframe(files, ['TPM', 'Length'], col_names, label_dict)
    else:
        df = parse_eq_classes.create_dataframe_with_eq_class(
            raw_path_string, csv_path, train_path, slash, label_dict,
            dataframe_csv_path)

    print "Read all csv files, created dataframe"
    print datetime.datetime.now()

    # created dataframe will have following format
    # Name       TPM_1  TPM_2  TPM_3  TPM_4 ....  TPM_199324  label
    # ERR188021  value  value  value  value ....     value     TSI
    # ERR188022    .      .      .      .   ....       .       CEU
    #   .          .      .      .      .   ....       .        .
    #   .          .      .      .      .   ....       .        .
    #   .          .      .      .      .   ....       .        .

    # w_file.write("\n Predicting for Population, Sequence Center and Both on full data: TPM, Length, Effective Length, NumReads\n")

    f1_score_p, accuracy_p = predict_population(df)
    scores['Population'] = (f1_score_p, accuracy_p)

    f1_score_sc, accuracy_sc = predict_sequence_center(df)
    scores['Sequence Center'] = (f1_score_sc, accuracy_sc)

    f1_score_p_sc, accuracy_p_sc = predict_population_seq_center(df)
    scores['Population and Sequence Center'] = (f1_score_p_sc, accuracy_p_sc)

    return scores