Esempio n. 1
0
def ComputeDiffInOptimalPolicyForMdp(mdp_folder, gammas, vi_file, pi_file,
                                     adjustForGoalState, outputFile):
    diffs = {}
    for gamma in gammas:
        v_file = mdp_folder + "/" + str(gamma) + "/" + vi_file
        p_file = mdp_folder + "/" + str(gamma) + "/" + pi_file
        vi = pd.read_csv(v_file,
                         header=None).sort_values([0, 1])[[3, 4, 5,
                                                           6]].as_matrix()
        pi = pd.read_csv(p_file,
                         header=None).sort_values([0, 1])[[3, 4, 5,
                                                           6]].as_matrix()
        if (adjustForGoalState):
            vi = vi[0:-1, :]
            pi = pi[0:-1, :]
        #diff = np.where(vi>0,1,0)-np.where(pi>0,1,0)
        diff = np.max(np.abs(vi - pi), axis=1)
        non_zero = np.count_nonzero(diff)
        diffs[gamma] = non_zero
    lines = []
    if (outputFile):
        for key in gammas:
            line = "{0},{1}".format(key, diffs[key])
            lines.append(line)
        u.WriteTextArrayToFile(mdp_folder + "/policy_diff.csv", lines)
    return diffs
def ComputeFinalResults(rootfolder):
    clustering_stats = pd.read_csv(rootfolder+"/clustering.csv")
    final_output_file = rootfolder+"/best_metrics.csv"
    data = u.FilterRows(clustering_stats, lambda x : x['p'] == 2)
    dim_red = ["ica","pca","rp","mi"]
    clustering = ["kmeans","gmm"]
    lines = []
    lines.append("clustering,dim_red,k,p,ami_raw,ami_true,sc,bic")
    raw_predictions = {}
    for c in clustering:
        d = data.loc[(data['dim_red_method'] == "raw") & (data['clustering'] == c),:]
        d = d.loc[d['bic'] == np.min(d['bic']),:]
        clusters_file = rootfolder + "/clustering_output/mthd={0}_k={1}_d=2_algo=raw.csv".format(c,d.iloc[0]['k'])
        raw_predictions[c] = np.loadtxt(clusters_file,delimiter=',')

    for dr in dim_red:
        for c in clustering:
            d = data.loc[(data['dim_red_method'] == dr) & (data['clustering'] == c),:]
            d = d.loc[d['bic'] == np.min(d['bic']),:]
            clusters_file = rootfolder + "/clustering_output/mthd={0}_k={1}_d=2_algo={2}.csv".format(c,d.iloc[0]['k'],dr)
            predicted = np.loadtxt(clusters_file,delimiter=',')
            ami = metrics.adjusted_mutual_info_score(raw_predictions[c],predicted)
            lines.append(u.ConcatToStr(",",[c,dr,d.iloc[0]['k'],ami,d.iloc[0]['ami_true'],d.iloc[0]['sc'],d.iloc[0]['bic']]))

    u.WriteTextArrayToFile(final_output_file,lines)
Esempio n. 3
0
def ComputeDiffInOptimalValueForMdp(mdp_folder, gammas, vi_file, pi_file,
                                    adjustForGoalState, outputFile):
    diffs = {}
    for gamma in gammas:
        v_file = mdp_folder + "/" + str(gamma) + "/" + vi_file
        p_file = mdp_folder + "/" + str(gamma) + "/" + pi_file
        vi = pd.read_csv(v_file, header=None).sort_values([0, 1])[2].values
        pi = pd.read_csv(p_file, header=None).sort_values([0, 1])[2].values
        vi_sum = vi.sum()
        pi_sum = pi.sum()
        diff = vi_sum - pi_sum
        diff = vi - pi
        diff[np.abs(diff) < 0.01] = 0
        pos_ind = np.where(diff > 0)
        neg_ind = np.where(diff < 0)
        if ((pos_ind[0].size > 0) & (neg_ind[0].size > 0)):
            diffs[gamma] = np.NaN
            continue
        elif (pos_ind[0].size > 0):
            diffs[gamma] = 1
        elif (neg_ind[0].size > 0):
            diffs[gamma] = -1
        else:
            diffs[gamma] = 0
    lines = []
    if (outputFile):
        for key in gammas:
            line = "{0},{1}".format(key, diffs[key])
            lines.append(line)
        u.WriteTextArrayToFile(mdp_folder + "/value_diff.csv", lines)
    return diffs
Esempio n. 4
0
def ComputePerformanceOnRealTestSet(model_info,
                                    root,
                                    outputfile,
                                    weka_jar,
                                    pos_class,
                                    compute_accuracy=False):

    models = ['ada', 'dt', 'svm', 'nnets', 'knn']
    f = []
    t = []
    for model in models:
        if ((model == 'ada') | (model == 'dt')):
            testfile = glob.glob("{0}/*.realtest.arff".format(root))[0]
            modelfile = "{0}/{1}".format(root, model_info[model])
            wekajar = weka_jar
            if (model == 'ada'):
                _outputfile = "{0}/realtest.prediction.ada.csv".format(root)
                _f, _t = ComputeWekaSavedModelPerformance(
                    testfile, modelfile, weka_jar, _outputfile,
                    ada.GetWekaCommandLineForConfig, pos_class,
                    compute_accuracy)
            else:
                _outputfile = "{0}/realtest.prediction.dt.csv".format(root)
                _f, _t = ComputeWekaSavedModelPerformance(
                    testfile, modelfile, weka_jar, _outputfile,
                    dt.GetWekaCommandLineForConfig, pos_class,
                    compute_accuracy)
        else:
            datafolder = "{0}/i-0_t-80_ts-{1}".format(root,
                                                      model_info[model][1])
            testfile = glob.glob(
                "{0}/*.realtest.preprocessed.data*".format(datafolder))[0]
            labelfile = glob.glob(
                "{0}/*.realtest.preprocessed.label*".format(datafolder))[0]
            _f, _t = ComputeSklearnSavedModelPerformance(
                testfile, labelfile, root + '/' + model_info[model][0],
                pos_class, compute_accuracy)
        f.append(_f)
        t.append(_t)
    file = root + '/' + outputfile
    lines = [u.ConcatToStr(",", models), u.ConcatToStr(",", f)]
    u.WriteTextArrayToFile(file, lines)
    lines = [u.ConcatToStr(",", models), u.ConcatToStr(",", t)]
    u.WriteTextArrayToFile(file.replace(".csv", ".time.csv"), lines)
Esempio n. 5
0
 def write_to_file(data_to_write, filepath):
     file = u.PreparePath(filepath)
     data_to_write.to_csv(file,
                          index=False,
                          header=(include_header &
                                  (arff_format_predata_lines is None)))
     if (arff_format_predata_lines is not None):
         data = []
         data.extend(arff_format_predata_lines)
         data.extend(u.ReadLinesFromFile(file))
         u.WriteTextArrayToFile(file, data)
Esempio n. 6
0
def ComputeDiffInVI_PI_Q(rootFolder):
    output = rootFolder + "/pi_q_comparison.csv"
    pi_file = rootFolder + "/LargeMdpRwTraps50/0.99/pi_10000.policy.csv"
    q_file = rootFolder + "/LargeMdpRwTraps50/0.99/ql_10000_alpha=1.0_po=boltzmann_p=100.0.policy.csv"
    diff = ComparePolicies(pi_file, q_file)
    lines = []
    lines.append("LargeMdp," + str(diff))
    pi_file = rootFolder + "/SmallMdpRwTraps/0.99/pi_10000.policy.csv"
    q_file = rootFolder + "/SmallMdpRwTraps/0.99/ql_1000_alpha=1.0_po=greedyepsilon_p=0.1.policy.csv"
    diff = ComparePolicies(pi_file, q_file)
    lines.append("SmallMdp," + str(diff))
    u.WriteTextArrayToFile(output, lines)
Esempio n. 7
0
def ComputeDiffInVI_PI_Q_1(rootFolder):
    output = rootFolder + "/pi_q_comparison.csv"
    rootFolder1 = r"C:/Users/shkhandu/OneDrive/Gatech/Courses/ML/Assignment4/OutputNew1"
    pi_file = rootFolder1 + "/LargeMdpRwTraps50/0.99/pi_10000.policy.csv"
    q_file = rootFolder + "/LargeMdpRwTraps50/0.99/ql_10000_alpha=1.0_po=boltzmann_p=100.0.policy.csv"
    diff = ComparePolicies(pi_file, q_file)
    lines = []
    lines.append("LargeMdp," + str(diff))
    pi_file = rootFolder1 + "/SmallMdpRwTraps/0.99/pi_10000.policy.csv"
    q_file = rootFolder + "/SmallMdpRwTraps/0.99/ql_5000_alpha=0.1_po=greedyepsilon_p=0.05.policy.csv"
    diff = ComparePolicies(pi_file, q_file)
    lines.append("SmallMdp," + str(diff))
    u.WriteTextArrayToFile(output, lines)
Esempio n. 8
0
def RunAdaBoostWithDecisionTreesToGeneratePerIterationMetrics(datasets_root_folder,weka_jar_path,dataset_filter,iters,inst,use_arff_files=True):
    """
    #weightThreshold parameter : http://weka.8497.n7.nabble.com/AdaBoost-Parameters-td11830.html    
    """
    file_extn = "arff" if use_arff_files else ".csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if(dataset_filter not in dataset_dir):
            continue
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir+"/ada",is_file=False)
        config_gen = ParameterGrid({'prune':[True,False],'iter':iters})
        for config in config_gen:
            id = GetIdForConfig(config)
            config["inst"] = inst
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict=sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root,id),is_file=False)
            params_output_file=u.PreparePath("{0}/{1}.params.txt".format(run_output_dir,id))
            model_output_file=u.PreparePath("{0}/{1}.model".format(run_output_dir,id))
            train_output_file=u.PreparePath("{0}/{1}.train.predictions.csv".format(run_output_dir,id))
            full_train_output_file=u.PreparePath("{0}/{1}.fulltrain.predictions.csv".format(run_output_dir,id))
            test_output_file=u.PreparePath("{0}/{1}.test.predictions.csv".format(run_output_dir,id))
            # if(os.path.isfile(train_output_file)):
            #     continue
            config['random_state'] = params_info_dict['random_state']
            config["wekajar"] = weka_jar_path
            config["trainset"] = trainfile
            config["class"]="last"
            config["trainpredictionoutputfile"]=train_output_file
            config["predictionoutputfile"] = config["trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            # for every config there has to be a train prediction and test prediction
            cmd = GetWekaCommandLineForConfig(config,False,False)
            config["modelbuildtimesecs"] = timeit.timeit(lambda: sl.RunCmdWithoutConsoleWindow(cmd),number=1)

            config["testpredictionoutputfile"] = test_output_file
            config["testset"] = testfiles[0]
            cmd = GetWekaCommandLineForConfig(config,True)
            config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1)
            os.remove(model_output_file)

            config.pop('random_state',None) # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k,config[k]))
            u.WriteTextArrayToFile(params_output_file,params_info)
        print("done dataset : " + dataset_dir)
Esempio n. 9
0
def CreateArffFileFromCsv(arff_attr_info,
                          arff_file_path,
                          data_text_array,
                          isFile=False,
                          hasHeader=True):
    arff_data = []
    arff_data.extend(arff_attr_info)
    data_text_array = u.ReadLinesFromFile(data_text_array) if (
        isFile) else data_text_array
    data_text_array = data_text_array[1:] if (isFile
                                              & hasHeader) else data_text_array
    arff_data.extend(data_text_array)
    file = u.PreparePath(arff_file_path)
    u.WriteTextArrayToFile(file, arff_data)
Esempio n. 10
0
def EvaluateExperiments(datasets_root_folder,
                        params_to_keep,
                        positive_class,
                        metric_calculation_fn,
                        evaluation_output_filename="performance.csv",
                        algo_folder="dt",
                        should_eval=lambda x: True):

    headers = []
    headers.extend(params_to_keep)
    headers.extend(['istrain', 'p', 'r', 'm'])
    headers = ",".join(headers)
    evals = []
    evals.append(headers)
    for directory in u.Get_Subdirectories(datasets_root_folder):
        # each directory is a dataset directory
        dt_output_dir = "{0}/{1}".format(directory, algo_folder)
        if (os.path.isdir(dt_output_dir) == False):
            continue
        for run_output_folder in u.Get_Subdirectories(dt_output_dir):
            if (should_eval(run_output_folder) == False):
                print("ignoring : {0}".format(run_output_folder))
                continue
            # read params file
            params_file_path = glob.glob(
                "{0}/*.params.txt".format(run_output_folder))[0]
            params = sl.GetDictionary(u.ReadLinesFromFile(params_file_path))
            values = []
            for k in params_to_keep:
                if (k in params):
                    values.append(str(params[k]))
                else:
                    values.append(str(np.NaN))
            p, r, f = metric_calculation_fn(
                params["trainpredictionoutputfile"], positive_class)
            train_performance_values = ",".join(values)
            train_performance_values = "{0},1,{1},{2},{3}".format(
                ",".join(values), str(p), str(r), str(f))
            evals.append(train_performance_values)
            if (os.path.isfile(params["testpredictionoutputfile"])):
                p, r, f = metric_calculation_fn(
                    params["testpredictionoutputfile"], positive_class)
                test_performance_values = ",".join(values)
                test_performance_values = "{0},0,{1},{2},{3}".format(
                    ",".join(values), str(p), str(r), str(f))
                evals.append(test_performance_values)
    u.WriteTextArrayToFile(
        u.PreparePath("{0}/{1}".format(datasets_root_folder,
                                       evaluation_output_filename)), evals)
def ComputeFinalResults1(rootfolder,clusters,dims):
    raw_results = GetBestRawClustering(rootfolder)
    clustering_results = pd.read_csv(rootfolder+"/clustering.csv")
    final_output_file = rootfolder+"/best_metrics.csv"
    best_raw_clustering_output = rootfolder+"/best_raw_clustering.csv"
    o = pd.concat([raw_results["kmeans"][1], raw_results["gmm"][1]])
    o.to_csv(best_raw_clustering_output)
    dim_red = ["mi","pca","ica","rp"]
    lines = []
    lines.append("clustering,dim_red,k,p,ami_raw,ami_true,sc,bic")
    raw_predictions = {}

    output = None
    for dr in dim_red:
        data = u.FilterRows(clustering_results,lambda x : x["dim_red_method"] == dr)
        dim_best_val = None
        dim_result = None
        for dim in dims:
            best = {} # {p,k,ami}
            for cluster_mthd in ["kmeans","gmm"]:
                for cluster in clusters:
                    print("{0},{1},{2},{3}".format(dr,str(dim),cluster_mthd,str(cluster)))
                    d = data.loc[(data['clustering'] == cluster_mthd)&(data['k'] == cluster) & (data['p'] == dim)]
                    row = d.head(1).copy()
                    if(cluster_mthd not in best or best[cluster_mthd]['bic'].iloc[0] > row['bic'].iloc[0]):
                        best[cluster_mthd] = row
            curr_val = (best["kmeans"]['ami_true'].iloc[0] + best["gmm"]['ami_true'].iloc[0]) / 2
            #curr_val = (best["kmeans"]['ami_true'].iloc[0] + best["gmm"]['ami_true'].iloc[0]) / 2
            #curr_val = np.minimum(best["kmeans"]['ami_true'].iloc[0], best["gmm"]['ami_true'].iloc[0])
            if(dim_best_val is None or dim_best_val < curr_val):
                dim_best_val = curr_val
                dim_result = best.copy()
        for c in ["kmeans","gmm"]:
            ami_raw = GetAmiWithRawPredictions(rootfolder,raw_results,dr,dim_result[c].iloc[0]["p"],dim_result[c].iloc[0]["k"],c)
            lines.append("{0},{1},{2},{3},{4},{5},{6},{7}".format(c,str(dim_result[c].iloc[0]["dim_red_method"]),str(dim_result[c].iloc[0]["k"]),str(dim_result[c].iloc[0]["p"]),str(ami_raw[c]),str(dim_result[c].iloc[0]["ami_true"]),str(dim_result[c].iloc[0]["sc"]),str(dim_result[c].iloc[0]["bic"])))
        #if(output is None):
        #    output = pd.concat([dim_result["kmeans"],dim_result["gmm"]])
        #else:
        #    output = pd.concat([output,dim_result["kmeans"],dim_result["gmm"]])

    u.WriteTextArrayToFile(final_output_file,lines)
Esempio n. 12
0
def RunDecisionTreesWithOptimalInst(datasets_root_folder,
                                    weka_jar_path,
                                    cv_results_file,
                                    use_arff_files=True):
    file_extn = "arff" if use_arff_files else ".csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    cv_results = pd.read_csv(datasets_root_folder + "/" + cv_results_file)
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False)
        filter_name, filter_val = GetFilterOptions(dataset_dir)
        config_gen = ParameterGrid({'prune': [True, False]})
        for config in config_gen:

            filter = lambda x: (x['prune'] == False) & (x[
                filter_name] == filter_val) & (x[
                    'istrain'] == 1)  # this will output on the held out set
            filtered_rows = u.FilterRows(cv_results, filter)
            a = filtered_rows['m']
            if (len(a) == 0):
                print("ignoring : {0}".format(dataset_dir))
                continue
            b = np.max(filtered_rows['m'])
            indxs = np.isclose(a, b)
            best_insts = filtered_rows[indxs]
            best_insts = best_insts.iloc[0]['inst']
            config['inst'] = best_insts

            id = GetIdForOptConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            # if(os.path.isfile(train_output_file)):
            #     continue
            config['random_state'] = params_info_dict['random_state']
            config["wekajar"] = weka_jar_path
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            # for every config there has to be a train prediction and test prediction
            cmd = GetWekaCommandLineForConfig(config, False, False)
            config["modelbuildtimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            # now for test set
            config["predictionoutputfile"] = test_output_file
            config["testset"] = testfiles[0]
            cmd = GetWekaCommandLineForConfig(config, True, False)
            config["modelevaltimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            config.pop('random_state',
                       None)  # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("done dataset : " + dataset_dir)
Esempio n. 13
0
def GenerateDatasetSplits(rootFolder,
                          dataset_folder_prefix,
                          dataset,
                          test_ratio,
                          train_ratio,
                          validation_ratio,
                          train_size_percentages,
                          class_col,
                          random_state,
                          arff_attr_info=None):
    """
    train_size_percentages is a list of intergers specifying the
    percent of train set to be taken while preparing the dataset

    test_ratio,train_ratio,validation_ratio : numbers in percentages
    """
    dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format(
        rootFolder, dataset_folder_prefix, train_ratio, test_ratio))
    train, test, validation = CreateTrainTestAndValidationPartitions(
        dataset, class_col, train_ratio / 100, test_ratio / 100, random_state,
        validation_ratio / 100)
    if (validation is not None):
        validation_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format(
            dataset_root, dataset_folder_prefix))
        validation.to_csv(validation_output_file_csv, index=False)
        test_output_file_csv = u.PreparePath("{0}/i-{1}.realtest.csv".format(
            dataset_root, dataset_folder_prefix))
        test.to_csv(test_output_file_csv, index=False)
        test_output_file_arff = u.PreparePath("{0}/i-{1}.realtest.arff".format(
            dataset_root, dataset_folder_prefix))
        CreateArffFileFromCsv(arff_attr_info, test_output_file_arff,
                              test_output_file_csv, True, True)
    else:
        test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format(
            dataset_root, dataset_folder_prefix))
        test.to_csv(test_output_file_csv, index=False)
    if (arff_attr_info is not None):
        test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format(
            dataset_root, dataset_folder_prefix))
        CreateArffFileFromCsv(arff_attr_info, test_output_file_arff,
                              test_output_file_csv, True, True)

    # now creating the train set partitions
    for train_set_size in train_size_percentages:
        folder_path = u.PreparePath("{0}/i-{1}_t-{2}_ts-{3}".format(
            dataset_root, dataset_folder_prefix, train_ratio, train_set_size))
        csv_output_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_ts-{3}.train.csv".format(folder_path,
                                                      dataset_folder_prefix,
                                                      train_ratio,
                                                      train_set_size))
        rows_to_keep = int(len(train) * train_set_size / 100)
        train.head(rows_to_keep).to_csv(csv_output_file, index=False)
        if (arff_attr_info is not None):
            arff_output_file = u.PreparePath(
                "{0}/i-{1}_t-{2}_ts-{3}.train.arff".format(
                    folder_path, dataset_folder_prefix, train_ratio,
                    train_set_size))
            CreateArffFileFromCsv(arff_attr_info, arff_output_file,
                                  csv_output_file, True, True)

        # writing the parameters
        params_info = [
            "dataset_instance={0}".format(dataset_folder_prefix),
            "test_split={0}".format(test_ratio),
            "train_split={0}".format(train_ratio),
            "random_state={0}".format(random_state),
            "class_col={0}".format(class_col),
            "train_split_percent_used={0}".format(train_set_size)
        ]
        params_out_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_ts-{3}.params.txt".format(folder_path,
                                                       dataset_folder_prefix,
                                                       train_ratio,
                                                       train_set_size))
        u.WriteTextArrayToFile(params_out_file, params_info)
def RunExperiments(X,Y,rootfolder,clusters,dims,compute_acc=None):
    datasets = {}
    datasets["raw"] = (X,Y)
    err_series = []
    decorations = {}
    decorations["pca"] = ("o","r","pca")
    decorations["ica"] = ("x","b","ica")
    decorations["rp"] = ("+","g","rp")
    decorations["mi"] = ("o","k","mi")
    flags = [True,True,True,True]
    nn_output_lines = []
    nn_output_file = rootfolder + "/nn.csv"
    if(compute_acc is not None):
        h,l = CreateOutputLineForNN(RunNeuralNetwork(X,Y,10,compute_acc,False),"raw")
        nn_output_lines.append(h)
        nn_output_lines.append(l)

    best_bic = None
    ################### PCA #####################
    if(flags[0]):
        pca_results = PerformPca(X,Y,dims,0)
        pca_var_explained_plot = u.PreparePath(rootfolder + "/plots/pca/var.png")
        recons_err_plot = u.PreparePath(rootfolder + "/plots/err.png")
        recons_err_dict = []
        var_y = []
        err_y = []

        for dim in dims:
            key = "pca_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(pca_results["{0}data".format(key)]),Y)
            err_y.append(pca_results[key+"reconstruction_error"])
            var_y = pca_results[key+"explained_var_ratio"]
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"pca")
            #    #nn_output_lines.append(h)
            #    nn_output_lines.append(l)

        ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2])
        recons_err_dict.append(ser)
        ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2])
        u.SaveDataPlotWithLegends([ser],x_axis_name="dimensions",y1_axis_name="% explained variance",filename=pca_var_explained_plot)

    ################### ICA #####################

    if(flags[1]):
        ica_kt_plot = u.PreparePath(rootfolder + "/plots/ica/kt.png")
        err_y = []
        ica_results = PerformIca(X,Y,dims,0)
        for dim in dims:
            key = "ica_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(ica_results[key+"data"]),Y)
            err_y.append(ica_results[key+"reconstruction_error"])
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"ica")
            #    nn_output_lines.append(l)

        var_y = ica_results["ica_kt_all"]
        ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2])
        recons_err_dict.append(ser)
        ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2])
        u.SaveDataPlotWithLegends([ser],x_axis_name="components",y1_axis_name="kurtosis",filename=ica_kt_plot)

    ################### RP #####################
    if(flags[2]):
        rp_runs_plot = u.PreparePath(rootfolder + "/plots/rp/runs.png")
        err_y = []
        runs = 10
        rp_results = PerformRandomProjections(X,Y,dims,runs)
        runs_series = []
        markers = u.GetColorCombinations(10)
        i=0
        for dim in dims:
            key = "rp_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(rp_results[key+"data"]),Y)
            err_y.append(rp_results[key+"reconstruction_error"])
            runs_ser = u.YSeries(rp_results[key+"reconstruction_errors_all"],xvalues=np.arange(runs)+1,points_marker = "o",line_color = markers[i]["color"],plot_legend_label="proj dims = "+str(dim))
            runs_series.append(runs_ser)
            i = i + 1
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"rp")
            #    nn_output_lines.append(l)

        ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["rp"][0],line_color=decorations["rp"][1],plot_legend_label=decorations["rp"][2])
        recons_err_dict.append(ser)
        u.SaveDataPlotWithLegends(runs_series,x_axis_name="run number",y1_axis_name="reconstruction err",filename=rp_runs_plot)

        u.SaveDataPlotWithLegends(recons_err_dict,x_axis_name="dimensions",y1_axis_name="reconstruction_error",filename=recons_err_plot)

    ###################### MI Feature Selection #########################
    if(flags[3]):
        mi_results = PerformMiBasedFeatureSelection(X,Y,dims,10)
        mi_plot = u.PreparePath(rootfolder + "/plots/mi/scores.png")
        for dim in dims:
            key = "mi_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(mi_results[key+"data"]),Y)
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"mi")
            #    nn_output_lines.append(l)
        ser = u.YSeries(mi_results["scores"],xvalues = np.arange(len(mi_results["scores"])) + 1,points_marker=decorations["mi"][0],line_color=decorations["mi"][1],plot_legend_label=decorations["mi"][2])
        u.SaveDataPlotWithLegends([ser],x_axis_name="feature number",y1_axis_name="mutual information", filename=mi_plot)

    ###################### CLUSTERING #########################
    clustering_output_file = rootfolder + "/clustering.csv"
    clustering_plots_output_root = u.PreparePath(rootfolder + "/plots")
    lines = []
    lines.append("clustering,dim_red_method,k,p,ami_raw,ami_true,sc,bic")
    raw_clustering_results = {}
    best_bic_raw_clustering = {}
    curr_best_bic = {}
    actual_labels = Y
    for dim in dims:
        for algo in ["raw","ica","rp","mi","pca"]:
            raw_data_plot_done = False
            key = "{0}_{1}_".format(algo,str(dim))
            if(algo == "raw"):
                key = "raw"
            dataset = datasets[key]
            for cluster in clusters:
                for mthd in ["kmeans","gmm"]:
                    raw_key = "{0}_{1}".format(str(cluster),mthd)
                    print("doing clustering for dim = {0} {1} k = {2} {3}".format(str(dim),algo,str(cluster), mthd))
                    c_key = "{0}_{1}_predicted".format(mthd,str(cluster))
                    c_key1 = "{0}_{1}_".format(mthd,str(cluster))
                    if(algo == "raw" and raw_key in raw_clustering_results):
                        results = raw_clustering_results[raw_key]
                    else:
                        #if(algo == "raw" and cluster == 2 and compute_acc):
                        #    results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd]
                        #    h,l = CreateOutputLineForNN(RunNeuralNetwork(results[c_key.replace("predicted","new_data")],dataset[1],10,compute_acc),mthd)
                        #    nn_output_lines.append(l)
                        #else:
                        results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd]
                        if(algo == "raw"):
                           raw_clustering_results[raw_key] = results
                        if(compute_acc):
                            mthd_key = mthd+algo if algo == "raw" else mthd+algo+str(cluster)+str(dim)
                            if((mthd_key not in curr_best_bic) or (curr_best_bic[mthd_key] > results[c_key1+"bic"])):
                                curr_best_bic[mthd_key] = results[c_key1+"bic"]
                                best_bic_raw_clustering[mthd_key] = (results[c_key1+"new_data"],dataset[1],results[c_key1+"metrics"]["ami"],results[c_key1+"bic"])
                                print("new best {0} {1}".format(c_key1,str(results[c_key1+"bic"])))

                    clustering_prediction_file = u.PreparePath(rootfolder + "/clustering_output/mthd={0}_k={1}_d={2}_algo={3}.csv".format(mthd,str(cluster),str(dim),algo))
                    np.savetxt(clustering_prediction_file,results[c_key])
                    bic = c_key.replace("predicted","bic")
                    bic = results[bic]
                    act = ComputeClusteringMetrics(actual_labels,results[c_key],dataset[0])
                    raw = ComputeClusteringMetrics(raw_clustering_results[raw_key][c_key],results[c_key],dataset[0])
                    line = "{0},{1},{2},{3},{4},{5},{6},{7}".format(mthd,algo,str(cluster),str(dim),str(raw["ami"]),str(act["ami"]),str(raw["sl"]),str(bic))
                    print(line)
                    plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_{3}.png".format(mthd,str(cluster),algo,str(dim))
                    #if(mthd == "gmm"):
                    #    prob_output_file = rootfolder + "/{0}_{1}_{2}_{3}.csv".format(mthd,str(cluster),algo,str(dim))
                    #    np.savetxt(prob_output_file,results[c_key.replace("predicted","prob")],delimiter=",")
                    ScatterPlotForClustering(results[c_key],actual_labels,plot_output_file)
                    if(dim == 2 and algo != "raw"):
                        if(raw_data_plot_done == False):
                            plot_output_file = clustering_plots_output_root + "/{0}_{1}_data.png".format(mthd,algo)
                            ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],np.zeros_like(actual_labels),actual_labels,plot_output_file)
                            raw_data_plot_done = True
                        plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_data.png".format(mthd,str(cluster),algo)
                        ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],results[c_key],actual_labels,plot_output_file)
                    lines.append(line)

    #if(compute_acc):
    #    keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","gmmpca":"pca","gmmica":"ica","gmmrp":"rp","gmmmi":"mi"}
    #    for key in keys_to_output.keys():
    #        if("raw" not in key):
    #            curr_best = None
    #            for cluster in clusters:
    #                datakey = key+str(cluster)
    #                if(curr_best is None or best_bic_raw_clustering[datakey][2] > curr_best):
    #                    curr_best = best_bic_raw_clustering[datakey][2]
    #                    _X = best_bic_raw_clustering[datakey][0]
    #                    _Y = best_bic_raw_clustering[datakey][1]
    #        else:
    #            _X = best_bic_raw_clustering[key][0]
    #            _Y = best_bic_raw_clustering[key][1]

    #        h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key])
    #        nn_output_lines.append(l)
    #    u.WriteTextArrayToFile(nn_output_file,nn_output_lines)

    if(compute_acc):
        keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","pca":"pca","ica":"ica","rp":"rp","mi":"mi"}
        for key in keys_to_output.keys():
            if("raw" not in key):
                dim_best_val = None
                dim_result = None
                for dim in dims:
                    best = {} # {x,y,p,k,bic,ami}
                    for cluster_mthd in ["kmeans","gmm"]:
                        for cluster in clusters:
                            datakey = cluster_mthd+key+str(cluster)+str(dim)
                            if(cluster_mthd not in best or best_bic_raw_clustering[datakey][2] > best[cluster_mthd][4]):
                                best[cluster_mthd] = (best_bic_raw_clustering[datakey][0],best_bic_raw_clustering[datakey][1],dim,cluster,best_bic_raw_clustering[datakey][3],best_bic_raw_clustering[datakey][2])
                    curr_val = (best["kmeans"][5] + best["gmm"][5]) / 2
                    if(dim_best_val is None or dim_best_val < curr_val):
                        dim_best_val = curr_val
                        dim_result = best

                _X = dim_result["gmm"][0]
                _Y = dim_result["gmm"][1]
            else:
                _X = best_bic_raw_clustering[key][0]
                _Y = best_bic_raw_clustering[key][1]

            h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key])
            nn_output_lines.append(l)
        u.WriteTextArrayToFile(nn_output_file,nn_output_lines)

    u.WriteTextArrayToFile(clustering_output_file,lines)
Esempio n. 15
0
def GenerateDatasetSplitsForWithNoise(rootFolder,
                                      dataset_folder_prefix,
                                      dataset,
                                      test_ratio,
                                      train_ratio,
                                      validation_ratio,
                                      noise_percentages,
                                      class_col,
                                      flip_fn,
                                      random_state,
                                      arff_attr_info=None):
    """
	train_size_percentages is a list of intergers specifying the
	percent of train set to be taken while preparing the dataset

	test_ratio,train_ratio,validation_ratio : numbers in percentages
	"""
    dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format(
        rootFolder, dataset_folder_prefix, train_ratio, test_ratio))
    train, test, validation = CreateTrainTestAndValidationPartitions(
        dataset, class_col, train_ratio / 100, test_ratio / 100, random_state,
        validation_ratio / 100)
    test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format(
        dataset_root, dataset_folder_prefix))
    test.to_csv(test_output_file_csv, index=False)
    if (arff_attr_info is not None):
        test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format(
            dataset_root, dataset_folder_prefix))
        CreateArffFileFromCsv(arff_attr_info, test_output_file_arff,
                              test_output_file_csv, True, True)

    # now creating the train set partitions
    for noise_perc in noise_percentages:
        folder_path = u.PreparePath("{0}/i-{1}_t-{2}_noise-{3}".format(
            dataset_root, dataset_folder_prefix, train_ratio, noise_perc))
        csv_output_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_noise-{3}.train.csv".format(
                folder_path, dataset_folder_prefix, train_ratio, noise_perc))

        noisy_dataset = CreateNoisyDataset(train, class_col, noise_perc / 100,
                                           random_state, flip_fn)
        noisy_dataset.to_csv(csv_output_file, index=False)

        print("done noisy : " + str(noise_perc))
        if (arff_attr_info is not None):
            arff_output_file = u.PreparePath(
                "{0}/i-{1}_t-{2}_noise-{3}.train.arff".format(
                    folder_path, dataset_folder_prefix, train_ratio,
                    noise_perc))
            CreateArffFileFromCsv(arff_attr_info, arff_output_file,
                                  csv_output_file, True, True)

        # writing the parameters
        params_info = [
            "dataset_instance={0}".format(dataset_folder_prefix),
            "test_split={0}".format(test_ratio),
            "train_split={0}".format(train_ratio),
            "random_state={0}".format(random_state),
            "class_col={0}".format(class_col),
            "noise_perc={0}".format(noise_perc)
        ]
        params_out_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_noise-{3}.params.txt".format(
                folder_path, dataset_folder_prefix, train_ratio, noise_perc))
        u.WriteTextArrayToFile(params_out_file, params_info)
Esempio n. 16
0
def GenerateDatasetSplitsForClassImbalance(rootFolder,
                                           dataset_folder_prefix,
                                           dataset,
                                           test_ratio,
                                           train_ratio,
                                           validation_ratio,
                                           imbalance_percentages,
                                           class_col,
                                           minority_label,
                                           min_minority_to_keep,
                                           random_state,
                                           arff_attr_info=None,
                                           train_set=None,
                                           test_set=None):
    """
	train_size_percentages is a list of intergers specifying the
	percent of train set to be taken while preparing the dataset

	test_ratio,train_ratio,validation_ratio : numbers in percentages
	"""
    dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format(
        rootFolder, dataset_folder_prefix, train_ratio, test_ratio))
    if ((train_set is not None) & (test_set is not None)):
        train = train_set
        test = test_set
    else:
        train, test, validation = CreateTrainTestAndValidationPartitions(
            dataset, class_col, train_ratio / 100, test_ratio / 100,
            random_state, validation_ratio / 100)
    test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format(
        dataset_root, dataset_folder_prefix))
    test.to_csv(test_output_file_csv, index=False)
    if (arff_attr_info is not None):
        test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format(
            dataset_root, dataset_folder_prefix))
        CreateArffFileFromCsv(arff_attr_info, test_output_file_arff,
                              test_output_file_csv, True, True)

    # now creating the train set partitions
    for imbalance_perc in imbalance_percentages:
        folder_path = u.PreparePath("{0}/i-{1}_t-{2}_im-{3}".format(
            dataset_root, dataset_folder_prefix, train_ratio, imbalance_perc))
        csv_output_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_im-{3}.train.csv".format(folder_path,
                                                      dataset_folder_prefix,
                                                      train_ratio,
                                                      imbalance_perc))
        imbalance_dataset = CreateImbalancedDataSet(train, class_col,
                                                    minority_label,
                                                    imbalance_perc / 100,
                                                    min_minority_to_keep,
                                                    random_state)
        imbalance_dataset.to_csv(csv_output_file, index=False)
        print("done imb : " + str(imbalance_perc))
        if (arff_attr_info is not None):
            arff_output_file = u.PreparePath(
                "{0}/i-{1}_t-{2}_im-{3}.train.arff".format(
                    folder_path, dataset_folder_prefix, train_ratio,
                    imbalance_perc))
            CreateArffFileFromCsv(arff_attr_info, arff_output_file,
                                  csv_output_file, True, True)

        # writing the parameters
        params_info = [
            "dataset_instance={0}".format(dataset_folder_prefix),
            "test_split={0}".format(test_ratio),
            "train_split={0}".format(train_ratio),
            "random_state={0}".format(random_state),
            "class_col={0}".format(class_col),
            "minority_label={0}".format(minority_label),
            "imbalance_perc={0}".format(imbalance_perc)
        ]
        params_out_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_im-{3}.params.txt".format(folder_path,
                                                       dataset_folder_prefix,
                                                       train_ratio,
                                                       imbalance_perc))
        u.WriteTextArrayToFile(params_out_file, params_info)
Esempio n. 17
0
def RunNeuralNetClassifier(datasets_root_folder,
                           one_hot_encoding_cols=None,
                           positive_class_label=None,
                           cv_file_format=None,
                           cv_scoring='f1'):
    file_extn = "csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/nnets", is_file=False)
        config_gen = nnconfig()
        config = config_gen.GetNextConfigAlongWithIdentifier()
        while (config is not None):
            id = config["id"]
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            # no separate cv is done for early stopping.
            cv_results_file = u.PreparePath(
                "{0}/{1}.grid_search_cv_results.csv".format(
                    run_output_dir, id)).replace("True", "False")
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            # if(os.path.isfile(cv_results_file)):
            # 	config = config_gen.GetNextConfigAlongWithIdentifier()
            # 	continue
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            data = pd.read_csv(trainfile)
            config["testset"] = testfiles[0]
            testdata = pd.read_csv(config["testset"])
            train_len = len(data)

            cols_to_ignore = set(
                one_hot_encoding_cols
            ) if one_hot_encoding_cols is not None else set([])
            cols_to_ignore.add(data.columns[-1])
            cols_to_transform = [
                c for c in data.columns if c not in cols_to_ignore
            ]
            scaler = StandardScaler()
            scaler.fit(data[cols_to_transform])
            data[cols_to_transform] = scaler.transform(data[cols_to_transform])
            testdata[cols_to_transform] = scaler.transform(
                testdata[cols_to_transform])

            all_data = pd.concat([data, testdata], axis=0, ignore_index=True)
            X_all, Y_all = PrepareDataAndLabel(all_data, positive_class_label,
                                               one_hot_encoding_cols)
            X = X_all[0:train_len, :]
            Y = Y_all[0:train_len]
            test_X = X_all[train_len:, :]
            test_Y = Y_all[train_len:]

            hidden_layers = [(10, ), (30, ), (50, ), (70, )]
            init_learning_rates = [0.1, 0.01, 0.001, 0.0001]
            alpha = [0.01, 0.1, 1, 10, 100]
            momentum = 0.9
            max_iter = 200
            early_stopping = config["earlystopping"]
            validation_fraction = 0.3
            random_state = int(params_info_dict["random_state"])
            solver = 'sgd'

            #for doing 3-fold CV
            param_grid = {
                "alpha": alpha,
                "learning_rate_init": init_learning_rates,
                "hidden_layer_sizes": hidden_layers
            }
            classifier = MLPClassifier(activation="logistic",
                                       momentum=momentum,
                                       early_stopping=early_stopping,
                                       verbose=False,
                                       validation_fraction=validation_fraction,
                                       random_state=random_state,
                                       solver="sgd",
                                       max_iter=max_iter)
            cv_file = None
            if (cv_file_format is not None):
                cv_file = cv_file_format.format(id).replace("True", "False")
            if ((cv_file is None) or (os.path.isfile(cv_file) == False)):
                gscv = GridSearchCV(classifier,
                                    param_grid,
                                    scoring=cv_scoring,
                                    n_jobs=3)
                gscv.fit(X, Y)
                _D = pd.DataFrame(gscv.cv_results_)
                best_params = gscv.best_params_
                _D.to_csv(cv_results_file)
            else:
                cv_results = pd.read_csv(cv_file)
                best_params = ast.literal_eval(cv_results[
                    cv_results['rank_test_score'] == 1].iloc[0]['params'])
            # gscv = GridSearchCV(classifier,param_grid,scoring='f1',n_jobs=3)
            # gscv.fit(X,Y)
            # _D = pd.DataFrame(gscv.cv_results_)
            # _D.to_csv(cv_results_file)
            classifier = MLPClassifier(
                hidden_layer_sizes=best_params["hidden_layer_sizes"],
                activation="logistic",
                momentum=momentum,
                early_stopping=early_stopping,
                verbose=True,
                validation_fraction=validation_fraction,
                random_state=random_state,
                solver="sgd",
                max_iter=max_iter,
                learning_rate_init=best_params["learning_rate_init"],
                alpha=best_params["alpha"])
            start = time.clock()
            classifier.fit(X, Y)
            end = time.clock()

            config['momentum'] = momentum
            config["hidden_layers"] = "10;30;50;70"
            config["alphas"] = u.ConcatToStr(";", alpha)
            config["init_learning_rates"] = u.ConcatToStr(
                ";", init_learning_rates)
            config["total_iter"] = classifier.n_iter_
            config["time_per_iter"] = (end - start) / classifier.n_iter_
            config["best_alpha"] = best_params["alpha"]
            config["best_hidden_layer_sizes"] = best_params[
                "hidden_layer_sizes"][0]
            config["best_init_learning_rate"] = best_params[
                "learning_rate_init"]
            config["loss_curve"] = u.ConcatToStr(";", classifier.loss_curve_)

            config["random_state"] = random_state
            config["modelbuildtimesecs"] = end - start
            # for train performance
            config["trainpredictionoutputfile"] = train_output_file
            train_predicted_Y = classifier.predict(X)
            output = pd.DataFrame({
                "actual": Y,
                "predicted": train_predicted_Y
            })
            output.to_csv(train_output_file, index=False)

            # now for test set
            config["predictionoutputfile"] = test_output_file

            u.WriteBinaryFile(model_output_file, classifier)

            #test_X,test_Y = PrepareDataAndLabel(data,positive_class_label,one_hot_encoding_cols)
            predicted_Y = classifier.predict(test_X)
            output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y})
            output.to_csv(test_output_file, index=False)

            config.pop('random_state',
                       None)  # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
            config = config_gen.GetNextConfigAlongWithIdentifier()
        print("done dataset : " + dataset_dir)
Esempio n. 18
0
def RunKNNClassifier(datasets_root_folder,
                     nominal_value_columns=None,
                     positive_class_label=None,
                     metric_fn=None,
                     cv_file=None,
                     cv_scoring='f1'):
    file_extn = "csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/knn", is_file=False)

        data = pd.read_csv(trainfile)
        testdata = pd.read_csv(testfiles[0])
        train_len = len(data)

        cols_to_ignore = set(nominal_value_columns
                             ) if nominal_value_columns is not None else set(
                                 [])
        cols_to_ignore.add(data.columns[-1])
        cols_to_transform = [
            c for c in data.columns if c not in cols_to_ignore
        ]
        scaler = StandardScaler()
        scaler.fit(data[cols_to_transform])
        data[cols_to_transform] = scaler.transform(data[cols_to_transform])
        testdata[cols_to_transform] = scaler.transform(
            testdata[cols_to_transform])

        all_data = pd.concat([data, testdata], axis=0, ignore_index=True)
        X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label,
                                                nominal_value_columns)
        X = X_all[0:train_len, :]
        Y = Y_all[0:train_len]
        test_X = X_all[train_len:, :]
        test_Y = Y_all[train_len:]

        param_grid = {
            'weights': np.array(['uniform', 'distance']),
            'n_neighbors': np.array([5, 10, 20, 50])
        }
        classifier = KNeighborsClassifier()
        if ((cv_file is None) or (os.path.isfile(cv_file) == False)):
            gscv = GridSearchCV(classifier,
                                param_grid,
                                scoring=cv_scoring,
                                n_jobs=3)
            gscv.fit(X, Y)
            _D = pd.DataFrame(gscv.cv_results_)
            best_params = gscv.best_params_
        else:
            _D = None

        config_gen = ParameterGrid({
            'weights': ['uniform'],
            'neighbors': [-1]
        })  # -1 denotes that we need to take the cv results
        for config in config_gen:
            id = GetIdForConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            cv_results_file = u.PreparePath(
                "{0}/{1}.grid_search_cv_results.csv".format(
                    run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            scalar_output_file = u.PreparePath("{0}/{1}.scaler".format(
                run_output_dir, id))
            if (cv_file is not None):
                cv_file = cv_file
            if (_D is not None):
                _D.to_csv(cv_results_file)
            else:
                cv_results = pd.read_csv(cv_file)
                best_params = ast.literal_eval(cv_results[
                    cv_results['rank_test_score'] == 1].iloc[0]['params'])

            # if(os.path.isfile(test_output_file)):
            #	config = config_gen.GetNextConfigAlongWithIdentifier()
            #	continue
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            config["testset"] = testfiles[0]

            if (config['neighbors'] == -1):
                neighbors = best_params['n_neighbors']
                weights = best_params['weights']
                # _D.to_csv(cv_results_file)
                config['best_neighbors'] = neighbors
                config['best_weights'] = weights
            else:
                neighbors = config['neighbors']
                weights = config['weights']
            if (metric_fn is None):
                classifier = KNeighborsClassifier(neighbors, weights)
            else:
                classifier = KNeighborsClassifier(
                    neighbors,
                    weights,
                    algorithm='brute',
                    metric='pyfunc',
                    metric_params={'func': metric_fn})

            loo = LeaveOneOut()
            y_actual = []
            y_predicted = []
            count = 0
            total = len(X)
            for train_idx, test_idx in loo.split(X):
                X_train, X_test = X[train_idx], X[test_idx]
                Y_train, Y_test = Y[train_idx], Y[test_idx]
                classifier.fit(X_train, Y_train)
                Y_test_predicted = classifier.predict(X_test)
                assert (len(Y_test_predicted) == 1)
                y_actual.append(Y_test[0])
                y_predicted.append(Y_test_predicted[0])
                count = count + 1
                if (count % 100 == 0):
                    print(str(count) + " " + str(total))

            start = time.clock()
            classifier.fit(X, Y)
            end = time.clock()
            print(end - start)
            config["modelbuildtimesecs"] = end - start
            # for train performance
            config["trainpredictionoutputfile"] = train_output_file
            #train_predicted_Y = classifier.predict(X)
            output = pd.DataFrame({
                "actual": y_actual,
                "predicted": y_predicted
            })
            output.to_csv(train_output_file, index=False)

            # now for test set
            config["predictionoutputfile"] = test_output_file

            start = time.clock()
            predicted_Y = classifier.predict(test_X)
            end = time.clock()
            u.WriteBinaryFile(model_output_file, classifier)
            u.WriteBinaryFile(scalar_output_file, scaler)
            config["modelevaltimesecs"] = end - start
            output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y})
            output.to_csv(test_output_file, index=False)

            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("DONE dataset : " + dataset_dir)
Esempio n. 19
0
def RunDecisionTrees(datasets_root_folder, weka_jar_path, use_arff_files=True):
    file_extn = "arff" if use_arff_files else ".csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        else:
            break
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False)
        config_gen = ParameterGrid({
            'prune': [False],
            'inst': [2, 5, 8, 12, 15]
        })
        for config in config_gen:
            id = GetIdForConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            # if(os.path.isfile(train_output_file)):
            #     continue
            config['random_state'] = params_info_dict['random_state']
            config["wekajar"] = weka_jar_path
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            # for every config there has to be a train prediction and test prediction
            cmd = GetWekaCommandLineForConfig(config, False)
            config["modelbuildtimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            # now for test set
            #config["predictionoutputfile"] = test_output_file
            #config["testset"] = testfiles[0]
            #cmd = GetWekaCommandLineForConfig(config,True)
            #config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1)

            config.pop('random_state',
                       None)  # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("done dataset : " + dataset_dir)
Esempio n. 20
0
def RunSVMClassifier(datasets_root_folder,
                     nominal_value_columns=None,
                     positive_class_label=None,
                     cv_file=None,
                     cv_scoring='f1'):
    file_extn = "csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    realtestfiles = glob.glob("{0}/*.realtest.{1}".format(
        datasets_root_folder, file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/svm", is_file=False)
        params_info = u.ReadLinesFromFile(paramfile)
        params_info_dict = sl.GetDictionary(params_info)

        data = pd.read_csv(trainfile)
        testdata = pd.read_csv(testfiles[0])
        realtestdata = pd.read_csv(realtestfiles[0])
        train_len = len(data)
        test_len = len(testdata) + train_len

        cols_to_ignore = set(nominal_value_columns
                             ) if nominal_value_columns is not None else set(
                                 [])
        cols_to_ignore.add(data.columns[-1])
        cols_to_transform = [
            c for c in data.columns if c not in cols_to_ignore
        ]
        scaler = StandardScaler()
        scaler.fit(data[cols_to_transform])
        data[cols_to_transform] = scaler.transform(data[cols_to_transform])
        testdata[cols_to_transform] = scaler.transform(
            testdata[cols_to_transform])
        realtestdata[cols_to_transform] = scaler.transform(
            realtestdata[cols_to_transform])

        all_data = pd.concat([data, testdata, realtestdata],
                             axis=0,
                             ignore_index=True)
        X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label,
                                                nominal_value_columns)
        X = X_all[0:train_len, :]
        Y = Y_all[0:train_len]
        test_X = X_all[train_len:test_len, :]
        test_Y = Y_all[train_len:test_len]
        realtest_X = X_all[test_len:, :]
        realtest_Y = Y_all[test_len:]
        realtest_data_file = trainfile.replace(".train.",
                                               ".realtest.preprocessed.data.")
        realtest_label_file = trainfile.replace(
            ".train.", ".realtest.preprocessed.label.")
        np.savetxt(realtest_data_file, realtest_X, delimiter=',')
        np.savetxt(realtest_label_file, realtest_Y, delimiter=',')

        dataset_size = GetDataSetSize(dataset_dir)
        StoreData("train.csv", "train_label.csv", X, Y, dataset_size)
        StoreData("validation.csv", "validation_label.csv", test_X, test_Y,
                  dataset_size)
        StoreData("test.csv", "test_label.csv", realtest_X, realtest_Y,
                  dataset_size)

        param_grid = [
            {
                'C': [0.1, 1, 10, 100, 1000],
                'degree': [2, 3, 4],
                'kernel': ['poly']
            },
            {
                'C': [0.1, 1, 10, 100, 1000],
                'gamma': [0.001, 0.0001],
                'kernel': ['rbf']
            },
        ]
        classifier = SVC(cache_size=1500,
                         random_state=int(params_info_dict['random_state']))
        if ((cv_file is None) or (os.path.isfile(cv_file) == False)):
            gscv = GridSearchCV(classifier,
                                param_grid,
                                scoring=cv_scoring,
                                n_jobs=3)
            gscv.fit(X, Y)
            _D = pd.DataFrame(gscv.cv_results_)
            best_params = gscv.best_params_
        else:
            _D = None
        config_gen = [{}]
        for config in config_gen:
            id = GetIdForConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            cv_results_file = u.PreparePath(
                "{0}/{1}.grid_search_cv_results.csv".format(
                    run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))

            if (_D is not None):
                _D.to_csv(cv_results_file)
            else:
                cv_results = pd.read_csv(cv_file)
                best_params = ast.literal_eval(cv_results[
                    cv_results['rank_test_score'] == 1].iloc[0]['params'])
            # if(os.path.isfile(test_output_file)):
            # 	config = config_gen.GetNextConfigAlongWithIdentifier()
            # 	continue
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            config["testset"] = testfiles[0]
            config["kernel"] = best_params['kernel']
            config['C'] = best_params['C']
            if (config['kernel'] == 'rbf'):
                config['gamma'] = best_params['gamma']
                classifier = SVC(config['C'],
                                 gamma=config['gamma'],
                                 kernel=config['kernel'],
                                 cache_size=1500,
                                 random_state=int(
                                     params_info_dict['random_state']))
            else:
                config['degree'] = best_params['degree']
                classifier = SVC(config['C'],
                                 kernel=config['kernel'],
                                 degree=config['degree'],
                                 cache_size=1500,
                                 random_state=int(
                                     params_info_dict['random_state']))

            start = time.clock()
            classifier.fit(X, Y)
            end = time.clock()
            print(end - start)
            config["modelbuildtimesecs"] = end - start
            config['numsupportvectors'] = u.ConcatToStr(
                ';', classifier.n_support_)
            # for train performance
            config["trainpredictionoutputfile"] = train_output_file
            train_predicted_Y = classifier.predict(X)
            output = pd.DataFrame({
                "actual": Y,
                "predicted": train_predicted_Y
            })
            output.to_csv(train_output_file, index=False)
            u.WriteBinaryFile(model_output_file, classifier)
            # now for test set
            config["predictionoutputfile"] = test_output_file

            start = time.clock()
            predicted_Y = classifier.predict(test_X)
            end = time.clock()
            config["modelevaltimesecs"] = end - start
            output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y})
            output.to_csv(test_output_file, index=False)

            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("done dataset : " + dataset_dir)