コード例 #1
0
def GenerateVowelRecognitionDataSetSplits(
        rootFolder,
        id,
        train_perc,
        test_perc,
        random_state,
        train_size_percs=None,
        imbalance_percs=None,
        noise_percs=None,
        class_col_name="vowel",
        min_minority_class_samples_to_keep=500,
        validation_perc=0):

    vowelDataFile = u.PreparePath(
        "{0}/vowel-recongnition-dataset.csv".format(rootFolder))
    arff_attrs_file = u.PreparePath("{0}/vowel.txt".format(rootFolder))
    data, arff_attrs = LoadCharacterRecognitionDataset(vowelDataFile,
                                                       arff_attrs_file)

    minority_class = "v"
    flip_fn = lambda x: "c" if (x == "v") else "c"
    if (train_size_percs is not None):
        GenerateDatasetSplits(rootFolder, id, data, test_perc, train_perc,
                              validation_perc, train_size_percs,
                              class_col_name, random_state, arff_attrs)
    if (imbalance_percs is not None):
        GenerateDatasetSplitsForClassImbalance(
            rootFolder, "imb" + str(id), data, test_perc, train_perc, 0,
            imbalance_percs, class_col_name, minority_class,
            min_minority_class_samples_to_keep, random_state, arff_attrs)
    if (noise_percs is not None):
        GenerateDatasetSplitsForWithNoise(rootFolder, "noise" + str(id), data,
                                          test_perc, train_perc, 0,
                                          noise_percs, class_col_name, flip_fn,
                                          random_state, arff_attrs)
コード例 #2
0
def PlotCrossValidationCurvesForSvm(
        rootfolder,
        y_axis_name="F-Measure",
        roots=[r'CreditScreeningDataset', 'LetterRecognition']):
    # root = r'C:/Users/shwet/OneDrive/Gatech/Courses/ML/DataSets/LetterRecognition'
    #root = r'C:/Users/shkhandu/OneDrive/Gatech/Courses/ML/DataSets/CreditScreeningDataset'
    for r in roots:
        root = rootfolder + "/" + r
        instance = r'i-0_t-80_T-20'
        dataset_instance_root = root + "/" + instance
        plot_output_file = u.PreparePath(
            root + r'/Plots/svm/cv.svm.{0}.png'.format(instance))
        cv_save_file = u.PreparePath(
            dataset_instance_root +
            "/svm.{0}.model_complexity_curves.csv".format(instance))
        x_axis_name = 'Train size % used'
        title = 'CV Peformance'

        def parameter_getter(path):
            paramfile = "{0}/svm/cvresults/cvresults.params.txt".format(path)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            return int(params_info_dict['train_split_percent_used'])

        def cv_getter(path):
            return "{0}/svm/cvresults/cvresults.grid_search_cv_results.csv".format(
                path)

        PlotCrossValidationCurves(dataset_instance_root, plot_output_file,
                                  x_axis_name, y_axis_name, title,
                                  parameter_getter, cv_getter, cv_save_file)
コード例 #3
0
def Plot(rootfolder, cols_to_plot_dict):
    data = pd.read_csv(rootfolder + "/stats_agg.csv")
    sizes = data['size'].unique()
    algos = ['rhc', 'sa', 'mimic', 'ga']
    algo_decoration = {
        'mimic': ('r', 'o', 'mimic'),
        'ga': ('g', 's', 'genetic algo'),
        'sa': ('b', '+', 'sim annealing'),
        'rhc': ('k', '*', 'rhc')
    }
    for col in cols_to_plot_dict.keys():
        y_ser = []
        for algo in algos:
            x = data[data['algo'] == algo].loc[:, 'size']
            y = data[data['algo'] == algo].loc[:, col]
            legend_label = algo_decoration[algo][2]
            marker = algo_decoration[algo][1]
            color = algo_decoration[algo][0]
            yseries = u.YSeries(y,
                                points_marker=marker,
                                line_color=color,
                                xvalues=x,
                                plot_legend_label=legend_label)
            y_ser.append(yseries)
        y_axis_name = cols_to_plot_dict[col]
        x_axis_name = 'size'
        savepath = u.PreparePath(rootfolder + "/plots/" + col + ".png")
        u.SaveDataPlotWithLegends(y_ser,
                                  filename=savepath,
                                  y1_axis_name=y_axis_name,
                                  x_axis_name=x_axis_name)
コード例 #4
0
def PlotCrossValidationCurvesForNNets(rootfolder, y_axis_name="F-Measure"):

    roots = [
        rootfolder + r'/CreditScreeningDataset',
        rootfolder + r'/LetterRecognition'
    ]
    for root in roots:
        instance = r'i-0_t-80_T-20'
        stopping = 'earlystop-False'
        dataset_instance_root = root + '/' + instance
        plot_output_file = u.PreparePath(
            root +
            r'/Plots/nnets/cv.{0}.nnets.{1}.png'.format(stopping, instance))
        cv_save_file = u.PreparePath(
            dataset_instance_root +
            "/nnets.{0}.{1}.model_complexity_curves.csv".format(
                instance, stopping))
        x_axis_name = 'Train size % used'
        parameter_name = 'train_split_percent_used'
        title = 'CV Peformance'

        def parameter_getter(path):
            paramfile = "{0}/nnets/{1}/{1}.params.txt".format(path, stopping)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            return int(params_info_dict[parameter_name])

        def cv_getter(path):
            return "{0}/nnets/{1}/{1}.grid_search_cv_results.csv".format(
                path, stopping)

        PlotCrossValidationCurves(dataset_instance_root, plot_output_file,
                                  x_axis_name, y_axis_name, title,
                                  parameter_getter, cv_getter, cv_save_file)
        plot_fn = lambda x: (("0.0001" in x) | ("0.0001" in x)) & (
            ("70" in x) | ("50" in x))
        plot_output_file = root + r'/Plots/nnets/cv.small.{0}.nnets.{1}.png'.format(
            stopping, instance)
        PlotCrossValidationCurves(dataset_instance_root,
                                  plot_output_file,
                                  x_axis_name,
                                  y_axis_name,
                                  title,
                                  parameter_getter,
                                  cv_getter,
                                  should_plot=plot_fn)
コード例 #5
0
ファイル: Svm.py プロジェクト: shwetabh-khanduja/ml
def StoreData(data_file_name, label_file_name, data, labels, size):
    root = u.PreparePath(
        "c:/Users/shkhandu/OneDrive/Gatech/Courses/ML/Assignment2/VowelRecognition/{0}"
        .format(str(size)),
        is_file=False)
    data_file = root + "/" + data_file_name
    label_file = root + "/" + label_file_name
    np.savetxt(data_file, data, delimiter=",")
    np.savetxt(label_file, labels, delimiter=",")
コード例 #6
0
def main():
    print("running")
    rootfolder = r"C:\Users\shkhandu\OneDrive\Gatech\Courses\ML\Assignment3\skhanduja7\data"

    output = rootfolder
    output_lr = u.PreparePath(output + "/lr")
    X,Y = ReadLetterRecognitionData(rootfolder)
    RunExperiments(X,Y,output_lr,[10,15,26,35,50],[2,4,8,12,16])
    PlotClusteringMetricsForDimsAndBic(output_lr,pd.read_csv(output_lr + '/clustering.csv'),[2,4,8,12,16],["raw","pca","ica","rp","mi"],[10,15,26,35,50],metrics=["bic"])
    PlotClusteringMetrics(output_lr,pd.read_csv(output_lr + '/clustering.csv'),[])
    ComputeFinalResults1(output_lr,[10,15,26,35,50],[2,4,8,12])

    output_lr = u.PreparePath(output + "/cs")
    X,Y = ReadCreditScreeningData(rootfolder)
    RunExperiments(X,Y,output_lr,[2,5,10,15,20],[2,5,10,20,30],True)
    PlotClusteringMetricsForDimsAndBic(output_lr,pd.read_csv(output_lr + '/clustering.csv'),[2,5,10,20,30],["raw","pca","ica","rp","mi"],[2,5,10,15,20],metrics=["bic"])
    PlotClusteringMetrics(output_lr,pd.read_csv(output_lr + '/clustering.csv'),[])
    ComputeFinalResults1(output_lr,[2,5,10,15,20],[2,5,10,30,20])
コード例 #7
0
def PlotCrossValidationCurvesForKnn(rootfolder, y_axis_name="F-Measure"):
    # root = r'C:/Users/shwet/OneDrive/Gatech/Courses/ML/DataSets/LetterRecognition'
    #root = r'C:/Users/shkhandu/OneDrive/Gatech/Courses/ML/DataSets/CreditScreeningDataset'
    roots = [
        rootfolder + r'/CreditScreeningDataset',
        rootfolder + r'/LetterRecognition'
    ]
    for root in roots:
        instance = r'i-0_t-80_T-20'
        dataset_instance_root = root + "/" + instance
        plot_output_file = u.PreparePath(
            root + r'/Plots/knn/cv.knn.{0}.png'.format(instance))
        cv_save_file = u.PreparePath(
            dataset_instance_root +
            "/knn.{0}.model_complexity_curves.csv".format(instance))
        x_axis_name = 'Model complexity'
        title = 'CV Peformance'

        def parameter_getter(path):
            paramfile = "{0}/knn/weights-uniform_neighbors--1/weights-uniform_neighbors--1.params.txt".format(
                path)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            return int(params_info_dict['train_split_percent_used'])

        def knn_label_maker(l):
            p = ast.literal_eval(l)
            return "n{0}w{1}".format(p['n_neighbors'], p['weights'][0])

        def cv_getter(path):
            return "{0}/knn/weights-uniform_neighbors--1/weights-uniform_neighbors--1.grid_search_cv_results.csv".format(
                path)

        PlotCrossValidationCurves2(dataset_instance_root,
                                   plot_output_file,
                                   x_axis_name,
                                   y_axis_name,
                                   title,
                                   parameter_getter,
                                   cv_getter,
                                   cv_save_file,
                                   label_maker=knn_label_maker)
コード例 #8
0
 def write_to_file(data_to_write, filepath):
     file = u.PreparePath(filepath)
     data_to_write.to_csv(file,
                          index=False,
                          header=(include_header &
                                  (arff_format_predata_lines is None)))
     if (arff_format_predata_lines is not None):
         data = []
         data.extend(arff_format_predata_lines)
         data.extend(u.ReadLinesFromFile(file))
         u.WriteTextArrayToFile(file, data)
コード例 #9
0
def GenerateCreditScreeningDataSetSplits(rootFolder,
                                         id,
                                         train_perc,
                                         test_perc,
                                         random_state,
                                         train_size_percs=None,
                                         imbalance_percs=None,
                                         noise_percs=None,
                                         class_col_name="A16",
                                         min_minority_class_samples_to_keep=10,
                                         train=None,
                                         test=None,
                                         validation_perc=0):

    #rootFolder=r"C:\Users\shkhandu\OneDrive\Gatech\Courses\ML\DataSets\CreditScreeningDataset"
    #id=0
    #train_perc=80
    #test_perc=20
    vowelDataFile = u.PreparePath(
        "{0}/data_no_missing_values.csv".format(rootFolder))
    arff_attrs_file = u.PreparePath("{0}/arff_attrs.txt".format(rootFolder))
    data, arff_attrs = LoadCreditScreeningData(vowelDataFile, arff_attrs_file)
    #random=0
    #train_size_percs = [20,30,40,50,60,70,80,90,100]
    #imbalance_percs = [90,10,20,30,40,50,70,5,100]
    minority_class = "+"
    flip_fn = lambda x: "-" if (x == "+") else "+"
    if (train_size_percs is not None):
        GenerateDatasetSplits(rootFolder, id, data, test_perc, train_perc,
                              validation_perc, train_size_percs,
                              class_col_name, random_state, arff_attrs)
    if (imbalance_percs is not None):
        GenerateDatasetSplitsForClassImbalance(
            rootFolder, "imb" + str(id), data, test_perc, train_perc, 0,
            imbalance_percs, class_col_name, minority_class,
            min_minority_class_samples_to_keep, random_state, arff_attrs)
    if (noise_percs is not None):
        GenerateDatasetSplitsForWithNoise(rootFolder, "noise" + str(id), data,
                                          test_perc, train_perc, 0,
                                          noise_percs, class_col_name, flip_fn,
                                          random_state, arff_attrs)
コード例 #10
0
def PlotClusteringMetrics(rootfolder, data, k,dim='raw',p=2):
    filter = lambda x : x['dim_red_method'] == dim and x['p'] == p
    filtered_data = u.FilterRows(data,filter)
    metrics = ["ami_raw","ami_true","sc","bic"]
    gmm_data = filtered_data.loc[filtered_data['clustering'] == "gmm",:]
    kmeans_data = filtered_data.loc[filtered_data['clustering'] == "kmeans",:]
    d = {"kmeans":('o','b','kmeans'),"gmm":('x','r','gmm')}
    for metric in metrics:
        outputfile = u.PreparePath(rootfolder + "/plots/metrics/{0}_{1}_p={2}.png".format(metric,dim,str(p)))
        kmeans_ser = u.YSeries(kmeans_data[metric],xvalues=kmeans_data["k"],points_marker = d["kmeans"][0],line_color=d["kmeans"][1],plot_legend_label=d["kmeans"][2])
        gmm_ser = u.YSeries(gmm_data[metric],xvalues=gmm_data["k"],points_marker = d["gmm"][0],line_color=d["gmm"][1],plot_legend_label=d["gmm"][2])
        u.SaveDataPlotWithLegends([kmeans_ser,gmm_ser],x_axis_name="number of clusters",y1_axis_name=metric,filename=outputfile)
コード例 #11
0
def PlotClusteringMetricsForDimsAndBic(rootfolder, data, dims, dim_reds,k,metrics = ["ami_raw","ami_true","sc","bic"]):
    colors = {"ica":'r','pca':'b','rp':'g','mi':'k','raw':'orange'}
    markers = {"kmeans":'o',"gmm":'x'}
    for _k in dims:
        for metric in metrics:
            for dim_red in dim_reds:
                ser = []
                outputfile = u.PreparePath(rootfolder + "/plots/metrics/dr_{0}_p={1}_{2}.png".format(metric,str(_k),dim_red))
                d = data.loc[(data['dim_red_method'] == dim_red) & (data['p'] == _k) & (data['clustering'] == 'kmeans') ,:]
                ser.append(u.YSeries(d[metric],xvalues=d['k'],line_color=colors[dim_red],points_marker=markers['kmeans'],plot_legend_label="{0}-{1}".format(dim_red,'kmeans')))
                d = data.loc[(data['dim_red_method'] == dim_red) & (data['p'] == _k) & (data['clustering'] == 'gmm') ,:]
                ser.append(u.YSeries(d[metric],xvalues=d['k'],line_color=colors[dim_red],points_marker=markers['gmm'],plot_legend_label="{0}-{1}".format(dim_red,'gmm')))
                u.SaveDataPlotWithLegends(ser,x_axis_name="k",y1_axis_name=metric,filename = outputfile)
コード例 #12
0
def CreateArffFileFromCsv(arff_attr_info,
                          arff_file_path,
                          data_text_array,
                          isFile=False,
                          hasHeader=True):
    arff_data = []
    arff_data.extend(arff_attr_info)
    data_text_array = u.ReadLinesFromFile(data_text_array) if (
        isFile) else data_text_array
    data_text_array = data_text_array[1:] if (isFile
                                              & hasHeader) else data_text_array
    arff_data.extend(data_text_array)
    file = u.PreparePath(arff_file_path)
    u.WriteTextArrayToFile(file, arff_data)
コード例 #13
0
def EvaluateExperiments(datasets_root_folder,
                        params_to_keep,
                        positive_class,
                        metric_calculation_fn,
                        evaluation_output_filename="performance.csv",
                        algo_folder="dt",
                        should_eval=lambda x: True):

    headers = []
    headers.extend(params_to_keep)
    headers.extend(['istrain', 'p', 'r', 'm'])
    headers = ",".join(headers)
    evals = []
    evals.append(headers)
    for directory in u.Get_Subdirectories(datasets_root_folder):
        # each directory is a dataset directory
        dt_output_dir = "{0}/{1}".format(directory, algo_folder)
        if (os.path.isdir(dt_output_dir) == False):
            continue
        for run_output_folder in u.Get_Subdirectories(dt_output_dir):
            if (should_eval(run_output_folder) == False):
                print("ignoring : {0}".format(run_output_folder))
                continue
            # read params file
            params_file_path = glob.glob(
                "{0}/*.params.txt".format(run_output_folder))[0]
            params = sl.GetDictionary(u.ReadLinesFromFile(params_file_path))
            values = []
            for k in params_to_keep:
                if (k in params):
                    values.append(str(params[k]))
                else:
                    values.append(str(np.NaN))
            p, r, f = metric_calculation_fn(
                params["trainpredictionoutputfile"], positive_class)
            train_performance_values = ",".join(values)
            train_performance_values = "{0},1,{1},{2},{3}".format(
                ",".join(values), str(p), str(r), str(f))
            evals.append(train_performance_values)
            if (os.path.isfile(params["testpredictionoutputfile"])):
                p, r, f = metric_calculation_fn(
                    params["testpredictionoutputfile"], positive_class)
                test_performance_values = ",".join(values)
                test_performance_values = "{0},0,{1},{2},{3}".format(
                    ",".join(values), str(p), str(r), str(f))
                evals.append(test_performance_values)
    u.WriteTextArrayToFile(
        u.PreparePath("{0}/{1}".format(datasets_root_folder,
                                       evaluation_output_filename)), evals)
コード例 #14
0
def RunAdaBoostWithDecisionTreesToGeneratePerIterationMetrics(datasets_root_folder,weka_jar_path,dataset_filter,iters,inst,use_arff_files=True):
    """
    #weightThreshold parameter : http://weka.8497.n7.nabble.com/AdaBoost-Parameters-td11830.html    
    """
    file_extn = "arff" if use_arff_files else ".csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if(dataset_filter not in dataset_dir):
            continue
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir+"/ada",is_file=False)
        config_gen = ParameterGrid({'prune':[True,False],'iter':iters})
        for config in config_gen:
            id = GetIdForConfig(config)
            config["inst"] = inst
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict=sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root,id),is_file=False)
            params_output_file=u.PreparePath("{0}/{1}.params.txt".format(run_output_dir,id))
            model_output_file=u.PreparePath("{0}/{1}.model".format(run_output_dir,id))
            train_output_file=u.PreparePath("{0}/{1}.train.predictions.csv".format(run_output_dir,id))
            full_train_output_file=u.PreparePath("{0}/{1}.fulltrain.predictions.csv".format(run_output_dir,id))
            test_output_file=u.PreparePath("{0}/{1}.test.predictions.csv".format(run_output_dir,id))
            # if(os.path.isfile(train_output_file)):
            #     continue
            config['random_state'] = params_info_dict['random_state']
            config["wekajar"] = weka_jar_path
            config["trainset"] = trainfile
            config["class"]="last"
            config["trainpredictionoutputfile"]=train_output_file
            config["predictionoutputfile"] = config["trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            # for every config there has to be a train prediction and test prediction
            cmd = GetWekaCommandLineForConfig(config,False,False)
            config["modelbuildtimesecs"] = timeit.timeit(lambda: sl.RunCmdWithoutConsoleWindow(cmd),number=1)

            config["testpredictionoutputfile"] = test_output_file
            config["testset"] = testfiles[0]
            cmd = GetWekaCommandLineForConfig(config,True)
            config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1)
            os.remove(model_output_file)

            config.pop('random_state',None) # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k,config[k]))
            u.WriteTextArrayToFile(params_output_file,params_info)
        print("done dataset : " + dataset_dir)
コード例 #15
0
def PlotPerIterationCurves(rootFolder, outputfolder):
    mimic = pd.read_csv(rootFolder + "/mimic.csv")
    sa = pd.read_csv(rootFolder + "/sa.csv")
    rhc = pd.read_csv(rootFolder + "/rhc.csv")
    ga = pd.read_csv(rootFolder + "/ga.csv")
    sizes = np.array(mimic['size'].unique())
    algo_decoration = {
        'mimic': ('r', 'o', 'mimic', mimic),
        'ga': ('g', 's', 'genetic algo', ga),
        'sa': ('b', '+', 'sim annealing', sa),
        'rhc': ('k', '*', 'rhc', rhc)
    }

    def f(data, name):
        x = data['iters']
        y = data['fn_value']
        deco = algo_decoration[name]
        return u.YSeries(y,
                         xvalues=x,
                         points_marker='.',
                         plot_legend_label=deco[2],
                         legend_marker='o',
                         line_color=deco[0])

    for size in sizes:
        size_root = u.PreparePath(outputfolder + "/itercurves_" + str(size) +
                                  ".png")
        y_ser = []
        for key in algo_decoration.keys():
            d = u.FilterRows(algo_decoration[key][3],
                             lambda x: x['size'] == size).head(10000)
            y_ser.append(f(d, key))
        u.SaveDataPlotWithLegends(y_ser,
                                  x_axis_name="iters",
                                  x=None,
                                  y1_axis_name="fn value",
                                  filename=size_root)
コード例 #16
0
def PlotPiViConvergenceForSmallAndLargeMdp(outputfolder, datafile, gamma):
    data = pd.read_csv(datafile)
    decorations = {1: 'g', 10: 'k', 10000: 'r'}
    pi_sweeps = [1, 10, 10000]
    ser = []
    ser1 = []
    vi_added = False
    for sweep in pi_sweeps:
        data_vi = u.FilterRows(
            data, lambda x: (x['mdp'] == 'LargeMdpRwTraps50') &
            (x['solver'] == 'vi') & (x['gamma'] == gamma))
        data_pi = u.FilterRows(
            data, lambda x:
            (x['mdp'] == 'LargeMdpRwTraps50') & (x['solver'] == 'pi') &
            (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep))
        assert (len(data_vi) == 1)
        assert (len(data_pi) == 1)

        data_vi_qchange = np.array(
            [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')])
        data_vi_value = np.array([
            float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';')
        ])
        data_pi_qchange = np.array(
            [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')])
        data_pi_value = np.array([
            float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';')
        ])
        if (vi_added == False):
            s_vi = u.YSeries(data_vi_qchange,
                             xvalues=np.arange(len(data_vi_qchange)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser.append(s_vi)
        s_pi = u.YSeries(data_pi_qchange,
                         xvalues=np.arange(len(data_pi_qchange)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser.append(s_pi)

        if (vi_added == False):
            s_vi = u.YSeries(data_vi_value,
                             xvalues=np.arange(len(data_vi_value)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser1.append(s_vi)
        s_pi = u.YSeries(data_pi_value,
                         xvalues=np.arange(len(data_pi_value)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser1.append(s_pi)
        vi_added = True

    outputfile = u.PreparePath(outputfolder + "/plots/large_qchange_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Max change in state value")
    outputfile = u.PreparePath(outputfolder + "/plots/large_value_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser1,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Total value accross states")

    ser = []
    ser1 = []
    vi_added = False
    for sweep in pi_sweeps:
        data_vi = u.FilterRows(
            data, lambda x: (x['mdp'] == 'SmallMdpRwTraps') &
            (x['solver'] == 'vi') & (x['gamma'] == gamma))
        data_pi = u.FilterRows(
            data, lambda x:
            (x['mdp'] == 'SmallMdpRwTraps') & (x['solver'] == 'pi') &
            (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep))
        assert (len(data_vi) == 1)
        assert (len(data_pi) == 1)

        data_vi_qchange = np.array(
            [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')])
        data_vi_value = np.array([
            float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';')
        ])
        data_pi_qchange = np.array(
            [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')])
        data_pi_value = np.array([
            float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';')
        ])
        if (vi_added == False):
            s_vi = u.YSeries(data_vi_qchange,
                             xvalues=np.arange(len(data_vi_qchange)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser.append(s_vi)
        s_pi = u.YSeries(data_pi_qchange,
                         xvalues=np.arange(len(data_pi_qchange)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser.append(s_pi)

        if (vi_added == False):
            s_vi = u.YSeries(data_vi_value,
                             xvalues=np.arange(len(data_vi_value)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser1.append(s_vi)
        s_pi = u.YSeries(data_pi_value,
                         xvalues=np.arange(len(data_pi_value)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser1.append(s_pi)
        vi_added = True

    outputfile = u.PreparePath(outputfolder + "/plots/small_qchange_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Max change in state value")
    outputfile = u.PreparePath(outputfolder + "/plots/small_value_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser1,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Total value accross states")
コード例 #17
0
def NNetAnalysis(output_root,
                 output_file_prefix,
                 metrics_file,
                 iters_to_ignore,
                 y_axis_name="F-Measure"):
    data_all = pd.read_csv(metrics_file)
    dataset_types = ['train_split_percent_used']
    col_funcs = {
        'p': ['mean'],
        'r': ['mean'],
        'm': ['mean'],
        'modelbuildtimesecs': ['mean']
    }

    mapping_output_words = {
        'p': 'Precision',
        'r': 'Recall',
        'm': y_axis_name,
        dataset_types[0]: 'Train size % used',
        'modelbuildtimesecs': 'Time to build model (sec)'
    }

    for dataset_type in dataset_types:

        def filter_query(x):
            return (~np.isnan(x[dataset_type]) &
                    (x['total_iter'] > iters_to_ignore))

        def train_earlystopping_filter(x):
            return x['earlystopping'] & (x['istrain'] == 1)

        def train_no_earlystopping_filter(x):
            return (x['earlystopping'] == False) & (x['istrain'] == 1)

        def test_earlystopping_filter(x):
            return x['earlystopping'] & (x['istrain'] == 0)

        def test_no_earlystopping_filter(x):
            return (x['earlystopping'] == False) & (x['istrain'] == 0)

        data = FilterRows(data_all, filter_query)
        data_agg = GetAggMetrics(
            data,
            col_funcs=col_funcs,
            gpby=[dataset_type, 'earlystopping', 'istrain'])
        x = data_agg[dataset_type].unique()

        def MissingValuesHandler(curr_values_frame, keyCol, valueCol,
                                 required_values):
            data = dict(
                zip(curr_values_frame[keyCol], curr_values_frame[valueCol]))
            y = []
            for v in required_values:
                if (v in data):
                    y.append(data[v])
                else:
                    y.append(0)
            return y

        for k, v in col_funcs.items():
            for agg in v:
                mvh = lambda df: MissingValuesHandler(df, dataset_type, k + "_"
                                                      + agg, x)
                y_train_earlystopping = u.YSeries(
                    mvh(FilterRows(data_agg, train_earlystopping_filter)),
                    line_color='r',
                    points_marker='o',
                    plot_legend_label="Train_with_earlystopping")
                y_train_no_earlystopping = u.YSeries(
                    mvh(FilterRows(data_agg, train_no_earlystopping_filter)),
                    line_color='r',
                    points_marker='x',
                    plot_legend_label="Train_without_earlystopping")
                y_test_earlystopping = u.YSeries(
                    mvh(FilterRows(data_agg, test_earlystopping_filter)),
                    line_color='b',
                    points_marker='o',
                    plot_legend_label="Validation_with_earlystopping")
                y_no_test_earlystopping = u.YSeries(
                    mvh(FilterRows(data_agg, test_no_earlystopping_filter)),
                    line_color='b',
                    points_marker='x',
                    plot_legend_label="Validation_without_earlystopping")

                output_file_name = u.PreparePath(
                    "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k,
                                                     agg, output_root,
                                                     dataset_type))
                f, ax = u.SaveDataPlotWithLegends(
                    [
                        y_test_earlystopping, y_no_test_earlystopping,
                        y_train_no_earlystopping, y_train_earlystopping
                    ], x, output_file_name, True,
                    mapping_output_words[dataset_type],
                    mapping_output_words[k],
                    'Neural Nets Performance'.format(agg))
    return data_agg
コード例 #18
0
def RunExperiments(X,Y,rootfolder,clusters,dims,compute_acc=None):
    datasets = {}
    datasets["raw"] = (X,Y)
    err_series = []
    decorations = {}
    decorations["pca"] = ("o","r","pca")
    decorations["ica"] = ("x","b","ica")
    decorations["rp"] = ("+","g","rp")
    decorations["mi"] = ("o","k","mi")
    flags = [True,True,True,True]
    nn_output_lines = []
    nn_output_file = rootfolder + "/nn.csv"
    if(compute_acc is not None):
        h,l = CreateOutputLineForNN(RunNeuralNetwork(X,Y,10,compute_acc,False),"raw")
        nn_output_lines.append(h)
        nn_output_lines.append(l)

    best_bic = None
    ################### PCA #####################
    if(flags[0]):
        pca_results = PerformPca(X,Y,dims,0)
        pca_var_explained_plot = u.PreparePath(rootfolder + "/plots/pca/var.png")
        recons_err_plot = u.PreparePath(rootfolder + "/plots/err.png")
        recons_err_dict = []
        var_y = []
        err_y = []

        for dim in dims:
            key = "pca_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(pca_results["{0}data".format(key)]),Y)
            err_y.append(pca_results[key+"reconstruction_error"])
            var_y = pca_results[key+"explained_var_ratio"]
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"pca")
            #    #nn_output_lines.append(h)
            #    nn_output_lines.append(l)

        ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2])
        recons_err_dict.append(ser)
        ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2])
        u.SaveDataPlotWithLegends([ser],x_axis_name="dimensions",y1_axis_name="% explained variance",filename=pca_var_explained_plot)

    ################### ICA #####################

    if(flags[1]):
        ica_kt_plot = u.PreparePath(rootfolder + "/plots/ica/kt.png")
        err_y = []
        ica_results = PerformIca(X,Y,dims,0)
        for dim in dims:
            key = "ica_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(ica_results[key+"data"]),Y)
            err_y.append(ica_results[key+"reconstruction_error"])
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"ica")
            #    nn_output_lines.append(l)

        var_y = ica_results["ica_kt_all"]
        ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2])
        recons_err_dict.append(ser)
        ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2])
        u.SaveDataPlotWithLegends([ser],x_axis_name="components",y1_axis_name="kurtosis",filename=ica_kt_plot)

    ################### RP #####################
    if(flags[2]):
        rp_runs_plot = u.PreparePath(rootfolder + "/plots/rp/runs.png")
        err_y = []
        runs = 10
        rp_results = PerformRandomProjections(X,Y,dims,runs)
        runs_series = []
        markers = u.GetColorCombinations(10)
        i=0
        for dim in dims:
            key = "rp_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(rp_results[key+"data"]),Y)
            err_y.append(rp_results[key+"reconstruction_error"])
            runs_ser = u.YSeries(rp_results[key+"reconstruction_errors_all"],xvalues=np.arange(runs)+1,points_marker = "o",line_color = markers[i]["color"],plot_legend_label="proj dims = "+str(dim))
            runs_series.append(runs_ser)
            i = i + 1
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"rp")
            #    nn_output_lines.append(l)

        ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["rp"][0],line_color=decorations["rp"][1],plot_legend_label=decorations["rp"][2])
        recons_err_dict.append(ser)
        u.SaveDataPlotWithLegends(runs_series,x_axis_name="run number",y1_axis_name="reconstruction err",filename=rp_runs_plot)

        u.SaveDataPlotWithLegends(recons_err_dict,x_axis_name="dimensions",y1_axis_name="reconstruction_error",filename=recons_err_plot)

    ###################### MI Feature Selection #########################
    if(flags[3]):
        mi_results = PerformMiBasedFeatureSelection(X,Y,dims,10)
        mi_plot = u.PreparePath(rootfolder + "/plots/mi/scores.png")
        for dim in dims:
            key = "mi_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(mi_results[key+"data"]),Y)
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"mi")
            #    nn_output_lines.append(l)
        ser = u.YSeries(mi_results["scores"],xvalues = np.arange(len(mi_results["scores"])) + 1,points_marker=decorations["mi"][0],line_color=decorations["mi"][1],plot_legend_label=decorations["mi"][2])
        u.SaveDataPlotWithLegends([ser],x_axis_name="feature number",y1_axis_name="mutual information", filename=mi_plot)

    ###################### CLUSTERING #########################
    clustering_output_file = rootfolder + "/clustering.csv"
    clustering_plots_output_root = u.PreparePath(rootfolder + "/plots")
    lines = []
    lines.append("clustering,dim_red_method,k,p,ami_raw,ami_true,sc,bic")
    raw_clustering_results = {}
    best_bic_raw_clustering = {}
    curr_best_bic = {}
    actual_labels = Y
    for dim in dims:
        for algo in ["raw","ica","rp","mi","pca"]:
            raw_data_plot_done = False
            key = "{0}_{1}_".format(algo,str(dim))
            if(algo == "raw"):
                key = "raw"
            dataset = datasets[key]
            for cluster in clusters:
                for mthd in ["kmeans","gmm"]:
                    raw_key = "{0}_{1}".format(str(cluster),mthd)
                    print("doing clustering for dim = {0} {1} k = {2} {3}".format(str(dim),algo,str(cluster), mthd))
                    c_key = "{0}_{1}_predicted".format(mthd,str(cluster))
                    c_key1 = "{0}_{1}_".format(mthd,str(cluster))
                    if(algo == "raw" and raw_key in raw_clustering_results):
                        results = raw_clustering_results[raw_key]
                    else:
                        #if(algo == "raw" and cluster == 2 and compute_acc):
                        #    results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd]
                        #    h,l = CreateOutputLineForNN(RunNeuralNetwork(results[c_key.replace("predicted","new_data")],dataset[1],10,compute_acc),mthd)
                        #    nn_output_lines.append(l)
                        #else:
                        results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd]
                        if(algo == "raw"):
                           raw_clustering_results[raw_key] = results
                        if(compute_acc):
                            mthd_key = mthd+algo if algo == "raw" else mthd+algo+str(cluster)+str(dim)
                            if((mthd_key not in curr_best_bic) or (curr_best_bic[mthd_key] > results[c_key1+"bic"])):
                                curr_best_bic[mthd_key] = results[c_key1+"bic"]
                                best_bic_raw_clustering[mthd_key] = (results[c_key1+"new_data"],dataset[1],results[c_key1+"metrics"]["ami"],results[c_key1+"bic"])
                                print("new best {0} {1}".format(c_key1,str(results[c_key1+"bic"])))

                    clustering_prediction_file = u.PreparePath(rootfolder + "/clustering_output/mthd={0}_k={1}_d={2}_algo={3}.csv".format(mthd,str(cluster),str(dim),algo))
                    np.savetxt(clustering_prediction_file,results[c_key])
                    bic = c_key.replace("predicted","bic")
                    bic = results[bic]
                    act = ComputeClusteringMetrics(actual_labels,results[c_key],dataset[0])
                    raw = ComputeClusteringMetrics(raw_clustering_results[raw_key][c_key],results[c_key],dataset[0])
                    line = "{0},{1},{2},{3},{4},{5},{6},{7}".format(mthd,algo,str(cluster),str(dim),str(raw["ami"]),str(act["ami"]),str(raw["sl"]),str(bic))
                    print(line)
                    plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_{3}.png".format(mthd,str(cluster),algo,str(dim))
                    #if(mthd == "gmm"):
                    #    prob_output_file = rootfolder + "/{0}_{1}_{2}_{3}.csv".format(mthd,str(cluster),algo,str(dim))
                    #    np.savetxt(prob_output_file,results[c_key.replace("predicted","prob")],delimiter=",")
                    ScatterPlotForClustering(results[c_key],actual_labels,plot_output_file)
                    if(dim == 2 and algo != "raw"):
                        if(raw_data_plot_done == False):
                            plot_output_file = clustering_plots_output_root + "/{0}_{1}_data.png".format(mthd,algo)
                            ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],np.zeros_like(actual_labels),actual_labels,plot_output_file)
                            raw_data_plot_done = True
                        plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_data.png".format(mthd,str(cluster),algo)
                        ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],results[c_key],actual_labels,plot_output_file)
                    lines.append(line)

    #if(compute_acc):
    #    keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","gmmpca":"pca","gmmica":"ica","gmmrp":"rp","gmmmi":"mi"}
    #    for key in keys_to_output.keys():
    #        if("raw" not in key):
    #            curr_best = None
    #            for cluster in clusters:
    #                datakey = key+str(cluster)
    #                if(curr_best is None or best_bic_raw_clustering[datakey][2] > curr_best):
    #                    curr_best = best_bic_raw_clustering[datakey][2]
    #                    _X = best_bic_raw_clustering[datakey][0]
    #                    _Y = best_bic_raw_clustering[datakey][1]
    #        else:
    #            _X = best_bic_raw_clustering[key][0]
    #            _Y = best_bic_raw_clustering[key][1]

    #        h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key])
    #        nn_output_lines.append(l)
    #    u.WriteTextArrayToFile(nn_output_file,nn_output_lines)

    if(compute_acc):
        keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","pca":"pca","ica":"ica","rp":"rp","mi":"mi"}
        for key in keys_to_output.keys():
            if("raw" not in key):
                dim_best_val = None
                dim_result = None
                for dim in dims:
                    best = {} # {x,y,p,k,bic,ami}
                    for cluster_mthd in ["kmeans","gmm"]:
                        for cluster in clusters:
                            datakey = cluster_mthd+key+str(cluster)+str(dim)
                            if(cluster_mthd not in best or best_bic_raw_clustering[datakey][2] > best[cluster_mthd][4]):
                                best[cluster_mthd] = (best_bic_raw_clustering[datakey][0],best_bic_raw_clustering[datakey][1],dim,cluster,best_bic_raw_clustering[datakey][3],best_bic_raw_clustering[datakey][2])
                    curr_val = (best["kmeans"][5] + best["gmm"][5]) / 2
                    if(dim_best_val is None or dim_best_val < curr_val):
                        dim_best_val = curr_val
                        dim_result = best

                _X = dim_result["gmm"][0]
                _Y = dim_result["gmm"][1]
            else:
                _X = best_bic_raw_clustering[key][0]
                _Y = best_bic_raw_clustering[key][1]

            h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key])
            nn_output_lines.append(l)
        u.WriteTextArrayToFile(nn_output_file,nn_output_lines)

    u.WriteTextArrayToFile(clustering_output_file,lines)
コード例 #19
0
def AdaBoostAnalysis(output_root, output_file_prefix, metrics_file):
    data_all = pd.read_csv(metrics_file)
    dataset_types = [
        'train_split_percent_used', 'imbalance_perc', 'noise_perc'
    ]
    col_funcs = {
        'p': ['mean', 'std'],
        'r': ['mean', 'std'],
        'm': ['mean', 'std']
    }

    mapping_output_words = {
        'p': 'Precision',
        'r': 'Recall',
        'm': 'F-Measure',
        dataset_types[0]: 'Train size % used',
        dataset_types[1]: 'Fraction of postives to negatives',
        dataset_types[2]: 'Noise %',
        'modelbuildtimesecs': 'Time to build AdaBoost model (sec)'
    }

    for dataset_type in dataset_types:

        def filter_query(x):
            return (~np.isnan(x[dataset_type]))

        data = FilterRows(data_all, filter_query)
        data_agg = GetAggMetrics(
            data,
            col_funcs=col_funcs,
            gpby=[dataset_type, 'prune', 'istrain', 'iter'])
        for metric, v in col_funcs.items():
            for agg in v:
                iterations = np.sort(data_agg['iter'].unique())
                prune_vals = data_agg['prune'].unique()
                dataset_type_values = data_agg[dataset_type].unique()
                for type_val in dataset_type_values:
                    for prune_val in prune_vals:
                        metric_col = metric + "_" + agg
                        y_test = []
                        y_train = []
                        for i in iterations:
                            filtered_data = data_agg[
                                (data_agg['prune'] == prune_val)
                                & (data_agg['iter'] == i) &
                                (data_agg[dataset_type] == type_val)]
                            train_data = filtered_data[filtered_data['istrain']
                                                       == 1]
                            assert (len(train_data) == 1)
                            y_train.append(train_data[metric_col].iloc[0])

                            test_data = filtered_data[filtered_data['istrain']
                                                      == 0]
                            assert (len(test_data) == 1)
                            y_test.append(test_data[metric_col].iloc[0])
                        # now we can plot since we have test and train values for each iter
                        output_file_name = u.PreparePath(
                            "{4}/{0}.{1}.prune-{5}.{6}-{7}.{2}.{3}.png".format(
                                output_file_prefix, dataset_type, metric, agg,
                                output_root, prune_val, dataset_type,
                                type_val))
                        y_train_series = u.YSeries(y_train,
                                                   line_color='r',
                                                   plot_legend_label='train')
                        y_test_series = u.YSeries(y_test,
                                                  line_color='b',
                                                  plot_legend_label='test')
                        if (~os.path.isfile(output_file_name)):
                            u.SaveDataPlotWithLegends(
                                [y_train_series, y_test_series], iterations,
                                output_file_name, True, "num of iterations",
                                mapping_output_words[metric],
                                "AdaBoost Performance ({0})".format(agg))
                        print(output_file_name)
コード例 #20
0
def SvmAnalysis(output_root,
                output_file_prefix,
                metrics_file,
                dataset_filter_fn=None,
                y_axis_name="F-Measure"):
    def ComputeTotalSupportVectors(s):
        return np.array([int(t) for t in s.split(';')]).sum()

    data_all = pd.read_csv(metrics_file)
    data_all['numsupportvectors'] = data_all['numsupportvectors'].apply(
        ComputeTotalSupportVectors)
    dataset_types = ['train_split_percent_used']
    col_funcs = {
        'p': ['mean'],
        'r': ['mean'],
        'm': ['mean'],
        'modelbuildtimesecs': ['mean']
    }

    mapping_output_words = {
        'p': 'Precision',
        'r': 'Recall',
        'm': y_axis_name,
        dataset_types[0]: 'Train size % used',
        'modelbuildtimesecs': 'Time to build model (sec)',
        'numsupportvectors': 'Number of Support Vectors'
    }

    for dataset_type in dataset_types:

        def filter_query(x):
            return ~np.isnan(x[dataset_type])

        def train_filter(x):
            return (x['istrain'] == 1)

        def test_filter(x):
            return (x['istrain'] == 0)

        if (dataset_filter_fn is not None):
            data_all = FilterRows(data_all, dataset_filter_fn)
        data = FilterRows(data_all, filter_query)
        data_agg = GetAggMetrics(data,
                                 col_funcs=col_funcs,
                                 gpby=[dataset_type, 'istrain'])
        x = data_agg[dataset_type].unique()

        for k, v in col_funcs.items():
            for agg in v:
                y_train = u.YSeries(FilterRows(data_agg,
                                               train_filter)[k + "_" + agg],
                                    line_color='r',
                                    points_marker='o',
                                    plot_legend_label="Train")
                y_test = u.YSeries(FilterRows(data_agg,
                                              test_filter)[k + "_" + agg],
                                   line_color='b',
                                   points_marker='o',
                                   plot_legend_label='validation')
                if ((k == 'numsupportvectors') | (k == 'modelbuildtimesecs')):
                    y_series = [y_train]
                else:
                    y_series = [y_test, y_train]

                output_file_name = u.PreparePath(
                    "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k,
                                                     agg, output_root,
                                                     dataset_type))
                f, ax = u.SaveDataPlotWithLegends(
                    y_series, x, output_file_name, True,
                    mapping_output_words[dataset_type],
                    mapping_output_words[k], 'SVM Performance'.format(agg))
    return data_agg
コード例 #21
0
ファイル: Svm.py プロジェクト: shwetabh-khanduja/ml
def RunSVMClassifier(datasets_root_folder,
                     nominal_value_columns=None,
                     positive_class_label=None,
                     cv_file=None,
                     cv_scoring='f1'):
    file_extn = "csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    realtestfiles = glob.glob("{0}/*.realtest.{1}".format(
        datasets_root_folder, file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/svm", is_file=False)
        params_info = u.ReadLinesFromFile(paramfile)
        params_info_dict = sl.GetDictionary(params_info)

        data = pd.read_csv(trainfile)
        testdata = pd.read_csv(testfiles[0])
        realtestdata = pd.read_csv(realtestfiles[0])
        train_len = len(data)
        test_len = len(testdata) + train_len

        cols_to_ignore = set(nominal_value_columns
                             ) if nominal_value_columns is not None else set(
                                 [])
        cols_to_ignore.add(data.columns[-1])
        cols_to_transform = [
            c for c in data.columns if c not in cols_to_ignore
        ]
        scaler = StandardScaler()
        scaler.fit(data[cols_to_transform])
        data[cols_to_transform] = scaler.transform(data[cols_to_transform])
        testdata[cols_to_transform] = scaler.transform(
            testdata[cols_to_transform])
        realtestdata[cols_to_transform] = scaler.transform(
            realtestdata[cols_to_transform])

        all_data = pd.concat([data, testdata, realtestdata],
                             axis=0,
                             ignore_index=True)
        X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label,
                                                nominal_value_columns)
        X = X_all[0:train_len, :]
        Y = Y_all[0:train_len]
        test_X = X_all[train_len:test_len, :]
        test_Y = Y_all[train_len:test_len]
        realtest_X = X_all[test_len:, :]
        realtest_Y = Y_all[test_len:]
        realtest_data_file = trainfile.replace(".train.",
                                               ".realtest.preprocessed.data.")
        realtest_label_file = trainfile.replace(
            ".train.", ".realtest.preprocessed.label.")
        np.savetxt(realtest_data_file, realtest_X, delimiter=',')
        np.savetxt(realtest_label_file, realtest_Y, delimiter=',')

        dataset_size = GetDataSetSize(dataset_dir)
        StoreData("train.csv", "train_label.csv", X, Y, dataset_size)
        StoreData("validation.csv", "validation_label.csv", test_X, test_Y,
                  dataset_size)
        StoreData("test.csv", "test_label.csv", realtest_X, realtest_Y,
                  dataset_size)

        param_grid = [
            {
                'C': [0.1, 1, 10, 100, 1000],
                'degree': [2, 3, 4],
                'kernel': ['poly']
            },
            {
                'C': [0.1, 1, 10, 100, 1000],
                'gamma': [0.001, 0.0001],
                'kernel': ['rbf']
            },
        ]
        classifier = SVC(cache_size=1500,
                         random_state=int(params_info_dict['random_state']))
        if ((cv_file is None) or (os.path.isfile(cv_file) == False)):
            gscv = GridSearchCV(classifier,
                                param_grid,
                                scoring=cv_scoring,
                                n_jobs=3)
            gscv.fit(X, Y)
            _D = pd.DataFrame(gscv.cv_results_)
            best_params = gscv.best_params_
        else:
            _D = None
        config_gen = [{}]
        for config in config_gen:
            id = GetIdForConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            cv_results_file = u.PreparePath(
                "{0}/{1}.grid_search_cv_results.csv".format(
                    run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))

            if (_D is not None):
                _D.to_csv(cv_results_file)
            else:
                cv_results = pd.read_csv(cv_file)
                best_params = ast.literal_eval(cv_results[
                    cv_results['rank_test_score'] == 1].iloc[0]['params'])
            # if(os.path.isfile(test_output_file)):
            # 	config = config_gen.GetNextConfigAlongWithIdentifier()
            # 	continue
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            config["testset"] = testfiles[0]
            config["kernel"] = best_params['kernel']
            config['C'] = best_params['C']
            if (config['kernel'] == 'rbf'):
                config['gamma'] = best_params['gamma']
                classifier = SVC(config['C'],
                                 gamma=config['gamma'],
                                 kernel=config['kernel'],
                                 cache_size=1500,
                                 random_state=int(
                                     params_info_dict['random_state']))
            else:
                config['degree'] = best_params['degree']
                classifier = SVC(config['C'],
                                 kernel=config['kernel'],
                                 degree=config['degree'],
                                 cache_size=1500,
                                 random_state=int(
                                     params_info_dict['random_state']))

            start = time.clock()
            classifier.fit(X, Y)
            end = time.clock()
            print(end - start)
            config["modelbuildtimesecs"] = end - start
            config['numsupportvectors'] = u.ConcatToStr(
                ';', classifier.n_support_)
            # for train performance
            config["trainpredictionoutputfile"] = train_output_file
            train_predicted_Y = classifier.predict(X)
            output = pd.DataFrame({
                "actual": Y,
                "predicted": train_predicted_Y
            })
            output.to_csv(train_output_file, index=False)
            u.WriteBinaryFile(model_output_file, classifier)
            # now for test set
            config["predictionoutputfile"] = test_output_file

            start = time.clock()
            predicted_Y = classifier.predict(test_X)
            end = time.clock()
            config["modelevaltimesecs"] = end - start
            output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y})
            output.to_csv(test_output_file, index=False)

            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("done dataset : " + dataset_dir)
コード例 #22
0
def KnnAnalysis(output_root, output_file_prefix, metrics_file):
    data_all = pd.read_csv(metrics_file)
    dataset_types = ['train_split_percent_used']
    col_funcs = {
        'p': ['mean'],
        'r': ['mean'],
        'm': ['mean'],
        'modelevaltimesecs': ['mean']
    }

    mapping_output_words = {
        'p': 'Precision',
        'r': 'Recall',
        'm': 'F-Measure',
        dataset_types[0]: 'Train size % used',
        dataset_types[1]: 'Fraction of postives to negatives',
        dataset_types[2]: 'Noise %',
        'modelevaltimesecs': 'Time to run Knn model (sec)'
    }

    for dataset_type in dataset_types:

        def filter_query(x):
            return (~np.isnan(x[dataset_type]) & (x['istrain'] == 0))

        def distance_weights_filter(x):
            return x['weights'] == 'distance'

        def uniform_weights_filter(x):
            return x['weights'] == 'uniform'

        data = FilterRows(data_all, filter_query)
        data_agg = GetAggMetrics(data,
                                 col_funcs=col_funcs,
                                 gpby=[dataset_type, 'weights', 'neighbors'])
        x = data_agg[dataset_type].unique()
        for k, v in col_funcs.items():
            for agg in v:
                data_for_distance_based_weighting = FilterRows(
                    data_agg, distance_weights_filter)
                nneighbors = [5, 10, 20, 50]
                marker_and_color_map = {
                    5: ('g', 'o'),
                    10: ('r', '+'),
                    20: ('b', 'x'),
                    50: ('k', 'd')
                }
                y_series = []
                for n in nneighbors:
                    d = data_for_distance_based_weighting[
                        data_for_distance_based_weighting['neighbors'] == n]
                    y = u.YSeries(d[k + "_" + agg],
                                  line_color=marker_and_color_map[n][0],
                                  points_marker=marker_and_color_map[n][1],
                                  plot_legend_label="k = " + str(n))
                    y_series.append(y)
                output_file_name = u.PreparePath(
                    "{4}/{0}.{1}.weighted.{2}.{3}.png".format(
                        output_file_prefix, dataset_type, k, agg, output_root))
                f, ax = u.SaveDataPlotWithLegends(
                    y_series, x, output_file_name, True,
                    mapping_output_words[dataset_type],
                    mapping_output_words[k], 'K Nearest Neighbor'.format(agg))

                data_for_distance_based_weighting = FilterRows(
                    data_agg, uniform_weights_filter)
                y_series = []
                for n in nneighbors:
                    d = data_for_distance_based_weighting[
                        data_for_distance_based_weighting['neighbors'] == n]
                    y = u.YSeries(d[k + "_" + agg],
                                  line_color=marker_and_color_map[n][0],
                                  points_marker=marker_and_color_map[n][1],
                                  plot_legend_label="k = " + str(n))
                    y_series.append(y)
                output_file_name = u.PreparePath(
                    "{4}/{0}.{1}.uniform.{2}.{3}.png".format(
                        output_file_prefix, dataset_type, k, agg, output_root))
                f, ax = u.SaveDataPlotWithLegends(
                    y_series, x, output_file_name, True,
                    mapping_output_words[dataset_type],
                    mapping_output_words[k], 'K Nearest Neighbor'.format(agg))
    return data_agg
コード例 #23
0
def DecisionTreeAnalysis(output_root,
                         output_file_prefix,
                         metrics_file,
                         dataset_filter_fn=None,
                         plt_title="Decision Trees Performance",
                         y_axis_name='F-Measure'):

    data_all = pd.read_csv(metrics_file)
    dataset_types = ['train_split_percent_used']
    col_funcs = {
        'p': ['mean'],
        'r': ['mean'],
        'm': ['mean'],
        'modelbuildtimesecs': ['mean']
    }

    mapping_output_words = {
        'p': 'Precision',
        'r': 'Recall',
        'm': y_axis_name,
        dataset_types[0]: 'Train size % used',
        'modelbuildtimesecs': 'Time to build model (sec)'
    }

    for dataset_type in dataset_types:

        def filter_query(x):
            return ~np.isnan(x[dataset_type])

        def train_prune_filter(x):
            return x['prune'] & (x['istrain'] == 1)

        def train_no_prune_filter(x):
            return (x['prune'] == False) & (x['istrain'] == 1)

        def test_prune_filter(x):
            return x['prune'] & (x['istrain'] == 0)

        def test_no_prune_filter(x):
            return (x['prune'] == False) & (x['istrain'] == 0)

        if (dataset_filter_fn is not None):
            data_all = FilterRows(data_all, dataset_filter_fn)
        data = FilterRows(data_all, filter_query)
        data_agg = GetAggMetrics(data,
                                 col_funcs=col_funcs,
                                 gpby=[dataset_type, 'prune', 'istrain'])
        x = data_agg[dataset_type].unique()

        for k, v in col_funcs.items():
            for agg in v:
                y_train_prune = u.YSeries(
                    FilterRows(data_agg, train_prune_filter)[k + "_" + agg],
                    line_color='r',
                    points_marker='o',
                    plot_legend_label="Train_with_pruning")
                y_train_no_prune = u.YSeries(
                    FilterRows(data_agg, train_no_prune_filter)[k + "_" + agg],
                    line_color='r',
                    points_marker='x',
                    plot_legend_label="Train_without_pruning")
                y_test_prune = u.YSeries(
                    FilterRows(data_agg, test_prune_filter)[k + "_" + agg],
                    line_color='b',
                    points_marker='o',
                    plot_legend_label="Validation_with_pruning")
                y_no_test_prune = u.YSeries(
                    FilterRows(data_agg, test_no_prune_filter)[k + "_" + agg],
                    line_color='b',
                    points_marker='x',
                    plot_legend_label="Validation_without_pruning")

                if (len(y_train_prune.values) == 0):
                    y_no_test_prune.plot_legend_label = "Validation"
                    y_train_no_prune.plot_legend_label = "Train"
                    if ((k == 'modelbuildtimesecs')):
                        y_series = [y_train_no_prune]
                    else:
                        y_series = [y_no_test_prune, y_train_no_prune]
                else:
                    if ((k == 'modelbuildtimesecs')):
                        y_series = [y_train_no_prune, y_train_prune]
                    else:
                        y_series = [
                            y_test_prune, y_no_test_prune, y_train_no_prune,
                            y_train_prune
                        ]

                output_file_name = u.PreparePath(
                    "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k,
                                                     agg, output_root,
                                                     dataset_type))
                f, ax = u.SaveDataPlotWithLegends(
                    y_series, x, output_file_name, True,
                    mapping_output_words[dataset_type],
                    mapping_output_words[k], plt_title)
    return data_agg
コード例 #24
0
def RunDecisionTreesWithOptimalInst(datasets_root_folder,
                                    weka_jar_path,
                                    cv_results_file,
                                    use_arff_files=True):
    file_extn = "arff" if use_arff_files else ".csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    cv_results = pd.read_csv(datasets_root_folder + "/" + cv_results_file)
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False)
        filter_name, filter_val = GetFilterOptions(dataset_dir)
        config_gen = ParameterGrid({'prune': [True, False]})
        for config in config_gen:

            filter = lambda x: (x['prune'] == False) & (x[
                filter_name] == filter_val) & (x[
                    'istrain'] == 1)  # this will output on the held out set
            filtered_rows = u.FilterRows(cv_results, filter)
            a = filtered_rows['m']
            if (len(a) == 0):
                print("ignoring : {0}".format(dataset_dir))
                continue
            b = np.max(filtered_rows['m'])
            indxs = np.isclose(a, b)
            best_insts = filtered_rows[indxs]
            best_insts = best_insts.iloc[0]['inst']
            config['inst'] = best_insts

            id = GetIdForOptConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            # if(os.path.isfile(train_output_file)):
            #     continue
            config['random_state'] = params_info_dict['random_state']
            config["wekajar"] = weka_jar_path
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            # for every config there has to be a train prediction and test prediction
            cmd = GetWekaCommandLineForConfig(config, False, False)
            config["modelbuildtimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            # now for test set
            config["predictionoutputfile"] = test_output_file
            config["testset"] = testfiles[0]
            cmd = GetWekaCommandLineForConfig(config, True, False)
            config["modelevaltimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            config.pop('random_state',
                       None)  # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("done dataset : " + dataset_dir)
コード例 #25
0
def GenerateDatasetSplits(rootFolder,
                          dataset_folder_prefix,
                          dataset,
                          test_ratio,
                          train_ratio,
                          validation_ratio,
                          train_size_percentages,
                          class_col,
                          random_state,
                          arff_attr_info=None):
    """
    train_size_percentages is a list of intergers specifying the
    percent of train set to be taken while preparing the dataset

    test_ratio,train_ratio,validation_ratio : numbers in percentages
    """
    dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format(
        rootFolder, dataset_folder_prefix, train_ratio, test_ratio))
    train, test, validation = CreateTrainTestAndValidationPartitions(
        dataset, class_col, train_ratio / 100, test_ratio / 100, random_state,
        validation_ratio / 100)
    if (validation is not None):
        validation_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format(
            dataset_root, dataset_folder_prefix))
        validation.to_csv(validation_output_file_csv, index=False)
        test_output_file_csv = u.PreparePath("{0}/i-{1}.realtest.csv".format(
            dataset_root, dataset_folder_prefix))
        test.to_csv(test_output_file_csv, index=False)
        test_output_file_arff = u.PreparePath("{0}/i-{1}.realtest.arff".format(
            dataset_root, dataset_folder_prefix))
        CreateArffFileFromCsv(arff_attr_info, test_output_file_arff,
                              test_output_file_csv, True, True)
    else:
        test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format(
            dataset_root, dataset_folder_prefix))
        test.to_csv(test_output_file_csv, index=False)
    if (arff_attr_info is not None):
        test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format(
            dataset_root, dataset_folder_prefix))
        CreateArffFileFromCsv(arff_attr_info, test_output_file_arff,
                              test_output_file_csv, True, True)

    # now creating the train set partitions
    for train_set_size in train_size_percentages:
        folder_path = u.PreparePath("{0}/i-{1}_t-{2}_ts-{3}".format(
            dataset_root, dataset_folder_prefix, train_ratio, train_set_size))
        csv_output_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_ts-{3}.train.csv".format(folder_path,
                                                      dataset_folder_prefix,
                                                      train_ratio,
                                                      train_set_size))
        rows_to_keep = int(len(train) * train_set_size / 100)
        train.head(rows_to_keep).to_csv(csv_output_file, index=False)
        if (arff_attr_info is not None):
            arff_output_file = u.PreparePath(
                "{0}/i-{1}_t-{2}_ts-{3}.train.arff".format(
                    folder_path, dataset_folder_prefix, train_ratio,
                    train_set_size))
            CreateArffFileFromCsv(arff_attr_info, arff_output_file,
                                  csv_output_file, True, True)

        # writing the parameters
        params_info = [
            "dataset_instance={0}".format(dataset_folder_prefix),
            "test_split={0}".format(test_ratio),
            "train_split={0}".format(train_ratio),
            "random_state={0}".format(random_state),
            "class_col={0}".format(class_col),
            "train_split_percent_used={0}".format(train_set_size)
        ]
        params_out_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_ts-{3}.params.txt".format(folder_path,
                                                       dataset_folder_prefix,
                                                       train_ratio,
                                                       train_set_size))
        u.WriteTextArrayToFile(params_out_file, params_info)
コード例 #26
0
def GenerateDatasetSplitsForWithNoise(rootFolder,
                                      dataset_folder_prefix,
                                      dataset,
                                      test_ratio,
                                      train_ratio,
                                      validation_ratio,
                                      noise_percentages,
                                      class_col,
                                      flip_fn,
                                      random_state,
                                      arff_attr_info=None):
    """
	train_size_percentages is a list of intergers specifying the
	percent of train set to be taken while preparing the dataset

	test_ratio,train_ratio,validation_ratio : numbers in percentages
	"""
    dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format(
        rootFolder, dataset_folder_prefix, train_ratio, test_ratio))
    train, test, validation = CreateTrainTestAndValidationPartitions(
        dataset, class_col, train_ratio / 100, test_ratio / 100, random_state,
        validation_ratio / 100)
    test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format(
        dataset_root, dataset_folder_prefix))
    test.to_csv(test_output_file_csv, index=False)
    if (arff_attr_info is not None):
        test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format(
            dataset_root, dataset_folder_prefix))
        CreateArffFileFromCsv(arff_attr_info, test_output_file_arff,
                              test_output_file_csv, True, True)

    # now creating the train set partitions
    for noise_perc in noise_percentages:
        folder_path = u.PreparePath("{0}/i-{1}_t-{2}_noise-{3}".format(
            dataset_root, dataset_folder_prefix, train_ratio, noise_perc))
        csv_output_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_noise-{3}.train.csv".format(
                folder_path, dataset_folder_prefix, train_ratio, noise_perc))

        noisy_dataset = CreateNoisyDataset(train, class_col, noise_perc / 100,
                                           random_state, flip_fn)
        noisy_dataset.to_csv(csv_output_file, index=False)

        print("done noisy : " + str(noise_perc))
        if (arff_attr_info is not None):
            arff_output_file = u.PreparePath(
                "{0}/i-{1}_t-{2}_noise-{3}.train.arff".format(
                    folder_path, dataset_folder_prefix, train_ratio,
                    noise_perc))
            CreateArffFileFromCsv(arff_attr_info, arff_output_file,
                                  csv_output_file, True, True)

        # writing the parameters
        params_info = [
            "dataset_instance={0}".format(dataset_folder_prefix),
            "test_split={0}".format(test_ratio),
            "train_split={0}".format(train_ratio),
            "random_state={0}".format(random_state),
            "class_col={0}".format(class_col),
            "noise_perc={0}".format(noise_perc)
        ]
        params_out_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_noise-{3}.params.txt".format(
                folder_path, dataset_folder_prefix, train_ratio, noise_perc))
        u.WriteTextArrayToFile(params_out_file, params_info)
コード例 #27
0
def GenerateDatasetSplitsForClassImbalance(rootFolder,
                                           dataset_folder_prefix,
                                           dataset,
                                           test_ratio,
                                           train_ratio,
                                           validation_ratio,
                                           imbalance_percentages,
                                           class_col,
                                           minority_label,
                                           min_minority_to_keep,
                                           random_state,
                                           arff_attr_info=None,
                                           train_set=None,
                                           test_set=None):
    """
	train_size_percentages is a list of intergers specifying the
	percent of train set to be taken while preparing the dataset

	test_ratio,train_ratio,validation_ratio : numbers in percentages
	"""
    dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format(
        rootFolder, dataset_folder_prefix, train_ratio, test_ratio))
    if ((train_set is not None) & (test_set is not None)):
        train = train_set
        test = test_set
    else:
        train, test, validation = CreateTrainTestAndValidationPartitions(
            dataset, class_col, train_ratio / 100, test_ratio / 100,
            random_state, validation_ratio / 100)
    test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format(
        dataset_root, dataset_folder_prefix))
    test.to_csv(test_output_file_csv, index=False)
    if (arff_attr_info is not None):
        test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format(
            dataset_root, dataset_folder_prefix))
        CreateArffFileFromCsv(arff_attr_info, test_output_file_arff,
                              test_output_file_csv, True, True)

    # now creating the train set partitions
    for imbalance_perc in imbalance_percentages:
        folder_path = u.PreparePath("{0}/i-{1}_t-{2}_im-{3}".format(
            dataset_root, dataset_folder_prefix, train_ratio, imbalance_perc))
        csv_output_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_im-{3}.train.csv".format(folder_path,
                                                      dataset_folder_prefix,
                                                      train_ratio,
                                                      imbalance_perc))
        imbalance_dataset = CreateImbalancedDataSet(train, class_col,
                                                    minority_label,
                                                    imbalance_perc / 100,
                                                    min_minority_to_keep,
                                                    random_state)
        imbalance_dataset.to_csv(csv_output_file, index=False)
        print("done imb : " + str(imbalance_perc))
        if (arff_attr_info is not None):
            arff_output_file = u.PreparePath(
                "{0}/i-{1}_t-{2}_im-{3}.train.arff".format(
                    folder_path, dataset_folder_prefix, train_ratio,
                    imbalance_perc))
            CreateArffFileFromCsv(arff_attr_info, arff_output_file,
                                  csv_output_file, True, True)

        # writing the parameters
        params_info = [
            "dataset_instance={0}".format(dataset_folder_prefix),
            "test_split={0}".format(test_ratio),
            "train_split={0}".format(train_ratio),
            "random_state={0}".format(random_state),
            "class_col={0}".format(class_col),
            "minority_label={0}".format(minority_label),
            "imbalance_perc={0}".format(imbalance_perc)
        ]
        params_out_file = u.PreparePath(
            "{0}/i-{1}_t-{2}_im-{3}.params.txt".format(folder_path,
                                                       dataset_folder_prefix,
                                                       train_ratio,
                                                       imbalance_perc))
        u.WriteTextArrayToFile(params_out_file, params_info)
コード例 #28
0
def RunDecisionTrees(datasets_root_folder, weka_jar_path, use_arff_files=True):
    file_extn = "arff" if use_arff_files else ".csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        else:
            break
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False)
        config_gen = ParameterGrid({
            'prune': [False],
            'inst': [2, 5, 8, 12, 15]
        })
        for config in config_gen:
            id = GetIdForConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            # if(os.path.isfile(train_output_file)):
            #     continue
            config['random_state'] = params_info_dict['random_state']
            config["wekajar"] = weka_jar_path
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            # for every config there has to be a train prediction and test prediction
            cmd = GetWekaCommandLineForConfig(config, False)
            config["modelbuildtimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            # now for test set
            #config["predictionoutputfile"] = test_output_file
            #config["testset"] = testfiles[0]
            #cmd = GetWekaCommandLineForConfig(config,True)
            #config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1)

            config.pop('random_state',
                       None)  # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("done dataset : " + dataset_dir)
コード例 #29
0
def RunKNNClassifier(datasets_root_folder,
                     nominal_value_columns=None,
                     positive_class_label=None,
                     metric_fn=None,
                     cv_file=None,
                     cv_scoring='f1'):
    file_extn = "csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/knn", is_file=False)

        data = pd.read_csv(trainfile)
        testdata = pd.read_csv(testfiles[0])
        train_len = len(data)

        cols_to_ignore = set(nominal_value_columns
                             ) if nominal_value_columns is not None else set(
                                 [])
        cols_to_ignore.add(data.columns[-1])
        cols_to_transform = [
            c for c in data.columns if c not in cols_to_ignore
        ]
        scaler = StandardScaler()
        scaler.fit(data[cols_to_transform])
        data[cols_to_transform] = scaler.transform(data[cols_to_transform])
        testdata[cols_to_transform] = scaler.transform(
            testdata[cols_to_transform])

        all_data = pd.concat([data, testdata], axis=0, ignore_index=True)
        X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label,
                                                nominal_value_columns)
        X = X_all[0:train_len, :]
        Y = Y_all[0:train_len]
        test_X = X_all[train_len:, :]
        test_Y = Y_all[train_len:]

        param_grid = {
            'weights': np.array(['uniform', 'distance']),
            'n_neighbors': np.array([5, 10, 20, 50])
        }
        classifier = KNeighborsClassifier()
        if ((cv_file is None) or (os.path.isfile(cv_file) == False)):
            gscv = GridSearchCV(classifier,
                                param_grid,
                                scoring=cv_scoring,
                                n_jobs=3)
            gscv.fit(X, Y)
            _D = pd.DataFrame(gscv.cv_results_)
            best_params = gscv.best_params_
        else:
            _D = None

        config_gen = ParameterGrid({
            'weights': ['uniform'],
            'neighbors': [-1]
        })  # -1 denotes that we need to take the cv results
        for config in config_gen:
            id = GetIdForConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            cv_results_file = u.PreparePath(
                "{0}/{1}.grid_search_cv_results.csv".format(
                    run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            scalar_output_file = u.PreparePath("{0}/{1}.scaler".format(
                run_output_dir, id))
            if (cv_file is not None):
                cv_file = cv_file
            if (_D is not None):
                _D.to_csv(cv_results_file)
            else:
                cv_results = pd.read_csv(cv_file)
                best_params = ast.literal_eval(cv_results[
                    cv_results['rank_test_score'] == 1].iloc[0]['params'])

            # if(os.path.isfile(test_output_file)):
            #	config = config_gen.GetNextConfigAlongWithIdentifier()
            #	continue
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            config["testset"] = testfiles[0]

            if (config['neighbors'] == -1):
                neighbors = best_params['n_neighbors']
                weights = best_params['weights']
                # _D.to_csv(cv_results_file)
                config['best_neighbors'] = neighbors
                config['best_weights'] = weights
            else:
                neighbors = config['neighbors']
                weights = config['weights']
            if (metric_fn is None):
                classifier = KNeighborsClassifier(neighbors, weights)
            else:
                classifier = KNeighborsClassifier(
                    neighbors,
                    weights,
                    algorithm='brute',
                    metric='pyfunc',
                    metric_params={'func': metric_fn})

            loo = LeaveOneOut()
            y_actual = []
            y_predicted = []
            count = 0
            total = len(X)
            for train_idx, test_idx in loo.split(X):
                X_train, X_test = X[train_idx], X[test_idx]
                Y_train, Y_test = Y[train_idx], Y[test_idx]
                classifier.fit(X_train, Y_train)
                Y_test_predicted = classifier.predict(X_test)
                assert (len(Y_test_predicted) == 1)
                y_actual.append(Y_test[0])
                y_predicted.append(Y_test_predicted[0])
                count = count + 1
                if (count % 100 == 0):
                    print(str(count) + " " + str(total))

            start = time.clock()
            classifier.fit(X, Y)
            end = time.clock()
            print(end - start)
            config["modelbuildtimesecs"] = end - start
            # for train performance
            config["trainpredictionoutputfile"] = train_output_file
            #train_predicted_Y = classifier.predict(X)
            output = pd.DataFrame({
                "actual": y_actual,
                "predicted": y_predicted
            })
            output.to_csv(train_output_file, index=False)

            # now for test set
            config["predictionoutputfile"] = test_output_file

            start = time.clock()
            predicted_Y = classifier.predict(test_X)
            end = time.clock()
            u.WriteBinaryFile(model_output_file, classifier)
            u.WriteBinaryFile(scalar_output_file, scaler)
            config["modelevaltimesecs"] = end - start
            output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y})
            output.to_csv(test_output_file, index=False)

            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("DONE dataset : " + dataset_dir)
コード例 #30
0
ファイル: NeuralNetwork.py プロジェクト: shwetabh-khanduja/ml
def RunNeuralNetClassifier(datasets_root_folder,
                           one_hot_encoding_cols=None,
                           positive_class_label=None,
                           cv_file_format=None,
                           cv_scoring='f1'):
    file_extn = "csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/nnets", is_file=False)
        config_gen = nnconfig()
        config = config_gen.GetNextConfigAlongWithIdentifier()
        while (config is not None):
            id = config["id"]
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            # no separate cv is done for early stopping.
            cv_results_file = u.PreparePath(
                "{0}/{1}.grid_search_cv_results.csv".format(
                    run_output_dir, id)).replace("True", "False")
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            # if(os.path.isfile(cv_results_file)):
            # 	config = config_gen.GetNextConfigAlongWithIdentifier()
            # 	continue
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            data = pd.read_csv(trainfile)
            config["testset"] = testfiles[0]
            testdata = pd.read_csv(config["testset"])
            train_len = len(data)

            cols_to_ignore = set(
                one_hot_encoding_cols
            ) if one_hot_encoding_cols is not None else set([])
            cols_to_ignore.add(data.columns[-1])
            cols_to_transform = [
                c for c in data.columns if c not in cols_to_ignore
            ]
            scaler = StandardScaler()
            scaler.fit(data[cols_to_transform])
            data[cols_to_transform] = scaler.transform(data[cols_to_transform])
            testdata[cols_to_transform] = scaler.transform(
                testdata[cols_to_transform])

            all_data = pd.concat([data, testdata], axis=0, ignore_index=True)
            X_all, Y_all = PrepareDataAndLabel(all_data, positive_class_label,
                                               one_hot_encoding_cols)
            X = X_all[0:train_len, :]
            Y = Y_all[0:train_len]
            test_X = X_all[train_len:, :]
            test_Y = Y_all[train_len:]

            hidden_layers = [(10, ), (30, ), (50, ), (70, )]
            init_learning_rates = [0.1, 0.01, 0.001, 0.0001]
            alpha = [0.01, 0.1, 1, 10, 100]
            momentum = 0.9
            max_iter = 200
            early_stopping = config["earlystopping"]
            validation_fraction = 0.3
            random_state = int(params_info_dict["random_state"])
            solver = 'sgd'

            #for doing 3-fold CV
            param_grid = {
                "alpha": alpha,
                "learning_rate_init": init_learning_rates,
                "hidden_layer_sizes": hidden_layers
            }
            classifier = MLPClassifier(activation="logistic",
                                       momentum=momentum,
                                       early_stopping=early_stopping,
                                       verbose=False,
                                       validation_fraction=validation_fraction,
                                       random_state=random_state,
                                       solver="sgd",
                                       max_iter=max_iter)
            cv_file = None
            if (cv_file_format is not None):
                cv_file = cv_file_format.format(id).replace("True", "False")
            if ((cv_file is None) or (os.path.isfile(cv_file) == False)):
                gscv = GridSearchCV(classifier,
                                    param_grid,
                                    scoring=cv_scoring,
                                    n_jobs=3)
                gscv.fit(X, Y)
                _D = pd.DataFrame(gscv.cv_results_)
                best_params = gscv.best_params_
                _D.to_csv(cv_results_file)
            else:
                cv_results = pd.read_csv(cv_file)
                best_params = ast.literal_eval(cv_results[
                    cv_results['rank_test_score'] == 1].iloc[0]['params'])
            # gscv = GridSearchCV(classifier,param_grid,scoring='f1',n_jobs=3)
            # gscv.fit(X,Y)
            # _D = pd.DataFrame(gscv.cv_results_)
            # _D.to_csv(cv_results_file)
            classifier = MLPClassifier(
                hidden_layer_sizes=best_params["hidden_layer_sizes"],
                activation="logistic",
                momentum=momentum,
                early_stopping=early_stopping,
                verbose=True,
                validation_fraction=validation_fraction,
                random_state=random_state,
                solver="sgd",
                max_iter=max_iter,
                learning_rate_init=best_params["learning_rate_init"],
                alpha=best_params["alpha"])
            start = time.clock()
            classifier.fit(X, Y)
            end = time.clock()

            config['momentum'] = momentum
            config["hidden_layers"] = "10;30;50;70"
            config["alphas"] = u.ConcatToStr(";", alpha)
            config["init_learning_rates"] = u.ConcatToStr(
                ";", init_learning_rates)
            config["total_iter"] = classifier.n_iter_
            config["time_per_iter"] = (end - start) / classifier.n_iter_
            config["best_alpha"] = best_params["alpha"]
            config["best_hidden_layer_sizes"] = best_params[
                "hidden_layer_sizes"][0]
            config["best_init_learning_rate"] = best_params[
                "learning_rate_init"]
            config["loss_curve"] = u.ConcatToStr(";", classifier.loss_curve_)

            config["random_state"] = random_state
            config["modelbuildtimesecs"] = end - start
            # for train performance
            config["trainpredictionoutputfile"] = train_output_file
            train_predicted_Y = classifier.predict(X)
            output = pd.DataFrame({
                "actual": Y,
                "predicted": train_predicted_Y
            })
            output.to_csv(train_output_file, index=False)

            # now for test set
            config["predictionoutputfile"] = test_output_file

            u.WriteBinaryFile(model_output_file, classifier)

            #test_X,test_Y = PrepareDataAndLabel(data,positive_class_label,one_hot_encoding_cols)
            predicted_Y = classifier.predict(test_X)
            output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y})
            output.to_csv(test_output_file, index=False)

            config.pop('random_state',
                       None)  # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
            config = config_gen.GetNextConfigAlongWithIdentifier()
        print("done dataset : " + dataset_dir)