Esempio n. 1
0
def LoadWineDataSet(
        root=r"C:\Users\shkhandu\OneDrive\Gatech\Courses\ML\DataSets\WineDataset"
):
    datafile = root + r"\wine.csv"
    headerfile = root + r"\schema.txt"
    data = pd.read_csv(datafile, header="infer")
    col_names = u.ReadLinesFromFile(headerfile)[0].split(',')
    data.columns = col_names
    return data
Esempio n. 2
0
 def write_to_file(data_to_write, filepath):
     file = u.PreparePath(filepath)
     data_to_write.to_csv(file,
                          index=False,
                          header=(include_header &
                                  (arff_format_predata_lines is None)))
     if (arff_format_predata_lines is not None):
         data = []
         data.extend(arff_format_predata_lines)
         data.extend(u.ReadLinesFromFile(file))
         u.WriteTextArrayToFile(file, data)
Esempio n. 3
0
def RunAdaBoostWithDecisionTreesToGeneratePerIterationMetrics(datasets_root_folder,weka_jar_path,dataset_filter,iters,inst,use_arff_files=True):
    """
    #weightThreshold parameter : http://weka.8497.n7.nabble.com/AdaBoost-Parameters-td11830.html    
    """
    file_extn = "arff" if use_arff_files else ".csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if(dataset_filter not in dataset_dir):
            continue
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir+"/ada",is_file=False)
        config_gen = ParameterGrid({'prune':[True,False],'iter':iters})
        for config in config_gen:
            id = GetIdForConfig(config)
            config["inst"] = inst
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict=sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root,id),is_file=False)
            params_output_file=u.PreparePath("{0}/{1}.params.txt".format(run_output_dir,id))
            model_output_file=u.PreparePath("{0}/{1}.model".format(run_output_dir,id))
            train_output_file=u.PreparePath("{0}/{1}.train.predictions.csv".format(run_output_dir,id))
            full_train_output_file=u.PreparePath("{0}/{1}.fulltrain.predictions.csv".format(run_output_dir,id))
            test_output_file=u.PreparePath("{0}/{1}.test.predictions.csv".format(run_output_dir,id))
            # if(os.path.isfile(train_output_file)):
            #     continue
            config['random_state'] = params_info_dict['random_state']
            config["wekajar"] = weka_jar_path
            config["trainset"] = trainfile
            config["class"]="last"
            config["trainpredictionoutputfile"]=train_output_file
            config["predictionoutputfile"] = config["trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            # for every config there has to be a train prediction and test prediction
            cmd = GetWekaCommandLineForConfig(config,False,False)
            config["modelbuildtimesecs"] = timeit.timeit(lambda: sl.RunCmdWithoutConsoleWindow(cmd),number=1)

            config["testpredictionoutputfile"] = test_output_file
            config["testset"] = testfiles[0]
            cmd = GetWekaCommandLineForConfig(config,True)
            config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1)
            os.remove(model_output_file)

            config.pop('random_state',None) # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k,config[k]))
            u.WriteTextArrayToFile(params_output_file,params_info)
        print("done dataset : " + dataset_dir)
Esempio n. 4
0
def CreateArffFileFromCsv(arff_attr_info,
                          arff_file_path,
                          data_text_array,
                          isFile=False,
                          hasHeader=True):
    arff_data = []
    arff_data.extend(arff_attr_info)
    data_text_array = u.ReadLinesFromFile(data_text_array) if (
        isFile) else data_text_array
    data_text_array = data_text_array[1:] if (isFile
                                              & hasHeader) else data_text_array
    arff_data.extend(data_text_array)
    file = u.PreparePath(arff_file_path)
    u.WriteTextArrayToFile(file, arff_data)
Esempio n. 5
0
def EvaluateExperiments(datasets_root_folder,
                        params_to_keep,
                        positive_class,
                        metric_calculation_fn,
                        evaluation_output_filename="performance.csv",
                        algo_folder="dt",
                        should_eval=lambda x: True):

    headers = []
    headers.extend(params_to_keep)
    headers.extend(['istrain', 'p', 'r', 'm'])
    headers = ",".join(headers)
    evals = []
    evals.append(headers)
    for directory in u.Get_Subdirectories(datasets_root_folder):
        # each directory is a dataset directory
        dt_output_dir = "{0}/{1}".format(directory, algo_folder)
        if (os.path.isdir(dt_output_dir) == False):
            continue
        for run_output_folder in u.Get_Subdirectories(dt_output_dir):
            if (should_eval(run_output_folder) == False):
                print("ignoring : {0}".format(run_output_folder))
                continue
            # read params file
            params_file_path = glob.glob(
                "{0}/*.params.txt".format(run_output_folder))[0]
            params = sl.GetDictionary(u.ReadLinesFromFile(params_file_path))
            values = []
            for k in params_to_keep:
                if (k in params):
                    values.append(str(params[k]))
                else:
                    values.append(str(np.NaN))
            p, r, f = metric_calculation_fn(
                params["trainpredictionoutputfile"], positive_class)
            train_performance_values = ",".join(values)
            train_performance_values = "{0},1,{1},{2},{3}".format(
                ",".join(values), str(p), str(r), str(f))
            evals.append(train_performance_values)
            if (os.path.isfile(params["testpredictionoutputfile"])):
                p, r, f = metric_calculation_fn(
                    params["testpredictionoutputfile"], positive_class)
                test_performance_values = ",".join(values)
                test_performance_values = "{0},0,{1},{2},{3}".format(
                    ",".join(values), str(p), str(r), str(f))
                evals.append(test_performance_values)
    u.WriteTextArrayToFile(
        u.PreparePath("{0}/{1}".format(datasets_root_folder,
                                       evaluation_output_filename)), evals)
Esempio n. 6
0
 def parameter_getter(path):
     paramfile = "{0}/knn/weights-uniform_neighbors--1/weights-uniform_neighbors--1.params.txt".format(
         path)
     params_info = u.ReadLinesFromFile(paramfile)
     params_info_dict = sl.GetDictionary(params_info)
     return int(params_info_dict['train_split_percent_used'])
Esempio n. 7
0
 def parameter_getter(path):
     paramfile = "{0}/svm/cvresults/cvresults.params.txt".format(path)
     params_info = u.ReadLinesFromFile(paramfile)
     params_info_dict = sl.GetDictionary(params_info)
     return int(params_info_dict['train_split_percent_used'])
Esempio n. 8
0
 def parameter_getter(path):
     paramfile = "{0}/nnets/{1}/{1}.params.txt".format(path, stopping)
     params_info = u.ReadLinesFromFile(paramfile)
     params_info_dict = sl.GetDictionary(params_info)
     return int(params_info_dict[parameter_name])
Esempio n. 9
0
def LoadCharacterRecognitionDataset(file, arff_attr_file=None):
    data = pd.read_csv(file)
    if (arff_attr_file is not None):
        arff_attrs = u.ReadLinesFromFile(arff_attr_file)
        return data, arff_attrs
    return data, None
Esempio n. 10
0
def LoadCreditScreeningData(file, arff_attr_file=None):
    data = pd.read_csv(file)
    if (arff_attr_file is not None):
        arff_attrs = u.ReadLinesFromFile(arff_attr_file)
        return data, arff_attrs
    return data, None
Esempio n. 11
0
def RunNeuralNetClassifier(datasets_root_folder,
                           one_hot_encoding_cols=None,
                           positive_class_label=None,
                           cv_file_format=None,
                           cv_scoring='f1'):
    file_extn = "csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/nnets", is_file=False)
        config_gen = nnconfig()
        config = config_gen.GetNextConfigAlongWithIdentifier()
        while (config is not None):
            id = config["id"]
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            # no separate cv is done for early stopping.
            cv_results_file = u.PreparePath(
                "{0}/{1}.grid_search_cv_results.csv".format(
                    run_output_dir, id)).replace("True", "False")
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            # if(os.path.isfile(cv_results_file)):
            # 	config = config_gen.GetNextConfigAlongWithIdentifier()
            # 	continue
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            data = pd.read_csv(trainfile)
            config["testset"] = testfiles[0]
            testdata = pd.read_csv(config["testset"])
            train_len = len(data)

            cols_to_ignore = set(
                one_hot_encoding_cols
            ) if one_hot_encoding_cols is not None else set([])
            cols_to_ignore.add(data.columns[-1])
            cols_to_transform = [
                c for c in data.columns if c not in cols_to_ignore
            ]
            scaler = StandardScaler()
            scaler.fit(data[cols_to_transform])
            data[cols_to_transform] = scaler.transform(data[cols_to_transform])
            testdata[cols_to_transform] = scaler.transform(
                testdata[cols_to_transform])

            all_data = pd.concat([data, testdata], axis=0, ignore_index=True)
            X_all, Y_all = PrepareDataAndLabel(all_data, positive_class_label,
                                               one_hot_encoding_cols)
            X = X_all[0:train_len, :]
            Y = Y_all[0:train_len]
            test_X = X_all[train_len:, :]
            test_Y = Y_all[train_len:]

            hidden_layers = [(10, ), (30, ), (50, ), (70, )]
            init_learning_rates = [0.1, 0.01, 0.001, 0.0001]
            alpha = [0.01, 0.1, 1, 10, 100]
            momentum = 0.9
            max_iter = 200
            early_stopping = config["earlystopping"]
            validation_fraction = 0.3
            random_state = int(params_info_dict["random_state"])
            solver = 'sgd'

            #for doing 3-fold CV
            param_grid = {
                "alpha": alpha,
                "learning_rate_init": init_learning_rates,
                "hidden_layer_sizes": hidden_layers
            }
            classifier = MLPClassifier(activation="logistic",
                                       momentum=momentum,
                                       early_stopping=early_stopping,
                                       verbose=False,
                                       validation_fraction=validation_fraction,
                                       random_state=random_state,
                                       solver="sgd",
                                       max_iter=max_iter)
            cv_file = None
            if (cv_file_format is not None):
                cv_file = cv_file_format.format(id).replace("True", "False")
            if ((cv_file is None) or (os.path.isfile(cv_file) == False)):
                gscv = GridSearchCV(classifier,
                                    param_grid,
                                    scoring=cv_scoring,
                                    n_jobs=3)
                gscv.fit(X, Y)
                _D = pd.DataFrame(gscv.cv_results_)
                best_params = gscv.best_params_
                _D.to_csv(cv_results_file)
            else:
                cv_results = pd.read_csv(cv_file)
                best_params = ast.literal_eval(cv_results[
                    cv_results['rank_test_score'] == 1].iloc[0]['params'])
            # gscv = GridSearchCV(classifier,param_grid,scoring='f1',n_jobs=3)
            # gscv.fit(X,Y)
            # _D = pd.DataFrame(gscv.cv_results_)
            # _D.to_csv(cv_results_file)
            classifier = MLPClassifier(
                hidden_layer_sizes=best_params["hidden_layer_sizes"],
                activation="logistic",
                momentum=momentum,
                early_stopping=early_stopping,
                verbose=True,
                validation_fraction=validation_fraction,
                random_state=random_state,
                solver="sgd",
                max_iter=max_iter,
                learning_rate_init=best_params["learning_rate_init"],
                alpha=best_params["alpha"])
            start = time.clock()
            classifier.fit(X, Y)
            end = time.clock()

            config['momentum'] = momentum
            config["hidden_layers"] = "10;30;50;70"
            config["alphas"] = u.ConcatToStr(";", alpha)
            config["init_learning_rates"] = u.ConcatToStr(
                ";", init_learning_rates)
            config["total_iter"] = classifier.n_iter_
            config["time_per_iter"] = (end - start) / classifier.n_iter_
            config["best_alpha"] = best_params["alpha"]
            config["best_hidden_layer_sizes"] = best_params[
                "hidden_layer_sizes"][0]
            config["best_init_learning_rate"] = best_params[
                "learning_rate_init"]
            config["loss_curve"] = u.ConcatToStr(";", classifier.loss_curve_)

            config["random_state"] = random_state
            config["modelbuildtimesecs"] = end - start
            # for train performance
            config["trainpredictionoutputfile"] = train_output_file
            train_predicted_Y = classifier.predict(X)
            output = pd.DataFrame({
                "actual": Y,
                "predicted": train_predicted_Y
            })
            output.to_csv(train_output_file, index=False)

            # now for test set
            config["predictionoutputfile"] = test_output_file

            u.WriteBinaryFile(model_output_file, classifier)

            #test_X,test_Y = PrepareDataAndLabel(data,positive_class_label,one_hot_encoding_cols)
            predicted_Y = classifier.predict(test_X)
            output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y})
            output.to_csv(test_output_file, index=False)

            config.pop('random_state',
                       None)  # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
            config = config_gen.GetNextConfigAlongWithIdentifier()
        print("done dataset : " + dataset_dir)
Esempio n. 12
0
def RunKNNClassifier(datasets_root_folder,
                     nominal_value_columns=None,
                     positive_class_label=None,
                     metric_fn=None,
                     cv_file=None,
                     cv_scoring='f1'):
    file_extn = "csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/knn", is_file=False)

        data = pd.read_csv(trainfile)
        testdata = pd.read_csv(testfiles[0])
        train_len = len(data)

        cols_to_ignore = set(nominal_value_columns
                             ) if nominal_value_columns is not None else set(
                                 [])
        cols_to_ignore.add(data.columns[-1])
        cols_to_transform = [
            c for c in data.columns if c not in cols_to_ignore
        ]
        scaler = StandardScaler()
        scaler.fit(data[cols_to_transform])
        data[cols_to_transform] = scaler.transform(data[cols_to_transform])
        testdata[cols_to_transform] = scaler.transform(
            testdata[cols_to_transform])

        all_data = pd.concat([data, testdata], axis=0, ignore_index=True)
        X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label,
                                                nominal_value_columns)
        X = X_all[0:train_len, :]
        Y = Y_all[0:train_len]
        test_X = X_all[train_len:, :]
        test_Y = Y_all[train_len:]

        param_grid = {
            'weights': np.array(['uniform', 'distance']),
            'n_neighbors': np.array([5, 10, 20, 50])
        }
        classifier = KNeighborsClassifier()
        if ((cv_file is None) or (os.path.isfile(cv_file) == False)):
            gscv = GridSearchCV(classifier,
                                param_grid,
                                scoring=cv_scoring,
                                n_jobs=3)
            gscv.fit(X, Y)
            _D = pd.DataFrame(gscv.cv_results_)
            best_params = gscv.best_params_
        else:
            _D = None

        config_gen = ParameterGrid({
            'weights': ['uniform'],
            'neighbors': [-1]
        })  # -1 denotes that we need to take the cv results
        for config in config_gen:
            id = GetIdForConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            cv_results_file = u.PreparePath(
                "{0}/{1}.grid_search_cv_results.csv".format(
                    run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            scalar_output_file = u.PreparePath("{0}/{1}.scaler".format(
                run_output_dir, id))
            if (cv_file is not None):
                cv_file = cv_file
            if (_D is not None):
                _D.to_csv(cv_results_file)
            else:
                cv_results = pd.read_csv(cv_file)
                best_params = ast.literal_eval(cv_results[
                    cv_results['rank_test_score'] == 1].iloc[0]['params'])

            # if(os.path.isfile(test_output_file)):
            #	config = config_gen.GetNextConfigAlongWithIdentifier()
            #	continue
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            config["testset"] = testfiles[0]

            if (config['neighbors'] == -1):
                neighbors = best_params['n_neighbors']
                weights = best_params['weights']
                # _D.to_csv(cv_results_file)
                config['best_neighbors'] = neighbors
                config['best_weights'] = weights
            else:
                neighbors = config['neighbors']
                weights = config['weights']
            if (metric_fn is None):
                classifier = KNeighborsClassifier(neighbors, weights)
            else:
                classifier = KNeighborsClassifier(
                    neighbors,
                    weights,
                    algorithm='brute',
                    metric='pyfunc',
                    metric_params={'func': metric_fn})

            loo = LeaveOneOut()
            y_actual = []
            y_predicted = []
            count = 0
            total = len(X)
            for train_idx, test_idx in loo.split(X):
                X_train, X_test = X[train_idx], X[test_idx]
                Y_train, Y_test = Y[train_idx], Y[test_idx]
                classifier.fit(X_train, Y_train)
                Y_test_predicted = classifier.predict(X_test)
                assert (len(Y_test_predicted) == 1)
                y_actual.append(Y_test[0])
                y_predicted.append(Y_test_predicted[0])
                count = count + 1
                if (count % 100 == 0):
                    print(str(count) + " " + str(total))

            start = time.clock()
            classifier.fit(X, Y)
            end = time.clock()
            print(end - start)
            config["modelbuildtimesecs"] = end - start
            # for train performance
            config["trainpredictionoutputfile"] = train_output_file
            #train_predicted_Y = classifier.predict(X)
            output = pd.DataFrame({
                "actual": y_actual,
                "predicted": y_predicted
            })
            output.to_csv(train_output_file, index=False)

            # now for test set
            config["predictionoutputfile"] = test_output_file

            start = time.clock()
            predicted_Y = classifier.predict(test_X)
            end = time.clock()
            u.WriteBinaryFile(model_output_file, classifier)
            u.WriteBinaryFile(scalar_output_file, scaler)
            config["modelevaltimesecs"] = end - start
            output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y})
            output.to_csv(test_output_file, index=False)

            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("DONE dataset : " + dataset_dir)
Esempio n. 13
0
def RunDecisionTrees(datasets_root_folder, weka_jar_path, use_arff_files=True):
    file_extn = "arff" if use_arff_files else ".csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        else:
            break
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False)
        config_gen = ParameterGrid({
            'prune': [False],
            'inst': [2, 5, 8, 12, 15]
        })
        for config in config_gen:
            id = GetIdForConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            # if(os.path.isfile(train_output_file)):
            #     continue
            config['random_state'] = params_info_dict['random_state']
            config["wekajar"] = weka_jar_path
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            # for every config there has to be a train prediction and test prediction
            cmd = GetWekaCommandLineForConfig(config, False)
            config["modelbuildtimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            # now for test set
            #config["predictionoutputfile"] = test_output_file
            #config["testset"] = testfiles[0]
            #cmd = GetWekaCommandLineForConfig(config,True)
            #config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1)

            config.pop('random_state',
                       None)  # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("done dataset : " + dataset_dir)
Esempio n. 14
0
def RunDecisionTreesWithOptimalInst(datasets_root_folder,
                                    weka_jar_path,
                                    cv_results_file,
                                    use_arff_files=True):
    file_extn = "arff" if use_arff_files else ".csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    cv_results = pd.read_csv(datasets_root_folder + "/" + cv_results_file)
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False)
        filter_name, filter_val = GetFilterOptions(dataset_dir)
        config_gen = ParameterGrid({'prune': [True, False]})
        for config in config_gen:

            filter = lambda x: (x['prune'] == False) & (x[
                filter_name] == filter_val) & (x[
                    'istrain'] == 1)  # this will output on the held out set
            filtered_rows = u.FilterRows(cv_results, filter)
            a = filtered_rows['m']
            if (len(a) == 0):
                print("ignoring : {0}".format(dataset_dir))
                continue
            b = np.max(filtered_rows['m'])
            indxs = np.isclose(a, b)
            best_insts = filtered_rows[indxs]
            best_insts = best_insts.iloc[0]['inst']
            config['inst'] = best_insts

            id = GetIdForOptConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            # if(os.path.isfile(train_output_file)):
            #     continue
            config['random_state'] = params_info_dict['random_state']
            config["wekajar"] = weka_jar_path
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            # for every config there has to be a train prediction and test prediction
            cmd = GetWekaCommandLineForConfig(config, False, False)
            config["modelbuildtimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            # now for test set
            config["predictionoutputfile"] = test_output_file
            config["testset"] = testfiles[0]
            cmd = GetWekaCommandLineForConfig(config, True, False)
            config["modelevaltimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            config.pop('random_state',
                       None)  # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("done dataset : " + dataset_dir)
Esempio n. 15
0
def RunSVMClassifier(datasets_root_folder,
                     nominal_value_columns=None,
                     positive_class_label=None,
                     cv_file=None,
                     cv_scoring='f1'):
    file_extn = "csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    realtestfiles = glob.glob("{0}/*.realtest.{1}".format(
        datasets_root_folder, file_extn))
    first = True
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        if (first):
            assert ("ts-100" in dataset_dir)
            first = False
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/svm", is_file=False)
        params_info = u.ReadLinesFromFile(paramfile)
        params_info_dict = sl.GetDictionary(params_info)

        data = pd.read_csv(trainfile)
        testdata = pd.read_csv(testfiles[0])
        realtestdata = pd.read_csv(realtestfiles[0])
        train_len = len(data)
        test_len = len(testdata) + train_len

        cols_to_ignore = set(nominal_value_columns
                             ) if nominal_value_columns is not None else set(
                                 [])
        cols_to_ignore.add(data.columns[-1])
        cols_to_transform = [
            c for c in data.columns if c not in cols_to_ignore
        ]
        scaler = StandardScaler()
        scaler.fit(data[cols_to_transform])
        data[cols_to_transform] = scaler.transform(data[cols_to_transform])
        testdata[cols_to_transform] = scaler.transform(
            testdata[cols_to_transform])
        realtestdata[cols_to_transform] = scaler.transform(
            realtestdata[cols_to_transform])

        all_data = pd.concat([data, testdata, realtestdata],
                             axis=0,
                             ignore_index=True)
        X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label,
                                                nominal_value_columns)
        X = X_all[0:train_len, :]
        Y = Y_all[0:train_len]
        test_X = X_all[train_len:test_len, :]
        test_Y = Y_all[train_len:test_len]
        realtest_X = X_all[test_len:, :]
        realtest_Y = Y_all[test_len:]
        realtest_data_file = trainfile.replace(".train.",
                                               ".realtest.preprocessed.data.")
        realtest_label_file = trainfile.replace(
            ".train.", ".realtest.preprocessed.label.")
        np.savetxt(realtest_data_file, realtest_X, delimiter=',')
        np.savetxt(realtest_label_file, realtest_Y, delimiter=',')

        dataset_size = GetDataSetSize(dataset_dir)
        StoreData("train.csv", "train_label.csv", X, Y, dataset_size)
        StoreData("validation.csv", "validation_label.csv", test_X, test_Y,
                  dataset_size)
        StoreData("test.csv", "test_label.csv", realtest_X, realtest_Y,
                  dataset_size)

        param_grid = [
            {
                'C': [0.1, 1, 10, 100, 1000],
                'degree': [2, 3, 4],
                'kernel': ['poly']
            },
            {
                'C': [0.1, 1, 10, 100, 1000],
                'gamma': [0.001, 0.0001],
                'kernel': ['rbf']
            },
        ]
        classifier = SVC(cache_size=1500,
                         random_state=int(params_info_dict['random_state']))
        if ((cv_file is None) or (os.path.isfile(cv_file) == False)):
            gscv = GridSearchCV(classifier,
                                param_grid,
                                scoring=cv_scoring,
                                n_jobs=3)
            gscv.fit(X, Y)
            _D = pd.DataFrame(gscv.cv_results_)
            best_params = gscv.best_params_
        else:
            _D = None
        config_gen = [{}]
        for config in config_gen:
            id = GetIdForConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            cv_results_file = u.PreparePath(
                "{0}/{1}.grid_search_cv_results.csv".format(
                    run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))

            if (_D is not None):
                _D.to_csv(cv_results_file)
            else:
                cv_results = pd.read_csv(cv_file)
                best_params = ast.literal_eval(cv_results[
                    cv_results['rank_test_score'] == 1].iloc[0]['params'])
            # if(os.path.isfile(test_output_file)):
            # 	config = config_gen.GetNextConfigAlongWithIdentifier()
            # 	continue
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            config["testset"] = testfiles[0]
            config["kernel"] = best_params['kernel']
            config['C'] = best_params['C']
            if (config['kernel'] == 'rbf'):
                config['gamma'] = best_params['gamma']
                classifier = SVC(config['C'],
                                 gamma=config['gamma'],
                                 kernel=config['kernel'],
                                 cache_size=1500,
                                 random_state=int(
                                     params_info_dict['random_state']))
            else:
                config['degree'] = best_params['degree']
                classifier = SVC(config['C'],
                                 kernel=config['kernel'],
                                 degree=config['degree'],
                                 cache_size=1500,
                                 random_state=int(
                                     params_info_dict['random_state']))

            start = time.clock()
            classifier.fit(X, Y)
            end = time.clock()
            print(end - start)
            config["modelbuildtimesecs"] = end - start
            config['numsupportvectors'] = u.ConcatToStr(
                ';', classifier.n_support_)
            # for train performance
            config["trainpredictionoutputfile"] = train_output_file
            train_predicted_Y = classifier.predict(X)
            output = pd.DataFrame({
                "actual": Y,
                "predicted": train_predicted_Y
            })
            output.to_csv(train_output_file, index=False)
            u.WriteBinaryFile(model_output_file, classifier)
            # now for test set
            config["predictionoutputfile"] = test_output_file

            start = time.clock()
            predicted_Y = classifier.predict(test_X)
            end = time.clock()
            config["modelevaltimesecs"] = end - start
            output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y})
            output.to_csv(test_output_file, index=False)

            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("done dataset : " + dataset_dir)