def LoadWineDataSet( root=r"C:\Users\shkhandu\OneDrive\Gatech\Courses\ML\DataSets\WineDataset" ): datafile = root + r"\wine.csv" headerfile = root + r"\schema.txt" data = pd.read_csv(datafile, header="infer") col_names = u.ReadLinesFromFile(headerfile)[0].split(',') data.columns = col_names return data
def write_to_file(data_to_write, filepath): file = u.PreparePath(filepath) data_to_write.to_csv(file, index=False, header=(include_header & (arff_format_predata_lines is None))) if (arff_format_predata_lines is not None): data = [] data.extend(arff_format_predata_lines) data.extend(u.ReadLinesFromFile(file)) u.WriteTextArrayToFile(file, data)
def RunAdaBoostWithDecisionTreesToGeneratePerIterationMetrics(datasets_root_folder,weka_jar_path,dataset_filter,iters,inst,use_arff_files=True): """ #weightThreshold parameter : http://weka.8497.n7.nabble.com/AdaBoost-Parameters-td11830.html """ file_extn = "arff" if use_arff_files else ".csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if(dataset_filter not in dataset_dir): continue trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir+"/ada",is_file=False) config_gen = ParameterGrid({'prune':[True,False],'iter':iters}) for config in config_gen: id = GetIdForConfig(config) config["inst"] = inst params_info = u.ReadLinesFromFile(paramfile) params_info_dict=sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root,id),is_file=False) params_output_file=u.PreparePath("{0}/{1}.params.txt".format(run_output_dir,id)) model_output_file=u.PreparePath("{0}/{1}.model".format(run_output_dir,id)) train_output_file=u.PreparePath("{0}/{1}.train.predictions.csv".format(run_output_dir,id)) full_train_output_file=u.PreparePath("{0}/{1}.fulltrain.predictions.csv".format(run_output_dir,id)) test_output_file=u.PreparePath("{0}/{1}.test.predictions.csv".format(run_output_dir,id)) # if(os.path.isfile(train_output_file)): # continue config['random_state'] = params_info_dict['random_state'] config["wekajar"] = weka_jar_path config["trainset"] = trainfile config["class"]="last" config["trainpredictionoutputfile"]=train_output_file config["predictionoutputfile"] = config["trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file # for every config there has to be a train prediction and test prediction cmd = GetWekaCommandLineForConfig(config,False,False) config["modelbuildtimesecs"] = timeit.timeit(lambda: sl.RunCmdWithoutConsoleWindow(cmd),number=1) config["testpredictionoutputfile"] = test_output_file config["testset"] = testfiles[0] cmd = GetWekaCommandLineForConfig(config,True) config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1) os.remove(model_output_file) config.pop('random_state',None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k,config[k])) u.WriteTextArrayToFile(params_output_file,params_info) print("done dataset : " + dataset_dir)
def CreateArffFileFromCsv(arff_attr_info, arff_file_path, data_text_array, isFile=False, hasHeader=True): arff_data = [] arff_data.extend(arff_attr_info) data_text_array = u.ReadLinesFromFile(data_text_array) if ( isFile) else data_text_array data_text_array = data_text_array[1:] if (isFile & hasHeader) else data_text_array arff_data.extend(data_text_array) file = u.PreparePath(arff_file_path) u.WriteTextArrayToFile(file, arff_data)
def EvaluateExperiments(datasets_root_folder, params_to_keep, positive_class, metric_calculation_fn, evaluation_output_filename="performance.csv", algo_folder="dt", should_eval=lambda x: True): headers = [] headers.extend(params_to_keep) headers.extend(['istrain', 'p', 'r', 'm']) headers = ",".join(headers) evals = [] evals.append(headers) for directory in u.Get_Subdirectories(datasets_root_folder): # each directory is a dataset directory dt_output_dir = "{0}/{1}".format(directory, algo_folder) if (os.path.isdir(dt_output_dir) == False): continue for run_output_folder in u.Get_Subdirectories(dt_output_dir): if (should_eval(run_output_folder) == False): print("ignoring : {0}".format(run_output_folder)) continue # read params file params_file_path = glob.glob( "{0}/*.params.txt".format(run_output_folder))[0] params = sl.GetDictionary(u.ReadLinesFromFile(params_file_path)) values = [] for k in params_to_keep: if (k in params): values.append(str(params[k])) else: values.append(str(np.NaN)) p, r, f = metric_calculation_fn( params["trainpredictionoutputfile"], positive_class) train_performance_values = ",".join(values) train_performance_values = "{0},1,{1},{2},{3}".format( ",".join(values), str(p), str(r), str(f)) evals.append(train_performance_values) if (os.path.isfile(params["testpredictionoutputfile"])): p, r, f = metric_calculation_fn( params["testpredictionoutputfile"], positive_class) test_performance_values = ",".join(values) test_performance_values = "{0},0,{1},{2},{3}".format( ",".join(values), str(p), str(r), str(f)) evals.append(test_performance_values) u.WriteTextArrayToFile( u.PreparePath("{0}/{1}".format(datasets_root_folder, evaluation_output_filename)), evals)
def parameter_getter(path): paramfile = "{0}/knn/weights-uniform_neighbors--1/weights-uniform_neighbors--1.params.txt".format( path) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) return int(params_info_dict['train_split_percent_used'])
def parameter_getter(path): paramfile = "{0}/svm/cvresults/cvresults.params.txt".format(path) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) return int(params_info_dict['train_split_percent_used'])
def parameter_getter(path): paramfile = "{0}/nnets/{1}/{1}.params.txt".format(path, stopping) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) return int(params_info_dict[parameter_name])
def LoadCharacterRecognitionDataset(file, arff_attr_file=None): data = pd.read_csv(file) if (arff_attr_file is not None): arff_attrs = u.ReadLinesFromFile(arff_attr_file) return data, arff_attrs return data, None
def LoadCreditScreeningData(file, arff_attr_file=None): data = pd.read_csv(file) if (arff_attr_file is not None): arff_attrs = u.ReadLinesFromFile(arff_attr_file) return data, arff_attrs return data, None
def RunNeuralNetClassifier(datasets_root_folder, one_hot_encoding_cols=None, positive_class_label=None, cv_file_format=None, cv_scoring='f1'): file_extn = "csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/nnets", is_file=False) config_gen = nnconfig() config = config_gen.GetNextConfigAlongWithIdentifier() while (config is not None): id = config["id"] params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) # no separate cv is done for early stopping. cv_results_file = u.PreparePath( "{0}/{1}.grid_search_cv_results.csv".format( run_output_dir, id)).replace("True", "False") model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) # if(os.path.isfile(cv_results_file)): # config = config_gen.GetNextConfigAlongWithIdentifier() # continue config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file data = pd.read_csv(trainfile) config["testset"] = testfiles[0] testdata = pd.read_csv(config["testset"]) train_len = len(data) cols_to_ignore = set( one_hot_encoding_cols ) if one_hot_encoding_cols is not None else set([]) cols_to_ignore.add(data.columns[-1]) cols_to_transform = [ c for c in data.columns if c not in cols_to_ignore ] scaler = StandardScaler() scaler.fit(data[cols_to_transform]) data[cols_to_transform] = scaler.transform(data[cols_to_transform]) testdata[cols_to_transform] = scaler.transform( testdata[cols_to_transform]) all_data = pd.concat([data, testdata], axis=0, ignore_index=True) X_all, Y_all = PrepareDataAndLabel(all_data, positive_class_label, one_hot_encoding_cols) X = X_all[0:train_len, :] Y = Y_all[0:train_len] test_X = X_all[train_len:, :] test_Y = Y_all[train_len:] hidden_layers = [(10, ), (30, ), (50, ), (70, )] init_learning_rates = [0.1, 0.01, 0.001, 0.0001] alpha = [0.01, 0.1, 1, 10, 100] momentum = 0.9 max_iter = 200 early_stopping = config["earlystopping"] validation_fraction = 0.3 random_state = int(params_info_dict["random_state"]) solver = 'sgd' #for doing 3-fold CV param_grid = { "alpha": alpha, "learning_rate_init": init_learning_rates, "hidden_layer_sizes": hidden_layers } classifier = MLPClassifier(activation="logistic", momentum=momentum, early_stopping=early_stopping, verbose=False, validation_fraction=validation_fraction, random_state=random_state, solver="sgd", max_iter=max_iter) cv_file = None if (cv_file_format is not None): cv_file = cv_file_format.format(id).replace("True", "False") if ((cv_file is None) or (os.path.isfile(cv_file) == False)): gscv = GridSearchCV(classifier, param_grid, scoring=cv_scoring, n_jobs=3) gscv.fit(X, Y) _D = pd.DataFrame(gscv.cv_results_) best_params = gscv.best_params_ _D.to_csv(cv_results_file) else: cv_results = pd.read_csv(cv_file) best_params = ast.literal_eval(cv_results[ cv_results['rank_test_score'] == 1].iloc[0]['params']) # gscv = GridSearchCV(classifier,param_grid,scoring='f1',n_jobs=3) # gscv.fit(X,Y) # _D = pd.DataFrame(gscv.cv_results_) # _D.to_csv(cv_results_file) classifier = MLPClassifier( hidden_layer_sizes=best_params["hidden_layer_sizes"], activation="logistic", momentum=momentum, early_stopping=early_stopping, verbose=True, validation_fraction=validation_fraction, random_state=random_state, solver="sgd", max_iter=max_iter, learning_rate_init=best_params["learning_rate_init"], alpha=best_params["alpha"]) start = time.clock() classifier.fit(X, Y) end = time.clock() config['momentum'] = momentum config["hidden_layers"] = "10;30;50;70" config["alphas"] = u.ConcatToStr(";", alpha) config["init_learning_rates"] = u.ConcatToStr( ";", init_learning_rates) config["total_iter"] = classifier.n_iter_ config["time_per_iter"] = (end - start) / classifier.n_iter_ config["best_alpha"] = best_params["alpha"] config["best_hidden_layer_sizes"] = best_params[ "hidden_layer_sizes"][0] config["best_init_learning_rate"] = best_params[ "learning_rate_init"] config["loss_curve"] = u.ConcatToStr(";", classifier.loss_curve_) config["random_state"] = random_state config["modelbuildtimesecs"] = end - start # for train performance config["trainpredictionoutputfile"] = train_output_file train_predicted_Y = classifier.predict(X) output = pd.DataFrame({ "actual": Y, "predicted": train_predicted_Y }) output.to_csv(train_output_file, index=False) # now for test set config["predictionoutputfile"] = test_output_file u.WriteBinaryFile(model_output_file, classifier) #test_X,test_Y = PrepareDataAndLabel(data,positive_class_label,one_hot_encoding_cols) predicted_Y = classifier.predict(test_X) output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y}) output.to_csv(test_output_file, index=False) config.pop('random_state', None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) config = config_gen.GetNextConfigAlongWithIdentifier() print("done dataset : " + dataset_dir)
def RunKNNClassifier(datasets_root_folder, nominal_value_columns=None, positive_class_label=None, metric_fn=None, cv_file=None, cv_scoring='f1'): file_extn = "csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/knn", is_file=False) data = pd.read_csv(trainfile) testdata = pd.read_csv(testfiles[0]) train_len = len(data) cols_to_ignore = set(nominal_value_columns ) if nominal_value_columns is not None else set( []) cols_to_ignore.add(data.columns[-1]) cols_to_transform = [ c for c in data.columns if c not in cols_to_ignore ] scaler = StandardScaler() scaler.fit(data[cols_to_transform]) data[cols_to_transform] = scaler.transform(data[cols_to_transform]) testdata[cols_to_transform] = scaler.transform( testdata[cols_to_transform]) all_data = pd.concat([data, testdata], axis=0, ignore_index=True) X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label, nominal_value_columns) X = X_all[0:train_len, :] Y = Y_all[0:train_len] test_X = X_all[train_len:, :] test_Y = Y_all[train_len:] param_grid = { 'weights': np.array(['uniform', 'distance']), 'n_neighbors': np.array([5, 10, 20, 50]) } classifier = KNeighborsClassifier() if ((cv_file is None) or (os.path.isfile(cv_file) == False)): gscv = GridSearchCV(classifier, param_grid, scoring=cv_scoring, n_jobs=3) gscv.fit(X, Y) _D = pd.DataFrame(gscv.cv_results_) best_params = gscv.best_params_ else: _D = None config_gen = ParameterGrid({ 'weights': ['uniform'], 'neighbors': [-1] }) # -1 denotes that we need to take the cv results for config in config_gen: id = GetIdForConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) cv_results_file = u.PreparePath( "{0}/{1}.grid_search_cv_results.csv".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) scalar_output_file = u.PreparePath("{0}/{1}.scaler".format( run_output_dir, id)) if (cv_file is not None): cv_file = cv_file if (_D is not None): _D.to_csv(cv_results_file) else: cv_results = pd.read_csv(cv_file) best_params = ast.literal_eval(cv_results[ cv_results['rank_test_score'] == 1].iloc[0]['params']) # if(os.path.isfile(test_output_file)): # config = config_gen.GetNextConfigAlongWithIdentifier() # continue config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file config["testset"] = testfiles[0] if (config['neighbors'] == -1): neighbors = best_params['n_neighbors'] weights = best_params['weights'] # _D.to_csv(cv_results_file) config['best_neighbors'] = neighbors config['best_weights'] = weights else: neighbors = config['neighbors'] weights = config['weights'] if (metric_fn is None): classifier = KNeighborsClassifier(neighbors, weights) else: classifier = KNeighborsClassifier( neighbors, weights, algorithm='brute', metric='pyfunc', metric_params={'func': metric_fn}) loo = LeaveOneOut() y_actual = [] y_predicted = [] count = 0 total = len(X) for train_idx, test_idx in loo.split(X): X_train, X_test = X[train_idx], X[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] classifier.fit(X_train, Y_train) Y_test_predicted = classifier.predict(X_test) assert (len(Y_test_predicted) == 1) y_actual.append(Y_test[0]) y_predicted.append(Y_test_predicted[0]) count = count + 1 if (count % 100 == 0): print(str(count) + " " + str(total)) start = time.clock() classifier.fit(X, Y) end = time.clock() print(end - start) config["modelbuildtimesecs"] = end - start # for train performance config["trainpredictionoutputfile"] = train_output_file #train_predicted_Y = classifier.predict(X) output = pd.DataFrame({ "actual": y_actual, "predicted": y_predicted }) output.to_csv(train_output_file, index=False) # now for test set config["predictionoutputfile"] = test_output_file start = time.clock() predicted_Y = classifier.predict(test_X) end = time.clock() u.WriteBinaryFile(model_output_file, classifier) u.WriteBinaryFile(scalar_output_file, scaler) config["modelevaltimesecs"] = end - start output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y}) output.to_csv(test_output_file, index=False) for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("DONE dataset : " + dataset_dir)
def RunDecisionTrees(datasets_root_folder, weka_jar_path, use_arff_files=True): file_extn = "arff" if use_arff_files else ".csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False else: break trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False) config_gen = ParameterGrid({ 'prune': [False], 'inst': [2, 5, 8, 12, 15] }) for config in config_gen: id = GetIdForConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) # if(os.path.isfile(train_output_file)): # continue config['random_state'] = params_info_dict['random_state'] config["wekajar"] = weka_jar_path config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file # for every config there has to be a train prediction and test prediction cmd = GetWekaCommandLineForConfig(config, False) config["modelbuildtimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) # now for test set #config["predictionoutputfile"] = test_output_file #config["testset"] = testfiles[0] #cmd = GetWekaCommandLineForConfig(config,True) #config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1) config.pop('random_state', None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("done dataset : " + dataset_dir)
def RunDecisionTreesWithOptimalInst(datasets_root_folder, weka_jar_path, cv_results_file, use_arff_files=True): file_extn = "arff" if use_arff_files else ".csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) cv_results = pd.read_csv(datasets_root_folder + "/" + cv_results_file) for dataset_dir in u.Get_Subdirectories(datasets_root_folder): trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False) filter_name, filter_val = GetFilterOptions(dataset_dir) config_gen = ParameterGrid({'prune': [True, False]}) for config in config_gen: filter = lambda x: (x['prune'] == False) & (x[ filter_name] == filter_val) & (x[ 'istrain'] == 1) # this will output on the held out set filtered_rows = u.FilterRows(cv_results, filter) a = filtered_rows['m'] if (len(a) == 0): print("ignoring : {0}".format(dataset_dir)) continue b = np.max(filtered_rows['m']) indxs = np.isclose(a, b) best_insts = filtered_rows[indxs] best_insts = best_insts.iloc[0]['inst'] config['inst'] = best_insts id = GetIdForOptConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) # if(os.path.isfile(train_output_file)): # continue config['random_state'] = params_info_dict['random_state'] config["wekajar"] = weka_jar_path config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file # for every config there has to be a train prediction and test prediction cmd = GetWekaCommandLineForConfig(config, False, False) config["modelbuildtimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) # now for test set config["predictionoutputfile"] = test_output_file config["testset"] = testfiles[0] cmd = GetWekaCommandLineForConfig(config, True, False) config["modelevaltimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) config.pop('random_state', None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("done dataset : " + dataset_dir)
def RunSVMClassifier(datasets_root_folder, nominal_value_columns=None, positive_class_label=None, cv_file=None, cv_scoring='f1'): file_extn = "csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) realtestfiles = glob.glob("{0}/*.realtest.{1}".format( datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/svm", is_file=False) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) data = pd.read_csv(trainfile) testdata = pd.read_csv(testfiles[0]) realtestdata = pd.read_csv(realtestfiles[0]) train_len = len(data) test_len = len(testdata) + train_len cols_to_ignore = set(nominal_value_columns ) if nominal_value_columns is not None else set( []) cols_to_ignore.add(data.columns[-1]) cols_to_transform = [ c for c in data.columns if c not in cols_to_ignore ] scaler = StandardScaler() scaler.fit(data[cols_to_transform]) data[cols_to_transform] = scaler.transform(data[cols_to_transform]) testdata[cols_to_transform] = scaler.transform( testdata[cols_to_transform]) realtestdata[cols_to_transform] = scaler.transform( realtestdata[cols_to_transform]) all_data = pd.concat([data, testdata, realtestdata], axis=0, ignore_index=True) X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label, nominal_value_columns) X = X_all[0:train_len, :] Y = Y_all[0:train_len] test_X = X_all[train_len:test_len, :] test_Y = Y_all[train_len:test_len] realtest_X = X_all[test_len:, :] realtest_Y = Y_all[test_len:] realtest_data_file = trainfile.replace(".train.", ".realtest.preprocessed.data.") realtest_label_file = trainfile.replace( ".train.", ".realtest.preprocessed.label.") np.savetxt(realtest_data_file, realtest_X, delimiter=',') np.savetxt(realtest_label_file, realtest_Y, delimiter=',') dataset_size = GetDataSetSize(dataset_dir) StoreData("train.csv", "train_label.csv", X, Y, dataset_size) StoreData("validation.csv", "validation_label.csv", test_X, test_Y, dataset_size) StoreData("test.csv", "test_label.csv", realtest_X, realtest_Y, dataset_size) param_grid = [ { 'C': [0.1, 1, 10, 100, 1000], 'degree': [2, 3, 4], 'kernel': ['poly'] }, { 'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] classifier = SVC(cache_size=1500, random_state=int(params_info_dict['random_state'])) if ((cv_file is None) or (os.path.isfile(cv_file) == False)): gscv = GridSearchCV(classifier, param_grid, scoring=cv_scoring, n_jobs=3) gscv.fit(X, Y) _D = pd.DataFrame(gscv.cv_results_) best_params = gscv.best_params_ else: _D = None config_gen = [{}] for config in config_gen: id = GetIdForConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) cv_results_file = u.PreparePath( "{0}/{1}.grid_search_cv_results.csv".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) if (_D is not None): _D.to_csv(cv_results_file) else: cv_results = pd.read_csv(cv_file) best_params = ast.literal_eval(cv_results[ cv_results['rank_test_score'] == 1].iloc[0]['params']) # if(os.path.isfile(test_output_file)): # config = config_gen.GetNextConfigAlongWithIdentifier() # continue config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file config["testset"] = testfiles[0] config["kernel"] = best_params['kernel'] config['C'] = best_params['C'] if (config['kernel'] == 'rbf'): config['gamma'] = best_params['gamma'] classifier = SVC(config['C'], gamma=config['gamma'], kernel=config['kernel'], cache_size=1500, random_state=int( params_info_dict['random_state'])) else: config['degree'] = best_params['degree'] classifier = SVC(config['C'], kernel=config['kernel'], degree=config['degree'], cache_size=1500, random_state=int( params_info_dict['random_state'])) start = time.clock() classifier.fit(X, Y) end = time.clock() print(end - start) config["modelbuildtimesecs"] = end - start config['numsupportvectors'] = u.ConcatToStr( ';', classifier.n_support_) # for train performance config["trainpredictionoutputfile"] = train_output_file train_predicted_Y = classifier.predict(X) output = pd.DataFrame({ "actual": Y, "predicted": train_predicted_Y }) output.to_csv(train_output_file, index=False) u.WriteBinaryFile(model_output_file, classifier) # now for test set config["predictionoutputfile"] = test_output_file start = time.clock() predicted_Y = classifier.predict(test_X) end = time.clock() config["modelevaltimesecs"] = end - start output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y}) output.to_csv(test_output_file, index=False) for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("done dataset : " + dataset_dir)