def ComputeDiffInOptimalPolicyForMdp(mdp_folder, gammas, vi_file, pi_file, adjustForGoalState, outputFile): diffs = {} for gamma in gammas: v_file = mdp_folder + "/" + str(gamma) + "/" + vi_file p_file = mdp_folder + "/" + str(gamma) + "/" + pi_file vi = pd.read_csv(v_file, header=None).sort_values([0, 1])[[3, 4, 5, 6]].as_matrix() pi = pd.read_csv(p_file, header=None).sort_values([0, 1])[[3, 4, 5, 6]].as_matrix() if (adjustForGoalState): vi = vi[0:-1, :] pi = pi[0:-1, :] #diff = np.where(vi>0,1,0)-np.where(pi>0,1,0) diff = np.max(np.abs(vi - pi), axis=1) non_zero = np.count_nonzero(diff) diffs[gamma] = non_zero lines = [] if (outputFile): for key in gammas: line = "{0},{1}".format(key, diffs[key]) lines.append(line) u.WriteTextArrayToFile(mdp_folder + "/policy_diff.csv", lines) return diffs
def ComputeFinalResults(rootfolder): clustering_stats = pd.read_csv(rootfolder+"/clustering.csv") final_output_file = rootfolder+"/best_metrics.csv" data = u.FilterRows(clustering_stats, lambda x : x['p'] == 2) dim_red = ["ica","pca","rp","mi"] clustering = ["kmeans","gmm"] lines = [] lines.append("clustering,dim_red,k,p,ami_raw,ami_true,sc,bic") raw_predictions = {} for c in clustering: d = data.loc[(data['dim_red_method'] == "raw") & (data['clustering'] == c),:] d = d.loc[d['bic'] == np.min(d['bic']),:] clusters_file = rootfolder + "/clustering_output/mthd={0}_k={1}_d=2_algo=raw.csv".format(c,d.iloc[0]['k']) raw_predictions[c] = np.loadtxt(clusters_file,delimiter=',') for dr in dim_red: for c in clustering: d = data.loc[(data['dim_red_method'] == dr) & (data['clustering'] == c),:] d = d.loc[d['bic'] == np.min(d['bic']),:] clusters_file = rootfolder + "/clustering_output/mthd={0}_k={1}_d=2_algo={2}.csv".format(c,d.iloc[0]['k'],dr) predicted = np.loadtxt(clusters_file,delimiter=',') ami = metrics.adjusted_mutual_info_score(raw_predictions[c],predicted) lines.append(u.ConcatToStr(",",[c,dr,d.iloc[0]['k'],ami,d.iloc[0]['ami_true'],d.iloc[0]['sc'],d.iloc[0]['bic']])) u.WriteTextArrayToFile(final_output_file,lines)
def ComputeDiffInOptimalValueForMdp(mdp_folder, gammas, vi_file, pi_file, adjustForGoalState, outputFile): diffs = {} for gamma in gammas: v_file = mdp_folder + "/" + str(gamma) + "/" + vi_file p_file = mdp_folder + "/" + str(gamma) + "/" + pi_file vi = pd.read_csv(v_file, header=None).sort_values([0, 1])[2].values pi = pd.read_csv(p_file, header=None).sort_values([0, 1])[2].values vi_sum = vi.sum() pi_sum = pi.sum() diff = vi_sum - pi_sum diff = vi - pi diff[np.abs(diff) < 0.01] = 0 pos_ind = np.where(diff > 0) neg_ind = np.where(diff < 0) if ((pos_ind[0].size > 0) & (neg_ind[0].size > 0)): diffs[gamma] = np.NaN continue elif (pos_ind[0].size > 0): diffs[gamma] = 1 elif (neg_ind[0].size > 0): diffs[gamma] = -1 else: diffs[gamma] = 0 lines = [] if (outputFile): for key in gammas: line = "{0},{1}".format(key, diffs[key]) lines.append(line) u.WriteTextArrayToFile(mdp_folder + "/value_diff.csv", lines) return diffs
def ComputePerformanceOnRealTestSet(model_info, root, outputfile, weka_jar, pos_class, compute_accuracy=False): models = ['ada', 'dt', 'svm', 'nnets', 'knn'] f = [] t = [] for model in models: if ((model == 'ada') | (model == 'dt')): testfile = glob.glob("{0}/*.realtest.arff".format(root))[0] modelfile = "{0}/{1}".format(root, model_info[model]) wekajar = weka_jar if (model == 'ada'): _outputfile = "{0}/realtest.prediction.ada.csv".format(root) _f, _t = ComputeWekaSavedModelPerformance( testfile, modelfile, weka_jar, _outputfile, ada.GetWekaCommandLineForConfig, pos_class, compute_accuracy) else: _outputfile = "{0}/realtest.prediction.dt.csv".format(root) _f, _t = ComputeWekaSavedModelPerformance( testfile, modelfile, weka_jar, _outputfile, dt.GetWekaCommandLineForConfig, pos_class, compute_accuracy) else: datafolder = "{0}/i-0_t-80_ts-{1}".format(root, model_info[model][1]) testfile = glob.glob( "{0}/*.realtest.preprocessed.data*".format(datafolder))[0] labelfile = glob.glob( "{0}/*.realtest.preprocessed.label*".format(datafolder))[0] _f, _t = ComputeSklearnSavedModelPerformance( testfile, labelfile, root + '/' + model_info[model][0], pos_class, compute_accuracy) f.append(_f) t.append(_t) file = root + '/' + outputfile lines = [u.ConcatToStr(",", models), u.ConcatToStr(",", f)] u.WriteTextArrayToFile(file, lines) lines = [u.ConcatToStr(",", models), u.ConcatToStr(",", t)] u.WriteTextArrayToFile(file.replace(".csv", ".time.csv"), lines)
def write_to_file(data_to_write, filepath): file = u.PreparePath(filepath) data_to_write.to_csv(file, index=False, header=(include_header & (arff_format_predata_lines is None))) if (arff_format_predata_lines is not None): data = [] data.extend(arff_format_predata_lines) data.extend(u.ReadLinesFromFile(file)) u.WriteTextArrayToFile(file, data)
def ComputeDiffInVI_PI_Q(rootFolder): output = rootFolder + "/pi_q_comparison.csv" pi_file = rootFolder + "/LargeMdpRwTraps50/0.99/pi_10000.policy.csv" q_file = rootFolder + "/LargeMdpRwTraps50/0.99/ql_10000_alpha=1.0_po=boltzmann_p=100.0.policy.csv" diff = ComparePolicies(pi_file, q_file) lines = [] lines.append("LargeMdp," + str(diff)) pi_file = rootFolder + "/SmallMdpRwTraps/0.99/pi_10000.policy.csv" q_file = rootFolder + "/SmallMdpRwTraps/0.99/ql_1000_alpha=1.0_po=greedyepsilon_p=0.1.policy.csv" diff = ComparePolicies(pi_file, q_file) lines.append("SmallMdp," + str(diff)) u.WriteTextArrayToFile(output, lines)
def ComputeDiffInVI_PI_Q_1(rootFolder): output = rootFolder + "/pi_q_comparison.csv" rootFolder1 = r"C:/Users/shkhandu/OneDrive/Gatech/Courses/ML/Assignment4/OutputNew1" pi_file = rootFolder1 + "/LargeMdpRwTraps50/0.99/pi_10000.policy.csv" q_file = rootFolder + "/LargeMdpRwTraps50/0.99/ql_10000_alpha=1.0_po=boltzmann_p=100.0.policy.csv" diff = ComparePolicies(pi_file, q_file) lines = [] lines.append("LargeMdp," + str(diff)) pi_file = rootFolder1 + "/SmallMdpRwTraps/0.99/pi_10000.policy.csv" q_file = rootFolder + "/SmallMdpRwTraps/0.99/ql_5000_alpha=0.1_po=greedyepsilon_p=0.05.policy.csv" diff = ComparePolicies(pi_file, q_file) lines.append("SmallMdp," + str(diff)) u.WriteTextArrayToFile(output, lines)
def RunAdaBoostWithDecisionTreesToGeneratePerIterationMetrics(datasets_root_folder,weka_jar_path,dataset_filter,iters,inst,use_arff_files=True): """ #weightThreshold parameter : http://weka.8497.n7.nabble.com/AdaBoost-Parameters-td11830.html """ file_extn = "arff" if use_arff_files else ".csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if(dataset_filter not in dataset_dir): continue trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir+"/ada",is_file=False) config_gen = ParameterGrid({'prune':[True,False],'iter':iters}) for config in config_gen: id = GetIdForConfig(config) config["inst"] = inst params_info = u.ReadLinesFromFile(paramfile) params_info_dict=sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root,id),is_file=False) params_output_file=u.PreparePath("{0}/{1}.params.txt".format(run_output_dir,id)) model_output_file=u.PreparePath("{0}/{1}.model".format(run_output_dir,id)) train_output_file=u.PreparePath("{0}/{1}.train.predictions.csv".format(run_output_dir,id)) full_train_output_file=u.PreparePath("{0}/{1}.fulltrain.predictions.csv".format(run_output_dir,id)) test_output_file=u.PreparePath("{0}/{1}.test.predictions.csv".format(run_output_dir,id)) # if(os.path.isfile(train_output_file)): # continue config['random_state'] = params_info_dict['random_state'] config["wekajar"] = weka_jar_path config["trainset"] = trainfile config["class"]="last" config["trainpredictionoutputfile"]=train_output_file config["predictionoutputfile"] = config["trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file # for every config there has to be a train prediction and test prediction cmd = GetWekaCommandLineForConfig(config,False,False) config["modelbuildtimesecs"] = timeit.timeit(lambda: sl.RunCmdWithoutConsoleWindow(cmd),number=1) config["testpredictionoutputfile"] = test_output_file config["testset"] = testfiles[0] cmd = GetWekaCommandLineForConfig(config,True) config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1) os.remove(model_output_file) config.pop('random_state',None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k,config[k])) u.WriteTextArrayToFile(params_output_file,params_info) print("done dataset : " + dataset_dir)
def CreateArffFileFromCsv(arff_attr_info, arff_file_path, data_text_array, isFile=False, hasHeader=True): arff_data = [] arff_data.extend(arff_attr_info) data_text_array = u.ReadLinesFromFile(data_text_array) if ( isFile) else data_text_array data_text_array = data_text_array[1:] if (isFile & hasHeader) else data_text_array arff_data.extend(data_text_array) file = u.PreparePath(arff_file_path) u.WriteTextArrayToFile(file, arff_data)
def EvaluateExperiments(datasets_root_folder, params_to_keep, positive_class, metric_calculation_fn, evaluation_output_filename="performance.csv", algo_folder="dt", should_eval=lambda x: True): headers = [] headers.extend(params_to_keep) headers.extend(['istrain', 'p', 'r', 'm']) headers = ",".join(headers) evals = [] evals.append(headers) for directory in u.Get_Subdirectories(datasets_root_folder): # each directory is a dataset directory dt_output_dir = "{0}/{1}".format(directory, algo_folder) if (os.path.isdir(dt_output_dir) == False): continue for run_output_folder in u.Get_Subdirectories(dt_output_dir): if (should_eval(run_output_folder) == False): print("ignoring : {0}".format(run_output_folder)) continue # read params file params_file_path = glob.glob( "{0}/*.params.txt".format(run_output_folder))[0] params = sl.GetDictionary(u.ReadLinesFromFile(params_file_path)) values = [] for k in params_to_keep: if (k in params): values.append(str(params[k])) else: values.append(str(np.NaN)) p, r, f = metric_calculation_fn( params["trainpredictionoutputfile"], positive_class) train_performance_values = ",".join(values) train_performance_values = "{0},1,{1},{2},{3}".format( ",".join(values), str(p), str(r), str(f)) evals.append(train_performance_values) if (os.path.isfile(params["testpredictionoutputfile"])): p, r, f = metric_calculation_fn( params["testpredictionoutputfile"], positive_class) test_performance_values = ",".join(values) test_performance_values = "{0},0,{1},{2},{3}".format( ",".join(values), str(p), str(r), str(f)) evals.append(test_performance_values) u.WriteTextArrayToFile( u.PreparePath("{0}/{1}".format(datasets_root_folder, evaluation_output_filename)), evals)
def ComputeFinalResults1(rootfolder,clusters,dims): raw_results = GetBestRawClustering(rootfolder) clustering_results = pd.read_csv(rootfolder+"/clustering.csv") final_output_file = rootfolder+"/best_metrics.csv" best_raw_clustering_output = rootfolder+"/best_raw_clustering.csv" o = pd.concat([raw_results["kmeans"][1], raw_results["gmm"][1]]) o.to_csv(best_raw_clustering_output) dim_red = ["mi","pca","ica","rp"] lines = [] lines.append("clustering,dim_red,k,p,ami_raw,ami_true,sc,bic") raw_predictions = {} output = None for dr in dim_red: data = u.FilterRows(clustering_results,lambda x : x["dim_red_method"] == dr) dim_best_val = None dim_result = None for dim in dims: best = {} # {p,k,ami} for cluster_mthd in ["kmeans","gmm"]: for cluster in clusters: print("{0},{1},{2},{3}".format(dr,str(dim),cluster_mthd,str(cluster))) d = data.loc[(data['clustering'] == cluster_mthd)&(data['k'] == cluster) & (data['p'] == dim)] row = d.head(1).copy() if(cluster_mthd not in best or best[cluster_mthd]['bic'].iloc[0] > row['bic'].iloc[0]): best[cluster_mthd] = row curr_val = (best["kmeans"]['ami_true'].iloc[0] + best["gmm"]['ami_true'].iloc[0]) / 2 #curr_val = (best["kmeans"]['ami_true'].iloc[0] + best["gmm"]['ami_true'].iloc[0]) / 2 #curr_val = np.minimum(best["kmeans"]['ami_true'].iloc[0], best["gmm"]['ami_true'].iloc[0]) if(dim_best_val is None or dim_best_val < curr_val): dim_best_val = curr_val dim_result = best.copy() for c in ["kmeans","gmm"]: ami_raw = GetAmiWithRawPredictions(rootfolder,raw_results,dr,dim_result[c].iloc[0]["p"],dim_result[c].iloc[0]["k"],c) lines.append("{0},{1},{2},{3},{4},{5},{6},{7}".format(c,str(dim_result[c].iloc[0]["dim_red_method"]),str(dim_result[c].iloc[0]["k"]),str(dim_result[c].iloc[0]["p"]),str(ami_raw[c]),str(dim_result[c].iloc[0]["ami_true"]),str(dim_result[c].iloc[0]["sc"]),str(dim_result[c].iloc[0]["bic"]))) #if(output is None): # output = pd.concat([dim_result["kmeans"],dim_result["gmm"]]) #else: # output = pd.concat([output,dim_result["kmeans"],dim_result["gmm"]]) u.WriteTextArrayToFile(final_output_file,lines)
def RunDecisionTreesWithOptimalInst(datasets_root_folder, weka_jar_path, cv_results_file, use_arff_files=True): file_extn = "arff" if use_arff_files else ".csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) cv_results = pd.read_csv(datasets_root_folder + "/" + cv_results_file) for dataset_dir in u.Get_Subdirectories(datasets_root_folder): trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False) filter_name, filter_val = GetFilterOptions(dataset_dir) config_gen = ParameterGrid({'prune': [True, False]}) for config in config_gen: filter = lambda x: (x['prune'] == False) & (x[ filter_name] == filter_val) & (x[ 'istrain'] == 1) # this will output on the held out set filtered_rows = u.FilterRows(cv_results, filter) a = filtered_rows['m'] if (len(a) == 0): print("ignoring : {0}".format(dataset_dir)) continue b = np.max(filtered_rows['m']) indxs = np.isclose(a, b) best_insts = filtered_rows[indxs] best_insts = best_insts.iloc[0]['inst'] config['inst'] = best_insts id = GetIdForOptConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) # if(os.path.isfile(train_output_file)): # continue config['random_state'] = params_info_dict['random_state'] config["wekajar"] = weka_jar_path config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file # for every config there has to be a train prediction and test prediction cmd = GetWekaCommandLineForConfig(config, False, False) config["modelbuildtimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) # now for test set config["predictionoutputfile"] = test_output_file config["testset"] = testfiles[0] cmd = GetWekaCommandLineForConfig(config, True, False) config["modelevaltimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) config.pop('random_state', None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("done dataset : " + dataset_dir)
def GenerateDatasetSplits(rootFolder, dataset_folder_prefix, dataset, test_ratio, train_ratio, validation_ratio, train_size_percentages, class_col, random_state, arff_attr_info=None): """ train_size_percentages is a list of intergers specifying the percent of train set to be taken while preparing the dataset test_ratio,train_ratio,validation_ratio : numbers in percentages """ dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format( rootFolder, dataset_folder_prefix, train_ratio, test_ratio)) train, test, validation = CreateTrainTestAndValidationPartitions( dataset, class_col, train_ratio / 100, test_ratio / 100, random_state, validation_ratio / 100) if (validation is not None): validation_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format( dataset_root, dataset_folder_prefix)) validation.to_csv(validation_output_file_csv, index=False) test_output_file_csv = u.PreparePath("{0}/i-{1}.realtest.csv".format( dataset_root, dataset_folder_prefix)) test.to_csv(test_output_file_csv, index=False) test_output_file_arff = u.PreparePath("{0}/i-{1}.realtest.arff".format( dataset_root, dataset_folder_prefix)) CreateArffFileFromCsv(arff_attr_info, test_output_file_arff, test_output_file_csv, True, True) else: test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format( dataset_root, dataset_folder_prefix)) test.to_csv(test_output_file_csv, index=False) if (arff_attr_info is not None): test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format( dataset_root, dataset_folder_prefix)) CreateArffFileFromCsv(arff_attr_info, test_output_file_arff, test_output_file_csv, True, True) # now creating the train set partitions for train_set_size in train_size_percentages: folder_path = u.PreparePath("{0}/i-{1}_t-{2}_ts-{3}".format( dataset_root, dataset_folder_prefix, train_ratio, train_set_size)) csv_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_ts-{3}.train.csv".format(folder_path, dataset_folder_prefix, train_ratio, train_set_size)) rows_to_keep = int(len(train) * train_set_size / 100) train.head(rows_to_keep).to_csv(csv_output_file, index=False) if (arff_attr_info is not None): arff_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_ts-{3}.train.arff".format( folder_path, dataset_folder_prefix, train_ratio, train_set_size)) CreateArffFileFromCsv(arff_attr_info, arff_output_file, csv_output_file, True, True) # writing the parameters params_info = [ "dataset_instance={0}".format(dataset_folder_prefix), "test_split={0}".format(test_ratio), "train_split={0}".format(train_ratio), "random_state={0}".format(random_state), "class_col={0}".format(class_col), "train_split_percent_used={0}".format(train_set_size) ] params_out_file = u.PreparePath( "{0}/i-{1}_t-{2}_ts-{3}.params.txt".format(folder_path, dataset_folder_prefix, train_ratio, train_set_size)) u.WriteTextArrayToFile(params_out_file, params_info)
def RunExperiments(X,Y,rootfolder,clusters,dims,compute_acc=None): datasets = {} datasets["raw"] = (X,Y) err_series = [] decorations = {} decorations["pca"] = ("o","r","pca") decorations["ica"] = ("x","b","ica") decorations["rp"] = ("+","g","rp") decorations["mi"] = ("o","k","mi") flags = [True,True,True,True] nn_output_lines = [] nn_output_file = rootfolder + "/nn.csv" if(compute_acc is not None): h,l = CreateOutputLineForNN(RunNeuralNetwork(X,Y,10,compute_acc,False),"raw") nn_output_lines.append(h) nn_output_lines.append(l) best_bic = None ################### PCA ##################### if(flags[0]): pca_results = PerformPca(X,Y,dims,0) pca_var_explained_plot = u.PreparePath(rootfolder + "/plots/pca/var.png") recons_err_plot = u.PreparePath(rootfolder + "/plots/err.png") recons_err_dict = [] var_y = [] err_y = [] for dim in dims: key = "pca_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(pca_results["{0}data".format(key)]),Y) err_y.append(pca_results[key+"reconstruction_error"]) var_y = pca_results[key+"explained_var_ratio"] #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"pca") # #nn_output_lines.append(h) # nn_output_lines.append(l) ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2]) recons_err_dict.append(ser) ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2]) u.SaveDataPlotWithLegends([ser],x_axis_name="dimensions",y1_axis_name="% explained variance",filename=pca_var_explained_plot) ################### ICA ##################### if(flags[1]): ica_kt_plot = u.PreparePath(rootfolder + "/plots/ica/kt.png") err_y = [] ica_results = PerformIca(X,Y,dims,0) for dim in dims: key = "ica_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(ica_results[key+"data"]),Y) err_y.append(ica_results[key+"reconstruction_error"]) #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"ica") # nn_output_lines.append(l) var_y = ica_results["ica_kt_all"] ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2]) recons_err_dict.append(ser) ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2]) u.SaveDataPlotWithLegends([ser],x_axis_name="components",y1_axis_name="kurtosis",filename=ica_kt_plot) ################### RP ##################### if(flags[2]): rp_runs_plot = u.PreparePath(rootfolder + "/plots/rp/runs.png") err_y = [] runs = 10 rp_results = PerformRandomProjections(X,Y,dims,runs) runs_series = [] markers = u.GetColorCombinations(10) i=0 for dim in dims: key = "rp_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(rp_results[key+"data"]),Y) err_y.append(rp_results[key+"reconstruction_error"]) runs_ser = u.YSeries(rp_results[key+"reconstruction_errors_all"],xvalues=np.arange(runs)+1,points_marker = "o",line_color = markers[i]["color"],plot_legend_label="proj dims = "+str(dim)) runs_series.append(runs_ser) i = i + 1 #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"rp") # nn_output_lines.append(l) ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["rp"][0],line_color=decorations["rp"][1],plot_legend_label=decorations["rp"][2]) recons_err_dict.append(ser) u.SaveDataPlotWithLegends(runs_series,x_axis_name="run number",y1_axis_name="reconstruction err",filename=rp_runs_plot) u.SaveDataPlotWithLegends(recons_err_dict,x_axis_name="dimensions",y1_axis_name="reconstruction_error",filename=recons_err_plot) ###################### MI Feature Selection ######################### if(flags[3]): mi_results = PerformMiBasedFeatureSelection(X,Y,dims,10) mi_plot = u.PreparePath(rootfolder + "/plots/mi/scores.png") for dim in dims: key = "mi_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(mi_results[key+"data"]),Y) #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"mi") # nn_output_lines.append(l) ser = u.YSeries(mi_results["scores"],xvalues = np.arange(len(mi_results["scores"])) + 1,points_marker=decorations["mi"][0],line_color=decorations["mi"][1],plot_legend_label=decorations["mi"][2]) u.SaveDataPlotWithLegends([ser],x_axis_name="feature number",y1_axis_name="mutual information", filename=mi_plot) ###################### CLUSTERING ######################### clustering_output_file = rootfolder + "/clustering.csv" clustering_plots_output_root = u.PreparePath(rootfolder + "/plots") lines = [] lines.append("clustering,dim_red_method,k,p,ami_raw,ami_true,sc,bic") raw_clustering_results = {} best_bic_raw_clustering = {} curr_best_bic = {} actual_labels = Y for dim in dims: for algo in ["raw","ica","rp","mi","pca"]: raw_data_plot_done = False key = "{0}_{1}_".format(algo,str(dim)) if(algo == "raw"): key = "raw" dataset = datasets[key] for cluster in clusters: for mthd in ["kmeans","gmm"]: raw_key = "{0}_{1}".format(str(cluster),mthd) print("doing clustering for dim = {0} {1} k = {2} {3}".format(str(dim),algo,str(cluster), mthd)) c_key = "{0}_{1}_predicted".format(mthd,str(cluster)) c_key1 = "{0}_{1}_".format(mthd,str(cluster)) if(algo == "raw" and raw_key in raw_clustering_results): results = raw_clustering_results[raw_key] else: #if(algo == "raw" and cluster == 2 and compute_acc): # results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd] # h,l = CreateOutputLineForNN(RunNeuralNetwork(results[c_key.replace("predicted","new_data")],dataset[1],10,compute_acc),mthd) # nn_output_lines.append(l) #else: results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd] if(algo == "raw"): raw_clustering_results[raw_key] = results if(compute_acc): mthd_key = mthd+algo if algo == "raw" else mthd+algo+str(cluster)+str(dim) if((mthd_key not in curr_best_bic) or (curr_best_bic[mthd_key] > results[c_key1+"bic"])): curr_best_bic[mthd_key] = results[c_key1+"bic"] best_bic_raw_clustering[mthd_key] = (results[c_key1+"new_data"],dataset[1],results[c_key1+"metrics"]["ami"],results[c_key1+"bic"]) print("new best {0} {1}".format(c_key1,str(results[c_key1+"bic"]))) clustering_prediction_file = u.PreparePath(rootfolder + "/clustering_output/mthd={0}_k={1}_d={2}_algo={3}.csv".format(mthd,str(cluster),str(dim),algo)) np.savetxt(clustering_prediction_file,results[c_key]) bic = c_key.replace("predicted","bic") bic = results[bic] act = ComputeClusteringMetrics(actual_labels,results[c_key],dataset[0]) raw = ComputeClusteringMetrics(raw_clustering_results[raw_key][c_key],results[c_key],dataset[0]) line = "{0},{1},{2},{3},{4},{5},{6},{7}".format(mthd,algo,str(cluster),str(dim),str(raw["ami"]),str(act["ami"]),str(raw["sl"]),str(bic)) print(line) plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_{3}.png".format(mthd,str(cluster),algo,str(dim)) #if(mthd == "gmm"): # prob_output_file = rootfolder + "/{0}_{1}_{2}_{3}.csv".format(mthd,str(cluster),algo,str(dim)) # np.savetxt(prob_output_file,results[c_key.replace("predicted","prob")],delimiter=",") ScatterPlotForClustering(results[c_key],actual_labels,plot_output_file) if(dim == 2 and algo != "raw"): if(raw_data_plot_done == False): plot_output_file = clustering_plots_output_root + "/{0}_{1}_data.png".format(mthd,algo) ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],np.zeros_like(actual_labels),actual_labels,plot_output_file) raw_data_plot_done = True plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_data.png".format(mthd,str(cluster),algo) ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],results[c_key],actual_labels,plot_output_file) lines.append(line) #if(compute_acc): # keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","gmmpca":"pca","gmmica":"ica","gmmrp":"rp","gmmmi":"mi"} # for key in keys_to_output.keys(): # if("raw" not in key): # curr_best = None # for cluster in clusters: # datakey = key+str(cluster) # if(curr_best is None or best_bic_raw_clustering[datakey][2] > curr_best): # curr_best = best_bic_raw_clustering[datakey][2] # _X = best_bic_raw_clustering[datakey][0] # _Y = best_bic_raw_clustering[datakey][1] # else: # _X = best_bic_raw_clustering[key][0] # _Y = best_bic_raw_clustering[key][1] # h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key]) # nn_output_lines.append(l) # u.WriteTextArrayToFile(nn_output_file,nn_output_lines) if(compute_acc): keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","pca":"pca","ica":"ica","rp":"rp","mi":"mi"} for key in keys_to_output.keys(): if("raw" not in key): dim_best_val = None dim_result = None for dim in dims: best = {} # {x,y,p,k,bic,ami} for cluster_mthd in ["kmeans","gmm"]: for cluster in clusters: datakey = cluster_mthd+key+str(cluster)+str(dim) if(cluster_mthd not in best or best_bic_raw_clustering[datakey][2] > best[cluster_mthd][4]): best[cluster_mthd] = (best_bic_raw_clustering[datakey][0],best_bic_raw_clustering[datakey][1],dim,cluster,best_bic_raw_clustering[datakey][3],best_bic_raw_clustering[datakey][2]) curr_val = (best["kmeans"][5] + best["gmm"][5]) / 2 if(dim_best_val is None or dim_best_val < curr_val): dim_best_val = curr_val dim_result = best _X = dim_result["gmm"][0] _Y = dim_result["gmm"][1] else: _X = best_bic_raw_clustering[key][0] _Y = best_bic_raw_clustering[key][1] h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key]) nn_output_lines.append(l) u.WriteTextArrayToFile(nn_output_file,nn_output_lines) u.WriteTextArrayToFile(clustering_output_file,lines)
def GenerateDatasetSplitsForWithNoise(rootFolder, dataset_folder_prefix, dataset, test_ratio, train_ratio, validation_ratio, noise_percentages, class_col, flip_fn, random_state, arff_attr_info=None): """ train_size_percentages is a list of intergers specifying the percent of train set to be taken while preparing the dataset test_ratio,train_ratio,validation_ratio : numbers in percentages """ dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format( rootFolder, dataset_folder_prefix, train_ratio, test_ratio)) train, test, validation = CreateTrainTestAndValidationPartitions( dataset, class_col, train_ratio / 100, test_ratio / 100, random_state, validation_ratio / 100) test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format( dataset_root, dataset_folder_prefix)) test.to_csv(test_output_file_csv, index=False) if (arff_attr_info is not None): test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format( dataset_root, dataset_folder_prefix)) CreateArffFileFromCsv(arff_attr_info, test_output_file_arff, test_output_file_csv, True, True) # now creating the train set partitions for noise_perc in noise_percentages: folder_path = u.PreparePath("{0}/i-{1}_t-{2}_noise-{3}".format( dataset_root, dataset_folder_prefix, train_ratio, noise_perc)) csv_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_noise-{3}.train.csv".format( folder_path, dataset_folder_prefix, train_ratio, noise_perc)) noisy_dataset = CreateNoisyDataset(train, class_col, noise_perc / 100, random_state, flip_fn) noisy_dataset.to_csv(csv_output_file, index=False) print("done noisy : " + str(noise_perc)) if (arff_attr_info is not None): arff_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_noise-{3}.train.arff".format( folder_path, dataset_folder_prefix, train_ratio, noise_perc)) CreateArffFileFromCsv(arff_attr_info, arff_output_file, csv_output_file, True, True) # writing the parameters params_info = [ "dataset_instance={0}".format(dataset_folder_prefix), "test_split={0}".format(test_ratio), "train_split={0}".format(train_ratio), "random_state={0}".format(random_state), "class_col={0}".format(class_col), "noise_perc={0}".format(noise_perc) ] params_out_file = u.PreparePath( "{0}/i-{1}_t-{2}_noise-{3}.params.txt".format( folder_path, dataset_folder_prefix, train_ratio, noise_perc)) u.WriteTextArrayToFile(params_out_file, params_info)
def GenerateDatasetSplitsForClassImbalance(rootFolder, dataset_folder_prefix, dataset, test_ratio, train_ratio, validation_ratio, imbalance_percentages, class_col, minority_label, min_minority_to_keep, random_state, arff_attr_info=None, train_set=None, test_set=None): """ train_size_percentages is a list of intergers specifying the percent of train set to be taken while preparing the dataset test_ratio,train_ratio,validation_ratio : numbers in percentages """ dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format( rootFolder, dataset_folder_prefix, train_ratio, test_ratio)) if ((train_set is not None) & (test_set is not None)): train = train_set test = test_set else: train, test, validation = CreateTrainTestAndValidationPartitions( dataset, class_col, train_ratio / 100, test_ratio / 100, random_state, validation_ratio / 100) test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format( dataset_root, dataset_folder_prefix)) test.to_csv(test_output_file_csv, index=False) if (arff_attr_info is not None): test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format( dataset_root, dataset_folder_prefix)) CreateArffFileFromCsv(arff_attr_info, test_output_file_arff, test_output_file_csv, True, True) # now creating the train set partitions for imbalance_perc in imbalance_percentages: folder_path = u.PreparePath("{0}/i-{1}_t-{2}_im-{3}".format( dataset_root, dataset_folder_prefix, train_ratio, imbalance_perc)) csv_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_im-{3}.train.csv".format(folder_path, dataset_folder_prefix, train_ratio, imbalance_perc)) imbalance_dataset = CreateImbalancedDataSet(train, class_col, minority_label, imbalance_perc / 100, min_minority_to_keep, random_state) imbalance_dataset.to_csv(csv_output_file, index=False) print("done imb : " + str(imbalance_perc)) if (arff_attr_info is not None): arff_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_im-{3}.train.arff".format( folder_path, dataset_folder_prefix, train_ratio, imbalance_perc)) CreateArffFileFromCsv(arff_attr_info, arff_output_file, csv_output_file, True, True) # writing the parameters params_info = [ "dataset_instance={0}".format(dataset_folder_prefix), "test_split={0}".format(test_ratio), "train_split={0}".format(train_ratio), "random_state={0}".format(random_state), "class_col={0}".format(class_col), "minority_label={0}".format(minority_label), "imbalance_perc={0}".format(imbalance_perc) ] params_out_file = u.PreparePath( "{0}/i-{1}_t-{2}_im-{3}.params.txt".format(folder_path, dataset_folder_prefix, train_ratio, imbalance_perc)) u.WriteTextArrayToFile(params_out_file, params_info)
def RunNeuralNetClassifier(datasets_root_folder, one_hot_encoding_cols=None, positive_class_label=None, cv_file_format=None, cv_scoring='f1'): file_extn = "csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/nnets", is_file=False) config_gen = nnconfig() config = config_gen.GetNextConfigAlongWithIdentifier() while (config is not None): id = config["id"] params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) # no separate cv is done for early stopping. cv_results_file = u.PreparePath( "{0}/{1}.grid_search_cv_results.csv".format( run_output_dir, id)).replace("True", "False") model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) # if(os.path.isfile(cv_results_file)): # config = config_gen.GetNextConfigAlongWithIdentifier() # continue config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file data = pd.read_csv(trainfile) config["testset"] = testfiles[0] testdata = pd.read_csv(config["testset"]) train_len = len(data) cols_to_ignore = set( one_hot_encoding_cols ) if one_hot_encoding_cols is not None else set([]) cols_to_ignore.add(data.columns[-1]) cols_to_transform = [ c for c in data.columns if c not in cols_to_ignore ] scaler = StandardScaler() scaler.fit(data[cols_to_transform]) data[cols_to_transform] = scaler.transform(data[cols_to_transform]) testdata[cols_to_transform] = scaler.transform( testdata[cols_to_transform]) all_data = pd.concat([data, testdata], axis=0, ignore_index=True) X_all, Y_all = PrepareDataAndLabel(all_data, positive_class_label, one_hot_encoding_cols) X = X_all[0:train_len, :] Y = Y_all[0:train_len] test_X = X_all[train_len:, :] test_Y = Y_all[train_len:] hidden_layers = [(10, ), (30, ), (50, ), (70, )] init_learning_rates = [0.1, 0.01, 0.001, 0.0001] alpha = [0.01, 0.1, 1, 10, 100] momentum = 0.9 max_iter = 200 early_stopping = config["earlystopping"] validation_fraction = 0.3 random_state = int(params_info_dict["random_state"]) solver = 'sgd' #for doing 3-fold CV param_grid = { "alpha": alpha, "learning_rate_init": init_learning_rates, "hidden_layer_sizes": hidden_layers } classifier = MLPClassifier(activation="logistic", momentum=momentum, early_stopping=early_stopping, verbose=False, validation_fraction=validation_fraction, random_state=random_state, solver="sgd", max_iter=max_iter) cv_file = None if (cv_file_format is not None): cv_file = cv_file_format.format(id).replace("True", "False") if ((cv_file is None) or (os.path.isfile(cv_file) == False)): gscv = GridSearchCV(classifier, param_grid, scoring=cv_scoring, n_jobs=3) gscv.fit(X, Y) _D = pd.DataFrame(gscv.cv_results_) best_params = gscv.best_params_ _D.to_csv(cv_results_file) else: cv_results = pd.read_csv(cv_file) best_params = ast.literal_eval(cv_results[ cv_results['rank_test_score'] == 1].iloc[0]['params']) # gscv = GridSearchCV(classifier,param_grid,scoring='f1',n_jobs=3) # gscv.fit(X,Y) # _D = pd.DataFrame(gscv.cv_results_) # _D.to_csv(cv_results_file) classifier = MLPClassifier( hidden_layer_sizes=best_params["hidden_layer_sizes"], activation="logistic", momentum=momentum, early_stopping=early_stopping, verbose=True, validation_fraction=validation_fraction, random_state=random_state, solver="sgd", max_iter=max_iter, learning_rate_init=best_params["learning_rate_init"], alpha=best_params["alpha"]) start = time.clock() classifier.fit(X, Y) end = time.clock() config['momentum'] = momentum config["hidden_layers"] = "10;30;50;70" config["alphas"] = u.ConcatToStr(";", alpha) config["init_learning_rates"] = u.ConcatToStr( ";", init_learning_rates) config["total_iter"] = classifier.n_iter_ config["time_per_iter"] = (end - start) / classifier.n_iter_ config["best_alpha"] = best_params["alpha"] config["best_hidden_layer_sizes"] = best_params[ "hidden_layer_sizes"][0] config["best_init_learning_rate"] = best_params[ "learning_rate_init"] config["loss_curve"] = u.ConcatToStr(";", classifier.loss_curve_) config["random_state"] = random_state config["modelbuildtimesecs"] = end - start # for train performance config["trainpredictionoutputfile"] = train_output_file train_predicted_Y = classifier.predict(X) output = pd.DataFrame({ "actual": Y, "predicted": train_predicted_Y }) output.to_csv(train_output_file, index=False) # now for test set config["predictionoutputfile"] = test_output_file u.WriteBinaryFile(model_output_file, classifier) #test_X,test_Y = PrepareDataAndLabel(data,positive_class_label,one_hot_encoding_cols) predicted_Y = classifier.predict(test_X) output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y}) output.to_csv(test_output_file, index=False) config.pop('random_state', None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) config = config_gen.GetNextConfigAlongWithIdentifier() print("done dataset : " + dataset_dir)
def RunKNNClassifier(datasets_root_folder, nominal_value_columns=None, positive_class_label=None, metric_fn=None, cv_file=None, cv_scoring='f1'): file_extn = "csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/knn", is_file=False) data = pd.read_csv(trainfile) testdata = pd.read_csv(testfiles[0]) train_len = len(data) cols_to_ignore = set(nominal_value_columns ) if nominal_value_columns is not None else set( []) cols_to_ignore.add(data.columns[-1]) cols_to_transform = [ c for c in data.columns if c not in cols_to_ignore ] scaler = StandardScaler() scaler.fit(data[cols_to_transform]) data[cols_to_transform] = scaler.transform(data[cols_to_transform]) testdata[cols_to_transform] = scaler.transform( testdata[cols_to_transform]) all_data = pd.concat([data, testdata], axis=0, ignore_index=True) X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label, nominal_value_columns) X = X_all[0:train_len, :] Y = Y_all[0:train_len] test_X = X_all[train_len:, :] test_Y = Y_all[train_len:] param_grid = { 'weights': np.array(['uniform', 'distance']), 'n_neighbors': np.array([5, 10, 20, 50]) } classifier = KNeighborsClassifier() if ((cv_file is None) or (os.path.isfile(cv_file) == False)): gscv = GridSearchCV(classifier, param_grid, scoring=cv_scoring, n_jobs=3) gscv.fit(X, Y) _D = pd.DataFrame(gscv.cv_results_) best_params = gscv.best_params_ else: _D = None config_gen = ParameterGrid({ 'weights': ['uniform'], 'neighbors': [-1] }) # -1 denotes that we need to take the cv results for config in config_gen: id = GetIdForConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) cv_results_file = u.PreparePath( "{0}/{1}.grid_search_cv_results.csv".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) scalar_output_file = u.PreparePath("{0}/{1}.scaler".format( run_output_dir, id)) if (cv_file is not None): cv_file = cv_file if (_D is not None): _D.to_csv(cv_results_file) else: cv_results = pd.read_csv(cv_file) best_params = ast.literal_eval(cv_results[ cv_results['rank_test_score'] == 1].iloc[0]['params']) # if(os.path.isfile(test_output_file)): # config = config_gen.GetNextConfigAlongWithIdentifier() # continue config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file config["testset"] = testfiles[0] if (config['neighbors'] == -1): neighbors = best_params['n_neighbors'] weights = best_params['weights'] # _D.to_csv(cv_results_file) config['best_neighbors'] = neighbors config['best_weights'] = weights else: neighbors = config['neighbors'] weights = config['weights'] if (metric_fn is None): classifier = KNeighborsClassifier(neighbors, weights) else: classifier = KNeighborsClassifier( neighbors, weights, algorithm='brute', metric='pyfunc', metric_params={'func': metric_fn}) loo = LeaveOneOut() y_actual = [] y_predicted = [] count = 0 total = len(X) for train_idx, test_idx in loo.split(X): X_train, X_test = X[train_idx], X[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] classifier.fit(X_train, Y_train) Y_test_predicted = classifier.predict(X_test) assert (len(Y_test_predicted) == 1) y_actual.append(Y_test[0]) y_predicted.append(Y_test_predicted[0]) count = count + 1 if (count % 100 == 0): print(str(count) + " " + str(total)) start = time.clock() classifier.fit(X, Y) end = time.clock() print(end - start) config["modelbuildtimesecs"] = end - start # for train performance config["trainpredictionoutputfile"] = train_output_file #train_predicted_Y = classifier.predict(X) output = pd.DataFrame({ "actual": y_actual, "predicted": y_predicted }) output.to_csv(train_output_file, index=False) # now for test set config["predictionoutputfile"] = test_output_file start = time.clock() predicted_Y = classifier.predict(test_X) end = time.clock() u.WriteBinaryFile(model_output_file, classifier) u.WriteBinaryFile(scalar_output_file, scaler) config["modelevaltimesecs"] = end - start output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y}) output.to_csv(test_output_file, index=False) for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("DONE dataset : " + dataset_dir)
def RunDecisionTrees(datasets_root_folder, weka_jar_path, use_arff_files=True): file_extn = "arff" if use_arff_files else ".csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False else: break trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False) config_gen = ParameterGrid({ 'prune': [False], 'inst': [2, 5, 8, 12, 15] }) for config in config_gen: id = GetIdForConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) # if(os.path.isfile(train_output_file)): # continue config['random_state'] = params_info_dict['random_state'] config["wekajar"] = weka_jar_path config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file # for every config there has to be a train prediction and test prediction cmd = GetWekaCommandLineForConfig(config, False) config["modelbuildtimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) # now for test set #config["predictionoutputfile"] = test_output_file #config["testset"] = testfiles[0] #cmd = GetWekaCommandLineForConfig(config,True) #config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1) config.pop('random_state', None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("done dataset : " + dataset_dir)
def RunSVMClassifier(datasets_root_folder, nominal_value_columns=None, positive_class_label=None, cv_file=None, cv_scoring='f1'): file_extn = "csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) realtestfiles = glob.glob("{0}/*.realtest.{1}".format( datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/svm", is_file=False) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) data = pd.read_csv(trainfile) testdata = pd.read_csv(testfiles[0]) realtestdata = pd.read_csv(realtestfiles[0]) train_len = len(data) test_len = len(testdata) + train_len cols_to_ignore = set(nominal_value_columns ) if nominal_value_columns is not None else set( []) cols_to_ignore.add(data.columns[-1]) cols_to_transform = [ c for c in data.columns if c not in cols_to_ignore ] scaler = StandardScaler() scaler.fit(data[cols_to_transform]) data[cols_to_transform] = scaler.transform(data[cols_to_transform]) testdata[cols_to_transform] = scaler.transform( testdata[cols_to_transform]) realtestdata[cols_to_transform] = scaler.transform( realtestdata[cols_to_transform]) all_data = pd.concat([data, testdata, realtestdata], axis=0, ignore_index=True) X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label, nominal_value_columns) X = X_all[0:train_len, :] Y = Y_all[0:train_len] test_X = X_all[train_len:test_len, :] test_Y = Y_all[train_len:test_len] realtest_X = X_all[test_len:, :] realtest_Y = Y_all[test_len:] realtest_data_file = trainfile.replace(".train.", ".realtest.preprocessed.data.") realtest_label_file = trainfile.replace( ".train.", ".realtest.preprocessed.label.") np.savetxt(realtest_data_file, realtest_X, delimiter=',') np.savetxt(realtest_label_file, realtest_Y, delimiter=',') dataset_size = GetDataSetSize(dataset_dir) StoreData("train.csv", "train_label.csv", X, Y, dataset_size) StoreData("validation.csv", "validation_label.csv", test_X, test_Y, dataset_size) StoreData("test.csv", "test_label.csv", realtest_X, realtest_Y, dataset_size) param_grid = [ { 'C': [0.1, 1, 10, 100, 1000], 'degree': [2, 3, 4], 'kernel': ['poly'] }, { 'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] classifier = SVC(cache_size=1500, random_state=int(params_info_dict['random_state'])) if ((cv_file is None) or (os.path.isfile(cv_file) == False)): gscv = GridSearchCV(classifier, param_grid, scoring=cv_scoring, n_jobs=3) gscv.fit(X, Y) _D = pd.DataFrame(gscv.cv_results_) best_params = gscv.best_params_ else: _D = None config_gen = [{}] for config in config_gen: id = GetIdForConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) cv_results_file = u.PreparePath( "{0}/{1}.grid_search_cv_results.csv".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) if (_D is not None): _D.to_csv(cv_results_file) else: cv_results = pd.read_csv(cv_file) best_params = ast.literal_eval(cv_results[ cv_results['rank_test_score'] == 1].iloc[0]['params']) # if(os.path.isfile(test_output_file)): # config = config_gen.GetNextConfigAlongWithIdentifier() # continue config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file config["testset"] = testfiles[0] config["kernel"] = best_params['kernel'] config['C'] = best_params['C'] if (config['kernel'] == 'rbf'): config['gamma'] = best_params['gamma'] classifier = SVC(config['C'], gamma=config['gamma'], kernel=config['kernel'], cache_size=1500, random_state=int( params_info_dict['random_state'])) else: config['degree'] = best_params['degree'] classifier = SVC(config['C'], kernel=config['kernel'], degree=config['degree'], cache_size=1500, random_state=int( params_info_dict['random_state'])) start = time.clock() classifier.fit(X, Y) end = time.clock() print(end - start) config["modelbuildtimesecs"] = end - start config['numsupportvectors'] = u.ConcatToStr( ';', classifier.n_support_) # for train performance config["trainpredictionoutputfile"] = train_output_file train_predicted_Y = classifier.predict(X) output = pd.DataFrame({ "actual": Y, "predicted": train_predicted_Y }) output.to_csv(train_output_file, index=False) u.WriteBinaryFile(model_output_file, classifier) # now for test set config["predictionoutputfile"] = test_output_file start = time.clock() predicted_Y = classifier.predict(test_X) end = time.clock() config["modelevaltimesecs"] = end - start output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y}) output.to_csv(test_output_file, index=False) for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("done dataset : " + dataset_dir)