def GenerateVowelRecognitionDataSetSplits( rootFolder, id, train_perc, test_perc, random_state, train_size_percs=None, imbalance_percs=None, noise_percs=None, class_col_name="vowel", min_minority_class_samples_to_keep=500, validation_perc=0): vowelDataFile = u.PreparePath( "{0}/vowel-recongnition-dataset.csv".format(rootFolder)) arff_attrs_file = u.PreparePath("{0}/vowel.txt".format(rootFolder)) data, arff_attrs = LoadCharacterRecognitionDataset(vowelDataFile, arff_attrs_file) minority_class = "v" flip_fn = lambda x: "c" if (x == "v") else "c" if (train_size_percs is not None): GenerateDatasetSplits(rootFolder, id, data, test_perc, train_perc, validation_perc, train_size_percs, class_col_name, random_state, arff_attrs) if (imbalance_percs is not None): GenerateDatasetSplitsForClassImbalance( rootFolder, "imb" + str(id), data, test_perc, train_perc, 0, imbalance_percs, class_col_name, minority_class, min_minority_class_samples_to_keep, random_state, arff_attrs) if (noise_percs is not None): GenerateDatasetSplitsForWithNoise(rootFolder, "noise" + str(id), data, test_perc, train_perc, 0, noise_percs, class_col_name, flip_fn, random_state, arff_attrs)
def PlotCrossValidationCurvesForSvm( rootfolder, y_axis_name="F-Measure", roots=[r'CreditScreeningDataset', 'LetterRecognition']): # root = r'C:/Users/shwet/OneDrive/Gatech/Courses/ML/DataSets/LetterRecognition' #root = r'C:/Users/shkhandu/OneDrive/Gatech/Courses/ML/DataSets/CreditScreeningDataset' for r in roots: root = rootfolder + "/" + r instance = r'i-0_t-80_T-20' dataset_instance_root = root + "/" + instance plot_output_file = u.PreparePath( root + r'/Plots/svm/cv.svm.{0}.png'.format(instance)) cv_save_file = u.PreparePath( dataset_instance_root + "/svm.{0}.model_complexity_curves.csv".format(instance)) x_axis_name = 'Train size % used' title = 'CV Peformance' def parameter_getter(path): paramfile = "{0}/svm/cvresults/cvresults.params.txt".format(path) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) return int(params_info_dict['train_split_percent_used']) def cv_getter(path): return "{0}/svm/cvresults/cvresults.grid_search_cv_results.csv".format( path) PlotCrossValidationCurves(dataset_instance_root, plot_output_file, x_axis_name, y_axis_name, title, parameter_getter, cv_getter, cv_save_file)
def Plot(rootfolder, cols_to_plot_dict): data = pd.read_csv(rootfolder + "/stats_agg.csv") sizes = data['size'].unique() algos = ['rhc', 'sa', 'mimic', 'ga'] algo_decoration = { 'mimic': ('r', 'o', 'mimic'), 'ga': ('g', 's', 'genetic algo'), 'sa': ('b', '+', 'sim annealing'), 'rhc': ('k', '*', 'rhc') } for col in cols_to_plot_dict.keys(): y_ser = [] for algo in algos: x = data[data['algo'] == algo].loc[:, 'size'] y = data[data['algo'] == algo].loc[:, col] legend_label = algo_decoration[algo][2] marker = algo_decoration[algo][1] color = algo_decoration[algo][0] yseries = u.YSeries(y, points_marker=marker, line_color=color, xvalues=x, plot_legend_label=legend_label) y_ser.append(yseries) y_axis_name = cols_to_plot_dict[col] x_axis_name = 'size' savepath = u.PreparePath(rootfolder + "/plots/" + col + ".png") u.SaveDataPlotWithLegends(y_ser, filename=savepath, y1_axis_name=y_axis_name, x_axis_name=x_axis_name)
def PlotCrossValidationCurvesForNNets(rootfolder, y_axis_name="F-Measure"): roots = [ rootfolder + r'/CreditScreeningDataset', rootfolder + r'/LetterRecognition' ] for root in roots: instance = r'i-0_t-80_T-20' stopping = 'earlystop-False' dataset_instance_root = root + '/' + instance plot_output_file = u.PreparePath( root + r'/Plots/nnets/cv.{0}.nnets.{1}.png'.format(stopping, instance)) cv_save_file = u.PreparePath( dataset_instance_root + "/nnets.{0}.{1}.model_complexity_curves.csv".format( instance, stopping)) x_axis_name = 'Train size % used' parameter_name = 'train_split_percent_used' title = 'CV Peformance' def parameter_getter(path): paramfile = "{0}/nnets/{1}/{1}.params.txt".format(path, stopping) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) return int(params_info_dict[parameter_name]) def cv_getter(path): return "{0}/nnets/{1}/{1}.grid_search_cv_results.csv".format( path, stopping) PlotCrossValidationCurves(dataset_instance_root, plot_output_file, x_axis_name, y_axis_name, title, parameter_getter, cv_getter, cv_save_file) plot_fn = lambda x: (("0.0001" in x) | ("0.0001" in x)) & ( ("70" in x) | ("50" in x)) plot_output_file = root + r'/Plots/nnets/cv.small.{0}.nnets.{1}.png'.format( stopping, instance) PlotCrossValidationCurves(dataset_instance_root, plot_output_file, x_axis_name, y_axis_name, title, parameter_getter, cv_getter, should_plot=plot_fn)
def StoreData(data_file_name, label_file_name, data, labels, size): root = u.PreparePath( "c:/Users/shkhandu/OneDrive/Gatech/Courses/ML/Assignment2/VowelRecognition/{0}" .format(str(size)), is_file=False) data_file = root + "/" + data_file_name label_file = root + "/" + label_file_name np.savetxt(data_file, data, delimiter=",") np.savetxt(label_file, labels, delimiter=",")
def main(): print("running") rootfolder = r"C:\Users\shkhandu\OneDrive\Gatech\Courses\ML\Assignment3\skhanduja7\data" output = rootfolder output_lr = u.PreparePath(output + "/lr") X,Y = ReadLetterRecognitionData(rootfolder) RunExperiments(X,Y,output_lr,[10,15,26,35,50],[2,4,8,12,16]) PlotClusteringMetricsForDimsAndBic(output_lr,pd.read_csv(output_lr + '/clustering.csv'),[2,4,8,12,16],["raw","pca","ica","rp","mi"],[10,15,26,35,50],metrics=["bic"]) PlotClusteringMetrics(output_lr,pd.read_csv(output_lr + '/clustering.csv'),[]) ComputeFinalResults1(output_lr,[10,15,26,35,50],[2,4,8,12]) output_lr = u.PreparePath(output + "/cs") X,Y = ReadCreditScreeningData(rootfolder) RunExperiments(X,Y,output_lr,[2,5,10,15,20],[2,5,10,20,30],True) PlotClusteringMetricsForDimsAndBic(output_lr,pd.read_csv(output_lr + '/clustering.csv'),[2,5,10,20,30],["raw","pca","ica","rp","mi"],[2,5,10,15,20],metrics=["bic"]) PlotClusteringMetrics(output_lr,pd.read_csv(output_lr + '/clustering.csv'),[]) ComputeFinalResults1(output_lr,[2,5,10,15,20],[2,5,10,30,20])
def PlotCrossValidationCurvesForKnn(rootfolder, y_axis_name="F-Measure"): # root = r'C:/Users/shwet/OneDrive/Gatech/Courses/ML/DataSets/LetterRecognition' #root = r'C:/Users/shkhandu/OneDrive/Gatech/Courses/ML/DataSets/CreditScreeningDataset' roots = [ rootfolder + r'/CreditScreeningDataset', rootfolder + r'/LetterRecognition' ] for root in roots: instance = r'i-0_t-80_T-20' dataset_instance_root = root + "/" + instance plot_output_file = u.PreparePath( root + r'/Plots/knn/cv.knn.{0}.png'.format(instance)) cv_save_file = u.PreparePath( dataset_instance_root + "/knn.{0}.model_complexity_curves.csv".format(instance)) x_axis_name = 'Model complexity' title = 'CV Peformance' def parameter_getter(path): paramfile = "{0}/knn/weights-uniform_neighbors--1/weights-uniform_neighbors--1.params.txt".format( path) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) return int(params_info_dict['train_split_percent_used']) def knn_label_maker(l): p = ast.literal_eval(l) return "n{0}w{1}".format(p['n_neighbors'], p['weights'][0]) def cv_getter(path): return "{0}/knn/weights-uniform_neighbors--1/weights-uniform_neighbors--1.grid_search_cv_results.csv".format( path) PlotCrossValidationCurves2(dataset_instance_root, plot_output_file, x_axis_name, y_axis_name, title, parameter_getter, cv_getter, cv_save_file, label_maker=knn_label_maker)
def write_to_file(data_to_write, filepath): file = u.PreparePath(filepath) data_to_write.to_csv(file, index=False, header=(include_header & (arff_format_predata_lines is None))) if (arff_format_predata_lines is not None): data = [] data.extend(arff_format_predata_lines) data.extend(u.ReadLinesFromFile(file)) u.WriteTextArrayToFile(file, data)
def GenerateCreditScreeningDataSetSplits(rootFolder, id, train_perc, test_perc, random_state, train_size_percs=None, imbalance_percs=None, noise_percs=None, class_col_name="A16", min_minority_class_samples_to_keep=10, train=None, test=None, validation_perc=0): #rootFolder=r"C:\Users\shkhandu\OneDrive\Gatech\Courses\ML\DataSets\CreditScreeningDataset" #id=0 #train_perc=80 #test_perc=20 vowelDataFile = u.PreparePath( "{0}/data_no_missing_values.csv".format(rootFolder)) arff_attrs_file = u.PreparePath("{0}/arff_attrs.txt".format(rootFolder)) data, arff_attrs = LoadCreditScreeningData(vowelDataFile, arff_attrs_file) #random=0 #train_size_percs = [20,30,40,50,60,70,80,90,100] #imbalance_percs = [90,10,20,30,40,50,70,5,100] minority_class = "+" flip_fn = lambda x: "-" if (x == "+") else "+" if (train_size_percs is not None): GenerateDatasetSplits(rootFolder, id, data, test_perc, train_perc, validation_perc, train_size_percs, class_col_name, random_state, arff_attrs) if (imbalance_percs is not None): GenerateDatasetSplitsForClassImbalance( rootFolder, "imb" + str(id), data, test_perc, train_perc, 0, imbalance_percs, class_col_name, minority_class, min_minority_class_samples_to_keep, random_state, arff_attrs) if (noise_percs is not None): GenerateDatasetSplitsForWithNoise(rootFolder, "noise" + str(id), data, test_perc, train_perc, 0, noise_percs, class_col_name, flip_fn, random_state, arff_attrs)
def PlotClusteringMetrics(rootfolder, data, k,dim='raw',p=2): filter = lambda x : x['dim_red_method'] == dim and x['p'] == p filtered_data = u.FilterRows(data,filter) metrics = ["ami_raw","ami_true","sc","bic"] gmm_data = filtered_data.loc[filtered_data['clustering'] == "gmm",:] kmeans_data = filtered_data.loc[filtered_data['clustering'] == "kmeans",:] d = {"kmeans":('o','b','kmeans'),"gmm":('x','r','gmm')} for metric in metrics: outputfile = u.PreparePath(rootfolder + "/plots/metrics/{0}_{1}_p={2}.png".format(metric,dim,str(p))) kmeans_ser = u.YSeries(kmeans_data[metric],xvalues=kmeans_data["k"],points_marker = d["kmeans"][0],line_color=d["kmeans"][1],plot_legend_label=d["kmeans"][2]) gmm_ser = u.YSeries(gmm_data[metric],xvalues=gmm_data["k"],points_marker = d["gmm"][0],line_color=d["gmm"][1],plot_legend_label=d["gmm"][2]) u.SaveDataPlotWithLegends([kmeans_ser,gmm_ser],x_axis_name="number of clusters",y1_axis_name=metric,filename=outputfile)
def PlotClusteringMetricsForDimsAndBic(rootfolder, data, dims, dim_reds,k,metrics = ["ami_raw","ami_true","sc","bic"]): colors = {"ica":'r','pca':'b','rp':'g','mi':'k','raw':'orange'} markers = {"kmeans":'o',"gmm":'x'} for _k in dims: for metric in metrics: for dim_red in dim_reds: ser = [] outputfile = u.PreparePath(rootfolder + "/plots/metrics/dr_{0}_p={1}_{2}.png".format(metric,str(_k),dim_red)) d = data.loc[(data['dim_red_method'] == dim_red) & (data['p'] == _k) & (data['clustering'] == 'kmeans') ,:] ser.append(u.YSeries(d[metric],xvalues=d['k'],line_color=colors[dim_red],points_marker=markers['kmeans'],plot_legend_label="{0}-{1}".format(dim_red,'kmeans'))) d = data.loc[(data['dim_red_method'] == dim_red) & (data['p'] == _k) & (data['clustering'] == 'gmm') ,:] ser.append(u.YSeries(d[metric],xvalues=d['k'],line_color=colors[dim_red],points_marker=markers['gmm'],plot_legend_label="{0}-{1}".format(dim_red,'gmm'))) u.SaveDataPlotWithLegends(ser,x_axis_name="k",y1_axis_name=metric,filename = outputfile)
def CreateArffFileFromCsv(arff_attr_info, arff_file_path, data_text_array, isFile=False, hasHeader=True): arff_data = [] arff_data.extend(arff_attr_info) data_text_array = u.ReadLinesFromFile(data_text_array) if ( isFile) else data_text_array data_text_array = data_text_array[1:] if (isFile & hasHeader) else data_text_array arff_data.extend(data_text_array) file = u.PreparePath(arff_file_path) u.WriteTextArrayToFile(file, arff_data)
def EvaluateExperiments(datasets_root_folder, params_to_keep, positive_class, metric_calculation_fn, evaluation_output_filename="performance.csv", algo_folder="dt", should_eval=lambda x: True): headers = [] headers.extend(params_to_keep) headers.extend(['istrain', 'p', 'r', 'm']) headers = ",".join(headers) evals = [] evals.append(headers) for directory in u.Get_Subdirectories(datasets_root_folder): # each directory is a dataset directory dt_output_dir = "{0}/{1}".format(directory, algo_folder) if (os.path.isdir(dt_output_dir) == False): continue for run_output_folder in u.Get_Subdirectories(dt_output_dir): if (should_eval(run_output_folder) == False): print("ignoring : {0}".format(run_output_folder)) continue # read params file params_file_path = glob.glob( "{0}/*.params.txt".format(run_output_folder))[0] params = sl.GetDictionary(u.ReadLinesFromFile(params_file_path)) values = [] for k in params_to_keep: if (k in params): values.append(str(params[k])) else: values.append(str(np.NaN)) p, r, f = metric_calculation_fn( params["trainpredictionoutputfile"], positive_class) train_performance_values = ",".join(values) train_performance_values = "{0},1,{1},{2},{3}".format( ",".join(values), str(p), str(r), str(f)) evals.append(train_performance_values) if (os.path.isfile(params["testpredictionoutputfile"])): p, r, f = metric_calculation_fn( params["testpredictionoutputfile"], positive_class) test_performance_values = ",".join(values) test_performance_values = "{0},0,{1},{2},{3}".format( ",".join(values), str(p), str(r), str(f)) evals.append(test_performance_values) u.WriteTextArrayToFile( u.PreparePath("{0}/{1}".format(datasets_root_folder, evaluation_output_filename)), evals)
def RunAdaBoostWithDecisionTreesToGeneratePerIterationMetrics(datasets_root_folder,weka_jar_path,dataset_filter,iters,inst,use_arff_files=True): """ #weightThreshold parameter : http://weka.8497.n7.nabble.com/AdaBoost-Parameters-td11830.html """ file_extn = "arff" if use_arff_files else ".csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if(dataset_filter not in dataset_dir): continue trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir+"/ada",is_file=False) config_gen = ParameterGrid({'prune':[True,False],'iter':iters}) for config in config_gen: id = GetIdForConfig(config) config["inst"] = inst params_info = u.ReadLinesFromFile(paramfile) params_info_dict=sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root,id),is_file=False) params_output_file=u.PreparePath("{0}/{1}.params.txt".format(run_output_dir,id)) model_output_file=u.PreparePath("{0}/{1}.model".format(run_output_dir,id)) train_output_file=u.PreparePath("{0}/{1}.train.predictions.csv".format(run_output_dir,id)) full_train_output_file=u.PreparePath("{0}/{1}.fulltrain.predictions.csv".format(run_output_dir,id)) test_output_file=u.PreparePath("{0}/{1}.test.predictions.csv".format(run_output_dir,id)) # if(os.path.isfile(train_output_file)): # continue config['random_state'] = params_info_dict['random_state'] config["wekajar"] = weka_jar_path config["trainset"] = trainfile config["class"]="last" config["trainpredictionoutputfile"]=train_output_file config["predictionoutputfile"] = config["trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file # for every config there has to be a train prediction and test prediction cmd = GetWekaCommandLineForConfig(config,False,False) config["modelbuildtimesecs"] = timeit.timeit(lambda: sl.RunCmdWithoutConsoleWindow(cmd),number=1) config["testpredictionoutputfile"] = test_output_file config["testset"] = testfiles[0] cmd = GetWekaCommandLineForConfig(config,True) config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1) os.remove(model_output_file) config.pop('random_state',None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k,config[k])) u.WriteTextArrayToFile(params_output_file,params_info) print("done dataset : " + dataset_dir)
def PlotPerIterationCurves(rootFolder, outputfolder): mimic = pd.read_csv(rootFolder + "/mimic.csv") sa = pd.read_csv(rootFolder + "/sa.csv") rhc = pd.read_csv(rootFolder + "/rhc.csv") ga = pd.read_csv(rootFolder + "/ga.csv") sizes = np.array(mimic['size'].unique()) algo_decoration = { 'mimic': ('r', 'o', 'mimic', mimic), 'ga': ('g', 's', 'genetic algo', ga), 'sa': ('b', '+', 'sim annealing', sa), 'rhc': ('k', '*', 'rhc', rhc) } def f(data, name): x = data['iters'] y = data['fn_value'] deco = algo_decoration[name] return u.YSeries(y, xvalues=x, points_marker='.', plot_legend_label=deco[2], legend_marker='o', line_color=deco[0]) for size in sizes: size_root = u.PreparePath(outputfolder + "/itercurves_" + str(size) + ".png") y_ser = [] for key in algo_decoration.keys(): d = u.FilterRows(algo_decoration[key][3], lambda x: x['size'] == size).head(10000) y_ser.append(f(d, key)) u.SaveDataPlotWithLegends(y_ser, x_axis_name="iters", x=None, y1_axis_name="fn value", filename=size_root)
def PlotPiViConvergenceForSmallAndLargeMdp(outputfolder, datafile, gamma): data = pd.read_csv(datafile) decorations = {1: 'g', 10: 'k', 10000: 'r'} pi_sweeps = [1, 10, 10000] ser = [] ser1 = [] vi_added = False for sweep in pi_sweeps: data_vi = u.FilterRows( data, lambda x: (x['mdp'] == 'LargeMdpRwTraps50') & (x['solver'] == 'vi') & (x['gamma'] == gamma)) data_pi = u.FilterRows( data, lambda x: (x['mdp'] == 'LargeMdpRwTraps50') & (x['solver'] == 'pi') & (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep)) assert (len(data_vi) == 1) assert (len(data_pi) == 1) data_vi_qchange = np.array( [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')]) data_vi_value = np.array([ float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';') ]) data_pi_qchange = np.array( [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')]) data_pi_value = np.array([ float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';') ]) if (vi_added == False): s_vi = u.YSeries(data_vi_qchange, xvalues=np.arange(len(data_vi_qchange)) + 1, line_color='b', plot_legend_label='VI') ser.append(s_vi) s_pi = u.YSeries(data_pi_qchange, xvalues=np.arange(len(data_pi_qchange)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser.append(s_pi) if (vi_added == False): s_vi = u.YSeries(data_vi_value, xvalues=np.arange(len(data_vi_value)) + 1, line_color='b', plot_legend_label='VI') ser1.append(s_vi) s_pi = u.YSeries(data_pi_value, xvalues=np.arange(len(data_pi_value)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser1.append(s_pi) vi_added = True outputfile = u.PreparePath(outputfolder + "/plots/large_qchange_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser, filename=outputfile, x_axis_name="iterations", y1_axis_name="Max change in state value") outputfile = u.PreparePath(outputfolder + "/plots/large_value_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser1, filename=outputfile, x_axis_name="iterations", y1_axis_name="Total value accross states") ser = [] ser1 = [] vi_added = False for sweep in pi_sweeps: data_vi = u.FilterRows( data, lambda x: (x['mdp'] == 'SmallMdpRwTraps') & (x['solver'] == 'vi') & (x['gamma'] == gamma)) data_pi = u.FilterRows( data, lambda x: (x['mdp'] == 'SmallMdpRwTraps') & (x['solver'] == 'pi') & (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep)) assert (len(data_vi) == 1) assert (len(data_pi) == 1) data_vi_qchange = np.array( [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')]) data_vi_value = np.array([ float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';') ]) data_pi_qchange = np.array( [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')]) data_pi_value = np.array([ float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';') ]) if (vi_added == False): s_vi = u.YSeries(data_vi_qchange, xvalues=np.arange(len(data_vi_qchange)) + 1, line_color='b', plot_legend_label='VI') ser.append(s_vi) s_pi = u.YSeries(data_pi_qchange, xvalues=np.arange(len(data_pi_qchange)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser.append(s_pi) if (vi_added == False): s_vi = u.YSeries(data_vi_value, xvalues=np.arange(len(data_vi_value)) + 1, line_color='b', plot_legend_label='VI') ser1.append(s_vi) s_pi = u.YSeries(data_pi_value, xvalues=np.arange(len(data_pi_value)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser1.append(s_pi) vi_added = True outputfile = u.PreparePath(outputfolder + "/plots/small_qchange_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser, filename=outputfile, x_axis_name="iterations", y1_axis_name="Max change in state value") outputfile = u.PreparePath(outputfolder + "/plots/small_value_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser1, filename=outputfile, x_axis_name="iterations", y1_axis_name="Total value accross states")
def NNetAnalysis(output_root, output_file_prefix, metrics_file, iters_to_ignore, y_axis_name="F-Measure"): data_all = pd.read_csv(metrics_file) dataset_types = ['train_split_percent_used'] col_funcs = { 'p': ['mean'], 'r': ['mean'], 'm': ['mean'], 'modelbuildtimesecs': ['mean'] } mapping_output_words = { 'p': 'Precision', 'r': 'Recall', 'm': y_axis_name, dataset_types[0]: 'Train size % used', 'modelbuildtimesecs': 'Time to build model (sec)' } for dataset_type in dataset_types: def filter_query(x): return (~np.isnan(x[dataset_type]) & (x['total_iter'] > iters_to_ignore)) def train_earlystopping_filter(x): return x['earlystopping'] & (x['istrain'] == 1) def train_no_earlystopping_filter(x): return (x['earlystopping'] == False) & (x['istrain'] == 1) def test_earlystopping_filter(x): return x['earlystopping'] & (x['istrain'] == 0) def test_no_earlystopping_filter(x): return (x['earlystopping'] == False) & (x['istrain'] == 0) data = FilterRows(data_all, filter_query) data_agg = GetAggMetrics( data, col_funcs=col_funcs, gpby=[dataset_type, 'earlystopping', 'istrain']) x = data_agg[dataset_type].unique() def MissingValuesHandler(curr_values_frame, keyCol, valueCol, required_values): data = dict( zip(curr_values_frame[keyCol], curr_values_frame[valueCol])) y = [] for v in required_values: if (v in data): y.append(data[v]) else: y.append(0) return y for k, v in col_funcs.items(): for agg in v: mvh = lambda df: MissingValuesHandler(df, dataset_type, k + "_" + agg, x) y_train_earlystopping = u.YSeries( mvh(FilterRows(data_agg, train_earlystopping_filter)), line_color='r', points_marker='o', plot_legend_label="Train_with_earlystopping") y_train_no_earlystopping = u.YSeries( mvh(FilterRows(data_agg, train_no_earlystopping_filter)), line_color='r', points_marker='x', plot_legend_label="Train_without_earlystopping") y_test_earlystopping = u.YSeries( mvh(FilterRows(data_agg, test_earlystopping_filter)), line_color='b', points_marker='o', plot_legend_label="Validation_with_earlystopping") y_no_test_earlystopping = u.YSeries( mvh(FilterRows(data_agg, test_no_earlystopping_filter)), line_color='b', points_marker='x', plot_legend_label="Validation_without_earlystopping") output_file_name = u.PreparePath( "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k, agg, output_root, dataset_type)) f, ax = u.SaveDataPlotWithLegends( [ y_test_earlystopping, y_no_test_earlystopping, y_train_no_earlystopping, y_train_earlystopping ], x, output_file_name, True, mapping_output_words[dataset_type], mapping_output_words[k], 'Neural Nets Performance'.format(agg)) return data_agg
def RunExperiments(X,Y,rootfolder,clusters,dims,compute_acc=None): datasets = {} datasets["raw"] = (X,Y) err_series = [] decorations = {} decorations["pca"] = ("o","r","pca") decorations["ica"] = ("x","b","ica") decorations["rp"] = ("+","g","rp") decorations["mi"] = ("o","k","mi") flags = [True,True,True,True] nn_output_lines = [] nn_output_file = rootfolder + "/nn.csv" if(compute_acc is not None): h,l = CreateOutputLineForNN(RunNeuralNetwork(X,Y,10,compute_acc,False),"raw") nn_output_lines.append(h) nn_output_lines.append(l) best_bic = None ################### PCA ##################### if(flags[0]): pca_results = PerformPca(X,Y,dims,0) pca_var_explained_plot = u.PreparePath(rootfolder + "/plots/pca/var.png") recons_err_plot = u.PreparePath(rootfolder + "/plots/err.png") recons_err_dict = [] var_y = [] err_y = [] for dim in dims: key = "pca_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(pca_results["{0}data".format(key)]),Y) err_y.append(pca_results[key+"reconstruction_error"]) var_y = pca_results[key+"explained_var_ratio"] #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"pca") # #nn_output_lines.append(h) # nn_output_lines.append(l) ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2]) recons_err_dict.append(ser) ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2]) u.SaveDataPlotWithLegends([ser],x_axis_name="dimensions",y1_axis_name="% explained variance",filename=pca_var_explained_plot) ################### ICA ##################### if(flags[1]): ica_kt_plot = u.PreparePath(rootfolder + "/plots/ica/kt.png") err_y = [] ica_results = PerformIca(X,Y,dims,0) for dim in dims: key = "ica_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(ica_results[key+"data"]),Y) err_y.append(ica_results[key+"reconstruction_error"]) #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"ica") # nn_output_lines.append(l) var_y = ica_results["ica_kt_all"] ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2]) recons_err_dict.append(ser) ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2]) u.SaveDataPlotWithLegends([ser],x_axis_name="components",y1_axis_name="kurtosis",filename=ica_kt_plot) ################### RP ##################### if(flags[2]): rp_runs_plot = u.PreparePath(rootfolder + "/plots/rp/runs.png") err_y = [] runs = 10 rp_results = PerformRandomProjections(X,Y,dims,runs) runs_series = [] markers = u.GetColorCombinations(10) i=0 for dim in dims: key = "rp_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(rp_results[key+"data"]),Y) err_y.append(rp_results[key+"reconstruction_error"]) runs_ser = u.YSeries(rp_results[key+"reconstruction_errors_all"],xvalues=np.arange(runs)+1,points_marker = "o",line_color = markers[i]["color"],plot_legend_label="proj dims = "+str(dim)) runs_series.append(runs_ser) i = i + 1 #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"rp") # nn_output_lines.append(l) ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["rp"][0],line_color=decorations["rp"][1],plot_legend_label=decorations["rp"][2]) recons_err_dict.append(ser) u.SaveDataPlotWithLegends(runs_series,x_axis_name="run number",y1_axis_name="reconstruction err",filename=rp_runs_plot) u.SaveDataPlotWithLegends(recons_err_dict,x_axis_name="dimensions",y1_axis_name="reconstruction_error",filename=recons_err_plot) ###################### MI Feature Selection ######################### if(flags[3]): mi_results = PerformMiBasedFeatureSelection(X,Y,dims,10) mi_plot = u.PreparePath(rootfolder + "/plots/mi/scores.png") for dim in dims: key = "mi_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(mi_results[key+"data"]),Y) #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"mi") # nn_output_lines.append(l) ser = u.YSeries(mi_results["scores"],xvalues = np.arange(len(mi_results["scores"])) + 1,points_marker=decorations["mi"][0],line_color=decorations["mi"][1],plot_legend_label=decorations["mi"][2]) u.SaveDataPlotWithLegends([ser],x_axis_name="feature number",y1_axis_name="mutual information", filename=mi_plot) ###################### CLUSTERING ######################### clustering_output_file = rootfolder + "/clustering.csv" clustering_plots_output_root = u.PreparePath(rootfolder + "/plots") lines = [] lines.append("clustering,dim_red_method,k,p,ami_raw,ami_true,sc,bic") raw_clustering_results = {} best_bic_raw_clustering = {} curr_best_bic = {} actual_labels = Y for dim in dims: for algo in ["raw","ica","rp","mi","pca"]: raw_data_plot_done = False key = "{0}_{1}_".format(algo,str(dim)) if(algo == "raw"): key = "raw" dataset = datasets[key] for cluster in clusters: for mthd in ["kmeans","gmm"]: raw_key = "{0}_{1}".format(str(cluster),mthd) print("doing clustering for dim = {0} {1} k = {2} {3}".format(str(dim),algo,str(cluster), mthd)) c_key = "{0}_{1}_predicted".format(mthd,str(cluster)) c_key1 = "{0}_{1}_".format(mthd,str(cluster)) if(algo == "raw" and raw_key in raw_clustering_results): results = raw_clustering_results[raw_key] else: #if(algo == "raw" and cluster == 2 and compute_acc): # results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd] # h,l = CreateOutputLineForNN(RunNeuralNetwork(results[c_key.replace("predicted","new_data")],dataset[1],10,compute_acc),mthd) # nn_output_lines.append(l) #else: results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd] if(algo == "raw"): raw_clustering_results[raw_key] = results if(compute_acc): mthd_key = mthd+algo if algo == "raw" else mthd+algo+str(cluster)+str(dim) if((mthd_key not in curr_best_bic) or (curr_best_bic[mthd_key] > results[c_key1+"bic"])): curr_best_bic[mthd_key] = results[c_key1+"bic"] best_bic_raw_clustering[mthd_key] = (results[c_key1+"new_data"],dataset[1],results[c_key1+"metrics"]["ami"],results[c_key1+"bic"]) print("new best {0} {1}".format(c_key1,str(results[c_key1+"bic"]))) clustering_prediction_file = u.PreparePath(rootfolder + "/clustering_output/mthd={0}_k={1}_d={2}_algo={3}.csv".format(mthd,str(cluster),str(dim),algo)) np.savetxt(clustering_prediction_file,results[c_key]) bic = c_key.replace("predicted","bic") bic = results[bic] act = ComputeClusteringMetrics(actual_labels,results[c_key],dataset[0]) raw = ComputeClusteringMetrics(raw_clustering_results[raw_key][c_key],results[c_key],dataset[0]) line = "{0},{1},{2},{3},{4},{5},{6},{7}".format(mthd,algo,str(cluster),str(dim),str(raw["ami"]),str(act["ami"]),str(raw["sl"]),str(bic)) print(line) plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_{3}.png".format(mthd,str(cluster),algo,str(dim)) #if(mthd == "gmm"): # prob_output_file = rootfolder + "/{0}_{1}_{2}_{3}.csv".format(mthd,str(cluster),algo,str(dim)) # np.savetxt(prob_output_file,results[c_key.replace("predicted","prob")],delimiter=",") ScatterPlotForClustering(results[c_key],actual_labels,plot_output_file) if(dim == 2 and algo != "raw"): if(raw_data_plot_done == False): plot_output_file = clustering_plots_output_root + "/{0}_{1}_data.png".format(mthd,algo) ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],np.zeros_like(actual_labels),actual_labels,plot_output_file) raw_data_plot_done = True plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_data.png".format(mthd,str(cluster),algo) ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],results[c_key],actual_labels,plot_output_file) lines.append(line) #if(compute_acc): # keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","gmmpca":"pca","gmmica":"ica","gmmrp":"rp","gmmmi":"mi"} # for key in keys_to_output.keys(): # if("raw" not in key): # curr_best = None # for cluster in clusters: # datakey = key+str(cluster) # if(curr_best is None or best_bic_raw_clustering[datakey][2] > curr_best): # curr_best = best_bic_raw_clustering[datakey][2] # _X = best_bic_raw_clustering[datakey][0] # _Y = best_bic_raw_clustering[datakey][1] # else: # _X = best_bic_raw_clustering[key][0] # _Y = best_bic_raw_clustering[key][1] # h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key]) # nn_output_lines.append(l) # u.WriteTextArrayToFile(nn_output_file,nn_output_lines) if(compute_acc): keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","pca":"pca","ica":"ica","rp":"rp","mi":"mi"} for key in keys_to_output.keys(): if("raw" not in key): dim_best_val = None dim_result = None for dim in dims: best = {} # {x,y,p,k,bic,ami} for cluster_mthd in ["kmeans","gmm"]: for cluster in clusters: datakey = cluster_mthd+key+str(cluster)+str(dim) if(cluster_mthd not in best or best_bic_raw_clustering[datakey][2] > best[cluster_mthd][4]): best[cluster_mthd] = (best_bic_raw_clustering[datakey][0],best_bic_raw_clustering[datakey][1],dim,cluster,best_bic_raw_clustering[datakey][3],best_bic_raw_clustering[datakey][2]) curr_val = (best["kmeans"][5] + best["gmm"][5]) / 2 if(dim_best_val is None or dim_best_val < curr_val): dim_best_val = curr_val dim_result = best _X = dim_result["gmm"][0] _Y = dim_result["gmm"][1] else: _X = best_bic_raw_clustering[key][0] _Y = best_bic_raw_clustering[key][1] h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key]) nn_output_lines.append(l) u.WriteTextArrayToFile(nn_output_file,nn_output_lines) u.WriteTextArrayToFile(clustering_output_file,lines)
def AdaBoostAnalysis(output_root, output_file_prefix, metrics_file): data_all = pd.read_csv(metrics_file) dataset_types = [ 'train_split_percent_used', 'imbalance_perc', 'noise_perc' ] col_funcs = { 'p': ['mean', 'std'], 'r': ['mean', 'std'], 'm': ['mean', 'std'] } mapping_output_words = { 'p': 'Precision', 'r': 'Recall', 'm': 'F-Measure', dataset_types[0]: 'Train size % used', dataset_types[1]: 'Fraction of postives to negatives', dataset_types[2]: 'Noise %', 'modelbuildtimesecs': 'Time to build AdaBoost model (sec)' } for dataset_type in dataset_types: def filter_query(x): return (~np.isnan(x[dataset_type])) data = FilterRows(data_all, filter_query) data_agg = GetAggMetrics( data, col_funcs=col_funcs, gpby=[dataset_type, 'prune', 'istrain', 'iter']) for metric, v in col_funcs.items(): for agg in v: iterations = np.sort(data_agg['iter'].unique()) prune_vals = data_agg['prune'].unique() dataset_type_values = data_agg[dataset_type].unique() for type_val in dataset_type_values: for prune_val in prune_vals: metric_col = metric + "_" + agg y_test = [] y_train = [] for i in iterations: filtered_data = data_agg[ (data_agg['prune'] == prune_val) & (data_agg['iter'] == i) & (data_agg[dataset_type] == type_val)] train_data = filtered_data[filtered_data['istrain'] == 1] assert (len(train_data) == 1) y_train.append(train_data[metric_col].iloc[0]) test_data = filtered_data[filtered_data['istrain'] == 0] assert (len(test_data) == 1) y_test.append(test_data[metric_col].iloc[0]) # now we can plot since we have test and train values for each iter output_file_name = u.PreparePath( "{4}/{0}.{1}.prune-{5}.{6}-{7}.{2}.{3}.png".format( output_file_prefix, dataset_type, metric, agg, output_root, prune_val, dataset_type, type_val)) y_train_series = u.YSeries(y_train, line_color='r', plot_legend_label='train') y_test_series = u.YSeries(y_test, line_color='b', plot_legend_label='test') if (~os.path.isfile(output_file_name)): u.SaveDataPlotWithLegends( [y_train_series, y_test_series], iterations, output_file_name, True, "num of iterations", mapping_output_words[metric], "AdaBoost Performance ({0})".format(agg)) print(output_file_name)
def SvmAnalysis(output_root, output_file_prefix, metrics_file, dataset_filter_fn=None, y_axis_name="F-Measure"): def ComputeTotalSupportVectors(s): return np.array([int(t) for t in s.split(';')]).sum() data_all = pd.read_csv(metrics_file) data_all['numsupportvectors'] = data_all['numsupportvectors'].apply( ComputeTotalSupportVectors) dataset_types = ['train_split_percent_used'] col_funcs = { 'p': ['mean'], 'r': ['mean'], 'm': ['mean'], 'modelbuildtimesecs': ['mean'] } mapping_output_words = { 'p': 'Precision', 'r': 'Recall', 'm': y_axis_name, dataset_types[0]: 'Train size % used', 'modelbuildtimesecs': 'Time to build model (sec)', 'numsupportvectors': 'Number of Support Vectors' } for dataset_type in dataset_types: def filter_query(x): return ~np.isnan(x[dataset_type]) def train_filter(x): return (x['istrain'] == 1) def test_filter(x): return (x['istrain'] == 0) if (dataset_filter_fn is not None): data_all = FilterRows(data_all, dataset_filter_fn) data = FilterRows(data_all, filter_query) data_agg = GetAggMetrics(data, col_funcs=col_funcs, gpby=[dataset_type, 'istrain']) x = data_agg[dataset_type].unique() for k, v in col_funcs.items(): for agg in v: y_train = u.YSeries(FilterRows(data_agg, train_filter)[k + "_" + agg], line_color='r', points_marker='o', plot_legend_label="Train") y_test = u.YSeries(FilterRows(data_agg, test_filter)[k + "_" + agg], line_color='b', points_marker='o', plot_legend_label='validation') if ((k == 'numsupportvectors') | (k == 'modelbuildtimesecs')): y_series = [y_train] else: y_series = [y_test, y_train] output_file_name = u.PreparePath( "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k, agg, output_root, dataset_type)) f, ax = u.SaveDataPlotWithLegends( y_series, x, output_file_name, True, mapping_output_words[dataset_type], mapping_output_words[k], 'SVM Performance'.format(agg)) return data_agg
def RunSVMClassifier(datasets_root_folder, nominal_value_columns=None, positive_class_label=None, cv_file=None, cv_scoring='f1'): file_extn = "csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) realtestfiles = glob.glob("{0}/*.realtest.{1}".format( datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/svm", is_file=False) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) data = pd.read_csv(trainfile) testdata = pd.read_csv(testfiles[0]) realtestdata = pd.read_csv(realtestfiles[0]) train_len = len(data) test_len = len(testdata) + train_len cols_to_ignore = set(nominal_value_columns ) if nominal_value_columns is not None else set( []) cols_to_ignore.add(data.columns[-1]) cols_to_transform = [ c for c in data.columns if c not in cols_to_ignore ] scaler = StandardScaler() scaler.fit(data[cols_to_transform]) data[cols_to_transform] = scaler.transform(data[cols_to_transform]) testdata[cols_to_transform] = scaler.transform( testdata[cols_to_transform]) realtestdata[cols_to_transform] = scaler.transform( realtestdata[cols_to_transform]) all_data = pd.concat([data, testdata, realtestdata], axis=0, ignore_index=True) X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label, nominal_value_columns) X = X_all[0:train_len, :] Y = Y_all[0:train_len] test_X = X_all[train_len:test_len, :] test_Y = Y_all[train_len:test_len] realtest_X = X_all[test_len:, :] realtest_Y = Y_all[test_len:] realtest_data_file = trainfile.replace(".train.", ".realtest.preprocessed.data.") realtest_label_file = trainfile.replace( ".train.", ".realtest.preprocessed.label.") np.savetxt(realtest_data_file, realtest_X, delimiter=',') np.savetxt(realtest_label_file, realtest_Y, delimiter=',') dataset_size = GetDataSetSize(dataset_dir) StoreData("train.csv", "train_label.csv", X, Y, dataset_size) StoreData("validation.csv", "validation_label.csv", test_X, test_Y, dataset_size) StoreData("test.csv", "test_label.csv", realtest_X, realtest_Y, dataset_size) param_grid = [ { 'C': [0.1, 1, 10, 100, 1000], 'degree': [2, 3, 4], 'kernel': ['poly'] }, { 'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] classifier = SVC(cache_size=1500, random_state=int(params_info_dict['random_state'])) if ((cv_file is None) or (os.path.isfile(cv_file) == False)): gscv = GridSearchCV(classifier, param_grid, scoring=cv_scoring, n_jobs=3) gscv.fit(X, Y) _D = pd.DataFrame(gscv.cv_results_) best_params = gscv.best_params_ else: _D = None config_gen = [{}] for config in config_gen: id = GetIdForConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) cv_results_file = u.PreparePath( "{0}/{1}.grid_search_cv_results.csv".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) if (_D is not None): _D.to_csv(cv_results_file) else: cv_results = pd.read_csv(cv_file) best_params = ast.literal_eval(cv_results[ cv_results['rank_test_score'] == 1].iloc[0]['params']) # if(os.path.isfile(test_output_file)): # config = config_gen.GetNextConfigAlongWithIdentifier() # continue config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file config["testset"] = testfiles[0] config["kernel"] = best_params['kernel'] config['C'] = best_params['C'] if (config['kernel'] == 'rbf'): config['gamma'] = best_params['gamma'] classifier = SVC(config['C'], gamma=config['gamma'], kernel=config['kernel'], cache_size=1500, random_state=int( params_info_dict['random_state'])) else: config['degree'] = best_params['degree'] classifier = SVC(config['C'], kernel=config['kernel'], degree=config['degree'], cache_size=1500, random_state=int( params_info_dict['random_state'])) start = time.clock() classifier.fit(X, Y) end = time.clock() print(end - start) config["modelbuildtimesecs"] = end - start config['numsupportvectors'] = u.ConcatToStr( ';', classifier.n_support_) # for train performance config["trainpredictionoutputfile"] = train_output_file train_predicted_Y = classifier.predict(X) output = pd.DataFrame({ "actual": Y, "predicted": train_predicted_Y }) output.to_csv(train_output_file, index=False) u.WriteBinaryFile(model_output_file, classifier) # now for test set config["predictionoutputfile"] = test_output_file start = time.clock() predicted_Y = classifier.predict(test_X) end = time.clock() config["modelevaltimesecs"] = end - start output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y}) output.to_csv(test_output_file, index=False) for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("done dataset : " + dataset_dir)
def KnnAnalysis(output_root, output_file_prefix, metrics_file): data_all = pd.read_csv(metrics_file) dataset_types = ['train_split_percent_used'] col_funcs = { 'p': ['mean'], 'r': ['mean'], 'm': ['mean'], 'modelevaltimesecs': ['mean'] } mapping_output_words = { 'p': 'Precision', 'r': 'Recall', 'm': 'F-Measure', dataset_types[0]: 'Train size % used', dataset_types[1]: 'Fraction of postives to negatives', dataset_types[2]: 'Noise %', 'modelevaltimesecs': 'Time to run Knn model (sec)' } for dataset_type in dataset_types: def filter_query(x): return (~np.isnan(x[dataset_type]) & (x['istrain'] == 0)) def distance_weights_filter(x): return x['weights'] == 'distance' def uniform_weights_filter(x): return x['weights'] == 'uniform' data = FilterRows(data_all, filter_query) data_agg = GetAggMetrics(data, col_funcs=col_funcs, gpby=[dataset_type, 'weights', 'neighbors']) x = data_agg[dataset_type].unique() for k, v in col_funcs.items(): for agg in v: data_for_distance_based_weighting = FilterRows( data_agg, distance_weights_filter) nneighbors = [5, 10, 20, 50] marker_and_color_map = { 5: ('g', 'o'), 10: ('r', '+'), 20: ('b', 'x'), 50: ('k', 'd') } y_series = [] for n in nneighbors: d = data_for_distance_based_weighting[ data_for_distance_based_weighting['neighbors'] == n] y = u.YSeries(d[k + "_" + agg], line_color=marker_and_color_map[n][0], points_marker=marker_and_color_map[n][1], plot_legend_label="k = " + str(n)) y_series.append(y) output_file_name = u.PreparePath( "{4}/{0}.{1}.weighted.{2}.{3}.png".format( output_file_prefix, dataset_type, k, agg, output_root)) f, ax = u.SaveDataPlotWithLegends( y_series, x, output_file_name, True, mapping_output_words[dataset_type], mapping_output_words[k], 'K Nearest Neighbor'.format(agg)) data_for_distance_based_weighting = FilterRows( data_agg, uniform_weights_filter) y_series = [] for n in nneighbors: d = data_for_distance_based_weighting[ data_for_distance_based_weighting['neighbors'] == n] y = u.YSeries(d[k + "_" + agg], line_color=marker_and_color_map[n][0], points_marker=marker_and_color_map[n][1], plot_legend_label="k = " + str(n)) y_series.append(y) output_file_name = u.PreparePath( "{4}/{0}.{1}.uniform.{2}.{3}.png".format( output_file_prefix, dataset_type, k, agg, output_root)) f, ax = u.SaveDataPlotWithLegends( y_series, x, output_file_name, True, mapping_output_words[dataset_type], mapping_output_words[k], 'K Nearest Neighbor'.format(agg)) return data_agg
def DecisionTreeAnalysis(output_root, output_file_prefix, metrics_file, dataset_filter_fn=None, plt_title="Decision Trees Performance", y_axis_name='F-Measure'): data_all = pd.read_csv(metrics_file) dataset_types = ['train_split_percent_used'] col_funcs = { 'p': ['mean'], 'r': ['mean'], 'm': ['mean'], 'modelbuildtimesecs': ['mean'] } mapping_output_words = { 'p': 'Precision', 'r': 'Recall', 'm': y_axis_name, dataset_types[0]: 'Train size % used', 'modelbuildtimesecs': 'Time to build model (sec)' } for dataset_type in dataset_types: def filter_query(x): return ~np.isnan(x[dataset_type]) def train_prune_filter(x): return x['prune'] & (x['istrain'] == 1) def train_no_prune_filter(x): return (x['prune'] == False) & (x['istrain'] == 1) def test_prune_filter(x): return x['prune'] & (x['istrain'] == 0) def test_no_prune_filter(x): return (x['prune'] == False) & (x['istrain'] == 0) if (dataset_filter_fn is not None): data_all = FilterRows(data_all, dataset_filter_fn) data = FilterRows(data_all, filter_query) data_agg = GetAggMetrics(data, col_funcs=col_funcs, gpby=[dataset_type, 'prune', 'istrain']) x = data_agg[dataset_type].unique() for k, v in col_funcs.items(): for agg in v: y_train_prune = u.YSeries( FilterRows(data_agg, train_prune_filter)[k + "_" + agg], line_color='r', points_marker='o', plot_legend_label="Train_with_pruning") y_train_no_prune = u.YSeries( FilterRows(data_agg, train_no_prune_filter)[k + "_" + agg], line_color='r', points_marker='x', plot_legend_label="Train_without_pruning") y_test_prune = u.YSeries( FilterRows(data_agg, test_prune_filter)[k + "_" + agg], line_color='b', points_marker='o', plot_legend_label="Validation_with_pruning") y_no_test_prune = u.YSeries( FilterRows(data_agg, test_no_prune_filter)[k + "_" + agg], line_color='b', points_marker='x', plot_legend_label="Validation_without_pruning") if (len(y_train_prune.values) == 0): y_no_test_prune.plot_legend_label = "Validation" y_train_no_prune.plot_legend_label = "Train" if ((k == 'modelbuildtimesecs')): y_series = [y_train_no_prune] else: y_series = [y_no_test_prune, y_train_no_prune] else: if ((k == 'modelbuildtimesecs')): y_series = [y_train_no_prune, y_train_prune] else: y_series = [ y_test_prune, y_no_test_prune, y_train_no_prune, y_train_prune ] output_file_name = u.PreparePath( "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k, agg, output_root, dataset_type)) f, ax = u.SaveDataPlotWithLegends( y_series, x, output_file_name, True, mapping_output_words[dataset_type], mapping_output_words[k], plt_title) return data_agg
def RunDecisionTreesWithOptimalInst(datasets_root_folder, weka_jar_path, cv_results_file, use_arff_files=True): file_extn = "arff" if use_arff_files else ".csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) cv_results = pd.read_csv(datasets_root_folder + "/" + cv_results_file) for dataset_dir in u.Get_Subdirectories(datasets_root_folder): trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False) filter_name, filter_val = GetFilterOptions(dataset_dir) config_gen = ParameterGrid({'prune': [True, False]}) for config in config_gen: filter = lambda x: (x['prune'] == False) & (x[ filter_name] == filter_val) & (x[ 'istrain'] == 1) # this will output on the held out set filtered_rows = u.FilterRows(cv_results, filter) a = filtered_rows['m'] if (len(a) == 0): print("ignoring : {0}".format(dataset_dir)) continue b = np.max(filtered_rows['m']) indxs = np.isclose(a, b) best_insts = filtered_rows[indxs] best_insts = best_insts.iloc[0]['inst'] config['inst'] = best_insts id = GetIdForOptConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) # if(os.path.isfile(train_output_file)): # continue config['random_state'] = params_info_dict['random_state'] config["wekajar"] = weka_jar_path config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file # for every config there has to be a train prediction and test prediction cmd = GetWekaCommandLineForConfig(config, False, False) config["modelbuildtimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) # now for test set config["predictionoutputfile"] = test_output_file config["testset"] = testfiles[0] cmd = GetWekaCommandLineForConfig(config, True, False) config["modelevaltimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) config.pop('random_state', None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("done dataset : " + dataset_dir)
def GenerateDatasetSplits(rootFolder, dataset_folder_prefix, dataset, test_ratio, train_ratio, validation_ratio, train_size_percentages, class_col, random_state, arff_attr_info=None): """ train_size_percentages is a list of intergers specifying the percent of train set to be taken while preparing the dataset test_ratio,train_ratio,validation_ratio : numbers in percentages """ dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format( rootFolder, dataset_folder_prefix, train_ratio, test_ratio)) train, test, validation = CreateTrainTestAndValidationPartitions( dataset, class_col, train_ratio / 100, test_ratio / 100, random_state, validation_ratio / 100) if (validation is not None): validation_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format( dataset_root, dataset_folder_prefix)) validation.to_csv(validation_output_file_csv, index=False) test_output_file_csv = u.PreparePath("{0}/i-{1}.realtest.csv".format( dataset_root, dataset_folder_prefix)) test.to_csv(test_output_file_csv, index=False) test_output_file_arff = u.PreparePath("{0}/i-{1}.realtest.arff".format( dataset_root, dataset_folder_prefix)) CreateArffFileFromCsv(arff_attr_info, test_output_file_arff, test_output_file_csv, True, True) else: test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format( dataset_root, dataset_folder_prefix)) test.to_csv(test_output_file_csv, index=False) if (arff_attr_info is not None): test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format( dataset_root, dataset_folder_prefix)) CreateArffFileFromCsv(arff_attr_info, test_output_file_arff, test_output_file_csv, True, True) # now creating the train set partitions for train_set_size in train_size_percentages: folder_path = u.PreparePath("{0}/i-{1}_t-{2}_ts-{3}".format( dataset_root, dataset_folder_prefix, train_ratio, train_set_size)) csv_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_ts-{3}.train.csv".format(folder_path, dataset_folder_prefix, train_ratio, train_set_size)) rows_to_keep = int(len(train) * train_set_size / 100) train.head(rows_to_keep).to_csv(csv_output_file, index=False) if (arff_attr_info is not None): arff_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_ts-{3}.train.arff".format( folder_path, dataset_folder_prefix, train_ratio, train_set_size)) CreateArffFileFromCsv(arff_attr_info, arff_output_file, csv_output_file, True, True) # writing the parameters params_info = [ "dataset_instance={0}".format(dataset_folder_prefix), "test_split={0}".format(test_ratio), "train_split={0}".format(train_ratio), "random_state={0}".format(random_state), "class_col={0}".format(class_col), "train_split_percent_used={0}".format(train_set_size) ] params_out_file = u.PreparePath( "{0}/i-{1}_t-{2}_ts-{3}.params.txt".format(folder_path, dataset_folder_prefix, train_ratio, train_set_size)) u.WriteTextArrayToFile(params_out_file, params_info)
def GenerateDatasetSplitsForWithNoise(rootFolder, dataset_folder_prefix, dataset, test_ratio, train_ratio, validation_ratio, noise_percentages, class_col, flip_fn, random_state, arff_attr_info=None): """ train_size_percentages is a list of intergers specifying the percent of train set to be taken while preparing the dataset test_ratio,train_ratio,validation_ratio : numbers in percentages """ dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format( rootFolder, dataset_folder_prefix, train_ratio, test_ratio)) train, test, validation = CreateTrainTestAndValidationPartitions( dataset, class_col, train_ratio / 100, test_ratio / 100, random_state, validation_ratio / 100) test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format( dataset_root, dataset_folder_prefix)) test.to_csv(test_output_file_csv, index=False) if (arff_attr_info is not None): test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format( dataset_root, dataset_folder_prefix)) CreateArffFileFromCsv(arff_attr_info, test_output_file_arff, test_output_file_csv, True, True) # now creating the train set partitions for noise_perc in noise_percentages: folder_path = u.PreparePath("{0}/i-{1}_t-{2}_noise-{3}".format( dataset_root, dataset_folder_prefix, train_ratio, noise_perc)) csv_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_noise-{3}.train.csv".format( folder_path, dataset_folder_prefix, train_ratio, noise_perc)) noisy_dataset = CreateNoisyDataset(train, class_col, noise_perc / 100, random_state, flip_fn) noisy_dataset.to_csv(csv_output_file, index=False) print("done noisy : " + str(noise_perc)) if (arff_attr_info is not None): arff_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_noise-{3}.train.arff".format( folder_path, dataset_folder_prefix, train_ratio, noise_perc)) CreateArffFileFromCsv(arff_attr_info, arff_output_file, csv_output_file, True, True) # writing the parameters params_info = [ "dataset_instance={0}".format(dataset_folder_prefix), "test_split={0}".format(test_ratio), "train_split={0}".format(train_ratio), "random_state={0}".format(random_state), "class_col={0}".format(class_col), "noise_perc={0}".format(noise_perc) ] params_out_file = u.PreparePath( "{0}/i-{1}_t-{2}_noise-{3}.params.txt".format( folder_path, dataset_folder_prefix, train_ratio, noise_perc)) u.WriteTextArrayToFile(params_out_file, params_info)
def GenerateDatasetSplitsForClassImbalance(rootFolder, dataset_folder_prefix, dataset, test_ratio, train_ratio, validation_ratio, imbalance_percentages, class_col, minority_label, min_minority_to_keep, random_state, arff_attr_info=None, train_set=None, test_set=None): """ train_size_percentages is a list of intergers specifying the percent of train set to be taken while preparing the dataset test_ratio,train_ratio,validation_ratio : numbers in percentages """ dataset_root = u.PreparePath("{0}/i-{1}_t-{2}_T-{3}".format( rootFolder, dataset_folder_prefix, train_ratio, test_ratio)) if ((train_set is not None) & (test_set is not None)): train = train_set test = test_set else: train, test, validation = CreateTrainTestAndValidationPartitions( dataset, class_col, train_ratio / 100, test_ratio / 100, random_state, validation_ratio / 100) test_output_file_csv = u.PreparePath("{0}/i-{1}.test.csv".format( dataset_root, dataset_folder_prefix)) test.to_csv(test_output_file_csv, index=False) if (arff_attr_info is not None): test_output_file_arff = u.PreparePath("{0}/i-{1}.test.arff".format( dataset_root, dataset_folder_prefix)) CreateArffFileFromCsv(arff_attr_info, test_output_file_arff, test_output_file_csv, True, True) # now creating the train set partitions for imbalance_perc in imbalance_percentages: folder_path = u.PreparePath("{0}/i-{1}_t-{2}_im-{3}".format( dataset_root, dataset_folder_prefix, train_ratio, imbalance_perc)) csv_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_im-{3}.train.csv".format(folder_path, dataset_folder_prefix, train_ratio, imbalance_perc)) imbalance_dataset = CreateImbalancedDataSet(train, class_col, minority_label, imbalance_perc / 100, min_minority_to_keep, random_state) imbalance_dataset.to_csv(csv_output_file, index=False) print("done imb : " + str(imbalance_perc)) if (arff_attr_info is not None): arff_output_file = u.PreparePath( "{0}/i-{1}_t-{2}_im-{3}.train.arff".format( folder_path, dataset_folder_prefix, train_ratio, imbalance_perc)) CreateArffFileFromCsv(arff_attr_info, arff_output_file, csv_output_file, True, True) # writing the parameters params_info = [ "dataset_instance={0}".format(dataset_folder_prefix), "test_split={0}".format(test_ratio), "train_split={0}".format(train_ratio), "random_state={0}".format(random_state), "class_col={0}".format(class_col), "minority_label={0}".format(minority_label), "imbalance_perc={0}".format(imbalance_perc) ] params_out_file = u.PreparePath( "{0}/i-{1}_t-{2}_im-{3}.params.txt".format(folder_path, dataset_folder_prefix, train_ratio, imbalance_perc)) u.WriteTextArrayToFile(params_out_file, params_info)
def RunDecisionTrees(datasets_root_folder, weka_jar_path, use_arff_files=True): file_extn = "arff" if use_arff_files else ".csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False else: break trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False) config_gen = ParameterGrid({ 'prune': [False], 'inst': [2, 5, 8, 12, 15] }) for config in config_gen: id = GetIdForConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) # if(os.path.isfile(train_output_file)): # continue config['random_state'] = params_info_dict['random_state'] config["wekajar"] = weka_jar_path config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file # for every config there has to be a train prediction and test prediction cmd = GetWekaCommandLineForConfig(config, False) config["modelbuildtimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) # now for test set #config["predictionoutputfile"] = test_output_file #config["testset"] = testfiles[0] #cmd = GetWekaCommandLineForConfig(config,True) #config["modelevaltimesecs"] = timeit.timeit(lambda : sl.RunCmdWithoutConsoleWindow(cmd),number=1) config.pop('random_state', None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("done dataset : " + dataset_dir)
def RunKNNClassifier(datasets_root_folder, nominal_value_columns=None, positive_class_label=None, metric_fn=None, cv_file=None, cv_scoring='f1'): file_extn = "csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/knn", is_file=False) data = pd.read_csv(trainfile) testdata = pd.read_csv(testfiles[0]) train_len = len(data) cols_to_ignore = set(nominal_value_columns ) if nominal_value_columns is not None else set( []) cols_to_ignore.add(data.columns[-1]) cols_to_transform = [ c for c in data.columns if c not in cols_to_ignore ] scaler = StandardScaler() scaler.fit(data[cols_to_transform]) data[cols_to_transform] = scaler.transform(data[cols_to_transform]) testdata[cols_to_transform] = scaler.transform( testdata[cols_to_transform]) all_data = pd.concat([data, testdata], axis=0, ignore_index=True) X_all, Y_all = nnet.PrepareDataAndLabel(all_data, positive_class_label, nominal_value_columns) X = X_all[0:train_len, :] Y = Y_all[0:train_len] test_X = X_all[train_len:, :] test_Y = Y_all[train_len:] param_grid = { 'weights': np.array(['uniform', 'distance']), 'n_neighbors': np.array([5, 10, 20, 50]) } classifier = KNeighborsClassifier() if ((cv_file is None) or (os.path.isfile(cv_file) == False)): gscv = GridSearchCV(classifier, param_grid, scoring=cv_scoring, n_jobs=3) gscv.fit(X, Y) _D = pd.DataFrame(gscv.cv_results_) best_params = gscv.best_params_ else: _D = None config_gen = ParameterGrid({ 'weights': ['uniform'], 'neighbors': [-1] }) # -1 denotes that we need to take the cv results for config in config_gen: id = GetIdForConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) cv_results_file = u.PreparePath( "{0}/{1}.grid_search_cv_results.csv".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) scalar_output_file = u.PreparePath("{0}/{1}.scaler".format( run_output_dir, id)) if (cv_file is not None): cv_file = cv_file if (_D is not None): _D.to_csv(cv_results_file) else: cv_results = pd.read_csv(cv_file) best_params = ast.literal_eval(cv_results[ cv_results['rank_test_score'] == 1].iloc[0]['params']) # if(os.path.isfile(test_output_file)): # config = config_gen.GetNextConfigAlongWithIdentifier() # continue config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file config["testset"] = testfiles[0] if (config['neighbors'] == -1): neighbors = best_params['n_neighbors'] weights = best_params['weights'] # _D.to_csv(cv_results_file) config['best_neighbors'] = neighbors config['best_weights'] = weights else: neighbors = config['neighbors'] weights = config['weights'] if (metric_fn is None): classifier = KNeighborsClassifier(neighbors, weights) else: classifier = KNeighborsClassifier( neighbors, weights, algorithm='brute', metric='pyfunc', metric_params={'func': metric_fn}) loo = LeaveOneOut() y_actual = [] y_predicted = [] count = 0 total = len(X) for train_idx, test_idx in loo.split(X): X_train, X_test = X[train_idx], X[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] classifier.fit(X_train, Y_train) Y_test_predicted = classifier.predict(X_test) assert (len(Y_test_predicted) == 1) y_actual.append(Y_test[0]) y_predicted.append(Y_test_predicted[0]) count = count + 1 if (count % 100 == 0): print(str(count) + " " + str(total)) start = time.clock() classifier.fit(X, Y) end = time.clock() print(end - start) config["modelbuildtimesecs"] = end - start # for train performance config["trainpredictionoutputfile"] = train_output_file #train_predicted_Y = classifier.predict(X) output = pd.DataFrame({ "actual": y_actual, "predicted": y_predicted }) output.to_csv(train_output_file, index=False) # now for test set config["predictionoutputfile"] = test_output_file start = time.clock() predicted_Y = classifier.predict(test_X) end = time.clock() u.WriteBinaryFile(model_output_file, classifier) u.WriteBinaryFile(scalar_output_file, scaler) config["modelevaltimesecs"] = end - start output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y}) output.to_csv(test_output_file, index=False) for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("DONE dataset : " + dataset_dir)
def RunNeuralNetClassifier(datasets_root_folder, one_hot_encoding_cols=None, positive_class_label=None, cv_file_format=None, cv_scoring='f1'): file_extn = "csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) first = True for dataset_dir in u.Get_Subdirectories(datasets_root_folder): if (first): assert ("ts-100" in dataset_dir) first = False trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/nnets", is_file=False) config_gen = nnconfig() config = config_gen.GetNextConfigAlongWithIdentifier() while (config is not None): id = config["id"] params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) # no separate cv is done for early stopping. cv_results_file = u.PreparePath( "{0}/{1}.grid_search_cv_results.csv".format( run_output_dir, id)).replace("True", "False") model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) # if(os.path.isfile(cv_results_file)): # config = config_gen.GetNextConfigAlongWithIdentifier() # continue config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file data = pd.read_csv(trainfile) config["testset"] = testfiles[0] testdata = pd.read_csv(config["testset"]) train_len = len(data) cols_to_ignore = set( one_hot_encoding_cols ) if one_hot_encoding_cols is not None else set([]) cols_to_ignore.add(data.columns[-1]) cols_to_transform = [ c for c in data.columns if c not in cols_to_ignore ] scaler = StandardScaler() scaler.fit(data[cols_to_transform]) data[cols_to_transform] = scaler.transform(data[cols_to_transform]) testdata[cols_to_transform] = scaler.transform( testdata[cols_to_transform]) all_data = pd.concat([data, testdata], axis=0, ignore_index=True) X_all, Y_all = PrepareDataAndLabel(all_data, positive_class_label, one_hot_encoding_cols) X = X_all[0:train_len, :] Y = Y_all[0:train_len] test_X = X_all[train_len:, :] test_Y = Y_all[train_len:] hidden_layers = [(10, ), (30, ), (50, ), (70, )] init_learning_rates = [0.1, 0.01, 0.001, 0.0001] alpha = [0.01, 0.1, 1, 10, 100] momentum = 0.9 max_iter = 200 early_stopping = config["earlystopping"] validation_fraction = 0.3 random_state = int(params_info_dict["random_state"]) solver = 'sgd' #for doing 3-fold CV param_grid = { "alpha": alpha, "learning_rate_init": init_learning_rates, "hidden_layer_sizes": hidden_layers } classifier = MLPClassifier(activation="logistic", momentum=momentum, early_stopping=early_stopping, verbose=False, validation_fraction=validation_fraction, random_state=random_state, solver="sgd", max_iter=max_iter) cv_file = None if (cv_file_format is not None): cv_file = cv_file_format.format(id).replace("True", "False") if ((cv_file is None) or (os.path.isfile(cv_file) == False)): gscv = GridSearchCV(classifier, param_grid, scoring=cv_scoring, n_jobs=3) gscv.fit(X, Y) _D = pd.DataFrame(gscv.cv_results_) best_params = gscv.best_params_ _D.to_csv(cv_results_file) else: cv_results = pd.read_csv(cv_file) best_params = ast.literal_eval(cv_results[ cv_results['rank_test_score'] == 1].iloc[0]['params']) # gscv = GridSearchCV(classifier,param_grid,scoring='f1',n_jobs=3) # gscv.fit(X,Y) # _D = pd.DataFrame(gscv.cv_results_) # _D.to_csv(cv_results_file) classifier = MLPClassifier( hidden_layer_sizes=best_params["hidden_layer_sizes"], activation="logistic", momentum=momentum, early_stopping=early_stopping, verbose=True, validation_fraction=validation_fraction, random_state=random_state, solver="sgd", max_iter=max_iter, learning_rate_init=best_params["learning_rate_init"], alpha=best_params["alpha"]) start = time.clock() classifier.fit(X, Y) end = time.clock() config['momentum'] = momentum config["hidden_layers"] = "10;30;50;70" config["alphas"] = u.ConcatToStr(";", alpha) config["init_learning_rates"] = u.ConcatToStr( ";", init_learning_rates) config["total_iter"] = classifier.n_iter_ config["time_per_iter"] = (end - start) / classifier.n_iter_ config["best_alpha"] = best_params["alpha"] config["best_hidden_layer_sizes"] = best_params[ "hidden_layer_sizes"][0] config["best_init_learning_rate"] = best_params[ "learning_rate_init"] config["loss_curve"] = u.ConcatToStr(";", classifier.loss_curve_) config["random_state"] = random_state config["modelbuildtimesecs"] = end - start # for train performance config["trainpredictionoutputfile"] = train_output_file train_predicted_Y = classifier.predict(X) output = pd.DataFrame({ "actual": Y, "predicted": train_predicted_Y }) output.to_csv(train_output_file, index=False) # now for test set config["predictionoutputfile"] = test_output_file u.WriteBinaryFile(model_output_file, classifier) #test_X,test_Y = PrepareDataAndLabel(data,positive_class_label,one_hot_encoding_cols) predicted_Y = classifier.predict(test_X) output = pd.DataFrame({"actual": test_Y, "predicted": predicted_Y}) output.to_csv(test_output_file, index=False) config.pop('random_state', None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) config = config_gen.GetNextConfigAlongWithIdentifier() print("done dataset : " + dataset_dir)