def ReadNNetResultsFile10k(rootpath): file = rootpath + "/nnet.output.csv" ga_noprogressfile = rootpath + "/output_test_lr_10k-iters_30-hiddenlayers_crowding.csv" data = pd.read_csv(file) ga_noprogress = pd.read_csv(ga_noprogressfile) return data, ga_noprogress ga_noprogress = u.FilterRows(data, lambda x: x['algo'] == 'ga') data = u.FilterRows(data, lambda x: x['algo'] != 'ga') ga_progress = pd.read_csv( rootpath + "/output_test_lr_10k-iters_30-hiddenlayers_ga.csv") data = pd.concat([data, ga_progress], ignore_index=True) return data, ga_noprogress
def PlotAdaboostPerIterationCurves(file_template, filter, plot_output_file, iters, y_axis_name='F-Measure'): ts = [20, 30, 40, 50, 60, 70, 80, 90, 100] colors = u.GetColorCombinations() y = [] for _ts in ts: data = pd.read_csv(file_template.format(str(_ts))) data = u.FilterRows(data, filter) data = data.set_index('iter') train_data = FilterRows(data, lambda x: x['istrain'] == 1) test_data = FilterRows(data, lambda x: x['istrain'] == 0) train_y = [] test_y = [] for iter in iters: train_y.append(train_data.loc[iter]['m']) test_y.append(test_data.loc[iter]['m']) c = colors.pop() y.append( u.YSeries(train_y, points_marker='o', line_color=c['color'], plot_legend_label=str(_ts) + "-train")) y.append( u.YSeries(test_y, points_marker='x', line_color=c['color'], plot_legend_label=str(_ts) + "-validation")) u.SaveDataPlotWithLegends(y, iters, plot_output_file, x_axis_name="num of iters/weak learners", y1_axis_name=y_axis_name)
def ComputeFinalResults(rootfolder): clustering_stats = pd.read_csv(rootfolder+"/clustering.csv") final_output_file = rootfolder+"/best_metrics.csv" data = u.FilterRows(clustering_stats, lambda x : x['p'] == 2) dim_red = ["ica","pca","rp","mi"] clustering = ["kmeans","gmm"] lines = [] lines.append("clustering,dim_red,k,p,ami_raw,ami_true,sc,bic") raw_predictions = {} for c in clustering: d = data.loc[(data['dim_red_method'] == "raw") & (data['clustering'] == c),:] d = d.loc[d['bic'] == np.min(d['bic']),:] clusters_file = rootfolder + "/clustering_output/mthd={0}_k={1}_d=2_algo=raw.csv".format(c,d.iloc[0]['k']) raw_predictions[c] = np.loadtxt(clusters_file,delimiter=',') for dr in dim_red: for c in clustering: d = data.loc[(data['dim_red_method'] == dr) & (data['clustering'] == c),:] d = d.loc[d['bic'] == np.min(d['bic']),:] clusters_file = rootfolder + "/clustering_output/mthd={0}_k={1}_d=2_algo={2}.csv".format(c,d.iloc[0]['k'],dr) predicted = np.loadtxt(clusters_file,delimiter=',') ami = metrics.adjusted_mutual_info_score(raw_predictions[c],predicted) lines.append(u.ConcatToStr(",",[c,dr,d.iloc[0]['k'],ami,d.iloc[0]['ami_true'],d.iloc[0]['sc'],d.iloc[0]['bic']])) u.WriteTextArrayToFile(final_output_file,lines)
def PlotAvgRewardsPerEpisode(data, totalpoints, points_to_sample, outputfile, y_axis_name, key_to_plot, max_points=100000): """ cr : cum_rewards ar : avg_rewards len : episode_len goal : reached_goal """ data_to_plot = u.FilterRows( data, lambda x: (x['solver'] == 'q') & (x['gamma'] == 0.99) & (x['alpha'] == 1) & (x['maxInnerVi'] == totalpoints)) x_to_take = np.arange(totalpoints) x_to_take = x_to_take[x_to_take % points_to_sample == 0] x_to_take = x_to_take[x_to_take < max_points] ser = data_to_plot.apply( lambda x: GetQRewardSeriesToPlot(x, x_to_take, key_to_plot), axis=1) u.SaveDataPlotWithLegends(ser.values, filename=outputfile, x_axis_name="episodes", y1_axis_name=y_axis_name)
def PlotClusteringMetrics(rootfolder, data, k,dim='raw',p=2): filter = lambda x : x['dim_red_method'] == dim and x['p'] == p filtered_data = u.FilterRows(data,filter) metrics = ["ami_raw","ami_true","sc","bic"] gmm_data = filtered_data.loc[filtered_data['clustering'] == "gmm",:] kmeans_data = filtered_data.loc[filtered_data['clustering'] == "kmeans",:] d = {"kmeans":('o','b','kmeans'),"gmm":('x','r','gmm')} for metric in metrics: outputfile = u.PreparePath(rootfolder + "/plots/metrics/{0}_{1}_p={2}.png".format(metric,dim,str(p))) kmeans_ser = u.YSeries(kmeans_data[metric],xvalues=kmeans_data["k"],points_marker = d["kmeans"][0],line_color=d["kmeans"][1],plot_legend_label=d["kmeans"][2]) gmm_ser = u.YSeries(gmm_data[metric],xvalues=gmm_data["k"],points_marker = d["gmm"][0],line_color=d["gmm"][1],plot_legend_label=d["gmm"][2]) u.SaveDataPlotWithLegends([kmeans_ser,gmm_ser],x_axis_name="number of clusters",y1_axis_name=metric,filename=outputfile)
def GetBestRawClustering(rootfolder): clustering_stats = pd.read_csv(rootfolder+"/clustering.csv") final_output_file = rootfolder+"/best_metrics.csv" data = u.FilterRows(clustering_stats, lambda x : x['p'] == 2) dim_red = ["ica","pca","rp","mi"] clustering = ["kmeans","gmm"] lines = [] lines.append("clustering,dim_red,k,p,ami_raw,ami_true,sc,bic") raw_predictions = {} for c in clustering: d = data.loc[(data['dim_red_method'] == "raw") & (data['clustering'] == c),:] d = d.loc[d['bic'] == np.min(d['bic']),:] clusters_file = rootfolder + "/clustering_output/mthd={0}_k={1}_d=2_algo=raw.csv".format(c,d.iloc[0]['k']) raw_predictions[c] = (np.loadtxt(clusters_file,delimiter=','),d.copy()) return raw_predictions
def PlotClusteringMetricsForDimsAndDimRed(rootfolder, data, dims, dim_reds,k): colors = {"ica":'r','pca':'b','rp':'g','mi':'k','raw':'orange'} markers = {"kmeans":'o',"gmm":'x'} metrics = ["ami_raw","ami_true","sc","bic"] for _k in k: filter = lambda x : x['k'] == _k filtered_data = u.FilterRows(data,filter) for metric in metrics: ser = [] outputfile = u.PreparePath(rootfolder + "/plots/metrics/dr_{0}_k={1}.png".format(metric,str(_k))) for dim_red in dim_reds: d = data.loc[(data['dim_red_method'] == dim_red) & (data['k'] == _k) & (data['clustering'] == 'kmeans') ,:] ser.append(u.YSeries(d[metric],xvalues=d['p'],line_color=colors[dim_red],points_marker=markers['kmeans'],plot_legend_label="{0}-{1}".format(dim_red,'kmeans'))) d = data.loc[(data['dim_red_method'] == dim_red) & (data['k'] == _k) & (data['clustering'] == 'gmm') ,:] ser.append(u.YSeries(d[metric],xvalues=d['p'],line_color=colors[dim_red],points_marker=markers['gmm'],plot_legend_label="{0}-{1}".format(dim_red,'gmm'))) u.SaveDataPlotWithLegends(ser,x_axis_name="dimensions",y1_axis_name=metric,filename = outputfile)
def ComputeFinalResults1(rootfolder,clusters,dims): raw_results = GetBestRawClustering(rootfolder) clustering_results = pd.read_csv(rootfolder+"/clustering.csv") final_output_file = rootfolder+"/best_metrics.csv" best_raw_clustering_output = rootfolder+"/best_raw_clustering.csv" o = pd.concat([raw_results["kmeans"][1], raw_results["gmm"][1]]) o.to_csv(best_raw_clustering_output) dim_red = ["mi","pca","ica","rp"] lines = [] lines.append("clustering,dim_red,k,p,ami_raw,ami_true,sc,bic") raw_predictions = {} output = None for dr in dim_red: data = u.FilterRows(clustering_results,lambda x : x["dim_red_method"] == dr) dim_best_val = None dim_result = None for dim in dims: best = {} # {p,k,ami} for cluster_mthd in ["kmeans","gmm"]: for cluster in clusters: print("{0},{1},{2},{3}".format(dr,str(dim),cluster_mthd,str(cluster))) d = data.loc[(data['clustering'] == cluster_mthd)&(data['k'] == cluster) & (data['p'] == dim)] row = d.head(1).copy() if(cluster_mthd not in best or best[cluster_mthd]['bic'].iloc[0] > row['bic'].iloc[0]): best[cluster_mthd] = row curr_val = (best["kmeans"]['ami_true'].iloc[0] + best["gmm"]['ami_true'].iloc[0]) / 2 #curr_val = (best["kmeans"]['ami_true'].iloc[0] + best["gmm"]['ami_true'].iloc[0]) / 2 #curr_val = np.minimum(best["kmeans"]['ami_true'].iloc[0], best["gmm"]['ami_true'].iloc[0]) if(dim_best_val is None or dim_best_val < curr_val): dim_best_val = curr_val dim_result = best.copy() for c in ["kmeans","gmm"]: ami_raw = GetAmiWithRawPredictions(rootfolder,raw_results,dr,dim_result[c].iloc[0]["p"],dim_result[c].iloc[0]["k"],c) lines.append("{0},{1},{2},{3},{4},{5},{6},{7}".format(c,str(dim_result[c].iloc[0]["dim_red_method"]),str(dim_result[c].iloc[0]["k"]),str(dim_result[c].iloc[0]["p"]),str(ami_raw[c]),str(dim_result[c].iloc[0]["ami_true"]),str(dim_result[c].iloc[0]["sc"]),str(dim_result[c].iloc[0]["bic"]))) #if(output is None): # output = pd.concat([dim_result["kmeans"],dim_result["gmm"]]) #else: # output = pd.concat([output,dim_result["kmeans"],dim_result["gmm"]]) u.WriteTextArrayToFile(final_output_file,lines)
def PlotPerIterationCurves(rootFolder, outputfolder): mimic = pd.read_csv(rootFolder + "/mimic.csv") sa = pd.read_csv(rootFolder + "/sa.csv") rhc = pd.read_csv(rootFolder + "/rhc.csv") ga = pd.read_csv(rootFolder + "/ga.csv") sizes = np.array(mimic['size'].unique()) algo_decoration = { 'mimic': ('r', 'o', 'mimic', mimic), 'ga': ('g', 's', 'genetic algo', ga), 'sa': ('b', '+', 'sim annealing', sa), 'rhc': ('k', '*', 'rhc', rhc) } def f(data, name): x = data['iters'] y = data['fn_value'] deco = algo_decoration[name] return u.YSeries(y, xvalues=x, points_marker='.', plot_legend_label=deco[2], legend_marker='o', line_color=deco[0]) for size in sizes: size_root = u.PreparePath(outputfolder + "/itercurves_" + str(size) + ".png") y_ser = [] for key in algo_decoration.keys(): d = u.FilterRows(algo_decoration[key][3], lambda x: x['size'] == size).head(10000) y_ser.append(f(d, key)) u.SaveDataPlotWithLegends(y_ser, x_axis_name="iters", x=None, y1_axis_name="fn value", filename=size_root)
def PlotTempVariationCurvesForSa(rootfolder, algoname, temperatures): """ """ rhcdata = u.FilterRows( pd.read_csv(rootfolder + "/" + algoname + "/stats_agg.csv"), lambda x: x['algo'] == 'rhc') y_Ser = [] y_Ser.append( u.YSeries(rhcdata['converged_iters'], xvalues=rhcdata['size'], points_marker="*", line_color="k", plot_legend_label="rhc")) data_dict = {} deco = { '0': ("r", "x"), '90': ("b", "o"), '95': ("g", "+"), '99': ("orange", ">") } for t in temperatures: path = rootfolder + "/" + algoname + "_" + t CompteStats(path, ["sa.csv"]) data_dict[t] = pd.read_csv(path + "/stats_agg.csv") y_Ser.append( u.YSeries(data_dict[t]['converged_iters'], xvalues=data_dict[t]['size'], points_marker=deco[t][1], line_color=deco[t][0], plot_legend_label="sa_" + t)) outputfile = rootfolder + "/" + algoname + "/plots/sa_temperatures.png" u.SaveDataPlotWithLegends(y_Ser, y1_axis_name="iterations to converge", x_axis_name="size", filename=outputfile)
def RunDecisionTreesWithOptimalInst(datasets_root_folder, weka_jar_path, cv_results_file, use_arff_files=True): file_extn = "arff" if use_arff_files else ".csv" testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder, file_extn)) cv_results = pd.read_csv(datasets_root_folder + "/" + cv_results_file) for dataset_dir in u.Get_Subdirectories(datasets_root_folder): trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir, file_extn))[0] paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0] dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False) filter_name, filter_val = GetFilterOptions(dataset_dir) config_gen = ParameterGrid({'prune': [True, False]}) for config in config_gen: filter = lambda x: (x['prune'] == False) & (x[ filter_name] == filter_val) & (x[ 'istrain'] == 1) # this will output on the held out set filtered_rows = u.FilterRows(cv_results, filter) a = filtered_rows['m'] if (len(a) == 0): print("ignoring : {0}".format(dataset_dir)) continue b = np.max(filtered_rows['m']) indxs = np.isclose(a, b) best_insts = filtered_rows[indxs] best_insts = best_insts.iloc[0]['inst'] config['inst'] = best_insts id = GetIdForOptConfig(config) params_info = u.ReadLinesFromFile(paramfile) params_info_dict = sl.GetDictionary(params_info) run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id), is_file=False) params_output_file = u.PreparePath("{0}/{1}.params.txt".format( run_output_dir, id)) model_output_file = u.PreparePath("{0}/{1}.model".format( run_output_dir, id)) train_output_file = u.PreparePath( "{0}/{1}.train.predictions.csv".format(run_output_dir, id)) test_output_file = u.PreparePath( "{0}/{1}.test.predictions.csv".format(run_output_dir, id)) # if(os.path.isfile(train_output_file)): # continue config['random_state'] = params_info_dict['random_state'] config["wekajar"] = weka_jar_path config["trainset"] = trainfile config["class"] = "last" config["trainpredictionoutputfile"] = train_output_file config["predictionoutputfile"] = config[ "trainpredictionoutputfile"] config["modeloutputfile"] = model_output_file config["testpredictionoutputfile"] = test_output_file # for every config there has to be a train prediction and test prediction cmd = GetWekaCommandLineForConfig(config, False, False) config["modelbuildtimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) # now for test set config["predictionoutputfile"] = test_output_file config["testset"] = testfiles[0] cmd = GetWekaCommandLineForConfig(config, True, False) config["modelevaltimesecs"] = timeit.timeit( lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1) config.pop('random_state', None) # since we already have that in params_info for k in config: params_info.append("{0}={1}".format(k, config[k])) u.WriteTextArrayToFile(params_output_file, params_info) print("done dataset : " + dataset_dir)
def PlotPiViConvergenceForSmallAndLargeMdp(outputfolder, datafile, gamma): data = pd.read_csv(datafile) decorations = {1: 'g', 10: 'k', 10000: 'r'} pi_sweeps = [1, 10, 10000] ser = [] ser1 = [] vi_added = False for sweep in pi_sweeps: data_vi = u.FilterRows( data, lambda x: (x['mdp'] == 'LargeMdpRwTraps50') & (x['solver'] == 'vi') & (x['gamma'] == gamma)) data_pi = u.FilterRows( data, lambda x: (x['mdp'] == 'LargeMdpRwTraps50') & (x['solver'] == 'pi') & (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep)) assert (len(data_vi) == 1) assert (len(data_pi) == 1) data_vi_qchange = np.array( [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')]) data_vi_value = np.array([ float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';') ]) data_pi_qchange = np.array( [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')]) data_pi_value = np.array([ float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';') ]) if (vi_added == False): s_vi = u.YSeries(data_vi_qchange, xvalues=np.arange(len(data_vi_qchange)) + 1, line_color='b', plot_legend_label='VI') ser.append(s_vi) s_pi = u.YSeries(data_pi_qchange, xvalues=np.arange(len(data_pi_qchange)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser.append(s_pi) if (vi_added == False): s_vi = u.YSeries(data_vi_value, xvalues=np.arange(len(data_vi_value)) + 1, line_color='b', plot_legend_label='VI') ser1.append(s_vi) s_pi = u.YSeries(data_pi_value, xvalues=np.arange(len(data_pi_value)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser1.append(s_pi) vi_added = True outputfile = u.PreparePath(outputfolder + "/plots/large_qchange_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser, filename=outputfile, x_axis_name="iterations", y1_axis_name="Max change in state value") outputfile = u.PreparePath(outputfolder + "/plots/large_value_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser1, filename=outputfile, x_axis_name="iterations", y1_axis_name="Total value accross states") ser = [] ser1 = [] vi_added = False for sweep in pi_sweeps: data_vi = u.FilterRows( data, lambda x: (x['mdp'] == 'SmallMdpRwTraps') & (x['solver'] == 'vi') & (x['gamma'] == gamma)) data_pi = u.FilterRows( data, lambda x: (x['mdp'] == 'SmallMdpRwTraps') & (x['solver'] == 'pi') & (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep)) assert (len(data_vi) == 1) assert (len(data_pi) == 1) data_vi_qchange = np.array( [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')]) data_vi_value = np.array([ float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';') ]) data_pi_qchange = np.array( [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')]) data_pi_value = np.array([ float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';') ]) if (vi_added == False): s_vi = u.YSeries(data_vi_qchange, xvalues=np.arange(len(data_vi_qchange)) + 1, line_color='b', plot_legend_label='VI') ser.append(s_vi) s_pi = u.YSeries(data_pi_qchange, xvalues=np.arange(len(data_pi_qchange)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser.append(s_pi) if (vi_added == False): s_vi = u.YSeries(data_vi_value, xvalues=np.arange(len(data_vi_value)) + 1, line_color='b', plot_legend_label='VI') ser1.append(s_vi) s_pi = u.YSeries(data_pi_value, xvalues=np.arange(len(data_pi_value)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser1.append(s_pi) vi_added = True outputfile = u.PreparePath(outputfolder + "/plots/small_qchange_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser, filename=outputfile, x_axis_name="iterations", y1_axis_name="Max change in state value") outputfile = u.PreparePath(outputfolder + "/plots/small_value_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser1, filename=outputfile, x_axis_name="iterations", y1_axis_name="Total value accross states")