def run_prediction(self, to_pred, obs, split, progr_pred): import pxpy as px data_disc = self.setup_for_cv(split, to_pred, obs) to_pred = to_pred.reshape(data_disc.shape + to_pred.shape[-1:]) # to n, T * V, D with warnings.catch_warnings(record=True): warnings.simplefilter('ignore') model = px.load_model(self.model_filename + '_{}'.format(self.emiters - 1)) self.predict(model, data_disc, progr_pred, to_pred) model.graph.delete() model.delete() super().run_prediction(to_pred, obs, self.split, progr_pred) # for final console output to_pred = to_pred.reshape(obs.shape) if self.slice_shape is not None: # only return the centered slice pixel series to_pred = to_pred[:, :, np.count_nonzero(self.slice_shape) // 2] to_pred = np.expand_dims(to_pred, 2) return to_pred
def run_training(self, data, obs, split, progr_train): import pxpy as px data = self.setup_for_cv(split, data, obs) missing = data == HIDE_VAL overall_loss = [] for emiter in range(self.emiters): self.obj = sys.maxsize loss = [] new_modelname = self.model_filename + '_{}'.format(emiter) if not os.path.isfile(new_modelname): if emiter != 0: # load the previous model and fill data gaps with gibbs data[missing] = HIDE_VAL with warnings.catch_warnings(record=True): warnings.simplefilter('ignore') prev_model = px.load_model(self.model_filename + '_{}'.format(emiter - 1)) self.predict_gibbs(prev_model, data) else: prev_model = None with warnings.catch_warnings(record=True): warnings.simplefilter('ignore') model = px.train( data=data, iters=sys.maxsize, graph=px.create_graph(self.edges), mode=getattr(px.ModelType, self.mode), shared_states=bool(self.shared), in_model=prev_model, opt_regularization_hook=self.regularize, opt_progress_hook=( lambda x, em=emiter, loss=loss: self. check_progress(x, progr_train, em, loss))) model.save(new_modelname) model.graph.delete() model.delete() overall_loss.append(('EM Iter ' + str(emiter), loss)) progr_train[self.split] = (100.0 / self.emiters) * (emiter + 1) self.cons.progress(progr_train, self.split) self.plot_convergence(overall_loss) super().run_training(data, obs, split, progr_train) # for final console output
def kl(): path = os.path.join(CONFIG.ROOT_DIR, "experiments", "COVERTYPE", "100_50_1_1583660364", "models") models = [] theta = [] logpartitions = [] suff_stats = [] for file in os.listdir(path): if "k0" in file: models.append(px.load_model(os.path.join(path, file))) samples = [] for model in models: m, A = model.infer() samples.append(model.sample(num_samples=100)) logpartitions.append(A) suff_stats.append(model.statistics) theta.append(model.weights) weights = np.array([model.weights for model in models]).T var_agg = Variance(weights, samples, -1, models[0].graph, models[0].states) var_agg.aggregate(None) agg = KL(models, 500) agg.aggregate(None)
def main(): dataset = ["dota2", "covertype", "susy"] var_dfs_low = pd.DataFrame() var_dfs_high = pd.DataFrame() varvar_dfs_low = pd.DataFrame() varvar_dfs_high = pd.DataFrame() var_aggs_low = pd.DataFrame() var_aggs_high = pd.DataFrame() save_path = os.path.join("..", "Thesis", "kapitel", "figures") aggregates = { data: { "none": { "avg": [[] for i in range(15)], "kl": [[] for i in range(15)], "wa": [[] for i in range(15)], "radon": [[] for i in range(15)], "var": [[] for i in range(15)] }, "fish": { "avg": [[] for i in range(15)], "kl": [[] for i in range(15)], "wa": [[] for i in range(15)], "radon": [[] for i in range(15)], "var": [[] for i in range(15)] }, "random": { "avg": [[] for i in range(15)], "kl": [[] for i in range(15)], "wa": [[] for i in range(15)], "radon": [[] for i in range(15)], "var": [[] for i in range(15)] }, "unif": { "avg": [[] for i in range(15)], "kl": [[] for i in range(15)], "wa": [[] for i in range(15)], "radon": [[] for i in range(15)], "var": [[] for i in range(15)] } } for data in dataset } for data in dataset: root_path = os.path.join("..", "..", "Cluster", "dota2") col_names = { "none": "NoCov", "l2": "l2", "None": "NoReg", "unif": "UnifCov", "fish": "FishCov", "random": "Rand", "avg": "Average", "radon": "Radon", "wa": "LLWeighted", "kl": "Bootstrap", "var": "AccWeighted" } for experiment in os.listdir(root_path): if experiment == "old" or experiment == "plots": continue index = [] path = os.path.join(root_path, experiment) covtype = experiment.split("_")[2] reg = experiment.split("_")[-1] eps = float( pd.read_csv(os.path.join( path, "readme.md")).loc[14][0].split(":")[1]) cv = [] baselines = [] aggs = ['avg', 'kl', 'var', 'wa', 'radon'] agg_dict = {a: [[] for i in range(15)] for a in aggs} for i in range(10): if i == 0: index.append( px.load_model( os.path.join(path, 'baseline', "px_model" + str(i))).num_instances) baselines.append( px.load_model( os.path.join(path, 'baseline', "px_model" + str(i))).weights) for h in range(10): cvp = os.path.join(path, str(h)) batches = [] for i in range(15): thetas = [] batch = os.path.join(cvp, "batch_n" + str(i)) agg_dict['avg'][i].append( np.load(os.path.join(batch, "weights_mean.npy"))) agg_dict['kl'][i].append( np.load(os.path.join(batch, "weights_kl.npy"))) agg_dict['var'][i].append( np.load(os.path.join(batch, "weights_var.npy"))) agg_dict['wa'][i].append( np.load(os.path.join(batch, "weights_wa.npy"))) if covtype.lower() != "none": agg_dict['radon'][i].append( np.load(os.path.join(batch, "weights_radon.npy"))) for j in range(10): if h == 0 and j == 0: index.append( px.load_model( os.path.join( batch, "dist_pxmodel " + str(j) + ".px")).num_instances) thetas.append( px.load_model( os.path.join(batch, "dist_pxmodel " + str(j) + ".px")).weights) batches.append(thetas) cv.append(batches) if eps == 0.05: df = var_dfs_high var_df = varvar_dfs_high agg_df = var_aggs_high else: df = var_dfs_low var_df = varvar_dfs_low agg_df = var_aggs_low average_variance = [] variance_variance = [] max_variance = [] avg_acc = {a: [] for a in aggs} for agg in aggs: if agg == 'radon' and covtype == 'none': continue if agg == 'avg' and covtype == 'fish' and reg.lower( ) == 'none': print("stop") tmp_avg_agg = [] for j in range(15): tmp_avg_agg.append( np.mean(np.var(agg_dict[agg][j], axis=0))) avg_acc[agg].append(tmp_avg_agg) for i in range(15): total_var = [] tmp_var = [a[i] for a in cv] for split in tmp_var: total_var.append(np.var(split, axis=0)) average_variance.append(np.mean(total_var)) variance_variance.append(np.var(total_var)) max_variance.append(np.max(total_var)) for agg in aggs: if agg == 'radon' and covtype == 'none': continue agg_df[data.capitalize() + " " + col_names[covtype] + " " + col_names[reg] + " " + col_names[agg]] = avg_acc[agg][0] if data.capitalize() + " " + col_names[reg] not in df.columns: df[data.capitalize() + " " + col_names[reg]] = np.append( np.mean(np.var(baselines, axis=0)), average_variance) var_df[data.capitalize() + " " + col_names[reg]] = np.append( np.mean(np.var(baselines, axis=0)), variance_variance) else: df[data.capitalize() + " " + col_names[reg]] += np.append( np.mean(np.var(baselines, axis=0)), average_variance) var_df[data.capitalize() + " " + col_names[reg]] += np.append( np.var(np.var(baselines, axis=0)), variance_variance) df.index = index var_df.index = index agg_df.index = index[1:] var_dfs_high.index.name = "Num Samples" var_dfs_low.index.name = "Num Samples" var_dfs_low = var_dfs_low / 4 var_dfs_high = var_dfs_high / 4 varvar_dfs_high = varvar_dfs_high / 4 varvar_dfs_low = varvar_dfs_low / 4 with open(os.path.join(save_path, "average_std_0.05.tex"), "w+") as texfile: texfile.write( np.sqrt(var_dfs_high).to_latex(float_format="%.2f", label="", caption="", na_rep="---")) with open(os.path.join(save_path, "average_std_0.1.tex"), "w+") as texfile: texfile.write( np.sqrt(var_dfs_low).to_latex(float_format="%.2f", label="", caption="", na_rep="---")) with open(os.path.join(save_path, "std_std_0.05.tex"), "w+") as texfile: texfile.write( np.sqrt(varvar_dfs_high).to_latex(float_format="%.2f", label="", caption="", na_rep="---")) with open(os.path.join(save_path, "std_std_0.1.tex"), "w+") as texfile: texfile.write( np.sqrt(varvar_dfs_low).to_latex(float_format="%.2f", label="", caption="", na_rep="---")) return var_dfs_high, var_dfs_low
def load_results(experiment): baseline_stats = {} agg_stats = [] graph = None objs = None test_objs = None for cv_split in os.listdir(experiment): # ------------------------------- # Load Baseline Results # ------------------------------- if cv_split == 'baseline': b_path = os.path.join(experiment, cv_split) baseline_stats['baseline_metrics'] = pd.read_csv( os.path.join(b_path, "baseline_metrics.csv")) graph_model = px.load_model(os.path.join(b_path, "px_model0")) graph = (graph_model.graph.edgelist, graph_model.states) # ------------------------------- # Load CV Results # ------------------------------- elif os.path.isdir(os.path.join(experiment, cv_split)): curr_results = {} a_path = os.path.join(experiment, cv_split) # Load Accuracy curr_results['acc'] = pd.read_csv( os.path.join(a_path, "accuracy.csv")) # Load F1 Score curr_results['f1'] = pd.read_csv(os.path.join(a_path, "f1.csv"), header=None, names=curr_results['acc'].columns) # ------------------------------- # Likelihood evaluated on Baseline # # File is obj.csv # ------------------------------- if cv_split == '0': curr_results['obj'] = pd.read_csv( os.path.join(a_path, "obj.csv")) objs = curr_results['obj'][0:10] objs.columns = objs.columns.str.strip() else: curr_results['obj'] = pd.read_csv( os.path.join(a_path, "obj.csv")) curr_results['obj'].columns = agg_stats[0]['obj'].columns curr_results['obj'].columns = curr_results[ 'obj'].columns.str.strip() curr_results['obj']['obj'] = curr_results['obj']['obj'].apply( float) curr_results['obj'] = curr_results['obj'][10:] # ------------------------------- # Likelihood evaluated on Test Split # # File is test_likelihood.csv # ------------------------------- if cv_split == '0': curr_results['test_ll'] = pd.read_csv( os.path.join(a_path, "test_likelihood.csv")) test_objs = curr_results['test_ll'][0:10] test_objs.columns = test_objs.columns.str.strip() else: curr_results['test_ll'] = pd.read_csv( os.path.join(a_path, "test_likelihood.csv")) curr_results['test_ll'].columns = agg_stats[0][ 'test_ll'].columns curr_results['test_ll'].columns = curr_results[ 'test_ll'].columns.str.strip() curr_results['test_ll']['test_ll'] = curr_results['test_ll'][ 'test_ll'].apply(float) curr_results['test_ll'] = curr_results['test_ll'][10:] agg_stats.append(curr_results) else: pass return baseline_stats, agg_stats, objs, test_objs, graph