Beispiel #1
0
 def run_prediction(self, to_pred, obs, split, progr_pred):
     import pxpy as px
     data_disc = self.setup_for_cv(split, to_pred, obs)
     to_pred = to_pred.reshape(data_disc.shape +
                               to_pred.shape[-1:])  # to n, T * V, D
     with warnings.catch_warnings(record=True):
         warnings.simplefilter('ignore')
         model = px.load_model(self.model_filename +
                               '_{}'.format(self.emiters - 1))
         self.predict(model, data_disc, progr_pred, to_pred)
         model.graph.delete()
         model.delete()
     super().run_prediction(to_pred, obs, self.split,
                            progr_pred)  # for final console output
     to_pred = to_pred.reshape(obs.shape)
     if self.slice_shape is not None:  # only return the centered slice pixel series
         to_pred = to_pred[:, :, np.count_nonzero(self.slice_shape) // 2]
         to_pred = np.expand_dims(to_pred, 2)
     return to_pred
Beispiel #2
0
 def run_training(self, data, obs, split, progr_train):
     import pxpy as px
     data = self.setup_for_cv(split, data, obs)
     missing = data == HIDE_VAL
     overall_loss = []
     for emiter in range(self.emiters):
         self.obj = sys.maxsize
         loss = []
         new_modelname = self.model_filename + '_{}'.format(emiter)
         if not os.path.isfile(new_modelname):
             if emiter != 0:  # load the previous model and fill data gaps with gibbs
                 data[missing] = HIDE_VAL
                 with warnings.catch_warnings(record=True):
                     warnings.simplefilter('ignore')
                     prev_model = px.load_model(self.model_filename +
                                                '_{}'.format(emiter - 1))
                     self.predict_gibbs(prev_model, data)
             else:
                 prev_model = None
             with warnings.catch_warnings(record=True):
                 warnings.simplefilter('ignore')
                 model = px.train(
                     data=data,
                     iters=sys.maxsize,
                     graph=px.create_graph(self.edges),
                     mode=getattr(px.ModelType, self.mode),
                     shared_states=bool(self.shared),
                     in_model=prev_model,
                     opt_regularization_hook=self.regularize,
                     opt_progress_hook=(
                         lambda x, em=emiter, loss=loss: self.
                         check_progress(x, progr_train, em, loss)))
                 model.save(new_modelname)
                 model.graph.delete()
                 model.delete()
             overall_loss.append(('EM Iter ' + str(emiter), loss))
         progr_train[self.split] = (100.0 / self.emiters) * (emiter + 1)
         self.cons.progress(progr_train, self.split)
     self.plot_convergence(overall_loss)
     super().run_training(data, obs, split,
                          progr_train)  # for final console output
Beispiel #3
0
def kl():
    path = os.path.join(CONFIG.ROOT_DIR, "experiments", "COVERTYPE",
                        "100_50_1_1583660364", "models")

    models = []
    theta = []
    logpartitions = []
    suff_stats = []
    for file in os.listdir(path):
        if "k0" in file:
            models.append(px.load_model(os.path.join(path, file)))
    samples = []
    for model in models:
        m, A = model.infer()
        samples.append(model.sample(num_samples=100))
        logpartitions.append(A)
        suff_stats.append(model.statistics)
        theta.append(model.weights)
    weights = np.array([model.weights for model in models]).T
    var_agg = Variance(weights, samples, -1, models[0].graph, models[0].states)
    var_agg.aggregate(None)
    agg = KL(models, 500)
    agg.aggregate(None)
Beispiel #4
0
def main():
    dataset = ["dota2", "covertype", "susy"]
    var_dfs_low = pd.DataFrame()
    var_dfs_high = pd.DataFrame()
    varvar_dfs_low = pd.DataFrame()
    varvar_dfs_high = pd.DataFrame()
    var_aggs_low = pd.DataFrame()
    var_aggs_high = pd.DataFrame()
    save_path = os.path.join("..", "Thesis", "kapitel", "figures")

    aggregates = {
        data: {
            "none": {
                "avg": [[] for i in range(15)],
                "kl": [[] for i in range(15)],
                "wa": [[] for i in range(15)],
                "radon": [[] for i in range(15)],
                "var": [[] for i in range(15)]
            },
            "fish": {
                "avg": [[] for i in range(15)],
                "kl": [[] for i in range(15)],
                "wa": [[] for i in range(15)],
                "radon": [[] for i in range(15)],
                "var": [[] for i in range(15)]
            },
            "random": {
                "avg": [[] for i in range(15)],
                "kl": [[] for i in range(15)],
                "wa": [[] for i in range(15)],
                "radon": [[] for i in range(15)],
                "var": [[] for i in range(15)]
            },
            "unif": {
                "avg": [[] for i in range(15)],
                "kl": [[] for i in range(15)],
                "wa": [[] for i in range(15)],
                "radon": [[] for i in range(15)],
                "var": [[] for i in range(15)]
            }
        }
        for data in dataset
    }
    for data in dataset:
        root_path = os.path.join("..", "..", "Cluster", "dota2")

        col_names = {
            "none": "NoCov",
            "l2": "l2",
            "None": "NoReg",
            "unif": "UnifCov",
            "fish": "FishCov",
            "random": "Rand",
            "avg": "Average",
            "radon": "Radon",
            "wa": "LLWeighted",
            "kl": "Bootstrap",
            "var": "AccWeighted"
        }

        for experiment in os.listdir(root_path):
            if experiment == "old" or experiment == "plots":
                continue
            index = []
            path = os.path.join(root_path, experiment)
            covtype = experiment.split("_")[2]
            reg = experiment.split("_")[-1]
            eps = float(
                pd.read_csv(os.path.join(
                    path, "readme.md")).loc[14][0].split(":")[1])
            cv = []
            baselines = []
            aggs = ['avg', 'kl', 'var', 'wa', 'radon']
            agg_dict = {a: [[] for i in range(15)] for a in aggs}
            for i in range(10):
                if i == 0:
                    index.append(
                        px.load_model(
                            os.path.join(path, 'baseline',
                                         "px_model" + str(i))).num_instances)
                baselines.append(
                    px.load_model(
                        os.path.join(path, 'baseline',
                                     "px_model" + str(i))).weights)
            for h in range(10):
                cvp = os.path.join(path, str(h))
                batches = []
                for i in range(15):
                    thetas = []
                    batch = os.path.join(cvp, "batch_n" + str(i))
                    agg_dict['avg'][i].append(
                        np.load(os.path.join(batch, "weights_mean.npy")))
                    agg_dict['kl'][i].append(
                        np.load(os.path.join(batch, "weights_kl.npy")))
                    agg_dict['var'][i].append(
                        np.load(os.path.join(batch, "weights_var.npy")))
                    agg_dict['wa'][i].append(
                        np.load(os.path.join(batch, "weights_wa.npy")))
                    if covtype.lower() != "none":
                        agg_dict['radon'][i].append(
                            np.load(os.path.join(batch, "weights_radon.npy")))
                    for j in range(10):
                        if h == 0 and j == 0:
                            index.append(
                                px.load_model(
                                    os.path.join(
                                        batch, "dist_pxmodel " + str(j) +
                                        ".px")).num_instances)
                        thetas.append(
                            px.load_model(
                                os.path.join(batch, "dist_pxmodel " + str(j) +
                                             ".px")).weights)
                    batches.append(thetas)
                cv.append(batches)
            if eps == 0.05:
                df = var_dfs_high
                var_df = varvar_dfs_high
                agg_df = var_aggs_high
            else:
                df = var_dfs_low
                var_df = varvar_dfs_low
                agg_df = var_aggs_low
            average_variance = []
            variance_variance = []
            max_variance = []
            avg_acc = {a: [] for a in aggs}
            for agg in aggs:
                if agg == 'radon' and covtype == 'none':
                    continue
                if agg == 'avg' and covtype == 'fish' and reg.lower(
                ) == 'none':
                    print("stop")
                tmp_avg_agg = []
                for j in range(15):
                    tmp_avg_agg.append(
                        np.mean(np.var(agg_dict[agg][j], axis=0)))
                avg_acc[agg].append(tmp_avg_agg)

            for i in range(15):
                total_var = []
                tmp_var = [a[i] for a in cv]
                for split in tmp_var:
                    total_var.append(np.var(split, axis=0))
                average_variance.append(np.mean(total_var))
                variance_variance.append(np.var(total_var))
                max_variance.append(np.max(total_var))
            for agg in aggs:
                if agg == 'radon' and covtype == 'none':
                    continue
                agg_df[data.capitalize() + " " + col_names[covtype] + " " +
                       col_names[reg] + " " + col_names[agg]] = avg_acc[agg][0]
            if data.capitalize() + " " + col_names[reg] not in df.columns:
                df[data.capitalize() + " " + col_names[reg]] = np.append(
                    np.mean(np.var(baselines, axis=0)), average_variance)
                var_df[data.capitalize() + " " + col_names[reg]] = np.append(
                    np.mean(np.var(baselines, axis=0)), variance_variance)
            else:
                df[data.capitalize() + " " + col_names[reg]] += np.append(
                    np.mean(np.var(baselines, axis=0)), average_variance)
                var_df[data.capitalize() + " " + col_names[reg]] += np.append(
                    np.var(np.var(baselines, axis=0)), variance_variance)
            df.index = index
            var_df.index = index
            agg_df.index = index[1:]

        var_dfs_high.index.name = "Num Samples"
        var_dfs_low.index.name = "Num Samples"
    var_dfs_low = var_dfs_low / 4
    var_dfs_high = var_dfs_high / 4
    varvar_dfs_high = varvar_dfs_high / 4
    varvar_dfs_low = varvar_dfs_low / 4
    with open(os.path.join(save_path, "average_std_0.05.tex"),
              "w+") as texfile:
        texfile.write(
            np.sqrt(var_dfs_high).to_latex(float_format="%.2f",
                                           label="",
                                           caption="",
                                           na_rep="---"))
    with open(os.path.join(save_path, "average_std_0.1.tex"), "w+") as texfile:
        texfile.write(
            np.sqrt(var_dfs_low).to_latex(float_format="%.2f",
                                          label="",
                                          caption="",
                                          na_rep="---"))
    with open(os.path.join(save_path, "std_std_0.05.tex"), "w+") as texfile:
        texfile.write(
            np.sqrt(varvar_dfs_high).to_latex(float_format="%.2f",
                                              label="",
                                              caption="",
                                              na_rep="---"))
    with open(os.path.join(save_path, "std_std_0.1.tex"), "w+") as texfile:
        texfile.write(
            np.sqrt(varvar_dfs_low).to_latex(float_format="%.2f",
                                             label="",
                                             caption="",
                                             na_rep="---"))
    return var_dfs_high, var_dfs_low
Beispiel #5
0
def load_results(experiment):
    baseline_stats = {}
    agg_stats = []
    graph = None
    objs = None
    test_objs = None
    for cv_split in os.listdir(experiment):

        # -------------------------------
        # Load Baseline Results
        # -------------------------------
        if cv_split == 'baseline':
            b_path = os.path.join(experiment, cv_split)
            baseline_stats['baseline_metrics'] = pd.read_csv(
                os.path.join(b_path, "baseline_metrics.csv"))
            graph_model = px.load_model(os.path.join(b_path, "px_model0"))
            graph = (graph_model.graph.edgelist, graph_model.states)

        # -------------------------------
        # Load CV Results
        # -------------------------------
        elif os.path.isdir(os.path.join(experiment, cv_split)):
            curr_results = {}
            a_path = os.path.join(experiment, cv_split)

            # Load Accuracy
            curr_results['acc'] = pd.read_csv(
                os.path.join(a_path, "accuracy.csv"))

            # Load F1 Score
            curr_results['f1'] = pd.read_csv(os.path.join(a_path, "f1.csv"),
                                             header=None,
                                             names=curr_results['acc'].columns)

            # -------------------------------
            # Likelihood evaluated on Baseline
            #
            # File is obj.csv
            # -------------------------------
            if cv_split == '0':
                curr_results['obj'] = pd.read_csv(
                    os.path.join(a_path, "obj.csv"))
                objs = curr_results['obj'][0:10]
                objs.columns = objs.columns.str.strip()
            else:
                curr_results['obj'] = pd.read_csv(
                    os.path.join(a_path, "obj.csv"))
                curr_results['obj'].columns = agg_stats[0]['obj'].columns

            curr_results['obj'].columns = curr_results[
                'obj'].columns.str.strip()
            curr_results['obj']['obj'] = curr_results['obj']['obj'].apply(
                float)
            curr_results['obj'] = curr_results['obj'][10:]

            # -------------------------------
            # Likelihood evaluated on Test Split
            #
            # File is test_likelihood.csv
            # -------------------------------
            if cv_split == '0':
                curr_results['test_ll'] = pd.read_csv(
                    os.path.join(a_path, "test_likelihood.csv"))
                test_objs = curr_results['test_ll'][0:10]
                test_objs.columns = test_objs.columns.str.strip()
            else:
                curr_results['test_ll'] = pd.read_csv(
                    os.path.join(a_path, "test_likelihood.csv"))
                curr_results['test_ll'].columns = agg_stats[0][
                    'test_ll'].columns

            curr_results['test_ll'].columns = curr_results[
                'test_ll'].columns.str.strip()
            curr_results['test_ll']['test_ll'] = curr_results['test_ll'][
                'test_ll'].apply(float)
            curr_results['test_ll'] = curr_results['test_ll'][10:]

            agg_stats.append(curr_results)
        else:
            pass
    return baseline_stats, agg_stats, objs, test_objs, graph