Beispiel #1
0
    def generate_gibbs_states_plots(self,
                                    states_path: str,
                                    cat: str = "likelihood"):
        new_dir = Tools.get_path(states_path, f"{cat}_plots")
        if Tools.path_exists(new_dir):
            print("Plots found, skipping..")
            return

        Tools.initialise_directory(new_dir)
        with Tools.scan_directory(states_path) as outputs:
            for i, output in enumerate(outputs):
                try:
                    state_file = Tools.get_path(output.path, "state.log")
                    df = pd.read_csv(filepath_or_buffer=state_file,
                                     delim_whitespace=True,
                                     index_col="iter")
                    ax = sns.lineplot(x=df.index, y=cat, data=df)
                    ax.margins(x=0)
                    name = output.name
                    fig = ax.get_figure()
                    fig.savefig(Tools.get_path(states_path, f"{cat}_plots",
                                               f"{name}.png"),
                                dpi=300,
                                bbox_incehs="tight",
                                format="png")
                    fig.clf()
                    print(f"{i}")
                except FileNotFoundError:
                    print(f"→ Skipping {output.name}")
Beispiel #2
0
    def _generate_hdps_outputs(self,
                               skip_factor: int = 1,
                               verbose: bool = False):
        st = time.perf_counter()
        ldac_path = Tools.get_path("lda_c_format_HyperFalse",
                                   "dummy_ldac_corpus.dat")
        words_nums = {}
        vocab_file = Tools.get_path("lda_c_format_HyperFalse",
                                    "dummy_ldac_corpus.dat.vocab")
        #        size = ((60 // skip_factor)
        #                * len(self.etas)
        #                * len(self.gammas)**2
        #                * len(self.alphas)**2)
        # Since we fixed the scales of Gammas
        size = ((60 // skip_factor) * len(self.etas) * len(self.gammas) *
                len(self.alphas))
        i = 0
        with Tools.scan_directory(self.training_folder) as ps_folders:
            for c, folder in enumerate(ps_folders):
                if not folder.name[0:7] == "problem":
                    if verbose:
                        print(f"→ Skipping {folder.name}")
                    continue
                # Implement the skipping factor
                if c % skip_factor != 0:
                    continue

                t = time.perf_counter()
                # Fix the scale parameters for the Gamma priors
                g_r = 1
                a_r = 1
                for eta in self.etas:
                    # for g_s, g_r in product(self.gammas, repeat=2):
                    # for a_s, a_r in product(self.alphas, repeat=2):
                    # Only switch the shape parameter of Gammas
                    for g_s in self.gammas:
                        for a_s in self.alphas:
                            # Cache the number of words for later
                            if folder.name not in words_nums:
                                vocab_path = Tools.get_path(
                                    folder.path, vocab_file)
                                n_words = self._get_number_words(vocab_path)
                                words_nums.update({folder.name: n_words})

                            i = i + 1
                            percentage = f"{100 * i / size:06.02f}"
                            suff = (f"{g_s:0.2f}_{g_r:0.2f}_"
                                    f"{a_s:0.2f}_{a_r:0.2f}")
                            if verbose:
                                print(f"► Applying HDP with "
                                      f"eta={eta:0.1f} "
                                      f"gamma({g_s:0.2f}, {g_r:0.2f}) "
                                      f"alpha({a_s:0.2f}, {a_r:0.2f}) "
                                      f"on {folder.name} [{percentage}%]")

                            directory = Tools.get_path(self.out_dir,
                                                       "optimisation",
                                                       f"{eta:0.1f}__{suff}",
                                                       folder.name)

                            if (Tools.path_exists(directory)):
                                if verbose:
                                    print("\tcached result found at "
                                          f"{directory}")
                                continue

                            path_executable = r"{}\hdp.exe".format(
                                self.hdp_path)
                            data = Tools.get_path(folder.path, ldac_path)

                            # Prepare the output directory
                            Tools.initialise_directories(directory)

                            if self.seed is not None:
                                s.run([
                                    path_executable, "--algorithm", "train",
                                    "--data", data, "--directory", directory,
                                    "--max_iter",
                                    str(self.iters), "--sample_hyper", "no",
                                    "--save_lag", "-1", "--eta",
                                    str(eta), "--gamma_a",
                                    str(g_s), "--gamma_b",
                                    str(g_r), "--alpha_a",
                                    str(a_s), "--alpha_b",
                                    str(a_r), "--random_seed",
                                    str(self.seed)
                                ],
                                      stdout=s.DEVNULL,
                                      check=True,
                                      capture_output=False,
                                      text=True)
                            else:
                                s.run([
                                    path_executable, "--algorithm", "train",
                                    "--data", data, "--directory", directory,
                                    "--max_iter",
                                    str(self.iters), "--sample_hyper", "no",
                                    "--save_lag", "-1", "--eta",
                                    str(eta), "--gamma_a",
                                    str(g_s), "--gamma_b",
                                    str(g_r), "--alpha_a",
                                    str(a_s), "--alpha_b",
                                    str(a_r)
                                ],
                                      stdout=s.DEVNULL,
                                      check=True,
                                      capture_output=False,
                                      text=True)

                if verbose:
                    print(f"--- {folder.name} done in "
                          f"{time.perf_counter() - t:0.1f} seconds ---")

        period = round(time.perf_counter() - st, 2)
        print(f"----- Vectorisation done in {period} seconds -----")
        return words_nums