def generate_gibbs_states_plots(self, states_path: str, cat: str = "likelihood"): new_dir = Tools.get_path(states_path, f"{cat}_plots") if Tools.path_exists(new_dir): print("Plots found, skipping..") return Tools.initialise_directory(new_dir) with Tools.scan_directory(states_path) as outputs: for i, output in enumerate(outputs): try: state_file = Tools.get_path(output.path, "state.log") df = pd.read_csv(filepath_or_buffer=state_file, delim_whitespace=True, index_col="iter") ax = sns.lineplot(x=df.index, y=cat, data=df) ax.margins(x=0) name = output.name fig = ax.get_figure() fig.savefig(Tools.get_path(states_path, f"{cat}_plots", f"{name}.png"), dpi=300, bbox_incehs="tight", format="png") fig.clf() print(f"{i}") except FileNotFoundError: print(f"→ Skipping {output.name}")
def _generate_hdps_outputs(self, skip_factor: int = 1, verbose: bool = False): st = time.perf_counter() ldac_path = Tools.get_path("lda_c_format_HyperFalse", "dummy_ldac_corpus.dat") words_nums = {} vocab_file = Tools.get_path("lda_c_format_HyperFalse", "dummy_ldac_corpus.dat.vocab") # size = ((60 // skip_factor) # * len(self.etas) # * len(self.gammas)**2 # * len(self.alphas)**2) # Since we fixed the scales of Gammas size = ((60 // skip_factor) * len(self.etas) * len(self.gammas) * len(self.alphas)) i = 0 with Tools.scan_directory(self.training_folder) as ps_folders: for c, folder in enumerate(ps_folders): if not folder.name[0:7] == "problem": if verbose: print(f"→ Skipping {folder.name}") continue # Implement the skipping factor if c % skip_factor != 0: continue t = time.perf_counter() # Fix the scale parameters for the Gamma priors g_r = 1 a_r = 1 for eta in self.etas: # for g_s, g_r in product(self.gammas, repeat=2): # for a_s, a_r in product(self.alphas, repeat=2): # Only switch the shape parameter of Gammas for g_s in self.gammas: for a_s in self.alphas: # Cache the number of words for later if folder.name not in words_nums: vocab_path = Tools.get_path( folder.path, vocab_file) n_words = self._get_number_words(vocab_path) words_nums.update({folder.name: n_words}) i = i + 1 percentage = f"{100 * i / size:06.02f}" suff = (f"{g_s:0.2f}_{g_r:0.2f}_" f"{a_s:0.2f}_{a_r:0.2f}") if verbose: print(f"► Applying HDP with " f"eta={eta:0.1f} " f"gamma({g_s:0.2f}, {g_r:0.2f}) " f"alpha({a_s:0.2f}, {a_r:0.2f}) " f"on {folder.name} [{percentage}%]") directory = Tools.get_path(self.out_dir, "optimisation", f"{eta:0.1f}__{suff}", folder.name) if (Tools.path_exists(directory)): if verbose: print("\tcached result found at " f"{directory}") continue path_executable = r"{}\hdp.exe".format( self.hdp_path) data = Tools.get_path(folder.path, ldac_path) # Prepare the output directory Tools.initialise_directories(directory) if self.seed is not None: s.run([ path_executable, "--algorithm", "train", "--data", data, "--directory", directory, "--max_iter", str(self.iters), "--sample_hyper", "no", "--save_lag", "-1", "--eta", str(eta), "--gamma_a", str(g_s), "--gamma_b", str(g_r), "--alpha_a", str(a_s), "--alpha_b", str(a_r), "--random_seed", str(self.seed) ], stdout=s.DEVNULL, check=True, capture_output=False, text=True) else: s.run([ path_executable, "--algorithm", "train", "--data", data, "--directory", directory, "--max_iter", str(self.iters), "--sample_hyper", "no", "--save_lag", "-1", "--eta", str(eta), "--gamma_a", str(g_s), "--gamma_b", str(g_r), "--alpha_a", str(a_s), "--alpha_b", str(a_r) ], stdout=s.DEVNULL, check=True, capture_output=False, text=True) if verbose: print(f"--- {folder.name} done in " f"{time.perf_counter() - t:0.1f} seconds ---") period = round(time.perf_counter() - st, 2) print(f"----- Vectorisation done in {period} seconds -----") return words_nums