def _vectorise_ps(self, ps: int, convert_to_proportions: bool): # Override the function, returning only the LSS representation directory_path = f"{self.corpus_path}\\problem{ps:03d}" pzd_fpath = (f"{directory_path}\\BTM_{self.btm_dir_suffix}" f"\\k{self.t}.pz_d") btm_lss = pd.read_csv(filepath_or_buffer=pzd_fpath, delim_whitespace=True, header=None) if len(self.btm.doc_index) == 0: doc_index = [] # We will need to build the index with Tools.scan_directory(directory_path) as docs: for doc in docs: if doc.is_dir(): continue doc_index.append(Tools.get_filename(doc.path)) btm_lss.index = doc_index else: btm_lss.index = self.btm.doc_index if convert_to_proportions: tokenised_btmcorpus_filepath = ( f"{directory_path}\\BTM_{self.btm_dir_suffix}" f"\\vectorised\\tokenised_btmcorpus.txt") with open(tokenised_btmcorpus_filepath) as c: tcorpus = c.readlines() freqs = [len(self._doc_gen_biterms(tdoc)) for tdoc in tcorpus] btm_lss = btm_lss.mul(freqs, axis="index") return btm_lss
def __init__( self, directory_path: str, t: int, alpha: float, beta: float, btm_exe_path: str = Tools.get_path("..", "BTM-master", "src", "btm.exe"), n_iter: int = 10000, # To guarantee convergence model_dir_suffix: str = "", doc_inference_type: str = "sum_b"): self.directory_path = directory_path self.t = t self.alpha = alpha self.beta = beta self.n_iter = n_iter self.doc_index = [] # the index of the files read for reference self.w = None self.btm_exe = btm_exe_path self.doc_inf_type = "sum_b" # Due to later dependant computations self.output_dir = Tools.get_path(directory_path, f"BTM_{model_dir_suffix}") self.plain_corpus_path = Tools.get_path(self.output_dir, "btmcorpus.txt") self.tokenised_btmcorpus_filepath = Tools.get_path( self.output_dir, "vectorised", "tokenised_btmcorpus.txt") self.vocab_ids_path = Tools.get_path(self.output_dir, "vectorised", "voca_pt")
def _convert_corpus_to_bow(self, file_ext: str = "txt"): """ Convert a directory of text files into a BoW model. Parameters ---------- word_grams : int (optional) The number of words to combine as features. 1 is the default value, and it denotes the usage of word unigrams. Returns ------- bow_corpus : gnesim corpus The bag-of-words model. dictionary : gensim dictionary The id2word mapping. plain_documents : list The list of plain documents, to serve as a reference point. """ # Read in the plain text files plain_documents = [] with Tools.scan_directory(self.input_docs_path) as docs: for doc in docs: if doc.is_dir() or Tools.split_path( doc.path)[1] != f".{file_ext}": continue try: f = open(doc.path, mode="r", encoding="utf8") plain_documents.append(f.read()) self.doc_index.append(Tools.get_filename(doc.path)) except PermissionError: # Raised when trying to open a directory print("Skipped while loading files: {}".format(doc.name)) pass # Collocation Detection can be applied here via gensim.models.phrases # Tokenise corpus and remove too short documents tokenised_corpus = [[ ' '.join(tkn) for tkn in ngrams(word_tokenize(d.lower()), self.word_grams) ] for d in plain_documents if len(d) > 3] if self.drop_uncommon: freq = defaultdict(int) for doc in tokenised_corpus: for word in doc: freq[word] += 1 tokenised_corpus = [[w for w in doc if freq[w] > self.freq_th] for doc in tokenised_corpus] # Form the word ids dictionary for vectorisation dictionary = Dictionary(tokenised_corpus) corpus = [dictionary.doc2bow(t_d) for t_d in tokenised_corpus] return (corpus, dictionary, pd.DataFrame(data=plain_documents, index=self.doc_index, columns=["content"]))
def _concatenate_docs_into_btmcorpus(self, remove_bgw: bool = False, drop_uncommon: bool = False, drop_punctuation: bool = False): # Read in the plain text files plain_documents = [] with Tools.scan_directory(self.directory_path) as docs: for doc in docs: if doc.is_dir(): continue try: f = open(doc.path, mode="r", encoding="utf8") plain_documents.append(f.read()) self.doc_index.append(Tools.get_filename(doc.path)) except PermissionError: # Raised when trying to open a directory print("Skipped while loading files: {}".format(doc.name)) pass finally: f.close() # lowercase and strip \n away plain_documents = [ str.replace(d, "\n", "").lower() for d in plain_documents ] # it was observed that the topics are composed of a lot of stop words # Following the BTM paper and the observation, we remove these if remove_bgw: # Detect the language lang = detect(" ".join(plain_documents)) if lang == "en": lang = "english" elif lang == "nl": lang = "dutch" else: lang = "greek" new_documents = [] for d in plain_documents: terms = [ w for w in word_tokenize(text=d, language=lang) if w not in set(stopwords.words(lang)) ] new_documents.append(" ".join(terms)) plain_documents = new_documents if drop_punctuation: plain_documents = [ sub(pattern=r"[^\w\s]", repl="", string=d) for d in plain_documents ] # save it to disk Tools.save_list_to_text(mylist=plain_documents, filepath=self.plain_corpus_path) return plain_documents
def load_pz_d_into_df(self, use_frequencies: bool = False): """ Parameters ---------- use_frequencies : bool, optional DESCRIPTION. The default is False. Returns ------- btm_lss : TYPE DESCRIPTION. """ # ??? This function is not used, should be used in tester._vectorise_ps # Load the lss into df pzd_fpath = f"{self.directory_path}k{self.t}.pz_d" try: btm_lss = pd.read_csv(filepath_or_buffer=pzd_fpath, delim_whitespace=True) if not self.doc_index: # We will need to build the index with Tools.scan_directory(self.directory_path) as docs: for doc in docs: if doc.is_dir(): continue self.doc_index.append(Tools.get_filename(doc.path)) btm_lss.index = self.doc_index if use_frequencies: # The saved documents are in p(z|d) values # We want to proportion them to frequencies so that we have the # frequency of terms belonging to a topic # Since sum_b is used, we will use the count of biterms # Treating each p(zi|dj) as a proportion, we will count biterms with open(self.tokenised_btmcorpus_filepath) as c: tcorpus = c.readlines() # How many biterms are there? # Analyzing the C++ code, a widnow of 15 is used # regenerate the biterms and count as statistics can detect # redundancies in unordered terms: freqs = [len(self._doc_gen_biterms(tdoc)) for tdoc in tcorpus] btm_lss = btm_lss.mul(freqs, axis="index") return btm_lss except FileNotFoundError: return None
def _get_ps_truth(self, ps: int): folder = "pan17_train" if train_phase else "pan17_test" true_labels_path = (f"..\\..\\Datasets\\{folder}\\truth" r"\problem{:03d}\clustering.json" ).format(ps) return Tools.load_true_clusters_into_vector(true_labels_path)
def _generate_lda_c_corpus(self): """ Convert a group of files LDA_C corpus and store it on disk""" bow_corpus, id2word_map, plain_docs = self._convert_corpus_to_bow() # Sterialise into LDA_C and store on disk output_dir = Tools.get_path( self.input_docs_path, f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}", f"_{self.hdp_alpha_s:0.1f}_common_{self.drop_uncommon}") Tools.initialise_directory(output_dir) save_location = Tools.get_path(output_dir, f"{self.lda_c_fname}.dat") bleicorpus.BleiCorpus.serialize(fname=save_location, corpus=bow_corpus, id2word=id2word_map) return plain_docs, bow_corpus
def _invoke_gibbs_hdp(self): """Invoke Gibbs hdp posterior inference on the corpus""" path_executable = Tools.get_path(self.hdp_path, "hdp.exe") param_data = Tools.get_path( self.input_docs_path, f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}", f"_{self.hdp_alpha_s:0.1f}_common_{self.drop_uncommon}", f"{self.lda_c_fname}.dat") param_directory = Tools.get_path(self.input_docs_path, self.hdp_output_directory) # Prepare the output directory Tools.initialise_directory(param_directory) if self.hdp_seed is not None and self.hdp_seed > 0: ret = s.run([ path_executable, "--algorithm", "train", "--data", param_data, "--directory", param_directory, "--max_iter", str(self.hdp_iterations), "--sample_hyper", "yes" if self.hdp_hyper_sampling else "no", "--save_lag", "-1", "--eta", str(self.hdp_eta), "--random_seed", str(self.hdp_seed), "--gamma_a", str(self.hdp_gamma_s), "--alpha_a", str(self.hdp_alpha_s) ], check=True, capture_output=True, text=True) else: ret = s.run([ path_executable, "--algorithm", "train", "--data", param_data, "--directory", param_directory, "--max_iter", str(self.hdp_iterations), "--sample_hyper", "yes" if self.hdp_hyper_sampling else "no", "--save_lag", "-1", "--eta", str(self.hdp_eta), "--gamma_a", str(self.hdp_gamma_s), "--alpha_a", str(self.hdp_alpha_s) ], check=True, capture_output=True, text=True) return ret.stdout
def _save_results(self, suffix: str, info_path: str, results: List[Dict], k_values: List[List]): path = Tools.splice_save_problemsets_dictionaries( results, metadata_fpath=info_path, suffix=suffix, test_data=not train_phase) Tools.save_k_vals_as_df(k_vals=k_values, suffix=suffix, test_data=not train_phase, cop_kmeans_frac=constraints_fraction) return path
def generate_gibbs_states_plots(self, states_path: str, cat: str = "likelihood"): new_dir = Tools.get_path(states_path, f"{cat}_plots") if Tools.path_exists(new_dir): print("Plots found, skipping..") return Tools.initialise_directory(new_dir) with Tools.scan_directory(states_path) as outputs: for i, output in enumerate(outputs): try: state_file = Tools.get_path(output.path, "state.log") df = pd.read_csv(filepath_or_buffer=state_file, delim_whitespace=True, index_col="iter") ax = sns.lineplot(x=df.index, y=cat, data=df) ax.margins(x=0) name = output.name fig = ax.get_figure() fig.savefig(Tools.get_path(states_path, f"{cat}_plots", f"{name}.png"), dpi=300, bbox_incehs="tight", format="png") fig.clf() print(f"{i}") except FileNotFoundError: print(f"→ Skipping {output.name}")
def _infer_btm_pz_d(self): """Invoke Gibbs BTM docs inference on the corpus""" ret = s.run([ self.btm_exe, "inf", self.doc_inf_type, str(self.t), self.tokenised_btmcorpus_filepath, Tools.get_path(self.output_dir, "") ], check=True, capture_output=True, text=True) return ret.stdout
def _estimate_btm(self): """Invoke Gibbs BTM posterior inference on the tokenised corpus""" ret = s.run( [ self.btm_exe, "est", str(self.t), str(self.w), str(self.alpha), str(self.beta), str(self.n_iter), str(self.n_iter), # Save Step self.tokenised_btmcorpus_filepath, Tools.get_path(self.output_dir, "") ], check=True, capture_output=True, text=True) return ret.stdout
def _load_lss_representation_into_df(self) -> pd.DataFrame: """ Load a BoT LSS representation from disk to a returned dataframe. Returns ------- lss_df : pd.DataFrame A matrix of shape (n_samples, n_features) Raises ------ FileNotFoundError When the LSS representation isn't found on disk. """ path = Tools.get_path(self.input_docs_path, self.hdp_output_directory, "mode-word-assignments.dat") # We don't need document tables, so we'll skip the relative column, # But we do need word counts under each topic, to produce some sort # of a bag-of-topics model (BoT) try: lss_df = pd.read_csv(filepath_or_buffer=path, delim_whitespace=True) # usecols=["d", "w", "z"]).drop_duplicates() # Produce topic weights as counts of topic words lss_df = lss_df.pivot_table(values='w', columns='z', index='d', aggfunc='count', fill_value=0) # Index with file names for later reference lss_df.index = self.doc_index return lss_df except FileNotFoundError: print(("\nNo LSS precomputed file was found on disk via:\n{}\n" "> Please generate LDA-C corpus and run HDP first...\n" ).format(path)) raise
def run_test(self, drop_uncommon=False, desired_k=None): problemsets_results = [] kvals = [] # K is None which means it will be inferred if train_phase: end = 1 else: end = 120 for ps in range(1, 1+end): print(f"Clustering problem {ps:03d}..") # In BTM, all the corpora need to be modelled as LSS # Now we proceed with clustering ground_truth = self._get_ps_truth(ps) lss_rep_docs = self._vectorise_ps(ps, convert_to_proportions=True) # Normalise the data as they are inherintly directional lss_rep_docs = Tools.normalise_data(lss_rep_docs) # Start the clustering endeavours ps_res, k_trends = self._cluster_data(ps=ps, data=lss_rep_docs, ground_truth=ground_truth, desired_k=None) problemsets_results.append(ps_res) kvals.append(k_trends) # Save the results to disk: print("Saving results..") self._save_results( suffix=f"_btm_{self.btm_dir_suffix}", info_path=f"{self.corpus_path}\\info.json", results=problemsets_results, k_values=kvals) print("Done.")
def main(): # Specify which topic model to use? use_btm = True if use_btm: # Control Parameters ### train_phase = True t = 10 # number of btm topics ########################## print("\n-------------------------------------") print("BTM modelling and authorial clustering") print("-------------------------------------\n") if train_phase: r = range(1, 2) dpath = Tools.get_path( r"D:\Projects\Authorial_Clustering_Short_Texts_nPTM" r"\Datasets\pan17_train") else: r = range(1, 121) dpath = (r"D:\Projects\Authorial_Clustering_Short_Texts_nPTM" r"\Datasets\pan17_test") for ps in r: # Loop over the problemsets ps_path = Tools.get_path(dpath, f"problem{ps:03d}") print(f"\nProcessing #{ps:03d}:") # Inferring BTM ### ##################### # TODO: avoid creating r BTM objects by delegating ps_path btm = LssBTModeller(directory_path=ps_path, t=t, alpha=1.0, beta=0.01, model_dir_suffix="remove_stopwords_puncts") btm.infer_btm(remove_bg_terms=True, drop_puncs=True, use_biterm_freqs=False) print("\t→ btm inference done") else: print("Main thread started..\n") folders_path = (r"D:\College\DKEM\Thesis" r"\AuthorshipClustering\Datasets\pan17_train") hdp = r"D:\College\DKEM\Thesis\AuthorshipClustering\Code\hdps\hdp" optimiser = LssOptimiser(train_folders_path=folders_path, hdp_path=hdp, ldac_filename="dummy_ldac_corpus.dat", hdp_seed=None, eta_range=[0.3, 0.5, 0.8, 1], gamma_range=[0.1, 0.3, 0.5], alpha_range=[0.1, 0.3, 0.5], out_dir=Tools.get_path(".", "__outputs__"), hdp_iters=1000) ret_eta = optimiser.smart_optimisation(tail_prcnt=0.8, skip_factor=5, plot_cat="num.tables", verbose=True) print(ret_eta) print("Done.")
def assess_hyper_sampling(self, tail_prcnt: float, verbose: bool = False): """ A function to measure the average per word log-likelihood after hyper-sampling the concentration parameters of the Dirichlet distributions. Caution: the hdp must have been run on the data with hyper sampling and without it, in order to load the two representations and compare. Returns ------- dct: dict A dictionary containing the per word log-likelihood of the train data with the two methods pertaining to sampling the concentration parameters: normal and hyper. """ path_normal = Tools.get_path(".", "hdp_lss_HyperFalse", "state.log") path_hyper = Tools.get_path(".", "hdp_lss_HyperTrue", "state.log") path_ldac = Tools.get_path(".", "lda_c_format_HyperTrue", "dummy_ldac_corpus.dat.vocab") per_word_ll_normal = [] per_word_ll_hyper = [] if verbose: print("------Concentration Parameters Optimisation------") with Tools.scan_directory(self.training_folder) as dirs: for d in dirs: if d.name[0:7] != "problem": continue if verbose: print(f"\t► Processing {d.name}") normal = Tools.get_path(d.path, path_normal) hyper = Tools.get_path(d.path, path_hyper) vocab = Tools.get_path(d.path, path_ldac) n_words = self._get_number_words(vocab) df_normal = pd.read_csv(filepath_or_buffer=normal, delim_whitespace=True, index_col="iter", usecols=["iter", "likelihood"], squeeze=True) ll_normal = df_normal.tail(round(len(df_normal) * tail_prcnt)).mean() per_word_ll_normal.append(ll_normal / n_words) df_hyper = pd.read_csv(filepath_or_buffer=hyper, delim_whitespace=True, index_col="iter", usecols=["iter", "likelihood"], squeeze=True) ll_hyper = df_hyper.tail(round(len(df_hyper) * tail_prcnt)).mean() per_word_ll_hyper.append(ll_hyper / n_words) dct = { "Normal_Sampling": round(sum(per_word_ll_normal) / len(per_word_ll_normal), 4), "Hyper_Sampling": round(sum(per_word_ll_hyper) / len(per_word_ll_hyper), 4) } if verbose: print("-------------------------------------------------") pd.DataFrame(data=dct, index=[0 ]).to_csv(f"{self.out_dir}/hyper_optimisation.csv", index=False) return dct
def problem_set_run(problem_set_id: int, n_clusters: int, seed: int, configuration: str, drop_uncommon: bool, verbose: bool, infer_lss: bool = False): problem_nbr = f"{problem_set_id:03d}" # Define an LSS modeller to represent documents in LSS non-sparse space # HDP with Gibbs sampler is being used as is from: # https://github.com/blei-lab/hdp # Adjust the parameters according to the preference if configuration == config_sparse: eta = 0.3 gamma = 0.1 alpha = 0.1 elif configuration == config_dense: eta = 0.8 gamma = 1.5 alpha = 1.5 else: eta = 0.5 gamma = 1.0 alpha = 1.0 Modeller = LssHdpModeller( hdp_path=r"..\hdps\hdp", input_docs_path=r"..\..\Datasets\pan17_train\problem{}".format( problem_nbr), ldac_filename=r"ldac_corpus", hdp_output_dir=r"hdp_lss", hdp_iters=10000, hdp_seed=seed, hdp_sample_hyper=False, hdp_eta=eta, hdp_gamma_s=gamma, hdp_alpha_s=alpha, word_grams=1, drop_uncommon=drop_uncommon, freq_threshold=1, verbose=verbose) # Infer the BoW and LSS representations of the documents try: # Load, project and visualise the data plain_docs, bow_rep_docs, lss_rep_docs = Modeller.get_corpus_lss( infer_lss, bim=False) # Begin Clustering Attempts true_labels_path = (r"..\..\Datasets\pan17_train\truth" r"\problem{}\clustering.json" ).format(problem_nbr) ground_truth = Tools.load_true_clusters_into_vector(true_labels_path) # Normalise the data if not BIM is used! clu_lss = Clusterer(dtm=Tools.normalise_data(data=lss_rep_docs), true_labels=ground_truth, max_nbr_clusters=len(lss_rep_docs)-1, min_nbr_clusters=1, min_cluster_size=2, metric="cosine", desired_n_clusters=n_clusters) norm_spk_pred, norm_spk_evals = clu_lss.evaluate( alg_option=Clusterer.alg_spherical_k_means, param_init="k-means++") # ispk_pred, ispk_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_iterative_spherical_k_means, # param_init="k-means++") norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate( alg_option=Clusterer.alg_h_dbscan) norm_ms_pred, norm_ms_evals = clu_lss.evaluate( alg_option=Clusterer.alg_mean_shift) # norm_xm_pred, norm_xm_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_x_means) nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="complete") nhac_s_pred, nhac_s_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="single") nhac_a_pred, nhac_a_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="average") n_optics_pred, n_optics_evals = clu_lss.evaluate( alg_option=Clusterer.alg_optics) # Baselines bl_rand_pred, bl_rand_evals = clu_lss.evaluate( alg_option=Clusterer.bl_random) bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate( alg_option=Clusterer.bl_singleton) nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp() ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering() # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features sota_pred_path = (r"D:\College\DKEM\Thesis\AuthorshipClustering\Code" r"\clusterPAN2017-master\train_out_LogEnt" f"\\problem{problem_nbr}\\clustering.json") sota_predicted = Tools.load_true_clusters_into_vector(sota_pred_path) sota_pred, sota_evals = clu_lss.eval_sota( sota_predicted=sota_predicted) # Return the results: return (Tools.form_problemset_result_dictionary( dictionaries=[ # ispk_evals, norm_spk_evals, norm_hdbscan_evals, norm_spk_evals, norm_hdbscan_evals, norm_ms_evals, # norm_xm_evals, nhac_complete_evals, nhac_s_evals, nhac_a_evals, n_optics_evals, bl_rand_evals, bl_singleton_evals, nhdp_evals, sota_evals, ntrue_evals ], identifiers=[ # "iSpKmeans", "E_SPKMeans", "E_HDBSCAN", "E_Mean_Shift", # "XMeans", "E_HAC_C", "E_HAC_Single", "E_HAC_Average", "E_OPTICS", "BL_r", "BL_s", "S_HDP", "BL_SOTA", "Labels"], problem_set=problem_set_id), ground_truth, lss_rep_docs, plain_docs, clu_lss) except FileNotFoundError: print("Please run HDP on these data first.")
def run_test(self, configuration: str, drop_uncommon: bool, save_name_suff: str, infer: bool, desired_k: int # If 0, true k will be used, None = estimation ): # Adjust the parameters according to the preference if configuration == TestApproach.config_sparse: eta = 0.3 gamma = 0.1 alpha = 0.1 elif configuration == TestApproach.config_dense: eta = 0.8 gamma = 1.5 alpha = 1.5 else: eta = 0.5 gamma = 1.0 alpha = 1.0 problemsets_results = [] k_vals = [] failures = [] # Detect if we're dealing with the train or test data r = range(1, 121) if not train_phase else range(40, 61) start = tpc() for ps in r: print(f"\n[{(tpc()-start)/60:06.2f}m] Problem Set ► {ps:03d} ◄") try: print(f"[{(tpc()-start)/60:06.2f}m]\tVectorising..") plain_docs, bow_rep_docs, lss_rep_docs = self._vectorise_ps( ps, infer_lss=infer, hdp_eta=eta, hdp_gamma_s=gamma, hdp_alpha_s=alpha, drop_uncommon_terms=drop_uncommon) lss_rep_docs = Tools.normalise_data(lss_rep_docs, log_e=log_entropy_w) # Begin Clustering Attempts print(f"[{(tpc()-start)/60:06.2f}m]\tClustering..") ground_truth = self._get_ps_truth(ps) ps_res, k_trends = self._cluster_data( ps, data=lss_rep_docs, ground_truth=ground_truth, desired_k=desired_k) problemsets_results.append(ps_res) k_vals.append(k_trends) except AttributeError as excp: failures.append(ps) print(excp) print(f"> ERROR: {excp}.\n> Skipping..") pass print(f"[{(tpc()-start)/60:06.2f}m]\tDone.") print("» Saving Results ..") folder = "pan17_train" if train_phase else "pan17_test" path = self._save_results( suffix=f"{save_name_suff}_{configuration}", info_path=f"..\\..\\Datasets\\{folder}\\info.json", results=problemsets_results, k_values=k_vals) if (len(failures) != 0): print(f"{len(failures)/len(lss_rep_docs)} problem(s) skipped.") Tools.save_list_to_text( mylist=failures, filepath=r"./__outputs__/skipped.txt", header=f"Skipped PS train 12% ({len(failures)})") print(f"[{(tpc()-start)/60:06.2f}m] All Done.") return path
def _generate_hdps_outputs(self, skip_factor: int = 1, verbose: bool = False): st = time.perf_counter() ldac_path = Tools.get_path("lda_c_format_HyperFalse", "dummy_ldac_corpus.dat") words_nums = {} vocab_file = Tools.get_path("lda_c_format_HyperFalse", "dummy_ldac_corpus.dat.vocab") # size = ((60 // skip_factor) # * len(self.etas) # * len(self.gammas)**2 # * len(self.alphas)**2) # Since we fixed the scales of Gammas size = ((60 // skip_factor) * len(self.etas) * len(self.gammas) * len(self.alphas)) i = 0 with Tools.scan_directory(self.training_folder) as ps_folders: for c, folder in enumerate(ps_folders): if not folder.name[0:7] == "problem": if verbose: print(f"→ Skipping {folder.name}") continue # Implement the skipping factor if c % skip_factor != 0: continue t = time.perf_counter() # Fix the scale parameters for the Gamma priors g_r = 1 a_r = 1 for eta in self.etas: # for g_s, g_r in product(self.gammas, repeat=2): # for a_s, a_r in product(self.alphas, repeat=2): # Only switch the shape parameter of Gammas for g_s in self.gammas: for a_s in self.alphas: # Cache the number of words for later if folder.name not in words_nums: vocab_path = Tools.get_path( folder.path, vocab_file) n_words = self._get_number_words(vocab_path) words_nums.update({folder.name: n_words}) i = i + 1 percentage = f"{100 * i / size:06.02f}" suff = (f"{g_s:0.2f}_{g_r:0.2f}_" f"{a_s:0.2f}_{a_r:0.2f}") if verbose: print(f"► Applying HDP with " f"eta={eta:0.1f} " f"gamma({g_s:0.2f}, {g_r:0.2f}) " f"alpha({a_s:0.2f}, {a_r:0.2f}) " f"on {folder.name} [{percentage}%]") directory = Tools.get_path(self.out_dir, "optimisation", f"{eta:0.1f}__{suff}", folder.name) if (Tools.path_exists(directory)): if verbose: print("\tcached result found at " f"{directory}") continue path_executable = r"{}\hdp.exe".format( self.hdp_path) data = Tools.get_path(folder.path, ldac_path) # Prepare the output directory Tools.initialise_directories(directory) if self.seed is not None: s.run([ path_executable, "--algorithm", "train", "--data", data, "--directory", directory, "--max_iter", str(self.iters), "--sample_hyper", "no", "--save_lag", "-1", "--eta", str(eta), "--gamma_a", str(g_s), "--gamma_b", str(g_r), "--alpha_a", str(a_s), "--alpha_b", str(a_r), "--random_seed", str(self.seed) ], stdout=s.DEVNULL, check=True, capture_output=False, text=True) else: s.run([ path_executable, "--algorithm", "train", "--data", data, "--directory", directory, "--max_iter", str(self.iters), "--sample_hyper", "no", "--save_lag", "-1", "--eta", str(eta), "--gamma_a", str(g_s), "--gamma_b", str(g_r), "--alpha_a", str(a_s), "--alpha_b", str(a_r) ], stdout=s.DEVNULL, check=True, capture_output=False, text=True) if verbose: print(f"--- {folder.name} done in " f"{time.perf_counter() - t:0.1f} seconds ---") period = round(time.perf_counter() - st, 2) print(f"----- Vectorisation done in {period} seconds -----") return words_nums
def smart_optimisation(self, plot_cat: str = "likelihood", tail_prcnt: float = 0.80, skip_factor: int = 1, verbose: bool = False): # First generate the outputs to compare: words_counts = self._generate_hdps_outputs(skip_factor=skip_factor, verbose=verbose) ret = {} # Loop over the outputs of different etas master_folder = Tools.get_path(self.out_dir, "optimisation") log_likelihoods = [] avg_num_topics = [] std_num_topics = [] pw_ll = [] errors = [] with Tools.scan_directory(master_folder) as perms: for perm in perms: # generate plots if not Tools.is_path_dir(perm.path): continue self.generate_gibbs_states_plots(states_path=perm.path, cat=plot_cat) with Tools.scan_directory(perm.path) as problems: for problem in problems: try: n_words = words_counts[problem.name] path_state = Tools.get_path( problem.path, "state.log") df_state = pd.read_csv( filepath_or_buffer=path_state, delim_whitespace=True, index_col="iter", usecols=["iter", "likelihood", "num.topics"]) ll = df_state.likelihood.tail( round(len(df_state) * tail_prcnt)).mean() avg_topics = df_state["num.topics"].tail( round(len(df_state) * tail_prcnt)).mean() std_topics = df_state["num.topics"].tail( round(len(df_state) * tail_prcnt)).std() log_likelihoods.append(ll) pw_ll.append(ll / n_words) avg_num_topics.append(avg_topics) std_num_topics.append(std_topics) except FileNotFoundError as e: print(f"{e}") errors.append(f"{e}") continue except KeyError: # Plots folders are being queried for n_words continue ret.update({ f"{perm.name}": [ round(sum(log_likelihoods) / len(log_likelihoods), 4), round(sum(pw_ll) / len(pw_ll), 4), round(sum(avg_num_topics) / len(avg_num_topics), 4), round(sum(std_num_topics) / len(std_num_topics), 4) ] }) # Save any encountered errors to disk too Tools.save_list_to_text(mylist=errors, filepath=Tools.get_path( self.out_dir, "optimisation", "opt_errors.txt")) pd.DataFrame(data=ret, index=["Log-l", "PwLL", "T-Avg", "T-Std" ]).T.to_csv(Tools.get_path(self.out_dir, "optimisation", "optimisation.csv"), index=True) return ret
def traverse_gamma_alpha(self, ps: int, tail_prcnt: float = 0.80, verbose: bool = True): ldac_path = Tools.get_path("lda_c_format_HyperFalse", "dummy_ldac_corpus.dat") dat_path = Tools.get_path(self.training_folder, f"problem{ps:03d}", ldac_path) directory = Tools.get_path(self.out_dir, "gamma_alpha") path_executable = Tools.get_path(self.hdp_path, "hdp.exe") res = defaultdict(list) total_work = len(self.gammas)**2 * len(self.alphas)**2 c = 0 print("----------------------------------------------------") for g_s, g_r in product(self.gammas, repeat=2): for a_s, a_r in product(self.alphas, repeat=2): for a_r in self.alphas: c = c + 1 progress = 100.0 * c / total_work suff = f"_{g_s:0.2f}_{g_r:0.2f}_{a_s:0.2f}_{a_r:0.2f}" if verbose: print(f"► Working on " f"Gamma({g_s:0.2f},{g_r:0.2f}) " f"and Alpha({a_s:0.2f},{a_r:0.2f}) " f"[{progress:06.2f}%]") s.run([ path_executable, "--algorithm", "train", "--data", dat_path, "--directory", Tools.get_path(directory, f"{c:03d}", f"hdp_out{suff}"), "--max_iter", str(500), "--sample_hyper", "no", "--save_lag", "-1", "--eta", "0.5", "--random_seed", str(self.seed), "--gamma_a", str(g_s), "--gamma_b", str(g_r), "--alpha_a", str(a_s), "--alpha_b", str(a_r) ], check=True, capture_output=True, text=True) # Read the likelihood ll = pd.read_csv(Tools.get_path(directory, f"{c:03d}hdp_out{suff}", "state.log"), delim_whitespace=True).likelihood.tail( round(tail_prcnt * 500)).mean() res["gamma_shape"].append(g_s) res["gamma_rate"].append(g_r) res["alpha_shape"].append(a_s) res["alpha_rate"].append(a_r) res["gamma"].append(g_s * g_r) res["alpha"].append(a_s * a_r) res["likelihood"].append(ll) # Save the results to disk df_res = pd.DataFrame(res) df_res.to_csv(Tools.get_path(directory, "results.csv"), index=False) if verbose: print("---------------------- Done ------------------------") return df_res
def _cluster_data(self, ps: int, data: List[List], ground_truth: List, desired_k: int): clu_lss = Clusterer(dtm=data, true_labels=ground_truth, max_nbr_clusters=len(data)-1, min_nbr_clusters=1, min_cluster_size=2, metric="cosine", desired_n_clusters=desired_k) # Run SPKMeans 10 times to get mean performance # This is also what supplied the estimated k for the Clusterer # TODO: decouple k estimations from the evaluation norm_spk_pred, norm_spk_evals = clu_lss.evaluate( alg_option=Clusterer.alg_spherical_k_means, param_init="k-means++") cop_kmeans_pred, cop_kmeans_evals = clu_lss.evaluate( alg_option=Clusterer.alg_cop_kmeans, param_constraints_size=constraints_fraction, param_copkmeans_init="random") if include_older_algorithms: norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate( alg_option=Clusterer.alg_h_dbscan) norm_ms_pred, norm_ms_evals = clu_lss.evaluate( alg_option=Clusterer.alg_mean_shift) # norm_xm_pred, norm_xm_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_x_means) nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="complete") nhac_s_pred, nhac_s_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="single") nhac_a_pred, nhac_a_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="average") n_optics_pred, n_optics_evals = clu_lss.evaluate( alg_option=Clusterer.alg_optics) # Baselines bl_rand_pred, bl_rand_evals = clu_lss.evaluate( alg_option=Clusterer.bl_random) bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate( alg_option=Clusterer.bl_singleton) nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp() ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering() # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features # Not Applicable for Training data if not train_phase: sota_pred_path_le = (r"D:\College\DKEM\Thesis\AuthorshipClustering" r"\Code\clusterPAN2017-master\output_LogEnt" f"\\problem{ps:03d}\\clustering.json") sota_predicted_le = Tools.load_true_clusters_into_vector( sota_pred_path_le) sota_pred_le, sota_evals_le = clu_lss.eval_sota( sota_predicted=sota_predicted_le) sota_pred_path_tf = (r"D:\College\DKEM\Thesis\AuthorshipClustering" r"\Code\clusterPAN2017-master\output_Tf" f"\\problem{ps:03d}\\clustering.json") sota_predicted_tf = Tools.load_true_clusters_into_vector( sota_pred_path_tf) sota_pred_tf, sota_evals_tf = clu_lss.eval_sota( sota_predicted=sota_predicted_tf) sota_pred_path_tfidf = ( r"D:\College\DKEM\Thesis\AuthorshipClustering" r"\Code\clusterPAN2017-master\output_TfIdf" f"\\problem{ps:03d}\\clustering.json") sota_predicted_tfidf = Tools.load_true_clusters_into_vector( sota_pred_path_tfidf) sota_pred_tfidf, sota_evals_tfidf = clu_lss.eval_sota( sota_predicted=sota_predicted_tfidf) else: # Build some placeholders only as SOTA isn't required to train # sota_pred_le = [0] * len(data) # sota_pred_tf = [0] * len(data) # sota_pred_tfidf = [0] * len(data) placebo_ret = {} placebo_ret.update({"nmi": None, "ami": None, "ari": None, "fms": None, "v_measure": None, "bcubed_precision": None, "bcubed_recall": None, "bcubed_fscore": None, "Silhouette": None, "Calinski_harabasz": None, "Davies_Bouldin": None # Here goes the unsupervised indices }) sota_evals_le = placebo_ret sota_evals_tf = placebo_ret sota_evals_tfidf = placebo_ret # Control whether k is estimated or it is the true k replicated: if desired_k != 0: k_trend = clu_lss.cand_k k_trend.append(1 + max(clu_lss.true_labels)) else: k_trend = [1 + max(clu_lss.true_labels) ] * (nbr_competing_methods + 1) result = Tools.form_problemset_result_dictionary( dictionaries=[ # ispk_evals, norm_spk_evals, norm_hdbscan_evals, norm_spk_evals, norm_hdbscan_evals, norm_ms_evals, # norm_xm_evals, nhac_complete_evals, nhac_s_evals, nhac_a_evals, n_optics_evals, cop_kmeans_evals, bl_rand_evals, bl_singleton_evals, nhdp_evals, sota_evals_tf, sota_evals_tfidf, sota_evals_le, ntrue_evals ], identifiers=[ # "iSpKmeans", "E_SPKMeans", "E_HDBSCAN", "E_Mean_Shift", # "XMeans", "E_HAC_C", "E_HAC_Single", "E_HAC_Average", "E_OPTICS", "E_COP_KMeans", "BL_r", "BL_s", "S_HDP", "BL_SOTA_tf", "BL_SOTA_tfidf", "BL_SOTA_le", "Labels"], problem_set=ps) return result, k_trend
# as it seems. However, the seeds would be consistant across # runs and yield comparable results for our experiments # (comparing different runs of HDP on a problem set) seed=max(33, 70*(ps == 41)) + (3 * (ps in problematics)), infer_lss=False, verbose=False, configuration=config_neutral, drop_uncommon=True) problemsets_results.append(ps_result) ks = clu.cand_k.copy() ks.append(1+max(clu.true_labels)) k_vals.append(ks) my_suffix = "_training_neutral_common" info_json = r"..\..\Datasets\pan17_train\info.json" Tools.splice_save_problemsets_dictionaries(problemsets_results, metadata_fpath=info_json, suffix=my_suffix) Tools.save_k_vals_as_df(k_vals=k_vals, suffix=my_suffix) print("==================== SPARSE ====================") problemsets_results = [] k_vals = [] for ps in range(1, 61): print(f"Executing on problem set ► {ps:03d} ◄ ..") ps_result, l, lss, plain, clu = problem_set_run( problem_set_id=ps, n_clusters=None, # Emperically specify a random seed that's compatible with # hyper sampling and certain problem sets due to a bug in HDP # as it seems. However, the seeds would be consistant across # runs and yield comparable results for our experiments