def _concatenate_docs_into_btmcorpus(self, remove_bgw: bool = False, drop_uncommon: bool = False, drop_punctuation: bool = False): # Read in the plain text files plain_documents = [] with Tools.scan_directory(self.directory_path) as docs: for doc in docs: if doc.is_dir(): continue try: f = open(doc.path, mode="r", encoding="utf8") plain_documents.append(f.read()) self.doc_index.append(Tools.get_filename(doc.path)) except PermissionError: # Raised when trying to open a directory print("Skipped while loading files: {}".format(doc.name)) pass finally: f.close() # lowercase and strip \n away plain_documents = [ str.replace(d, "\n", "").lower() for d in plain_documents ] # it was observed that the topics are composed of a lot of stop words # Following the BTM paper and the observation, we remove these if remove_bgw: # Detect the language lang = detect(" ".join(plain_documents)) if lang == "en": lang = "english" elif lang == "nl": lang = "dutch" else: lang = "greek" new_documents = [] for d in plain_documents: terms = [ w for w in word_tokenize(text=d, language=lang) if w not in set(stopwords.words(lang)) ] new_documents.append(" ".join(terms)) plain_documents = new_documents if drop_punctuation: plain_documents = [ sub(pattern=r"[^\w\s]", repl="", string=d) for d in plain_documents ] # save it to disk Tools.save_list_to_text(mylist=plain_documents, filepath=self.plain_corpus_path) return plain_documents
def run_test(self, configuration: str, drop_uncommon: bool, save_name_suff: str, infer: bool, desired_k: int # If 0, true k will be used, None = estimation ): # Adjust the parameters according to the preference if configuration == TestApproach.config_sparse: eta = 0.3 gamma = 0.1 alpha = 0.1 elif configuration == TestApproach.config_dense: eta = 0.8 gamma = 1.5 alpha = 1.5 else: eta = 0.5 gamma = 1.0 alpha = 1.0 problemsets_results = [] k_vals = [] failures = [] # Detect if we're dealing with the train or test data r = range(1, 121) if not train_phase else range(40, 61) start = tpc() for ps in r: print(f"\n[{(tpc()-start)/60:06.2f}m] Problem Set ► {ps:03d} ◄") try: print(f"[{(tpc()-start)/60:06.2f}m]\tVectorising..") plain_docs, bow_rep_docs, lss_rep_docs = self._vectorise_ps( ps, infer_lss=infer, hdp_eta=eta, hdp_gamma_s=gamma, hdp_alpha_s=alpha, drop_uncommon_terms=drop_uncommon) lss_rep_docs = Tools.normalise_data(lss_rep_docs, log_e=log_entropy_w) # Begin Clustering Attempts print(f"[{(tpc()-start)/60:06.2f}m]\tClustering..") ground_truth = self._get_ps_truth(ps) ps_res, k_trends = self._cluster_data( ps, data=lss_rep_docs, ground_truth=ground_truth, desired_k=desired_k) problemsets_results.append(ps_res) k_vals.append(k_trends) except AttributeError as excp: failures.append(ps) print(excp) print(f"> ERROR: {excp}.\n> Skipping..") pass print(f"[{(tpc()-start)/60:06.2f}m]\tDone.") print("» Saving Results ..") folder = "pan17_train" if train_phase else "pan17_test" path = self._save_results( suffix=f"{save_name_suff}_{configuration}", info_path=f"..\\..\\Datasets\\{folder}\\info.json", results=problemsets_results, k_values=k_vals) if (len(failures) != 0): print(f"{len(failures)/len(lss_rep_docs)} problem(s) skipped.") Tools.save_list_to_text( mylist=failures, filepath=r"./__outputs__/skipped.txt", header=f"Skipped PS train 12% ({len(failures)})") print(f"[{(tpc()-start)/60:06.2f}m] All Done.") return path
def smart_optimisation(self, plot_cat: str = "likelihood", tail_prcnt: float = 0.80, skip_factor: int = 1, verbose: bool = False): # First generate the outputs to compare: words_counts = self._generate_hdps_outputs(skip_factor=skip_factor, verbose=verbose) ret = {} # Loop over the outputs of different etas master_folder = Tools.get_path(self.out_dir, "optimisation") log_likelihoods = [] avg_num_topics = [] std_num_topics = [] pw_ll = [] errors = [] with Tools.scan_directory(master_folder) as perms: for perm in perms: # generate plots if not Tools.is_path_dir(perm.path): continue self.generate_gibbs_states_plots(states_path=perm.path, cat=plot_cat) with Tools.scan_directory(perm.path) as problems: for problem in problems: try: n_words = words_counts[problem.name] path_state = Tools.get_path( problem.path, "state.log") df_state = pd.read_csv( filepath_or_buffer=path_state, delim_whitespace=True, index_col="iter", usecols=["iter", "likelihood", "num.topics"]) ll = df_state.likelihood.tail( round(len(df_state) * tail_prcnt)).mean() avg_topics = df_state["num.topics"].tail( round(len(df_state) * tail_prcnt)).mean() std_topics = df_state["num.topics"].tail( round(len(df_state) * tail_prcnt)).std() log_likelihoods.append(ll) pw_ll.append(ll / n_words) avg_num_topics.append(avg_topics) std_num_topics.append(std_topics) except FileNotFoundError as e: print(f"{e}") errors.append(f"{e}") continue except KeyError: # Plots folders are being queried for n_words continue ret.update({ f"{perm.name}": [ round(sum(log_likelihoods) / len(log_likelihoods), 4), round(sum(pw_ll) / len(pw_ll), 4), round(sum(avg_num_topics) / len(avg_num_topics), 4), round(sum(std_num_topics) / len(std_num_topics), 4) ] }) # Save any encountered errors to disk too Tools.save_list_to_text(mylist=errors, filepath=Tools.get_path( self.out_dir, "optimisation", "opt_errors.txt")) pd.DataFrame(data=ret, index=["Log-l", "PwLL", "T-Avg", "T-Std" ]).T.to_csv(Tools.get_path(self.out_dir, "optimisation", "optimisation.csv"), index=True) return ret