def run_test(self, drop_uncommon=False, desired_k=None): problemsets_results = [] kvals = [] # K is None which means it will be inferred if train_phase: end = 1 else: end = 120 for ps in range(1, 1+end): print(f"Clustering problem {ps:03d}..") # In BTM, all the corpora need to be modelled as LSS # Now we proceed with clustering ground_truth = self._get_ps_truth(ps) lss_rep_docs = self._vectorise_ps(ps, convert_to_proportions=True) # Normalise the data as they are inherintly directional lss_rep_docs = Tools.normalise_data(lss_rep_docs) # Start the clustering endeavours ps_res, k_trends = self._cluster_data(ps=ps, data=lss_rep_docs, ground_truth=ground_truth, desired_k=None) problemsets_results.append(ps_res) kvals.append(k_trends) # Save the results to disk: print("Saving results..") self._save_results( suffix=f"_btm_{self.btm_dir_suffix}", info_path=f"{self.corpus_path}\\info.json", results=problemsets_results, k_values=kvals) print("Done.")
def run_test(self, configuration: str, drop_uncommon: bool, save_name_suff: str, infer: bool, desired_k: int # If 0, true k will be used, None = estimation ): # Adjust the parameters according to the preference if configuration == TestApproach.config_sparse: eta = 0.3 gamma = 0.1 alpha = 0.1 elif configuration == TestApproach.config_dense: eta = 0.8 gamma = 1.5 alpha = 1.5 else: eta = 0.5 gamma = 1.0 alpha = 1.0 problemsets_results = [] k_vals = [] failures = [] # Detect if we're dealing with the train or test data r = range(1, 121) if not train_phase else range(40, 61) start = tpc() for ps in r: print(f"\n[{(tpc()-start)/60:06.2f}m] Problem Set ► {ps:03d} ◄") try: print(f"[{(tpc()-start)/60:06.2f}m]\tVectorising..") plain_docs, bow_rep_docs, lss_rep_docs = self._vectorise_ps( ps, infer_lss=infer, hdp_eta=eta, hdp_gamma_s=gamma, hdp_alpha_s=alpha, drop_uncommon_terms=drop_uncommon) lss_rep_docs = Tools.normalise_data(lss_rep_docs, log_e=log_entropy_w) # Begin Clustering Attempts print(f"[{(tpc()-start)/60:06.2f}m]\tClustering..") ground_truth = self._get_ps_truth(ps) ps_res, k_trends = self._cluster_data( ps, data=lss_rep_docs, ground_truth=ground_truth, desired_k=desired_k) problemsets_results.append(ps_res) k_vals.append(k_trends) except AttributeError as excp: failures.append(ps) print(excp) print(f"> ERROR: {excp}.\n> Skipping..") pass print(f"[{(tpc()-start)/60:06.2f}m]\tDone.") print("» Saving Results ..") folder = "pan17_train" if train_phase else "pan17_test" path = self._save_results( suffix=f"{save_name_suff}_{configuration}", info_path=f"..\\..\\Datasets\\{folder}\\info.json", results=problemsets_results, k_values=k_vals) if (len(failures) != 0): print(f"{len(failures)/len(lss_rep_docs)} problem(s) skipped.") Tools.save_list_to_text( mylist=failures, filepath=r"./__outputs__/skipped.txt", header=f"Skipped PS train 12% ({len(failures)})") print(f"[{(tpc()-start)/60:06.2f}m] All Done.") return path
def problem_set_run(problem_set_id: int, n_clusters: int, seed: int, configuration: str, drop_uncommon: bool, verbose: bool, infer_lss: bool = False): problem_nbr = f"{problem_set_id:03d}" # Define an LSS modeller to represent documents in LSS non-sparse space # HDP with Gibbs sampler is being used as is from: # https://github.com/blei-lab/hdp # Adjust the parameters according to the preference if configuration == config_sparse: eta = 0.3 gamma = 0.1 alpha = 0.1 elif configuration == config_dense: eta = 0.8 gamma = 1.5 alpha = 1.5 else: eta = 0.5 gamma = 1.0 alpha = 1.0 Modeller = LssHdpModeller( hdp_path=r"..\hdps\hdp", input_docs_path=r"..\..\Datasets\pan17_train\problem{}".format( problem_nbr), ldac_filename=r"ldac_corpus", hdp_output_dir=r"hdp_lss", hdp_iters=10000, hdp_seed=seed, hdp_sample_hyper=False, hdp_eta=eta, hdp_gamma_s=gamma, hdp_alpha_s=alpha, word_grams=1, drop_uncommon=drop_uncommon, freq_threshold=1, verbose=verbose) # Infer the BoW and LSS representations of the documents try: # Load, project and visualise the data plain_docs, bow_rep_docs, lss_rep_docs = Modeller.get_corpus_lss( infer_lss, bim=False) # Begin Clustering Attempts true_labels_path = (r"..\..\Datasets\pan17_train\truth" r"\problem{}\clustering.json" ).format(problem_nbr) ground_truth = Tools.load_true_clusters_into_vector(true_labels_path) # Normalise the data if not BIM is used! clu_lss = Clusterer(dtm=Tools.normalise_data(data=lss_rep_docs), true_labels=ground_truth, max_nbr_clusters=len(lss_rep_docs)-1, min_nbr_clusters=1, min_cluster_size=2, metric="cosine", desired_n_clusters=n_clusters) norm_spk_pred, norm_spk_evals = clu_lss.evaluate( alg_option=Clusterer.alg_spherical_k_means, param_init="k-means++") # ispk_pred, ispk_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_iterative_spherical_k_means, # param_init="k-means++") norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate( alg_option=Clusterer.alg_h_dbscan) norm_ms_pred, norm_ms_evals = clu_lss.evaluate( alg_option=Clusterer.alg_mean_shift) # norm_xm_pred, norm_xm_evals = clu_lss.evaluate( # alg_option=Clusterer.alg_x_means) nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="complete") nhac_s_pred, nhac_s_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="single") nhac_a_pred, nhac_a_evals = clu_lss.evaluate( alg_option=Clusterer.alg_hac, param_linkage="average") n_optics_pred, n_optics_evals = clu_lss.evaluate( alg_option=Clusterer.alg_optics) # Baselines bl_rand_pred, bl_rand_evals = clu_lss.evaluate( alg_option=Clusterer.bl_random) bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate( alg_option=Clusterer.bl_singleton) nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp() ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering() # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features sota_pred_path = (r"D:\College\DKEM\Thesis\AuthorshipClustering\Code" r"\clusterPAN2017-master\train_out_LogEnt" f"\\problem{problem_nbr}\\clustering.json") sota_predicted = Tools.load_true_clusters_into_vector(sota_pred_path) sota_pred, sota_evals = clu_lss.eval_sota( sota_predicted=sota_predicted) # Return the results: return (Tools.form_problemset_result_dictionary( dictionaries=[ # ispk_evals, norm_spk_evals, norm_hdbscan_evals, norm_spk_evals, norm_hdbscan_evals, norm_ms_evals, # norm_xm_evals, nhac_complete_evals, nhac_s_evals, nhac_a_evals, n_optics_evals, bl_rand_evals, bl_singleton_evals, nhdp_evals, sota_evals, ntrue_evals ], identifiers=[ # "iSpKmeans", "E_SPKMeans", "E_HDBSCAN", "E_Mean_Shift", # "XMeans", "E_HAC_C", "E_HAC_Single", "E_HAC_Average", "E_OPTICS", "BL_r", "BL_s", "S_HDP", "BL_SOTA", "Labels"], problem_set=problem_set_id), ground_truth, lss_rep_docs, plain_docs, clu_lss) except FileNotFoundError: print("Please run HDP on these data first.")