コード例 #1
0
    def run_test(self,
                 drop_uncommon=False,
                 desired_k=None):

        problemsets_results = []
        kvals = []

        # K is None which means it will be inferred
        if train_phase:
            end = 1
        else:
            end = 120

        for ps in range(1, 1+end):
            print(f"Clustering problem {ps:03d}..")
            # In BTM, all the corpora need to be modelled as LSS
            # Now we proceed with clustering
            ground_truth = self._get_ps_truth(ps)
            lss_rep_docs = self._vectorise_ps(ps, convert_to_proportions=True)
            # Normalise the data as they are inherintly directional
            lss_rep_docs = Tools.normalise_data(lss_rep_docs)
            # Start the clustering endeavours
            ps_res, k_trends = self._cluster_data(ps=ps,
                                                  data=lss_rep_docs,
                                                  ground_truth=ground_truth,
                                                  desired_k=None)
            problemsets_results.append(ps_res)
            kvals.append(k_trends)
        # Save the results to disk:
        print("Saving results..")
        self._save_results(
            suffix=f"_btm_{self.btm_dir_suffix}",
            info_path=f"{self.corpus_path}\\info.json",
            results=problemsets_results,
            k_values=kvals)
        print("Done.")
コード例 #2
0
    def run_test(self,
                 configuration: str,
                 drop_uncommon: bool,
                 save_name_suff: str,
                 infer: bool,
                 desired_k: int  # If 0, true k will be used, None = estimation
                 ):

        # Adjust the parameters according to the preference
        if configuration == TestApproach.config_sparse:
            eta = 0.3
            gamma = 0.1
            alpha = 0.1
        elif configuration == TestApproach.config_dense:
            eta = 0.8
            gamma = 1.5
            alpha = 1.5
        else:
            eta = 0.5
            gamma = 1.0
            alpha = 1.0

        problemsets_results = []
        k_vals = []
        failures = []
        # Detect if we're dealing with the train or test data
        r = range(1, 121) if not train_phase else range(40, 61)
        start = tpc()
        for ps in r:
            print(f"\n[{(tpc()-start)/60:06.2f}m] Problem Set ► {ps:03d} ◄")
            try:
                print(f"[{(tpc()-start)/60:06.2f}m]\tVectorising..")
                plain_docs, bow_rep_docs, lss_rep_docs = self._vectorise_ps(
                        ps,
                        infer_lss=infer,
                        hdp_eta=eta,
                        hdp_gamma_s=gamma,
                        hdp_alpha_s=alpha,
                        drop_uncommon_terms=drop_uncommon)
                lss_rep_docs = Tools.normalise_data(lss_rep_docs,
                                                    log_e=log_entropy_w)

                # Begin Clustering Attempts
                print(f"[{(tpc()-start)/60:06.2f}m]\tClustering..")
                ground_truth = self._get_ps_truth(ps)
                ps_res, k_trends = self._cluster_data(
                    ps, data=lss_rep_docs,
                    ground_truth=ground_truth,
                    desired_k=desired_k)
                problemsets_results.append(ps_res)
                k_vals.append(k_trends)
            except AttributeError as excp:
                failures.append(ps)
                print(excp)
                print(f"> ERROR: {excp}.\n> Skipping..")
                pass
            print(f"[{(tpc()-start)/60:06.2f}m]\tDone.")

        print("» Saving Results ..")
        folder = "pan17_train" if train_phase else "pan17_test"
        path = self._save_results(
                suffix=f"{save_name_suff}_{configuration}",
                info_path=f"..\\..\\Datasets\\{folder}\\info.json",
                results=problemsets_results,
                k_values=k_vals)
        if (len(failures) != 0):
            print(f"{len(failures)/len(lss_rep_docs)} problem(s) skipped.")
            Tools.save_list_to_text(
                mylist=failures,
                filepath=r"./__outputs__/skipped.txt",
                header=f"Skipped PS train 12% ({len(failures)})")

        print(f"[{(tpc()-start)/60:06.2f}m] All Done.")
        return path
コード例 #3
0
def problem_set_run(problem_set_id: int,
                    n_clusters: int,
                    seed: int,
                    configuration: str,
                    drop_uncommon: bool,
                    verbose: bool,
                    infer_lss: bool = False):
    problem_nbr = f"{problem_set_id:03d}"
    # Define an LSS modeller to represent documents in LSS non-sparse space
    # HDP with Gibbs sampler is being used as is from:
    #   https://github.com/blei-lab/hdp

    # Adjust the parameters according to the preference
    if configuration == config_sparse:
        eta = 0.3
        gamma = 0.1
        alpha = 0.1
    elif configuration == config_dense:
        eta = 0.8
        gamma = 1.5
        alpha = 1.5
    else:
        eta = 0.5
        gamma = 1.0
        alpha = 1.0

    Modeller = LssHdpModeller(
            hdp_path=r"..\hdps\hdp",
            input_docs_path=r"..\..\Datasets\pan17_train\problem{}".format(
                    problem_nbr),
            ldac_filename=r"ldac_corpus",
            hdp_output_dir=r"hdp_lss",
            hdp_iters=10000,
            hdp_seed=seed,
            hdp_sample_hyper=False,
            hdp_eta=eta,
            hdp_gamma_s=gamma,
            hdp_alpha_s=alpha,
            word_grams=1,
            drop_uncommon=drop_uncommon,
            freq_threshold=1,
            verbose=verbose)

    # Infer the BoW and LSS representations of the documents
    try:
        # Load, project and visualise the data
        plain_docs, bow_rep_docs, lss_rep_docs = Modeller.get_corpus_lss(
                infer_lss,
                bim=False)

        # Begin Clustering Attempts
        true_labels_path = (r"..\..\Datasets\pan17_train\truth"
                            r"\problem{}\clustering.json"
                            ).format(problem_nbr)

        ground_truth = Tools.load_true_clusters_into_vector(true_labels_path)

        # Normalise the data if not BIM is used!
        clu_lss = Clusterer(dtm=Tools.normalise_data(data=lss_rep_docs),
                            true_labels=ground_truth,
                            max_nbr_clusters=len(lss_rep_docs)-1,
                            min_nbr_clusters=1,
                            min_cluster_size=2,
                            metric="cosine",
                            desired_n_clusters=n_clusters)

        norm_spk_pred, norm_spk_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_spherical_k_means,
                param_init="k-means++")

#        ispk_pred, ispk_evals = clu_lss.evaluate(
#                alg_option=Clusterer.alg_iterative_spherical_k_means,
#                param_init="k-means++")

        norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_h_dbscan)

        norm_ms_pred, norm_ms_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_mean_shift)

#        norm_xm_pred, norm_xm_evals = clu_lss.evaluate(
#                alg_option=Clusterer.alg_x_means)

        nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="complete")

        nhac_s_pred, nhac_s_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="single")

        nhac_a_pred, nhac_a_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="average")

        n_optics_pred, n_optics_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_optics)

        # Baselines
        bl_rand_pred, bl_rand_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_random)
        bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_singleton)

        nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp()
        ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering()

        # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features
        sota_pred_path = (r"D:\College\DKEM\Thesis\AuthorshipClustering\Code"
                          r"\clusterPAN2017-master\train_out_LogEnt"
                          f"\\problem{problem_nbr}\\clustering.json")
        sota_predicted = Tools.load_true_clusters_into_vector(sota_pred_path)
        sota_pred, sota_evals = clu_lss.eval_sota(
                sota_predicted=sota_predicted)

        # Return the results:
        return (Tools.form_problemset_result_dictionary(
                dictionaries=[
                        # ispk_evals, norm_spk_evals, norm_hdbscan_evals,
                        norm_spk_evals, norm_hdbscan_evals,
                        norm_ms_evals,  # norm_xm_evals,
                        nhac_complete_evals, nhac_s_evals, nhac_a_evals,
                        n_optics_evals, bl_rand_evals, bl_singleton_evals,
                        nhdp_evals, sota_evals, ntrue_evals
                        ],
                identifiers=[  # "iSpKmeans",
                             "E_SPKMeans", "E_HDBSCAN",
                             "E_Mean_Shift",  # "XMeans",
                             "E_HAC_C", "E_HAC_Single", "E_HAC_Average",
                             "E_OPTICS", "BL_r", "BL_s",
                             "S_HDP", "BL_SOTA", "Labels"],
                problem_set=problem_set_id),
                ground_truth,
                lss_rep_docs,
                plain_docs,
                clu_lss)

    except FileNotFoundError:
        print("Please run HDP on these data first.")