def _vectorise_ps(self,
                      ps: int,
                      convert_to_proportions: bool):
        # Override the function, returning only the LSS representation
        directory_path = f"{self.corpus_path}\\problem{ps:03d}"
        pzd_fpath = (f"{directory_path}\\BTM_{self.btm_dir_suffix}"
                     f"\\k{self.t}.pz_d")

        btm_lss = pd.read_csv(filepath_or_buffer=pzd_fpath,
                              delim_whitespace=True,
                              header=None)

        if len(self.btm.doc_index) == 0:
            doc_index = []
            # We will need to build the index
            with Tools.scan_directory(directory_path) as docs:
                for doc in docs:
                    if doc.is_dir():
                        continue
                    doc_index.append(Tools.get_filename(doc.path))
            btm_lss.index = doc_index
        else:
            btm_lss.index = self.btm.doc_index

        if convert_to_proportions:
            tokenised_btmcorpus_filepath = (
                f"{directory_path}\\BTM_{self.btm_dir_suffix}"
                f"\\vectorised\\tokenised_btmcorpus.txt")
            with open(tokenised_btmcorpus_filepath) as c:
                tcorpus = c.readlines()
                freqs = [len(self._doc_gen_biterms(tdoc))
                         for tdoc in tcorpus]
                btm_lss = btm_lss.mul(freqs, axis="index")

        return btm_lss
Ejemplo n.º 2
0
    def __init__(
            self,
            directory_path: str,
            t: int,
            alpha: float,
            beta: float,
            btm_exe_path: str = Tools.get_path("..", "BTM-master", "src",
                                               "btm.exe"),
            n_iter: int = 10000,  # To guarantee convergence
            model_dir_suffix: str = "",
            doc_inference_type: str = "sum_b"):
        self.directory_path = directory_path
        self.t = t
        self.alpha = alpha
        self.beta = beta
        self.n_iter = n_iter
        self.doc_index = []  # the index of the files read for reference
        self.w = None
        self.btm_exe = btm_exe_path
        self.doc_inf_type = "sum_b"  # Due to later dependant computations

        self.output_dir = Tools.get_path(directory_path,
                                         f"BTM_{model_dir_suffix}")
        self.plain_corpus_path = Tools.get_path(self.output_dir,
                                                "btmcorpus.txt")
        self.tokenised_btmcorpus_filepath = Tools.get_path(
            self.output_dir, "vectorised", "tokenised_btmcorpus.txt")
        self.vocab_ids_path = Tools.get_path(self.output_dir, "vectorised",
                                             "voca_pt")
Ejemplo n.º 3
0
    def _convert_corpus_to_bow(self, file_ext: str = "txt"):
        """
        Convert a directory of text files into a BoW model.

        Parameters
        ----------
        word_grams : int (optional)
            The number of words to combine as features. 1 is the default value,
            and it denotes the usage of word unigrams.

        Returns
        -------
        bow_corpus : gnesim corpus
            The bag-of-words model.

        dictionary : gensim dictionary
            The id2word mapping.

        plain_documents : list
            The list of plain documents, to serve as a reference point.
        """
        # Read in the plain text files
        plain_documents = []
        with Tools.scan_directory(self.input_docs_path) as docs:
            for doc in docs:
                if doc.is_dir() or Tools.split_path(
                        doc.path)[1] != f".{file_ext}":
                    continue
                try:
                    f = open(doc.path, mode="r", encoding="utf8")
                    plain_documents.append(f.read())
                    self.doc_index.append(Tools.get_filename(doc.path))
                except PermissionError:
                    # Raised when trying to open a directory
                    print("Skipped while loading files: {}".format(doc.name))
                    pass
        # Collocation Detection can be applied here via gensim.models.phrases
        # Tokenise corpus and remove too short documents
        tokenised_corpus = [[
            ' '.join(tkn)
            for tkn in ngrams(word_tokenize(d.lower()), self.word_grams)
        ] for d in plain_documents if len(d) > 3]

        if self.drop_uncommon:
            freq = defaultdict(int)
            for doc in tokenised_corpus:
                for word in doc:
                    freq[word] += 1
            tokenised_corpus = [[w for w in doc if freq[w] > self.freq_th]
                                for doc in tokenised_corpus]
        # Form the word ids dictionary for vectorisation
        dictionary = Dictionary(tokenised_corpus)
        corpus = [dictionary.doc2bow(t_d) for t_d in tokenised_corpus]

        return (corpus, dictionary,
                pd.DataFrame(data=plain_documents,
                             index=self.doc_index,
                             columns=["content"]))
Ejemplo n.º 4
0
    def _concatenate_docs_into_btmcorpus(self,
                                         remove_bgw: bool = False,
                                         drop_uncommon: bool = False,
                                         drop_punctuation: bool = False):
        # Read in the plain text files
        plain_documents = []
        with Tools.scan_directory(self.directory_path) as docs:
            for doc in docs:
                if doc.is_dir():
                    continue
                try:
                    f = open(doc.path, mode="r", encoding="utf8")
                    plain_documents.append(f.read())
                    self.doc_index.append(Tools.get_filename(doc.path))
                except PermissionError:
                    # Raised when trying to open a directory
                    print("Skipped while loading files: {}".format(doc.name))
                    pass
                finally:
                    f.close()
        # lowercase and strip \n away
        plain_documents = [
            str.replace(d, "\n", "").lower() for d in plain_documents
        ]
        # it was observed that the topics are composed of a lot of stop words
        # Following the BTM paper and the observation, we remove these
        if remove_bgw:
            # Detect the language
            lang = detect(" ".join(plain_documents))
            if lang == "en":
                lang = "english"
            elif lang == "nl":
                lang = "dutch"
            else:
                lang = "greek"

            new_documents = []
            for d in plain_documents:
                terms = [
                    w for w in word_tokenize(text=d, language=lang)
                    if w not in set(stopwords.words(lang))
                ]
                new_documents.append(" ".join(terms))
            plain_documents = new_documents

        if drop_punctuation:
            plain_documents = [
                sub(pattern=r"[^\w\s]", repl="", string=d)
                for d in plain_documents
            ]
        # save it to disk
        Tools.save_list_to_text(mylist=plain_documents,
                                filepath=self.plain_corpus_path)
        return plain_documents
Ejemplo n.º 5
0
    def load_pz_d_into_df(self, use_frequencies: bool = False):
        """


        Parameters
        ----------
        use_frequencies : bool, optional
            DESCRIPTION. The default is False.

        Returns
        -------
        btm_lss : TYPE
            DESCRIPTION.

        """
        # ??? This function is not used, should be used in tester._vectorise_ps
        # Load the lss into df
        pzd_fpath = f"{self.directory_path}k{self.t}.pz_d"
        try:
            btm_lss = pd.read_csv(filepath_or_buffer=pzd_fpath,
                                  delim_whitespace=True)

            if not self.doc_index:
                # We will need to build the index
                with Tools.scan_directory(self.directory_path) as docs:
                    for doc in docs:
                        if doc.is_dir():
                            continue
                        self.doc_index.append(Tools.get_filename(doc.path))
            btm_lss.index = self.doc_index

            if use_frequencies:
                # The saved documents are in p(z|d) values
                # We want to proportion them to frequencies so that we have the
                # frequency of terms belonging to a topic
                # Since sum_b is used, we will use the count of biterms
                # Treating each p(zi|dj) as a proportion, we will count biterms
                with open(self.tokenised_btmcorpus_filepath) as c:
                    tcorpus = c.readlines()
                # How many biterms are there?
                # Analyzing the C++ code, a widnow of 15 is used
                # regenerate the biterms and count as statistics can detect
                # redundancies in unordered terms:
                freqs = [len(self._doc_gen_biterms(tdoc)) for tdoc in tcorpus]
                btm_lss = btm_lss.mul(freqs, axis="index")

            return btm_lss
        except FileNotFoundError:
            return None
    def _get_ps_truth(self, ps: int):
        folder = "pan17_train" if train_phase else "pan17_test"

        true_labels_path = (f"..\\..\\Datasets\\{folder}\\truth"
                            r"\problem{:03d}\clustering.json"
                            ).format(ps)
        return Tools.load_true_clusters_into_vector(true_labels_path)
Ejemplo n.º 7
0
    def _generate_lda_c_corpus(self):
        """ Convert a group of files LDA_C corpus and store it on disk"""
        bow_corpus, id2word_map, plain_docs = self._convert_corpus_to_bow()
        # Sterialise into LDA_C and store on disk
        output_dir = Tools.get_path(
            self.input_docs_path,
            f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}",
            f"_{self.hdp_alpha_s:0.1f}_common_{self.drop_uncommon}")

        Tools.initialise_directory(output_dir)
        save_location = Tools.get_path(output_dir, f"{self.lda_c_fname}.dat")

        bleicorpus.BleiCorpus.serialize(fname=save_location,
                                        corpus=bow_corpus,
                                        id2word=id2word_map)
        return plain_docs, bow_corpus
Ejemplo n.º 8
0
    def _invoke_gibbs_hdp(self):
        """Invoke Gibbs hdp posterior inference on the corpus"""
        path_executable = Tools.get_path(self.hdp_path, "hdp.exe")

        param_data = Tools.get_path(
            self.input_docs_path,
            f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}",
            f"_{self.hdp_alpha_s:0.1f}_common_{self.drop_uncommon}",
            f"{self.lda_c_fname}.dat")

        param_directory = Tools.get_path(self.input_docs_path,
                                         self.hdp_output_directory)

        # Prepare the output directory
        Tools.initialise_directory(param_directory)

        if self.hdp_seed is not None and self.hdp_seed > 0:
            ret = s.run([
                path_executable, "--algorithm", "train", "--data", param_data,
                "--directory", param_directory, "--max_iter",
                str(self.hdp_iterations), "--sample_hyper",
                "yes" if self.hdp_hyper_sampling else "no", "--save_lag", "-1",
                "--eta",
                str(self.hdp_eta), "--random_seed",
                str(self.hdp_seed), "--gamma_a",
                str(self.hdp_gamma_s), "--alpha_a",
                str(self.hdp_alpha_s)
            ],
                        check=True,
                        capture_output=True,
                        text=True)
        else:
            ret = s.run([
                path_executable, "--algorithm", "train", "--data", param_data,
                "--directory", param_directory, "--max_iter",
                str(self.hdp_iterations), "--sample_hyper",
                "yes" if self.hdp_hyper_sampling else "no", "--save_lag", "-1",
                "--eta",
                str(self.hdp_eta), "--gamma_a",
                str(self.hdp_gamma_s), "--alpha_a",
                str(self.hdp_alpha_s)
            ],
                        check=True,
                        capture_output=True,
                        text=True)

        return ret.stdout
    def _save_results(self,
                      suffix: str,
                      info_path: str,
                      results: List[Dict],
                      k_values: List[List]):

        path = Tools.splice_save_problemsets_dictionaries(
                results,
                metadata_fpath=info_path,
                suffix=suffix,
                test_data=not train_phase)

        Tools.save_k_vals_as_df(k_vals=k_values, suffix=suffix,
                                test_data=not train_phase,
                                cop_kmeans_frac=constraints_fraction)

        return path
Ejemplo n.º 10
0
    def generate_gibbs_states_plots(self,
                                    states_path: str,
                                    cat: str = "likelihood"):
        new_dir = Tools.get_path(states_path, f"{cat}_plots")
        if Tools.path_exists(new_dir):
            print("Plots found, skipping..")
            return

        Tools.initialise_directory(new_dir)
        with Tools.scan_directory(states_path) as outputs:
            for i, output in enumerate(outputs):
                try:
                    state_file = Tools.get_path(output.path, "state.log")
                    df = pd.read_csv(filepath_or_buffer=state_file,
                                     delim_whitespace=True,
                                     index_col="iter")
                    ax = sns.lineplot(x=df.index, y=cat, data=df)
                    ax.margins(x=0)
                    name = output.name
                    fig = ax.get_figure()
                    fig.savefig(Tools.get_path(states_path, f"{cat}_plots",
                                               f"{name}.png"),
                                dpi=300,
                                bbox_incehs="tight",
                                format="png")
                    fig.clf()
                    print(f"{i}")
                except FileNotFoundError:
                    print(f"→ Skipping {output.name}")
Ejemplo n.º 11
0
    def _infer_btm_pz_d(self):
        """Invoke Gibbs BTM docs inference on the corpus"""

        ret = s.run([
            self.btm_exe, "inf", self.doc_inf_type,
            str(self.t), self.tokenised_btmcorpus_filepath,
            Tools.get_path(self.output_dir, "")
        ],
                    check=True,
                    capture_output=True,
                    text=True)
        return ret.stdout
Ejemplo n.º 12
0
    def _estimate_btm(self):
        """Invoke Gibbs BTM posterior inference on the tokenised corpus"""

        ret = s.run(
            [
                self.btm_exe,
                "est",
                str(self.t),
                str(self.w),
                str(self.alpha),
                str(self.beta),
                str(self.n_iter),
                str(self.n_iter),  # Save Step
                self.tokenised_btmcorpus_filepath,
                Tools.get_path(self.output_dir, "")
            ],
            check=True,
            capture_output=True,
            text=True)
        return ret.stdout
Ejemplo n.º 13
0
    def _load_lss_representation_into_df(self) -> pd.DataFrame:
        """
        Load a BoT LSS representation from disk to a returned dataframe.

        Returns
        -------
        lss_df : pd.DataFrame
            A matrix of shape (n_samples, n_features)

        Raises
        ------
        FileNotFoundError
            When the LSS representation isn't found on disk.

        """

        path = Tools.get_path(self.input_docs_path, self.hdp_output_directory,
                              "mode-word-assignments.dat")
        # We don't need document tables, so we'll skip the relative column,
        # But we do need word counts under each topic, to produce some sort
        # of a bag-of-topics model (BoT)
        try:
            lss_df = pd.read_csv(filepath_or_buffer=path,
                                 delim_whitespace=True)
            #                             usecols=["d", "w", "z"]).drop_duplicates()
            # Produce topic weights as counts of topic words
            lss_df = lss_df.pivot_table(values='w',
                                        columns='z',
                                        index='d',
                                        aggfunc='count',
                                        fill_value=0)
            # Index with file names for later reference
            lss_df.index = self.doc_index

            return lss_df
        except FileNotFoundError:
            print(("\nNo LSS precomputed file was found on disk via:\n{}\n"
                   "> Please generate LDA-C corpus and run HDP first...\n"
                   ).format(path))
            raise
    def run_test(self,
                 drop_uncommon=False,
                 desired_k=None):

        problemsets_results = []
        kvals = []

        # K is None which means it will be inferred
        if train_phase:
            end = 1
        else:
            end = 120

        for ps in range(1, 1+end):
            print(f"Clustering problem {ps:03d}..")
            # In BTM, all the corpora need to be modelled as LSS
            # Now we proceed with clustering
            ground_truth = self._get_ps_truth(ps)
            lss_rep_docs = self._vectorise_ps(ps, convert_to_proportions=True)
            # Normalise the data as they are inherintly directional
            lss_rep_docs = Tools.normalise_data(lss_rep_docs)
            # Start the clustering endeavours
            ps_res, k_trends = self._cluster_data(ps=ps,
                                                  data=lss_rep_docs,
                                                  ground_truth=ground_truth,
                                                  desired_k=None)
            problemsets_results.append(ps_res)
            kvals.append(k_trends)
        # Save the results to disk:
        print("Saving results..")
        self._save_results(
            suffix=f"_btm_{self.btm_dir_suffix}",
            info_path=f"{self.corpus_path}\\info.json",
            results=problemsets_results,
            k_values=kvals)
        print("Done.")
Ejemplo n.º 15
0
def main():
    # Specify which topic model to use?
    use_btm = True

    if use_btm:
        #   Control Parameters ###
        train_phase = True
        t = 10  # number of btm topics
        ##########################

        print("\n-------------------------------------")
        print("BTM modelling and authorial clustering")
        print("-------------------------------------\n")

        if train_phase:
            r = range(1, 2)
            dpath = Tools.get_path(
                r"D:\Projects\Authorial_Clustering_Short_Texts_nPTM"
                r"\Datasets\pan17_train")
        else:
            r = range(1, 121)
            dpath = (r"D:\Projects\Authorial_Clustering_Short_Texts_nPTM"
                     r"\Datasets\pan17_test")

        for ps in r:
            # Loop over the problemsets
            ps_path = Tools.get_path(dpath, f"problem{ps:03d}")
            print(f"\nProcessing #{ps:03d}:")
            #   Inferring BTM ###
            #####################
            # TODO: avoid creating r BTM objects by delegating ps_path
            btm = LssBTModeller(directory_path=ps_path,
                                t=t,
                                alpha=1.0,
                                beta=0.01,
                                model_dir_suffix="remove_stopwords_puncts")
            btm.infer_btm(remove_bg_terms=True,
                          drop_puncs=True,
                          use_biterm_freqs=False)
            print("\t→ btm inference done")
    else:

        print("Main thread started..\n")
        folders_path = (r"D:\College\DKEM\Thesis"
                        r"\AuthorshipClustering\Datasets\pan17_train")
        hdp = r"D:\College\DKEM\Thesis\AuthorshipClustering\Code\hdps\hdp"

        optimiser = LssOptimiser(train_folders_path=folders_path,
                                 hdp_path=hdp,
                                 ldac_filename="dummy_ldac_corpus.dat",
                                 hdp_seed=None,
                                 eta_range=[0.3, 0.5, 0.8, 1],
                                 gamma_range=[0.1, 0.3, 0.5],
                                 alpha_range=[0.1, 0.3, 0.5],
                                 out_dir=Tools.get_path(".", "__outputs__"),
                                 hdp_iters=1000)

        ret_eta = optimiser.smart_optimisation(tail_prcnt=0.8,
                                               skip_factor=5,
                                               plot_cat="num.tables",
                                               verbose=True)
        print(ret_eta)
        print("Done.")
Ejemplo n.º 16
0
    def assess_hyper_sampling(self, tail_prcnt: float, verbose: bool = False):
        """
        A function to measure the average per word log-likelihood after
        hyper-sampling the concentration parameters of the Dirichlet
        distributions.
        Caution: the hdp must have been run on the data with hyper sampling and
        without it, in order to load the two representations and compare.

        Returns
        -------
        dct: dict
            A dictionary containing the per word log-likelihood of the train
            data with the two methods pertaining to sampling the concentration
            parameters: normal and hyper.

        """
        path_normal = Tools.get_path(".", "hdp_lss_HyperFalse", "state.log")
        path_hyper = Tools.get_path(".", "hdp_lss_HyperTrue", "state.log")
        path_ldac = Tools.get_path(".", "lda_c_format_HyperTrue",
                                   "dummy_ldac_corpus.dat.vocab")
        per_word_ll_normal = []
        per_word_ll_hyper = []

        if verbose:
            print("------Concentration Parameters Optimisation------")

        with Tools.scan_directory(self.training_folder) as dirs:
            for d in dirs:
                if d.name[0:7] != "problem":
                    continue

                if verbose:
                    print(f"\t► Processing {d.name}")

                normal = Tools.get_path(d.path, path_normal)
                hyper = Tools.get_path(d.path, path_hyper)
                vocab = Tools.get_path(d.path, path_ldac)

                n_words = self._get_number_words(vocab)
                df_normal = pd.read_csv(filepath_or_buffer=normal,
                                        delim_whitespace=True,
                                        index_col="iter",
                                        usecols=["iter", "likelihood"],
                                        squeeze=True)
                ll_normal = df_normal.tail(round(len(df_normal) *
                                                 tail_prcnt)).mean()
                per_word_ll_normal.append(ll_normal / n_words)

                df_hyper = pd.read_csv(filepath_or_buffer=hyper,
                                       delim_whitespace=True,
                                       index_col="iter",
                                       usecols=["iter", "likelihood"],
                                       squeeze=True)
                ll_hyper = df_hyper.tail(round(len(df_hyper) *
                                               tail_prcnt)).mean()
                per_word_ll_hyper.append(ll_hyper / n_words)

        dct = {
            "Normal_Sampling":
            round(sum(per_word_ll_normal) / len(per_word_ll_normal), 4),
            "Hyper_Sampling":
            round(sum(per_word_ll_hyper) / len(per_word_ll_hyper), 4)
        }

        if verbose:
            print("-------------------------------------------------")

        pd.DataFrame(data=dct,
                     index=[0
                            ]).to_csv(f"{self.out_dir}/hyper_optimisation.csv",
                                      index=False)
        return dct
def problem_set_run(problem_set_id: int,
                    n_clusters: int,
                    seed: int,
                    configuration: str,
                    drop_uncommon: bool,
                    verbose: bool,
                    infer_lss: bool = False):
    problem_nbr = f"{problem_set_id:03d}"
    # Define an LSS modeller to represent documents in LSS non-sparse space
    # HDP with Gibbs sampler is being used as is from:
    #   https://github.com/blei-lab/hdp

    # Adjust the parameters according to the preference
    if configuration == config_sparse:
        eta = 0.3
        gamma = 0.1
        alpha = 0.1
    elif configuration == config_dense:
        eta = 0.8
        gamma = 1.5
        alpha = 1.5
    else:
        eta = 0.5
        gamma = 1.0
        alpha = 1.0

    Modeller = LssHdpModeller(
            hdp_path=r"..\hdps\hdp",
            input_docs_path=r"..\..\Datasets\pan17_train\problem{}".format(
                    problem_nbr),
            ldac_filename=r"ldac_corpus",
            hdp_output_dir=r"hdp_lss",
            hdp_iters=10000,
            hdp_seed=seed,
            hdp_sample_hyper=False,
            hdp_eta=eta,
            hdp_gamma_s=gamma,
            hdp_alpha_s=alpha,
            word_grams=1,
            drop_uncommon=drop_uncommon,
            freq_threshold=1,
            verbose=verbose)

    # Infer the BoW and LSS representations of the documents
    try:
        # Load, project and visualise the data
        plain_docs, bow_rep_docs, lss_rep_docs = Modeller.get_corpus_lss(
                infer_lss,
                bim=False)

        # Begin Clustering Attempts
        true_labels_path = (r"..\..\Datasets\pan17_train\truth"
                            r"\problem{}\clustering.json"
                            ).format(problem_nbr)

        ground_truth = Tools.load_true_clusters_into_vector(true_labels_path)

        # Normalise the data if not BIM is used!
        clu_lss = Clusterer(dtm=Tools.normalise_data(data=lss_rep_docs),
                            true_labels=ground_truth,
                            max_nbr_clusters=len(lss_rep_docs)-1,
                            min_nbr_clusters=1,
                            min_cluster_size=2,
                            metric="cosine",
                            desired_n_clusters=n_clusters)

        norm_spk_pred, norm_spk_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_spherical_k_means,
                param_init="k-means++")

#        ispk_pred, ispk_evals = clu_lss.evaluate(
#                alg_option=Clusterer.alg_iterative_spherical_k_means,
#                param_init="k-means++")

        norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_h_dbscan)

        norm_ms_pred, norm_ms_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_mean_shift)

#        norm_xm_pred, norm_xm_evals = clu_lss.evaluate(
#                alg_option=Clusterer.alg_x_means)

        nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="complete")

        nhac_s_pred, nhac_s_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="single")

        nhac_a_pred, nhac_a_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_hac,
                param_linkage="average")

        n_optics_pred, n_optics_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_optics)

        # Baselines
        bl_rand_pred, bl_rand_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_random)
        bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_singleton)

        nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp()
        ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering()

        # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features
        sota_pred_path = (r"D:\College\DKEM\Thesis\AuthorshipClustering\Code"
                          r"\clusterPAN2017-master\train_out_LogEnt"
                          f"\\problem{problem_nbr}\\clustering.json")
        sota_predicted = Tools.load_true_clusters_into_vector(sota_pred_path)
        sota_pred, sota_evals = clu_lss.eval_sota(
                sota_predicted=sota_predicted)

        # Return the results:
        return (Tools.form_problemset_result_dictionary(
                dictionaries=[
                        # ispk_evals, norm_spk_evals, norm_hdbscan_evals,
                        norm_spk_evals, norm_hdbscan_evals,
                        norm_ms_evals,  # norm_xm_evals,
                        nhac_complete_evals, nhac_s_evals, nhac_a_evals,
                        n_optics_evals, bl_rand_evals, bl_singleton_evals,
                        nhdp_evals, sota_evals, ntrue_evals
                        ],
                identifiers=[  # "iSpKmeans",
                             "E_SPKMeans", "E_HDBSCAN",
                             "E_Mean_Shift",  # "XMeans",
                             "E_HAC_C", "E_HAC_Single", "E_HAC_Average",
                             "E_OPTICS", "BL_r", "BL_s",
                             "S_HDP", "BL_SOTA", "Labels"],
                problem_set=problem_set_id),
                ground_truth,
                lss_rep_docs,
                plain_docs,
                clu_lss)

    except FileNotFoundError:
        print("Please run HDP on these data first.")
    def run_test(self,
                 configuration: str,
                 drop_uncommon: bool,
                 save_name_suff: str,
                 infer: bool,
                 desired_k: int  # If 0, true k will be used, None = estimation
                 ):

        # Adjust the parameters according to the preference
        if configuration == TestApproach.config_sparse:
            eta = 0.3
            gamma = 0.1
            alpha = 0.1
        elif configuration == TestApproach.config_dense:
            eta = 0.8
            gamma = 1.5
            alpha = 1.5
        else:
            eta = 0.5
            gamma = 1.0
            alpha = 1.0

        problemsets_results = []
        k_vals = []
        failures = []
        # Detect if we're dealing with the train or test data
        r = range(1, 121) if not train_phase else range(40, 61)
        start = tpc()
        for ps in r:
            print(f"\n[{(tpc()-start)/60:06.2f}m] Problem Set ► {ps:03d} ◄")
            try:
                print(f"[{(tpc()-start)/60:06.2f}m]\tVectorising..")
                plain_docs, bow_rep_docs, lss_rep_docs = self._vectorise_ps(
                        ps,
                        infer_lss=infer,
                        hdp_eta=eta,
                        hdp_gamma_s=gamma,
                        hdp_alpha_s=alpha,
                        drop_uncommon_terms=drop_uncommon)
                lss_rep_docs = Tools.normalise_data(lss_rep_docs,
                                                    log_e=log_entropy_w)

                # Begin Clustering Attempts
                print(f"[{(tpc()-start)/60:06.2f}m]\tClustering..")
                ground_truth = self._get_ps_truth(ps)
                ps_res, k_trends = self._cluster_data(
                    ps, data=lss_rep_docs,
                    ground_truth=ground_truth,
                    desired_k=desired_k)
                problemsets_results.append(ps_res)
                k_vals.append(k_trends)
            except AttributeError as excp:
                failures.append(ps)
                print(excp)
                print(f"> ERROR: {excp}.\n> Skipping..")
                pass
            print(f"[{(tpc()-start)/60:06.2f}m]\tDone.")

        print("» Saving Results ..")
        folder = "pan17_train" if train_phase else "pan17_test"
        path = self._save_results(
                suffix=f"{save_name_suff}_{configuration}",
                info_path=f"..\\..\\Datasets\\{folder}\\info.json",
                results=problemsets_results,
                k_values=k_vals)
        if (len(failures) != 0):
            print(f"{len(failures)/len(lss_rep_docs)} problem(s) skipped.")
            Tools.save_list_to_text(
                mylist=failures,
                filepath=r"./__outputs__/skipped.txt",
                header=f"Skipped PS train 12% ({len(failures)})")

        print(f"[{(tpc()-start)/60:06.2f}m] All Done.")
        return path
Ejemplo n.º 19
0
    def _generate_hdps_outputs(self,
                               skip_factor: int = 1,
                               verbose: bool = False):
        st = time.perf_counter()
        ldac_path = Tools.get_path("lda_c_format_HyperFalse",
                                   "dummy_ldac_corpus.dat")
        words_nums = {}
        vocab_file = Tools.get_path("lda_c_format_HyperFalse",
                                    "dummy_ldac_corpus.dat.vocab")
        #        size = ((60 // skip_factor)
        #                * len(self.etas)
        #                * len(self.gammas)**2
        #                * len(self.alphas)**2)
        # Since we fixed the scales of Gammas
        size = ((60 // skip_factor) * len(self.etas) * len(self.gammas) *
                len(self.alphas))
        i = 0
        with Tools.scan_directory(self.training_folder) as ps_folders:
            for c, folder in enumerate(ps_folders):
                if not folder.name[0:7] == "problem":
                    if verbose:
                        print(f"→ Skipping {folder.name}")
                    continue
                # Implement the skipping factor
                if c % skip_factor != 0:
                    continue

                t = time.perf_counter()
                # Fix the scale parameters for the Gamma priors
                g_r = 1
                a_r = 1
                for eta in self.etas:
                    # for g_s, g_r in product(self.gammas, repeat=2):
                    # for a_s, a_r in product(self.alphas, repeat=2):
                    # Only switch the shape parameter of Gammas
                    for g_s in self.gammas:
                        for a_s in self.alphas:
                            # Cache the number of words for later
                            if folder.name not in words_nums:
                                vocab_path = Tools.get_path(
                                    folder.path, vocab_file)
                                n_words = self._get_number_words(vocab_path)
                                words_nums.update({folder.name: n_words})

                            i = i + 1
                            percentage = f"{100 * i / size:06.02f}"
                            suff = (f"{g_s:0.2f}_{g_r:0.2f}_"
                                    f"{a_s:0.2f}_{a_r:0.2f}")
                            if verbose:
                                print(f"► Applying HDP with "
                                      f"eta={eta:0.1f} "
                                      f"gamma({g_s:0.2f}, {g_r:0.2f}) "
                                      f"alpha({a_s:0.2f}, {a_r:0.2f}) "
                                      f"on {folder.name} [{percentage}%]")

                            directory = Tools.get_path(self.out_dir,
                                                       "optimisation",
                                                       f"{eta:0.1f}__{suff}",
                                                       folder.name)

                            if (Tools.path_exists(directory)):
                                if verbose:
                                    print("\tcached result found at "
                                          f"{directory}")
                                continue

                            path_executable = r"{}\hdp.exe".format(
                                self.hdp_path)
                            data = Tools.get_path(folder.path, ldac_path)

                            # Prepare the output directory
                            Tools.initialise_directories(directory)

                            if self.seed is not None:
                                s.run([
                                    path_executable, "--algorithm", "train",
                                    "--data", data, "--directory", directory,
                                    "--max_iter",
                                    str(self.iters), "--sample_hyper", "no",
                                    "--save_lag", "-1", "--eta",
                                    str(eta), "--gamma_a",
                                    str(g_s), "--gamma_b",
                                    str(g_r), "--alpha_a",
                                    str(a_s), "--alpha_b",
                                    str(a_r), "--random_seed",
                                    str(self.seed)
                                ],
                                      stdout=s.DEVNULL,
                                      check=True,
                                      capture_output=False,
                                      text=True)
                            else:
                                s.run([
                                    path_executable, "--algorithm", "train",
                                    "--data", data, "--directory", directory,
                                    "--max_iter",
                                    str(self.iters), "--sample_hyper", "no",
                                    "--save_lag", "-1", "--eta",
                                    str(eta), "--gamma_a",
                                    str(g_s), "--gamma_b",
                                    str(g_r), "--alpha_a",
                                    str(a_s), "--alpha_b",
                                    str(a_r)
                                ],
                                      stdout=s.DEVNULL,
                                      check=True,
                                      capture_output=False,
                                      text=True)

                if verbose:
                    print(f"--- {folder.name} done in "
                          f"{time.perf_counter() - t:0.1f} seconds ---")

        period = round(time.perf_counter() - st, 2)
        print(f"----- Vectorisation done in {period} seconds -----")
        return words_nums
Ejemplo n.º 20
0
    def smart_optimisation(self,
                           plot_cat: str = "likelihood",
                           tail_prcnt: float = 0.80,
                           skip_factor: int = 1,
                           verbose: bool = False):
        # First generate the outputs to compare:
        words_counts = self._generate_hdps_outputs(skip_factor=skip_factor,
                                                   verbose=verbose)

        ret = {}
        # Loop over the outputs of different etas
        master_folder = Tools.get_path(self.out_dir, "optimisation")
        log_likelihoods = []
        avg_num_topics = []
        std_num_topics = []
        pw_ll = []
        errors = []
        with Tools.scan_directory(master_folder) as perms:
            for perm in perms:
                # generate plots
                if not Tools.is_path_dir(perm.path):
                    continue

                self.generate_gibbs_states_plots(states_path=perm.path,
                                                 cat=plot_cat)
                with Tools.scan_directory(perm.path) as problems:
                    for problem in problems:
                        try:
                            n_words = words_counts[problem.name]
                            path_state = Tools.get_path(
                                problem.path, "state.log")
                            df_state = pd.read_csv(
                                filepath_or_buffer=path_state,
                                delim_whitespace=True,
                                index_col="iter",
                                usecols=["iter", "likelihood", "num.topics"])
                            ll = df_state.likelihood.tail(
                                round(len(df_state) * tail_prcnt)).mean()
                            avg_topics = df_state["num.topics"].tail(
                                round(len(df_state) * tail_prcnt)).mean()
                            std_topics = df_state["num.topics"].tail(
                                round(len(df_state) * tail_prcnt)).std()

                            log_likelihoods.append(ll)
                            pw_ll.append(ll / n_words)
                            avg_num_topics.append(avg_topics)
                            std_num_topics.append(std_topics)
                        except FileNotFoundError as e:
                            print(f"{e}")
                            errors.append(f"{e}")
                            continue
                        except KeyError:
                            # Plots folders are being queried for n_words
                            continue
                ret.update({
                    f"{perm.name}": [
                        round(sum(log_likelihoods) / len(log_likelihoods), 4),
                        round(sum(pw_ll) / len(pw_ll), 4),
                        round(sum(avg_num_topics) / len(avg_num_topics), 4),
                        round(sum(std_num_topics) / len(std_num_topics), 4)
                    ]
                })
        # Save any encountered errors to disk too
        Tools.save_list_to_text(mylist=errors,
                                filepath=Tools.get_path(
                                    self.out_dir, "optimisation",
                                    "opt_errors.txt"))

        pd.DataFrame(data=ret,
                     index=["Log-l", "PwLL", "T-Avg", "T-Std"
                            ]).T.to_csv(Tools.get_path(self.out_dir,
                                                       "optimisation",
                                                       "optimisation.csv"),
                                        index=True)

        return ret
Ejemplo n.º 21
0
    def traverse_gamma_alpha(self,
                             ps: int,
                             tail_prcnt: float = 0.80,
                             verbose: bool = True):
        ldac_path = Tools.get_path("lda_c_format_HyperFalse",
                                   "dummy_ldac_corpus.dat")
        dat_path = Tools.get_path(self.training_folder, f"problem{ps:03d}",
                                  ldac_path)
        directory = Tools.get_path(self.out_dir, "gamma_alpha")
        path_executable = Tools.get_path(self.hdp_path, "hdp.exe")

        res = defaultdict(list)
        total_work = len(self.gammas)**2 * len(self.alphas)**2
        c = 0
        print("----------------------------------------------------")
        for g_s, g_r in product(self.gammas, repeat=2):
            for a_s, a_r in product(self.alphas, repeat=2):
                for a_r in self.alphas:
                    c = c + 1
                    progress = 100.0 * c / total_work
                    suff = f"_{g_s:0.2f}_{g_r:0.2f}_{a_s:0.2f}_{a_r:0.2f}"
                    if verbose:
                        print(f"► Working on "
                              f"Gamma({g_s:0.2f},{g_r:0.2f}) "
                              f"and Alpha({a_s:0.2f},{a_r:0.2f}) "
                              f"[{progress:06.2f}%]")
                    s.run([
                        path_executable, "--algorithm", "train", "--data",
                        dat_path, "--directory",
                        Tools.get_path(directory, f"{c:03d}",
                                       f"hdp_out{suff}"), "--max_iter",
                        str(500), "--sample_hyper", "no", "--save_lag", "-1",
                        "--eta", "0.5", "--random_seed",
                        str(self.seed), "--gamma_a",
                        str(g_s), "--gamma_b",
                        str(g_r), "--alpha_a",
                        str(a_s), "--alpha_b",
                        str(a_r)
                    ],
                          check=True,
                          capture_output=True,
                          text=True)
                    # Read the likelihood
                    ll = pd.read_csv(Tools.get_path(directory,
                                                    f"{c:03d}hdp_out{suff}",
                                                    "state.log"),
                                     delim_whitespace=True).likelihood.tail(
                                         round(tail_prcnt * 500)).mean()
                    res["gamma_shape"].append(g_s)
                    res["gamma_rate"].append(g_r)
                    res["alpha_shape"].append(a_s)
                    res["alpha_rate"].append(a_r)
                    res["gamma"].append(g_s * g_r)
                    res["alpha"].append(a_s * a_r)
                    res["likelihood"].append(ll)
        # Save the results to disk
        df_res = pd.DataFrame(res)
        df_res.to_csv(Tools.get_path(directory, "results.csv"), index=False)
        if verbose:
            print("---------------------- Done ------------------------")
        return df_res
    def _cluster_data(self, ps: int,
                      data: List[List],
                      ground_truth: List,
                      desired_k: int):
        clu_lss = Clusterer(dtm=data,
                            true_labels=ground_truth,
                            max_nbr_clusters=len(data)-1,
                            min_nbr_clusters=1,
                            min_cluster_size=2,
                            metric="cosine",
                            desired_n_clusters=desired_k)

        # Run SPKMeans 10 times to get mean performance
        # This is also what supplied the estimated k for the Clusterer
        # TODO: decouple k estimations from the evaluation
        norm_spk_pred, norm_spk_evals = clu_lss.evaluate(
                alg_option=Clusterer.alg_spherical_k_means,
                param_init="k-means++")

        cop_kmeans_pred, cop_kmeans_evals = clu_lss.evaluate(
            alg_option=Clusterer.alg_cop_kmeans,
            param_constraints_size=constraints_fraction,
            param_copkmeans_init="random")

        if include_older_algorithms:
            norm_hdbscan_pred, norm_hdbscan_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_h_dbscan)

            norm_ms_pred, norm_ms_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_mean_shift)

    #        norm_xm_pred, norm_xm_evals = clu_lss.evaluate(
    #                alg_option=Clusterer.alg_x_means)

            nhac_complete_pred, nhac_complete_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_hac,
                    param_linkage="complete")

            nhac_s_pred, nhac_s_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_hac,
                    param_linkage="single")

            nhac_a_pred, nhac_a_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_hac,
                    param_linkage="average")

            n_optics_pred, n_optics_evals = clu_lss.evaluate(
                    alg_option=Clusterer.alg_optics)

        # Baselines
        bl_rand_pred, bl_rand_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_random)
        bl_singleton_pred, bl_singleton_evals = clu_lss.evaluate(
                alg_option=Clusterer.bl_singleton)

        nhdp_pred, nhdp_evals = clu_lss.eval_cluster_hdp()
        ntrue_pred, ntrue_evals = clu_lss.eval_true_clustering()

        # SOTA - Gomez et. al. HAC and Log-Entropy with 20k features
        # Not Applicable for Training data
        if not train_phase:
            sota_pred_path_le = (r"D:\College\DKEM\Thesis\AuthorshipClustering"
                                 r"\Code\clusterPAN2017-master\output_LogEnt"
                                 f"\\problem{ps:03d}\\clustering.json")
            sota_predicted_le = Tools.load_true_clusters_into_vector(
                    sota_pred_path_le)
            sota_pred_le, sota_evals_le = clu_lss.eval_sota(
                    sota_predicted=sota_predicted_le)

            sota_pred_path_tf = (r"D:\College\DKEM\Thesis\AuthorshipClustering"
                                 r"\Code\clusterPAN2017-master\output_Tf"
                                 f"\\problem{ps:03d}\\clustering.json")
            sota_predicted_tf = Tools.load_true_clusters_into_vector(
                    sota_pred_path_tf)
            sota_pred_tf, sota_evals_tf = clu_lss.eval_sota(
                    sota_predicted=sota_predicted_tf)

            sota_pred_path_tfidf = (
                r"D:\College\DKEM\Thesis\AuthorshipClustering"
                r"\Code\clusterPAN2017-master\output_TfIdf"
                f"\\problem{ps:03d}\\clustering.json")
            sota_predicted_tfidf = Tools.load_true_clusters_into_vector(
                    sota_pred_path_tfidf)
            sota_pred_tfidf, sota_evals_tfidf = clu_lss.eval_sota(
                    sota_predicted=sota_predicted_tfidf)
        else:
            # Build some placeholders only as SOTA isn't required to train
            # sota_pred_le = [0] * len(data)
            # sota_pred_tf = [0] * len(data)
            # sota_pred_tfidf = [0] * len(data)
            placebo_ret = {}
            placebo_ret.update({"nmi": None,
                                "ami": None,
                                "ari": None,
                                "fms": None,
                                "v_measure": None,
                                "bcubed_precision": None,
                                "bcubed_recall": None,
                                "bcubed_fscore": None,
                                "Silhouette": None,
                                "Calinski_harabasz": None,
                                "Davies_Bouldin": None
                                # Here goes the unsupervised indices
                                })
            sota_evals_le = placebo_ret
            sota_evals_tf = placebo_ret
            sota_evals_tfidf = placebo_ret

        # Control whether k is estimated or it is the true k replicated:
        if desired_k != 0:
            k_trend = clu_lss.cand_k
            k_trend.append(1 + max(clu_lss.true_labels))
        else:
            k_trend = [1 + max(clu_lss.true_labels)
                       ] * (nbr_competing_methods + 1)

        result = Tools.form_problemset_result_dictionary(
                dictionaries=[
                        # ispk_evals, norm_spk_evals, norm_hdbscan_evals,
                        norm_spk_evals, norm_hdbscan_evals,
                        norm_ms_evals,  # norm_xm_evals,
                        nhac_complete_evals, nhac_s_evals, nhac_a_evals,
                        n_optics_evals, cop_kmeans_evals,
                        bl_rand_evals, bl_singleton_evals,
                        nhdp_evals,
                        sota_evals_tf, sota_evals_tfidf, sota_evals_le,
                        ntrue_evals
                        ],
                identifiers=[  # "iSpKmeans",
                             "E_SPKMeans", "E_HDBSCAN",
                             "E_Mean_Shift",  # "XMeans",
                             "E_HAC_C", "E_HAC_Single", "E_HAC_Average",
                             "E_OPTICS", "E_COP_KMeans",
                             "BL_r", "BL_s", "S_HDP",
                             "BL_SOTA_tf", "BL_SOTA_tfidf", "BL_SOTA_le",
                             "Labels"],
                problem_set=ps)

        return result, k_trend
            # as it seems. However, the seeds would be consistant across
            # runs and yield comparable results for our experiments
            # (comparing different runs of HDP on a problem set)
            seed=max(33, 70*(ps == 41)) + (3 * (ps in problematics)),
            infer_lss=False,
            verbose=False,
            configuration=config_neutral,
            drop_uncommon=True)
        problemsets_results.append(ps_result)
        ks = clu.cand_k.copy()
        ks.append(1+max(clu.true_labels))
        k_vals.append(ks)
    my_suffix = "_training_neutral_common"
    info_json = r"..\..\Datasets\pan17_train\info.json"
    Tools.splice_save_problemsets_dictionaries(problemsets_results,
                                               metadata_fpath=info_json,
                                               suffix=my_suffix)
    Tools.save_k_vals_as_df(k_vals=k_vals, suffix=my_suffix)

    print("==================== SPARSE ====================")
    problemsets_results = []
    k_vals = []
    for ps in range(1, 61):
        print(f"Executing on problem set ► {ps:03d} ◄ ..")
        ps_result, l, lss, plain, clu = problem_set_run(
            problem_set_id=ps,
            n_clusters=None,
            # Emperically specify a random seed that's compatible with
            # hyper sampling and certain problem sets due to a bug in HDP
            # as it seems. However, the seeds would be consistant across
            # runs and yield comparable results for our experiments