예제 #1
0
    def _concatenate_docs_into_btmcorpus(self,
                                         remove_bgw: bool = False,
                                         drop_uncommon: bool = False,
                                         drop_punctuation: bool = False):
        # Read in the plain text files
        plain_documents = []
        with Tools.scan_directory(self.directory_path) as docs:
            for doc in docs:
                if doc.is_dir():
                    continue
                try:
                    f = open(doc.path, mode="r", encoding="utf8")
                    plain_documents.append(f.read())
                    self.doc_index.append(Tools.get_filename(doc.path))
                except PermissionError:
                    # Raised when trying to open a directory
                    print("Skipped while loading files: {}".format(doc.name))
                    pass
                finally:
                    f.close()
        # lowercase and strip \n away
        plain_documents = [
            str.replace(d, "\n", "").lower() for d in plain_documents
        ]
        # it was observed that the topics are composed of a lot of stop words
        # Following the BTM paper and the observation, we remove these
        if remove_bgw:
            # Detect the language
            lang = detect(" ".join(plain_documents))
            if lang == "en":
                lang = "english"
            elif lang == "nl":
                lang = "dutch"
            else:
                lang = "greek"

            new_documents = []
            for d in plain_documents:
                terms = [
                    w for w in word_tokenize(text=d, language=lang)
                    if w not in set(stopwords.words(lang))
                ]
                new_documents.append(" ".join(terms))
            plain_documents = new_documents

        if drop_punctuation:
            plain_documents = [
                sub(pattern=r"[^\w\s]", repl="", string=d)
                for d in plain_documents
            ]
        # save it to disk
        Tools.save_list_to_text(mylist=plain_documents,
                                filepath=self.plain_corpus_path)
        return plain_documents
    def run_test(self,
                 configuration: str,
                 drop_uncommon: bool,
                 save_name_suff: str,
                 infer: bool,
                 desired_k: int  # If 0, true k will be used, None = estimation
                 ):

        # Adjust the parameters according to the preference
        if configuration == TestApproach.config_sparse:
            eta = 0.3
            gamma = 0.1
            alpha = 0.1
        elif configuration == TestApproach.config_dense:
            eta = 0.8
            gamma = 1.5
            alpha = 1.5
        else:
            eta = 0.5
            gamma = 1.0
            alpha = 1.0

        problemsets_results = []
        k_vals = []
        failures = []
        # Detect if we're dealing with the train or test data
        r = range(1, 121) if not train_phase else range(40, 61)
        start = tpc()
        for ps in r:
            print(f"\n[{(tpc()-start)/60:06.2f}m] Problem Set ► {ps:03d} ◄")
            try:
                print(f"[{(tpc()-start)/60:06.2f}m]\tVectorising..")
                plain_docs, bow_rep_docs, lss_rep_docs = self._vectorise_ps(
                        ps,
                        infer_lss=infer,
                        hdp_eta=eta,
                        hdp_gamma_s=gamma,
                        hdp_alpha_s=alpha,
                        drop_uncommon_terms=drop_uncommon)
                lss_rep_docs = Tools.normalise_data(lss_rep_docs,
                                                    log_e=log_entropy_w)

                # Begin Clustering Attempts
                print(f"[{(tpc()-start)/60:06.2f}m]\tClustering..")
                ground_truth = self._get_ps_truth(ps)
                ps_res, k_trends = self._cluster_data(
                    ps, data=lss_rep_docs,
                    ground_truth=ground_truth,
                    desired_k=desired_k)
                problemsets_results.append(ps_res)
                k_vals.append(k_trends)
            except AttributeError as excp:
                failures.append(ps)
                print(excp)
                print(f"> ERROR: {excp}.\n> Skipping..")
                pass
            print(f"[{(tpc()-start)/60:06.2f}m]\tDone.")

        print("» Saving Results ..")
        folder = "pan17_train" if train_phase else "pan17_test"
        path = self._save_results(
                suffix=f"{save_name_suff}_{configuration}",
                info_path=f"..\\..\\Datasets\\{folder}\\info.json",
                results=problemsets_results,
                k_values=k_vals)
        if (len(failures) != 0):
            print(f"{len(failures)/len(lss_rep_docs)} problem(s) skipped.")
            Tools.save_list_to_text(
                mylist=failures,
                filepath=r"./__outputs__/skipped.txt",
                header=f"Skipped PS train 12% ({len(failures)})")

        print(f"[{(tpc()-start)/60:06.2f}m] All Done.")
        return path
예제 #3
0
    def smart_optimisation(self,
                           plot_cat: str = "likelihood",
                           tail_prcnt: float = 0.80,
                           skip_factor: int = 1,
                           verbose: bool = False):
        # First generate the outputs to compare:
        words_counts = self._generate_hdps_outputs(skip_factor=skip_factor,
                                                   verbose=verbose)

        ret = {}
        # Loop over the outputs of different etas
        master_folder = Tools.get_path(self.out_dir, "optimisation")
        log_likelihoods = []
        avg_num_topics = []
        std_num_topics = []
        pw_ll = []
        errors = []
        with Tools.scan_directory(master_folder) as perms:
            for perm in perms:
                # generate plots
                if not Tools.is_path_dir(perm.path):
                    continue

                self.generate_gibbs_states_plots(states_path=perm.path,
                                                 cat=plot_cat)
                with Tools.scan_directory(perm.path) as problems:
                    for problem in problems:
                        try:
                            n_words = words_counts[problem.name]
                            path_state = Tools.get_path(
                                problem.path, "state.log")
                            df_state = pd.read_csv(
                                filepath_or_buffer=path_state,
                                delim_whitespace=True,
                                index_col="iter",
                                usecols=["iter", "likelihood", "num.topics"])
                            ll = df_state.likelihood.tail(
                                round(len(df_state) * tail_prcnt)).mean()
                            avg_topics = df_state["num.topics"].tail(
                                round(len(df_state) * tail_prcnt)).mean()
                            std_topics = df_state["num.topics"].tail(
                                round(len(df_state) * tail_prcnt)).std()

                            log_likelihoods.append(ll)
                            pw_ll.append(ll / n_words)
                            avg_num_topics.append(avg_topics)
                            std_num_topics.append(std_topics)
                        except FileNotFoundError as e:
                            print(f"{e}")
                            errors.append(f"{e}")
                            continue
                        except KeyError:
                            # Plots folders are being queried for n_words
                            continue
                ret.update({
                    f"{perm.name}": [
                        round(sum(log_likelihoods) / len(log_likelihoods), 4),
                        round(sum(pw_ll) / len(pw_ll), 4),
                        round(sum(avg_num_topics) / len(avg_num_topics), 4),
                        round(sum(std_num_topics) / len(std_num_topics), 4)
                    ]
                })
        # Save any encountered errors to disk too
        Tools.save_list_to_text(mylist=errors,
                                filepath=Tools.get_path(
                                    self.out_dir, "optimisation",
                                    "opt_errors.txt"))

        pd.DataFrame(data=ret,
                     index=["Log-l", "PwLL", "T-Avg", "T-Std"
                            ]).T.to_csv(Tools.get_path(self.out_dir,
                                                       "optimisation",
                                                       "optimisation.csv"),
                                        index=True)

        return ret