def _vectorise_ps(self,
                      ps: int,
                      convert_to_proportions: bool):
        # Override the function, returning only the LSS representation
        directory_path = f"{self.corpus_path}\\problem{ps:03d}"
        pzd_fpath = (f"{directory_path}\\BTM_{self.btm_dir_suffix}"
                     f"\\k{self.t}.pz_d")

        btm_lss = pd.read_csv(filepath_or_buffer=pzd_fpath,
                              delim_whitespace=True,
                              header=None)

        if len(self.btm.doc_index) == 0:
            doc_index = []
            # We will need to build the index
            with Tools.scan_directory(directory_path) as docs:
                for doc in docs:
                    if doc.is_dir():
                        continue
                    doc_index.append(Tools.get_filename(doc.path))
            btm_lss.index = doc_index
        else:
            btm_lss.index = self.btm.doc_index

        if convert_to_proportions:
            tokenised_btmcorpus_filepath = (
                f"{directory_path}\\BTM_{self.btm_dir_suffix}"
                f"\\vectorised\\tokenised_btmcorpus.txt")
            with open(tokenised_btmcorpus_filepath) as c:
                tcorpus = c.readlines()
                freqs = [len(self._doc_gen_biterms(tdoc))
                         for tdoc in tcorpus]
                btm_lss = btm_lss.mul(freqs, axis="index")

        return btm_lss
Beispiel #2
0
    def _convert_corpus_to_bow(self, file_ext: str = "txt"):
        """
        Convert a directory of text files into a BoW model.

        Parameters
        ----------
        word_grams : int (optional)
            The number of words to combine as features. 1 is the default value,
            and it denotes the usage of word unigrams.

        Returns
        -------
        bow_corpus : gnesim corpus
            The bag-of-words model.

        dictionary : gensim dictionary
            The id2word mapping.

        plain_documents : list
            The list of plain documents, to serve as a reference point.
        """
        # Read in the plain text files
        plain_documents = []
        with Tools.scan_directory(self.input_docs_path) as docs:
            for doc in docs:
                if doc.is_dir() or Tools.split_path(
                        doc.path)[1] != f".{file_ext}":
                    continue
                try:
                    f = open(doc.path, mode="r", encoding="utf8")
                    plain_documents.append(f.read())
                    self.doc_index.append(Tools.get_filename(doc.path))
                except PermissionError:
                    # Raised when trying to open a directory
                    print("Skipped while loading files: {}".format(doc.name))
                    pass
        # Collocation Detection can be applied here via gensim.models.phrases
        # Tokenise corpus and remove too short documents
        tokenised_corpus = [[
            ' '.join(tkn)
            for tkn in ngrams(word_tokenize(d.lower()), self.word_grams)
        ] for d in plain_documents if len(d) > 3]

        if self.drop_uncommon:
            freq = defaultdict(int)
            for doc in tokenised_corpus:
                for word in doc:
                    freq[word] += 1
            tokenised_corpus = [[w for w in doc if freq[w] > self.freq_th]
                                for doc in tokenised_corpus]
        # Form the word ids dictionary for vectorisation
        dictionary = Dictionary(tokenised_corpus)
        corpus = [dictionary.doc2bow(t_d) for t_d in tokenised_corpus]

        return (corpus, dictionary,
                pd.DataFrame(data=plain_documents,
                             index=self.doc_index,
                             columns=["content"]))
Beispiel #3
0
    def _concatenate_docs_into_btmcorpus(self,
                                         remove_bgw: bool = False,
                                         drop_uncommon: bool = False,
                                         drop_punctuation: bool = False):
        # Read in the plain text files
        plain_documents = []
        with Tools.scan_directory(self.directory_path) as docs:
            for doc in docs:
                if doc.is_dir():
                    continue
                try:
                    f = open(doc.path, mode="r", encoding="utf8")
                    plain_documents.append(f.read())
                    self.doc_index.append(Tools.get_filename(doc.path))
                except PermissionError:
                    # Raised when trying to open a directory
                    print("Skipped while loading files: {}".format(doc.name))
                    pass
                finally:
                    f.close()
        # lowercase and strip \n away
        plain_documents = [
            str.replace(d, "\n", "").lower() for d in plain_documents
        ]
        # it was observed that the topics are composed of a lot of stop words
        # Following the BTM paper and the observation, we remove these
        if remove_bgw:
            # Detect the language
            lang = detect(" ".join(plain_documents))
            if lang == "en":
                lang = "english"
            elif lang == "nl":
                lang = "dutch"
            else:
                lang = "greek"

            new_documents = []
            for d in plain_documents:
                terms = [
                    w for w in word_tokenize(text=d, language=lang)
                    if w not in set(stopwords.words(lang))
                ]
                new_documents.append(" ".join(terms))
            plain_documents = new_documents

        if drop_punctuation:
            plain_documents = [
                sub(pattern=r"[^\w\s]", repl="", string=d)
                for d in plain_documents
            ]
        # save it to disk
        Tools.save_list_to_text(mylist=plain_documents,
                                filepath=self.plain_corpus_path)
        return plain_documents
Beispiel #4
0
    def load_pz_d_into_df(self, use_frequencies: bool = False):
        """


        Parameters
        ----------
        use_frequencies : bool, optional
            DESCRIPTION. The default is False.

        Returns
        -------
        btm_lss : TYPE
            DESCRIPTION.

        """
        # ??? This function is not used, should be used in tester._vectorise_ps
        # Load the lss into df
        pzd_fpath = f"{self.directory_path}k{self.t}.pz_d"
        try:
            btm_lss = pd.read_csv(filepath_or_buffer=pzd_fpath,
                                  delim_whitespace=True)

            if not self.doc_index:
                # We will need to build the index
                with Tools.scan_directory(self.directory_path) as docs:
                    for doc in docs:
                        if doc.is_dir():
                            continue
                        self.doc_index.append(Tools.get_filename(doc.path))
            btm_lss.index = self.doc_index

            if use_frequencies:
                # The saved documents are in p(z|d) values
                # We want to proportion them to frequencies so that we have the
                # frequency of terms belonging to a topic
                # Since sum_b is used, we will use the count of biterms
                # Treating each p(zi|dj) as a proportion, we will count biterms
                with open(self.tokenised_btmcorpus_filepath) as c:
                    tcorpus = c.readlines()
                # How many biterms are there?
                # Analyzing the C++ code, a widnow of 15 is used
                # regenerate the biterms and count as statistics can detect
                # redundancies in unordered terms:
                freqs = [len(self._doc_gen_biterms(tdoc)) for tdoc in tcorpus]
                btm_lss = btm_lss.mul(freqs, axis="index")

            return btm_lss
        except FileNotFoundError:
            return None