def _vectorise_ps(self, ps: int, convert_to_proportions: bool): # Override the function, returning only the LSS representation directory_path = f"{self.corpus_path}\\problem{ps:03d}" pzd_fpath = (f"{directory_path}\\BTM_{self.btm_dir_suffix}" f"\\k{self.t}.pz_d") btm_lss = pd.read_csv(filepath_or_buffer=pzd_fpath, delim_whitespace=True, header=None) if len(self.btm.doc_index) == 0: doc_index = [] # We will need to build the index with Tools.scan_directory(directory_path) as docs: for doc in docs: if doc.is_dir(): continue doc_index.append(Tools.get_filename(doc.path)) btm_lss.index = doc_index else: btm_lss.index = self.btm.doc_index if convert_to_proportions: tokenised_btmcorpus_filepath = ( f"{directory_path}\\BTM_{self.btm_dir_suffix}" f"\\vectorised\\tokenised_btmcorpus.txt") with open(tokenised_btmcorpus_filepath) as c: tcorpus = c.readlines() freqs = [len(self._doc_gen_biterms(tdoc)) for tdoc in tcorpus] btm_lss = btm_lss.mul(freqs, axis="index") return btm_lss
def _convert_corpus_to_bow(self, file_ext: str = "txt"): """ Convert a directory of text files into a BoW model. Parameters ---------- word_grams : int (optional) The number of words to combine as features. 1 is the default value, and it denotes the usage of word unigrams. Returns ------- bow_corpus : gnesim corpus The bag-of-words model. dictionary : gensim dictionary The id2word mapping. plain_documents : list The list of plain documents, to serve as a reference point. """ # Read in the plain text files plain_documents = [] with Tools.scan_directory(self.input_docs_path) as docs: for doc in docs: if doc.is_dir() or Tools.split_path( doc.path)[1] != f".{file_ext}": continue try: f = open(doc.path, mode="r", encoding="utf8") plain_documents.append(f.read()) self.doc_index.append(Tools.get_filename(doc.path)) except PermissionError: # Raised when trying to open a directory print("Skipped while loading files: {}".format(doc.name)) pass # Collocation Detection can be applied here via gensim.models.phrases # Tokenise corpus and remove too short documents tokenised_corpus = [[ ' '.join(tkn) for tkn in ngrams(word_tokenize(d.lower()), self.word_grams) ] for d in plain_documents if len(d) > 3] if self.drop_uncommon: freq = defaultdict(int) for doc in tokenised_corpus: for word in doc: freq[word] += 1 tokenised_corpus = [[w for w in doc if freq[w] > self.freq_th] for doc in tokenised_corpus] # Form the word ids dictionary for vectorisation dictionary = Dictionary(tokenised_corpus) corpus = [dictionary.doc2bow(t_d) for t_d in tokenised_corpus] return (corpus, dictionary, pd.DataFrame(data=plain_documents, index=self.doc_index, columns=["content"]))
def _concatenate_docs_into_btmcorpus(self, remove_bgw: bool = False, drop_uncommon: bool = False, drop_punctuation: bool = False): # Read in the plain text files plain_documents = [] with Tools.scan_directory(self.directory_path) as docs: for doc in docs: if doc.is_dir(): continue try: f = open(doc.path, mode="r", encoding="utf8") plain_documents.append(f.read()) self.doc_index.append(Tools.get_filename(doc.path)) except PermissionError: # Raised when trying to open a directory print("Skipped while loading files: {}".format(doc.name)) pass finally: f.close() # lowercase and strip \n away plain_documents = [ str.replace(d, "\n", "").lower() for d in plain_documents ] # it was observed that the topics are composed of a lot of stop words # Following the BTM paper and the observation, we remove these if remove_bgw: # Detect the language lang = detect(" ".join(plain_documents)) if lang == "en": lang = "english" elif lang == "nl": lang = "dutch" else: lang = "greek" new_documents = [] for d in plain_documents: terms = [ w for w in word_tokenize(text=d, language=lang) if w not in set(stopwords.words(lang)) ] new_documents.append(" ".join(terms)) plain_documents = new_documents if drop_punctuation: plain_documents = [ sub(pattern=r"[^\w\s]", repl="", string=d) for d in plain_documents ] # save it to disk Tools.save_list_to_text(mylist=plain_documents, filepath=self.plain_corpus_path) return plain_documents
def load_pz_d_into_df(self, use_frequencies: bool = False): """ Parameters ---------- use_frequencies : bool, optional DESCRIPTION. The default is False. Returns ------- btm_lss : TYPE DESCRIPTION. """ # ??? This function is not used, should be used in tester._vectorise_ps # Load the lss into df pzd_fpath = f"{self.directory_path}k{self.t}.pz_d" try: btm_lss = pd.read_csv(filepath_or_buffer=pzd_fpath, delim_whitespace=True) if not self.doc_index: # We will need to build the index with Tools.scan_directory(self.directory_path) as docs: for doc in docs: if doc.is_dir(): continue self.doc_index.append(Tools.get_filename(doc.path)) btm_lss.index = self.doc_index if use_frequencies: # The saved documents are in p(z|d) values # We want to proportion them to frequencies so that we have the # frequency of terms belonging to a topic # Since sum_b is used, we will use the count of biterms # Treating each p(zi|dj) as a proportion, we will count biterms with open(self.tokenised_btmcorpus_filepath) as c: tcorpus = c.readlines() # How many biterms are there? # Analyzing the C++ code, a widnow of 15 is used # regenerate the biterms and count as statistics can detect # redundancies in unordered terms: freqs = [len(self._doc_gen_biterms(tdoc)) for tdoc in tcorpus] btm_lss = btm_lss.mul(freqs, axis="index") return btm_lss except FileNotFoundError: return None