def test_preprocessing(): root_folder = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/plsa/test/' pclean.file_parts_number=10 pplsa.file_parts_number = 10 pclean.file_dict = root_folder + 'dict/test_dict' pclean.source_texts = root_folder + 'extracted/*.txt' pclean.output_dir = root_folder + 'cleaned/' # Do cleansing on the data and turing it to bad-of-words model pclean.pre_pro() # Train using PLSA pplsa.folder = pclean.output_dir[:-1] pplsa.dict_path = pclean.file_dict pplsa.folder = pclean.output_dir[:-1] pplsa.main()
def generate_topics(self): start_time_1 = time.time() pplsa.file_parts_number=10 pclean.file_parts_number = 10 pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] +'_dict' pclean.source_texts = self.source_texts + self.unique_folder_naming + '*.txt' pclean.output_dir = self.output_dir + self.unique_folder_naming os.mkdir(pclean.output_dir) # Do cleansing on the data and turing it to bad-of-words model pclean.pre_pro() # Train using PLSA pplsa.topic_divider = 0 pplsa.num_topics = 2 pplsa.folder = pclean.output_dir[:-1] pplsa.dict_path = pclean.file_dict pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx' pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional' pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png' # Folder paths to delete self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH self.output_dir_stream = pclean.output_dir self.file_dict_stream = pclean.file_dict os.mkdir(pplsa.PLSA_PARAMETERS_PATH) pplsa.main() end_time_1 = time.time() print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4))
def generate_topics_json(self): start_time_1 = time.time() pplsa.file_parts_number = 10 pclean.file_parts_number = 10 pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] + '_dict' pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json' pclean.output_dir = self.output_dir + self.unique_folder_naming os.mkdir(pclean.output_dir) # Do cleansing on the data and turing it to bad-of-words model with open( self.plsa_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: f.write('Preprocessing started.') pclean.pre_pro() with open( self.plsa_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: f.write('Preprocessing finished. Topic analysis started.') # Train using PLSA pplsa.topic_divider = self.topic_divider pplsa.num_topics = self.num_topics pplsa.maxiter2 = self.max_iter pplsa.beta = self.beta pplsa.folder = pclean.output_dir[:-1] pplsa.dict_path = pclean.file_dict pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx' pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional' pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png' # Folder paths to delete self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH self.output_dir_stream = pclean.output_dir self.file_dict_stream = pclean.file_dict try: os.mkdir(pplsa.PLSA_PARAMETERS_PATH) except: print( '-----------------------Folder exists-------------------------' ) pplsa.main() end_time_1 = time.time() print('Total training time took:', round((end_time_1 - start_time_1) / 60, 4)) with open( self.plsa_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: f.write('Topic analysis finished.\n') f.write(str(round((end_time_1 - start_time_1) / 60, 4)))