Ejemplo n.º 1
0
def test_preprocessing():

    root_folder = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/plsa/test/'

    pclean.file_parts_number=10
    pplsa.file_parts_number = 10
    pclean.file_dict = root_folder + 'dict/test_dict'
    pclean.source_texts = root_folder + 'extracted/*.txt'
    pclean.output_dir = root_folder + 'cleaned/'


    # Do cleansing on the data and turing it to bad-of-words model
    pclean.pre_pro()

    # Train using PLSA
    pplsa.folder = pclean.output_dir[:-1]
    pplsa.dict_path = pclean.file_dict
    pplsa.folder = pclean.output_dir[:-1]
    pplsa.main()
Ejemplo n.º 2
0
    def generate_topics(self):

        start_time_1 = time.time()

        pplsa.file_parts_number=10
        pclean.file_parts_number = 10
        pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] +'_dict'
        pclean.source_texts = self.source_texts + self.unique_folder_naming + '*.txt'
        pclean.output_dir = self.output_dir + self.unique_folder_naming

        os.mkdir(pclean.output_dir)


        # Do cleansing on the data and turing it to bad-of-words model
        pclean.pre_pro()

        # Train using PLSA
        pplsa.topic_divider = 0
        pplsa.num_topics = 2
        pplsa.folder = pclean.output_dir[:-1]
        pplsa.dict_path = pclean.file_dict
        pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming
        pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx'
        pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional'
        pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png'

        # Folder paths to delete
        self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH
        self.output_dir_stream = pclean.output_dir
        self.file_dict_stream = pclean.file_dict



        os.mkdir(pplsa.PLSA_PARAMETERS_PATH)

        pplsa.main()

        end_time_1 = time.time()

        print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4))
Ejemplo n.º 3
0
    def generate_topics_json(self):

        start_time_1 = time.time()

        pplsa.file_parts_number = 10
        pclean.file_parts_number = 10
        pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] + '_dict'
        pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json'
        pclean.output_dir = self.output_dir + self.unique_folder_naming

        os.mkdir(pclean.output_dir)

        # Do cleansing on the data and turing it to bad-of-words model

        with open(
                self.plsa_parameters_path + self.unique_folder_naming +
                'status.txt', 'w') as f:
            f.write('Preprocessing started.')

        pclean.pre_pro()

        with open(
                self.plsa_parameters_path + self.unique_folder_naming +
                'status.txt', 'w') as f:
            f.write('Preprocessing finished. Topic analysis started.')

        # Train using PLSA
        pplsa.topic_divider = self.topic_divider
        pplsa.num_topics = self.num_topics
        pplsa.maxiter2 = self.max_iter
        pplsa.beta = self.beta
        pplsa.folder = pclean.output_dir[:-1]
        pplsa.dict_path = pclean.file_dict
        pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming
        pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx'
        pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional'
        pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png'

        # Folder paths to delete
        self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH
        self.output_dir_stream = pclean.output_dir
        self.file_dict_stream = pclean.file_dict

        try:
            os.mkdir(pplsa.PLSA_PARAMETERS_PATH)
        except:
            print(
                '-----------------------Folder exists-------------------------'
            )

        pplsa.main()

        end_time_1 = time.time()

        print('Total training time took:',
              round((end_time_1 - start_time_1) / 60, 4))

        with open(
                self.plsa_parameters_path + self.unique_folder_naming +
                'status.txt', 'w') as f:
            f.write('Topic analysis finished.\n')
            f.write(str(round((end_time_1 - start_time_1) / 60, 4)))