Esempio n. 1
0
class Preprocessor:
    def __init__(self, rootPath="", inputFolder=""):
        self.metadata = Metadata()

        self.stopper = Stopper()
        stopwords_folder = os.path.join(rootPath, "stopwords")
        print("Preprocessor root path: ", rootPath)
        self.stopper.load_stopwords(stopwords_folder)

        self.normalizer_tokenizer = NormalizationTokenization()
        self.stemmer = Stemmer()

        self.p1_path = ""
        self.p2_path = ""
        self.p3_path = ""

        self.rootPath = rootPath
        self.inputFolder = inputFolder

    def prepare_output_folders(self):
        self.p1_path = os.path.join(self.rootPath, outputFolder,
                                    practice_1_output_folder)
        self.p2_path = os.path.join(self.rootPath, outputFolder,
                                    practice_2_output_folder)
        self.p3_path = os.path.join(self.rootPath, outputFolder,
                                    practice_3_output_folder)
        if not os.path.exists(self.p1_path):
            os.makedirs(self.p1_path)
        if not os.path.exists(self.p2_path):
            os.makedirs(self.p2_path)
        if not os.path.exists(self.p3_path):
            os.makedirs(self.p3_path)

    def preprocess_text(self, text):
        '''
        This method does the preprocessing of the given text and returns
        the list of the processed tokens.
        '''
        token_list = self.normalizer_tokenizer.process_text(text)
        tokens_without_stopwords = self.stopper.remove_stopwords(token_list)
        tokens_stems_only = self.stemmer.get_stems(tokens_without_stopwords)
        return tokens_stems_only

    def preprocess(self, generate_metadata=False, generate_output_files=False):
        '''
        This method does all the preprocessing of all the files in the system
        '''
        self.prepare_output_folders()

        start_time = timer()
        inputPath = os.path.join(self.rootPath, self.inputFolder)
        for file in os.listdir(inputPath):
            fileName = os.fsdecode(file)

            ### <Practice 1>
            htmlFilter = HtmlFilter(inputPath, fileName)
            text = htmlFilter.filter_html()

            token_list = self.normalizer_tokenizer.process_text(text)

            txtFileName = fileName.replace('.html', '.txt')
            if generate_output_files:
                full_path = os.path.join(self.p1_path, txtFileName)
                self.write_string_list_to_file(full_path, token_list)
            ### </Practice 1>

            ### <Practice 2>
            tokens_without_stopwords = self.stopper.remove_stopwords(
                token_list)
            if generate_output_files:
                full_path = os.path.join(self.p2_path, txtFileName)
                self.write_string_list_to_file(full_path,
                                               tokens_without_stopwords)
            ### </Practice 2>

            ### <Practice 3>
            tokens_stems_only = self.stemmer.get_stems(
                tokens_without_stopwords)
            full_path = os.path.join(self.p3_path, txtFileName)
            self.write_string_list_to_file(full_path, tokens_stems_only)
            ### </Practice 3>

            if (generate_metadata):
                self.metadata.practice1_metadata(token_list)
                self.metadata.num_of_files += 1
                self.metadata.practice2_metadata(tokens_without_stopwords)
                self.metadata.practice3_metadata(tokens_stems_only)

        if (generate_metadata):
            self.metadata.final_metadata()

            self.metadata.avg_tokens_per_file = self.metadata.num_of_tokens / self.metadata.num_of_files
            self.metadata.avg_num_of_words_per_file = self.metadata.num_of_words_after_removing_stopwords / self.metadata.num_of_files
            self.metadata.avg_words_stemming = self.metadata.num_of_words_after_stemming / self.metadata.num_of_files
            end_time = timer()
            self.metadata.time_needed = str(end_time - start_time)

            self.metadata.print_metadata()

    def write_string_list_to_file(self, path, string_list):
        with open(path, 'w+') as file:
            file.write('\n'.join(string_list))