class Preprocessor: def __init__(self, corpus=None, cxp=True, swr=True, nr=True, stem=True): if corpus != None: self.corpus_path = Path(str(corpus)) else: self.corpus_path = None self.contraction_expansion_flag = False self.stop_word_flag = False self.noise_removal_flag = False self.stemmer_flag = False if cxp: self.contraction_expansion_flag = True self.contraction_expander = ContractionExpander() if swr: self.stop_word_flag = True self.stop_word_remover = StopWordRemover() if nr: self.noise_removal_flag = True self.noise_remover = NoiseRemover() if stem: self.stemmer_flag = True self.stemmer = Stemmer() def process_corpus(self): if self.corpus_path == None: raise Exception('Please set the path to the corpus first') processed_path = Path("processed") if(not Path(str(processed_path)).exists()): processed_path.mkdir() # assumes that everything in corpus_path is not a file, not a directory for file in self.corpus_path.iterdir(): with open(file, 'r') as f: content = f.read().lower() if self.noise_removal_flag: content = self.noise_remover.remove_noise(content) if self.contraction_expansion_flag: content = self.contraction_expander.expand_text(content) new_content = '' if self.stop_word_flag: content = self.stop_word_remover.remove_stop_words(content) if self.stemmer_flag: content = self.stemmer.stem_text(content) new_file = open("processed\\" + path.basename(f.name), 'w') new_file.write(content) new_file.close() # takes in the folder for corpus def set_path(self, corpus): if corpus == None: raise Exception('Corpus was None, please give a directory for the corpus') if isinstance(corpus, str): self.corpus_path = Path(corpus) elif isinstance(corpus, Path): self.corpus_path = corpus else: raise Exception('Invalid type for corpus, please give a Path or a string')