def clean_text(self, text): """ normalizes the text :param text: a string of text, to be cleaned. :return: a list of terms (i.e. tokenized) """ if text: text = text.lower() text = text.split() cleaned = [] cleaner_pipeline = TermPipeline() cleaner_pipeline = self.construct_pipeline(cleaner_pipeline) for term in text: clean_result = cleaner_pipeline.process(term) if clean_result: cleaned.append(clean_result) return cleaned else: return ''
def setUp(self): self.logger = logging.getLogger("TestTermPipeline") self.ltp = LengthTermProcessor() self.tp = TermProcessor() self.stp = StopwordTermProcessor(stopwordfile='stopwords_test.txt') self.ptp = PunctuationTermProcessor() self.atp = AlphaTermProcessor() self.sctp = SpecialCharProcessor() self.pipeline = TermPipeline() self.pipeline.add_processor(self.sctp) self.pipeline.add_processor(self.tp) self.pipeline.add_processor(self.ltp) self.pipeline.add_processor(self.ptp) self.pipeline.add_processor(self.stp) self.pipeline.add_processor(self.atp)
def clean_text(self, text): """ normalizes the text :param text: a string of text, to be cleaned. :return: a list of terms (i.e. tokenized) """ if text: text = text.lower() text = text.split() cleaned = [] cleaner_pipeline = TermPipeline() cleaner_pipeline = self.construct_pipeline(cleaner_pipeline) for term in text: clean_result = cleaner_pipeline.process(term) if clean_result: cleaned.append(clean_result) return cleaned else: return ""
class TestTermPipeline(unittest.TestCase): def setUp(self): self.logger = logging.getLogger("TestTermPipeline") self.ltp = LengthTermProcessor() self.tp = TermProcessor() self.stp = StopwordTermProcessor(stopwordfile='stopwords_test.txt') self.ptp = PunctuationTermProcessor() self.atp = AlphaTermProcessor() self.sctp = SpecialCharProcessor() self.pipeline = TermPipeline() self.pipeline.add_processor(self.sctp) self.pipeline.add_processor(self.tp) self.pipeline.add_processor(self.ltp) self.pipeline.add_processor(self.ptp) self.pipeline.add_processor(self.stp) self.pipeline.add_processor(self.atp) def test_read_stopfile(self): expected = ['accessibility', 'information', 'site', 'skip', 'main', 'content', 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "arent", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "cant", 'cannot', 'could', "couldnt", 'did', "didnt", 'do', 'does', "doesnt", 'doing', "dont", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadnt", 'has', "hasnt", 'have', "havent", 'having', 'he', "hed", "hell", "hes", 'her', 'here', "heres", 'hers', 'herself', 'him', 'himself', 'his', 'how', "hows", 'i', "id", "ill", "im", "ive", 'if', 'in', 'into', 'is', "isnt", 'it', "its", 'its', 'itself', "lets", 'me', 'more', 'most', "mustnt", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shant", 'she', "shed", "shell", "shes", 'should', "shouldnt", 'so', 'some', 'such', 'than', 'that', "thats", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "theres", 'these', 'they', "theyd", "theyll", "theyre", "theyve", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'us', 'very', 'was', "wasnt", 'we', "wed", "well", "were", "weve", 'were', "werent", 'what', "whats", 'when', "whens", 'where', "wheres", 'which', 'while', 'who', "whos", 'whom', 'why', "whys", 'with', "wont", 'would', "wouldnt", 'you', "youd", "youll", "youre", "youve", 'your', 'yours', 'yourself', 'yourselves'] self.assertEquals(expected, self.stp.stoplist) def test_process(self): #test removal of punctuation, numbers, special characters, can't from stopfile #other cases are tested in test_query_gen where the clean #method which uses the pipeline process method is tested terms = 'hello WORlD. my name is Python111!!! ü can\'t' terms = terms.split() result = [] for term in terms: clean = self.pipeline.process(term) if clean: result.append(clean) expected = ['hello', 'world', 'name', 'python'] self.assertEquals(expected,result) #as above but tests cant is removed as well as can't # term = 'hello WORlD. my name is Python111!!! ü cant' # expected = 'hello world my name is python ' # result = self.pipeline.process(term) self.assertEquals(expected,result) def test_processors_config_order(self): #todo this is to see the impact of adding processors in #different orders pass