def clean_text(self, text):
        """ normalizes the text
        :param text: a string of text, to be cleaned.
        :return: a list of terms (i.e. tokenized)
        """
        if text:
            text = text.lower()
            text = text.split()
            cleaned = []
            cleaner_pipeline = TermPipeline()
            cleaner_pipeline = self.construct_pipeline(cleaner_pipeline)

            for term in text:
                clean_result = cleaner_pipeline.process(term)
                if clean_result:
                    cleaned.append(clean_result)
            return cleaned
        else:
            return ''
Exemple #2
0
    def setUp(self):
        self.logger = logging.getLogger("TestTermPipeline")

        self.ltp = LengthTermProcessor()
        self.tp = TermProcessor()
        self.stp = StopwordTermProcessor(stopwordfile='stopwords_test.txt')
        self.ptp = PunctuationTermProcessor()
        self.atp = AlphaTermProcessor()
        self.sctp = SpecialCharProcessor()



        self.pipeline = TermPipeline()
        self.pipeline.add_processor(self.sctp)
        self.pipeline.add_processor(self.tp)
        self.pipeline.add_processor(self.ltp)
        self.pipeline.add_processor(self.ptp)
        self.pipeline.add_processor(self.stp)
        self.pipeline.add_processor(self.atp)
    def clean_text(self, text):
        """ normalizes the text
        :param text: a string of text, to be cleaned.
        :return: a list of terms (i.e. tokenized)
        """
        if text:
            text = text.lower()
            text = text.split()
            cleaned = []
            cleaner_pipeline = TermPipeline()
            cleaner_pipeline = self.construct_pipeline(cleaner_pipeline)

            for term in text:
                clean_result = cleaner_pipeline.process(term)
                if clean_result:
                    cleaned.append(clean_result)
            return cleaned
        else:
            return ""
Exemple #4
0
class TestTermPipeline(unittest.TestCase):

    def setUp(self):
        self.logger = logging.getLogger("TestTermPipeline")

        self.ltp = LengthTermProcessor()
        self.tp = TermProcessor()
        self.stp = StopwordTermProcessor(stopwordfile='stopwords_test.txt')
        self.ptp = PunctuationTermProcessor()
        self.atp = AlphaTermProcessor()
        self.sctp = SpecialCharProcessor()



        self.pipeline = TermPipeline()
        self.pipeline.add_processor(self.sctp)
        self.pipeline.add_processor(self.tp)
        self.pipeline.add_processor(self.ltp)
        self.pipeline.add_processor(self.ptp)
        self.pipeline.add_processor(self.stp)
        self.pipeline.add_processor(self.atp)

    def test_read_stopfile(self):
        expected = ['accessibility', 'information', 'site', 'skip',
                    'main', 'content', 'a', 'about', 'above', 'after',
                    'again', 'against', 'all', 'am', 'an', 'and', 'any',
                    'are', "arent", 'as', 'at', 'be', 'because', 'been',
                    'before', 'being', 'below', 'between', 'both', 'but',
                    'by', "cant", 'cannot', 'could', "couldnt", 'did',
                    "didnt", 'do', 'does', "doesnt", 'doing', "dont",
                    'down', 'during', 'each', 'few', 'for', 'from',
                    'further', 'had', "hadnt", 'has', "hasnt", 'have',
                    "havent", 'having', 'he', "hed", "hell", "hes",
                    'her', 'here', "heres", 'hers', 'herself', 'him',
                    'himself', 'his', 'how', "hows", 'i', "id", "ill",
                    "im", "ive", 'if', 'in', 'into', 'is', "isnt",
                    'it', "its", 'its', 'itself', "lets", 'me', 'more',
                    'most', "mustnt", 'my', 'myself', 'no', 'nor', 'not',
                    'of', 'off', 'on', 'once', 'only', 'or', 'other',
                    'ought', 'our', 'ours', 'ourselves', 'out', 'over',
                    'own', 'same', "shant", 'she', "shed", "shell",
                    "shes", 'should', "shouldnt", 'so', 'some', 'such',
                    'than', 'that', "thats", 'the', 'their', 'theirs',
                    'them', 'themselves', 'then', 'there', "theres",
                    'these', 'they', "theyd", "theyll", "theyre",
                    "theyve", 'this', 'those', 'through', 'to', 'too',
                    'under', 'until', 'up', 'us', 'very', 'was', "wasnt",
                    'we', "wed", "well", "were", "weve", 'were',
                    "werent", 'what', "whats", 'when', "whens", 'where',
                    "wheres", 'which', 'while', 'who', "whos", 'whom',
                    'why', "whys", 'with', "wont", 'would', "wouldnt",
                    'you', "youd", "youll", "youre", "youve", 'your',
                    'yours', 'yourself', 'yourselves']
        self.assertEquals(expected, self.stp.stoplist)

    def test_process(self):
        #test removal of punctuation, numbers, special characters, can't from stopfile
        #other cases are tested in test_query_gen where the clean
        #method which uses the pipeline process method is tested
        terms = 'hello WORlD. my name  is Python111!!! ü can\'t'
        terms = terms.split()
        result = []
        for term in terms:
            clean = self.pipeline.process(term)
            if clean:
                result.append(clean)
        expected = ['hello', 'world', 'name', 'python']
        self.assertEquals(expected,result)
        #as above but tests cant is removed as well as can't
        # term = 'hello WORlD. my name  is Python111!!! ü cant'
        # expected = 'hello world my name is python '
        # result = self.pipeline.process(term)
        self.assertEquals(expected,result)

    def test_processors_config_order(self):
        #todo this is to see the impact of adding processors in
        #different orders
        pass