def test_extractor(self):
        """Reads a review of Alice in Wonderland and extracts the most
        frequent nouns found in the text as well as the most frequent 
        'noun phrases'.
        """

        text = readData("alicereview.txt")

        extractor = getUtility(ITermExtractor)
        (simple_terms, np_terms) = extractor.extract(text)
        important_terms = sorted(simple_terms.items(), key=itemgetter(1), reverse=True)[:10]

        self.failUnless(
            important_terms
            == [
                ("alice", 80),
                ("queen", 19),
                ("rabbit", 15),
                ("hatter", 13),
                ("door", 13),
                ("cat", 13),
                ("chapter", 12),
                ("king", 12),
                ("turtle", 11),
                ("duchess", 11),
            ]
        )

        important_np_terms = sorted(np_terms.items(), key=itemgetter(1), reverse=True)[:10]

        self.failUnless(
            important_np_terms
            == [("white rabbit", 8), ("mock turtle", 8), ("cheshire cat", 5), ("march hare", 4), ("mad hatter", 3)]
        )
    def test_extractor(self):
        """Reads a review of Alice in Wonderland and extracts the most
        frequent nouns found in the text as well as the most frequent
        'noun phrases'.
        """

        text = readData('alicereview.txt')
        extractor = getUtility(ITermExtractor)
        (simple_terms, np_terms) = extractor.extract(text, locale="en")

        top_5_nouns = sorted(
            simple_terms.items(),
            key = itemgetter(1),
            reverse=True)[:5]
        top_5_nouns = [term for (term, rank) in top_5_nouns]
        for word in ['alice', 'rabbit', 'hatter', 'door', 'cat']:
            self.failUnless(word in top_5_nouns)

        top_5_nps = sorted(
            np_terms.items(),
            key = itemgetter(1),
            reverse=True)[:5]
        top_5_nps = [term for (term, rank) in top_5_nps]
        for np in ['white rabbit', 'mock turtle', 'mad hatter', 'march hare']:
            self.failUnless(np in top_5_nps)
 def test_indexer(self):
     """Creates an IATContentType and tests the catalog index
     """
     text = readData('alicereview.txt')
     self.folder.invokeFactory('Document', 'test',
                               text=text,
                               subject="A Subject")
     catalog = getToolByName(self.folder, 'portal_catalog')
     cr = catalog.searchResults(noun_terms="alice")
     self.failUnless(cr[0]['noun_terms'][:5] ==
         ['alice', 'rabbit', 'door', 'cat', 'hatter'])
     self.failUnless(cr[0]['noun_phrase_terms'][:5] ==
         ['mock turtle', 'white rabbit', 'march hare', 'mad hatter'])
Example #4
0
 def test_indexer(self):
     """Creates an IATContentType and tests the catalog index
     """
     text = readData('alicereview.txt')
     self.folder.invokeFactory('Document',
                               'test',
                               text=text,
                               subject="A Subject")
     catalog = getToolByName(self.folder, 'portal_catalog')
     cr = catalog.searchResults(noun_terms="alice")
     self.failUnless(cr[0]['noun_terms'][:5] ==
                     ['alice', 'rabbit', 'door', 'cat', 'hatter'])
     self.failUnless(
         cr[0]['noun_phrase_terms'][:5] ==
         ['mock turtle', 'white rabbit', 'march hare', 'mad hatter'])
    def test_npstorage(self):
        """Reads a review of Alice in Wonderland and extracts the most
        frequent nouns found in the text as well as the most frequent 
        'noun phrases'.
        """
        
        text = readData('alicereview.txt')
        storage = getUtility(INounPhraseStorage)        
        storage.addDocument('alice',text)
        self.failUnless(storage.rankedNouns['alice'] == 
            [('alice', 0), ('queen', 1), ('rabbit', 2), ('hatter', 3), 
             ('door', 3), ('cat', 3), ('chapter', 6), ('king', 6), 
             ('turtle', 8), ('duchess', 8), ('hare', 10), ('table', 11), 
             ('white', 11), ('house', 13), ('mock', 13), ('time', 13), 
             ('day', 16), ('story', 16), ('mushroom', 16), ('baby', 16), 
             ('march', 20), ('caterpillar', 20), ('cheshire', 20), 
             ('sister', 20), ('size', 20), ('pool', 20), ('verdict', 26), 
             ('jury', 26), ('witness', 26), ('nothing', 26), ('side', 26), 
             ('gryphon', 26), ('head', 26), ('tea', 26), ('cook', 26), 
             ('mouse', 26), ('mad', 26), ('month', 37), ('everything', 37), 
             ('passage', 37), ('croquet', 37), ('hall', 37), ('watch', 37), 
             ('fan', 37), ('procession', 37), ('dormouse', 37), ('re', 37), 
             ('everyone', 37), ('dream', 37), ('tree', 37), ('game', 37), 
             ('window', 37), ('way', 37), ('part', 37), ('evidence', 37), 
             ('executioner', 37), ('doesn', 37), ('footman', 37)])
        
        self.failUnless(storage.rankedNPs['alice'] == 
            [('white rabbit', 0), ('mock turtle', 0), ('cheshire cat', 2), 
             ('march hare', 3), ('mad hatter', 4)])

        self.failUnless(storage.getRankedTerms('alice',5) == 
          ([('alice', 0), ('queen', 1), ('rabbit', 2), ('hatter', 3), 
            ('door', 3), ('cat', 3)], 
           [('white rabbit', 0), ('mock turtle', 0), ('cheshire cat', 2), 
            ('march hare', 3), ('mad hatter', 4)]))
        
        self.failUnless(storage.getNounTerms('alice',5) == 
            ['alice', 'queen', 'rabbit', 'hatter', 'door', 'cat'])
        self.failUnless(storage.getNPTerms('alice',5) == 
            ['white rabbit', 'mock turtle', 'cheshire cat', 'march hare', 
             'mad hatter'])
Example #6
0
    def test_extractor(self):
        """Reads a review of Alice in Wonderland and extracts the most
        frequent nouns found in the text as well as the most frequent
        'noun phrases'.
        """

        text = readData('alicereview.txt')
        extractor = getUtility(ITermExtractor)
        (simple_terms, np_terms) = extractor.extract(text, locale="en")

        top_5_nouns = sorted(simple_terms.items(),
                             key=itemgetter(1),
                             reverse=True)[:5]
        top_5_nouns = [term for (term, rank) in top_5_nouns]
        for word in ['alice', 'rabbit', 'hatter', 'door', 'cat']:
            self.failUnless(word in top_5_nouns)

        top_5_nps = sorted(np_terms.items(), key=itemgetter(1),
                           reverse=True)[:5]
        top_5_nps = [term for (term, rank) in top_5_nps]
        for np in ['white rabbit', 'mock turtle', 'mad hatter', 'march hare']:
            self.failUnless(np in top_5_nps)