Esempio n. 1
0
 def test_most_encountered_words_longer_text_blob(self):
     """
     Test FileIndexer.most_encountered_words with a longer text blob
     """
     indexer = FileIndexer(['tests/a_dark_brown_dog'])
     indexer.start_indexing()
     self.assertEqual(indexer.most_encountered_words(5), [
         ('the', 212), ('a', 88), ('and', 83), ('he', 80),
         ('his', 69)
     ])
Esempio n. 2
0
class IndexerTestCase(unittest.TestCase):
    """
    Tests for nuke_filewalker.FileIndexer
    """
    def setUp(self):
        self.indexer = FileIndexer(['tests/testfile1.txt', 'tests/testfile2.txt'])

    def test_tokenize(self):
        """
        FileIndexer.tokenize should tokenize a blob of text into words.
        """
        with open('tests/testfile1.txt') as fp:
            blob = fp.read()
            self.assertEqual(self.indexer.tokenize(blob), [
                'i', 'am', 'beginning', 'to', 'feel', 'like', 'a', 'rap',
                'god', 'rap', 'god', 'all', 'my', 'people', 'from', 'the',
                'front', 'to', 'the', 'back', 'nod', 'back', 'nod', 'now',
                'who', 'thinks', 'their', 'arms', 'are', 'long', 'enough',
                'to', 'slap', 'box', 'slap', 'box', 'they', 'said', 'i',
                'rap', 'like', 'a', 'robot', 'so', 'call', 'me', 'rap', 'bot'
            ])

    def test_start_indexing(self):
        """
        FileIndexer.start_indexing should index multiple files and store all
        the words in a list.
        """
        self.indexer.start_indexing()
        self.assertEqual(self.indexer.words, [
            'i', 'am', 'beginning', 'to', 'feel', 'like', 'a', 'rap',
            'god', 'rap', 'god', 'all', 'my', 'people', 'from', 'the',
            'front', 'to', 'the', 'back', 'nod', 'back', 'nod', 'now',
            'who', 'thinks', 'their', 'arms', 'are', 'long', 'enough',
            'to', 'slap', 'box', 'slap', 'box', 'they', 'said', 'i',
            'rap', 'like', 'a', 'robot', 'so', 'call', 'me', 'rap', 'bot',
            'i', 'am', 'a', 'space', 'bound', 'rocket', 'ship', 'and',
            'your', 'heart', 'is', 'the', 'moon', 'and', 'i', 'am', 'aiming',
            'right', 'at', 'you', 'right', 'at', 'you', 'two', 'hundred',
            'fifty', 'thousand', 'miles', 'on', 'a', 'clear', 'night', 'in',
            'june', 'and', 'i', 'am', 'aiming', 'right', 'at', 'you',
            'right', 'at', 'you', 'right', 'at', 'you'
        ])

    def test_most_encountered_words_default_count(self):
        """
        FileIndexer.most_encountered_words called without a parameter
        should return the top 10 most common words across all the files
        being indexed.
        """
        self.indexer.start_indexing()
        self.assertEqual(self.indexer.most_encountered_words(), [
            ('right', 5), ('you', 5), ('i', 5), ('at', 5), ('am', 4),
            ('rap', 4), ('a', 4), ('and', 3), ('to', 3), ('the', 3),
        ])

    def test_most_encountered_words_new_count(self):
        """
        FileIndexer.most_encountered_words should return the specified
        number of most common words when called with a parameter for
        count.
        """
        self.indexer.start_indexing()
        self.assertEqual(self.indexer.most_encountered_words(5), [
            ('right', 5), ('you', 5), ('i', 5), ('at', 5), ('am', 4)
        ])

    def test_most_encountered_words_longer_text_blob(self):
        """
        Test FileIndexer.most_encountered_words with a longer text blob
        """
        indexer = FileIndexer(['tests/a_dark_brown_dog'])
        indexer.start_indexing()
        self.assertEqual(indexer.most_encountered_words(5), [
            ('the', 212), ('a', 88), ('and', 83), ('he', 80),
            ('his', 69)
        ])