コード例 #1
0
ファイル: dictogram_test.py プロジェクト: imthaghost/tweetGen
 def test_types(self):
     histogram = Dictogram(self.fish_words)
     # Verify count of distinct word types
     assert len(set(self.fish_words)) == 5
     assert histogram.types == 5
     # Adding words again should not change count of distinct word types
     for word in self.fish_words:
         histogram.add_count(word)
     assert histogram.types == 5
コード例 #2
0
ファイル: dictogram_test.py プロジェクト: imthaghost/tweetGen
 def test_tokens(self):
     histogram = Dictogram(self.fish_words)
     # Verify total count of all word tokens
     assert len(self.fish_words) == 8
     assert histogram.tokens == 8
     # Adding words again should double total count of all word tokens
     for word in self.fish_words:
         histogram.add_count(word)
     assert histogram.tokens == 8 * 2
コード例 #3
0
ファイル: dictogram_test.py プロジェクト: imthaghost/tweetGen
 def test_entries(self):
     dictogram = Dictogram(self.fish_words)
     # Verify histogram as dictionary of entries like {word: count}
     assert len(dictogram) == 5
     self.assertCountEqual(dictogram, self.fish_dict)  # Ignore item order
     # Verify histogram as list of entries like [(word, count)]
     listogram = dictogram.items()
     assert len(listogram) == 5
     self.assertCountEqual(listogram, self.fish_list)  # Ignore item order
コード例 #4
0
    def make_markov(self, corpus):
        '''Generates nth order markov chain from corpus'''

        for i, word in enumerate(corpus):
            # Prepopulate queue
            if i < self.order:
                self.queue.enqueue(word)
            else:
                # Queue becomes state key in markov dict
                state = tuple(self.queue)

                # Advance the queue
                self.queue.dequeue()
                self.queue.enqueue(word)

                if match(r'(([A-Z])\w*)', state[0]) is not None:
                    self['START'].add_count(state)

                if state not in self.keys():
                    self[state] = Dictogram()

                # If state exists, add word to dictogram
                self.get(state).add_count(word)

        return self
コード例 #5
0
ファイル: dictogram_test.py プロジェクト: imthaghost/tweetGen
 def test_contains(self):
     histogram = Dictogram(self.fish_words)
     # All of these words should be found
     for word in self.fish_words:
         assert word in histogram
     # None of these words should be found
     for word in ('fishy', 'food'):
         assert word not in histogram
コード例 #6
0
    def __init__(self, corpus=None, order=2):
        super(NarkovChain, self).__init__()
        self.order = order
        self.queue = Queue(order)
        self.sentence = []

        if corpus is not None:
            self['START'] = Dictogram()
            self.make_markov(corpus)
コード例 #7
0
ファイル: dictogram_test.py プロジェクト: imthaghost/tweetGen
 def test_frequency(self):
     histogram = Dictogram(self.fish_words)
     # Verify frequency count of all words
     assert histogram.frequency('one') == 1
     assert histogram.frequency('two') == 1
     assert histogram.frequency('red') == 1
     assert histogram.frequency('blue') == 1
     assert histogram.frequency('fish') == 4
     # Verify frequency count of unseen words
     assert histogram.frequency('food') == 0
コード例 #8
0
ファイル: dictogram_test.py プロジェクト: imthaghost/tweetGen
 def test_add_count(self):
     histogram = Dictogram(self.fish_words)
     # Add more words to update frequency counts
     histogram.add_count('two', 2)
     histogram.add_count('blue', 3)
     histogram.add_count('fish', 4)
     histogram.add_count('food', 5)
     # Verify updated frequency count of all words
     assert histogram.frequency('one') == 1
     assert histogram.frequency('two') == 3
     assert histogram.frequency('red') == 1
     assert histogram.frequency('blue') == 4
     assert histogram.frequency('fish') == 8
     assert histogram.frequency('food') == 5
     # Verify count of distinct word types
     assert histogram.types == 6
     # Verify total count of all word tokens
     assert histogram.tokens == 8 + 14