Exemple #1
0
 def testShave(self):
     ngramcounter = NgramCounter(1)
     ngramcounter.count( self.corpusfile )
     ngramcounter.shave(4)
     self.assertEqual(ngramcounter.get_count('a'), 15)
     self.assertEqual(ngramcounter.get_count('b'), 10)
     self.assertEqual(ngramcounter.get_count('c'), 4)
     self.assertEqual(ngramcounter.get_count('d'), 0)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
Exemple #2
0
 def testCount2(self):
     ngramcounter = NgramCounter(2)
     ngramcounter.count( self.corpusfile )
     self.assertEqual(ngramcounter.get_count('a b'), 7)
     self.assertEqual(ngramcounter.get_count('b a'), 4)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3)
     self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
Exemple #3
0
    def testVocab(self):
        wds = WordsList()
        wds.add("a")
        wds.add("b")
        wds.add("c")
        ngramcounter = NgramCounter(1,wds)
        ngramcounter.count( self.corpusfile )

        self.assertEqual(ngramcounter.get_count('a'), 15)
        self.assertEqual(ngramcounter.get_count('b'), 10)
        self.assertEqual(ngramcounter.get_count('c'), 4)
        self.assertEqual(ngramcounter.get_count('d'), 0)
        self.assertEqual(ngramcounter.get_count(UNKSTAMP), 3)
        self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
        self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
Exemple #4
0
 def testCount1(self):
     ngramcounter = NgramCounter() # default is unigram
     ngramcounter.count( self.corpusfile )
     self.assertEqual(ngramcounter.get_count('a'), 15)
     self.assertEqual(ngramcounter.get_count('b'), 10)
     self.assertEqual(ngramcounter.get_count('c'), 4)
     self.assertEqual(ngramcounter.get_count('d'), 3)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
     ngramcounter = NgramCounter(1)
     ngramcounter.count( self.corpusfile, self.corpusfile )
     self.assertEqual(ngramcounter.get_count('a'), 30)
     self.assertEqual(ngramcounter.get_count('b'), 20)
     self.assertEqual(ngramcounter.get_count('c'), 8)
     self.assertEqual(ngramcounter.get_count('d'), 6)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 6)
Exemple #5
0
 def testAppendSentence2(self):
     ngramcounter = NgramCounter(2) # bigram
     ngramcounter.append_sentence( self.sent1 )
     self.assertEqual(ngramcounter.get_count('a b'), 3)
     self.assertEqual(ngramcounter.get_count('b a'), 2)
     self.assertEqual(ngramcounter.get_count('a c'), 1)
     self.assertEqual(ngramcounter.get_count('a d'), 0)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 1)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' b'), 0)
     self.assertEqual(ngramcounter.get_count('a '+END_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 1)
     ngramcounter.append_sentence( self.sent2 )
     ngramcounter.append_sentence( self.sent3 )
     self.assertEqual(ngramcounter.get_count('a b'), 7)
     self.assertEqual(ngramcounter.get_count('b a'), 4)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3)
     self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
Exemple #6
0
 def testAppendSentence1(self):
     ngramcounter = NgramCounter() # default is unigram
     ngramcounter.append_sentence( self.sent1 )
     self.assertEqual(ngramcounter.get_count('a'), 6)
     self.assertEqual(ngramcounter.get_count('b'), 4)
     self.assertEqual(ngramcounter.get_count('c'), 1)
     self.assertEqual(ngramcounter.get_count('d'), 0)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 1)
     self.assertEqual(ngramcounter.get_ncount(), 12)
     ngramcounter.append_sentence( self.sent2 )
     ngramcounter.append_sentence( self.sent3 )
     self.assertEqual(ngramcounter.get_count('a'), 15)
     self.assertEqual(ngramcounter.get_count('b'), 10)
     self.assertEqual(ngramcounter.get_count('c'), 4)
     self.assertEqual(ngramcounter.get_count('d'), 3)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)