Example #1
0
 def testCount2(self):
     ngramcounter = NgramCounter(2)
     ngramcounter.count( self.corpusfile )
     self.assertEqual(ngramcounter.get_count('a b'), 7)
     self.assertEqual(ngramcounter.get_count('b a'), 4)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3)
     self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
Example #2
0
 def testShave(self):
     ngramcounter = NgramCounter(1)
     ngramcounter.count( self.corpusfile )
     ngramcounter.shave(4)
     self.assertEqual(ngramcounter.get_count('a'), 15)
     self.assertEqual(ngramcounter.get_count('b'), 10)
     self.assertEqual(ngramcounter.get_count('c'), 4)
     self.assertEqual(ngramcounter.get_count('d'), 0)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
Example #3
0
    def testVocab(self):
        wds = WordsList()
        wds.add("a")
        wds.add("b")
        wds.add("c")
        ngramcounter = NgramCounter(1,wds)
        ngramcounter.count( self.corpusfile )

        self.assertEqual(ngramcounter.get_count('a'), 15)
        self.assertEqual(ngramcounter.get_count('b'), 10)
        self.assertEqual(ngramcounter.get_count('c'), 4)
        self.assertEqual(ngramcounter.get_count('d'), 0)
        self.assertEqual(ngramcounter.get_count(UNKSTAMP), 3)
        self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
        self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
Example #4
0
 def testCount1(self):
     ngramcounter = NgramCounter() # default is unigram
     ngramcounter.count( self.corpusfile )
     self.assertEqual(ngramcounter.get_count('a'), 15)
     self.assertEqual(ngramcounter.get_count('b'), 10)
     self.assertEqual(ngramcounter.get_count('c'), 4)
     self.assertEqual(ngramcounter.get_count('d'), 3)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
     ngramcounter = NgramCounter(1)
     ngramcounter.count( self.corpusfile, self.corpusfile )
     self.assertEqual(ngramcounter.get_count('a'), 30)
     self.assertEqual(ngramcounter.get_count('b'), 20)
     self.assertEqual(ngramcounter.get_count('c'), 8)
     self.assertEqual(ngramcounter.get_count('d'), 6)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 6)