def testCount2(self): ngramcounter = NgramCounter(2) ngramcounter.count( self.corpusfile ) self.assertEqual(ngramcounter.get_count('a b'), 7) self.assertEqual(ngramcounter.get_count('b a'), 4) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3) self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
def testShave(self): ngramcounter = NgramCounter(1) ngramcounter.count( self.corpusfile ) ngramcounter.shave(4) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def testVocab(self): wds = WordsList() wds.add("a") wds.add("b") wds.add("c") ngramcounter = NgramCounter(1,wds) ngramcounter.count( self.corpusfile ) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(UNKSTAMP), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def testCount1(self): ngramcounter = NgramCounter() # default is unigram ngramcounter.count( self.corpusfile ) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3) ngramcounter = NgramCounter(1) ngramcounter.count( self.corpusfile, self.corpusfile ) self.assertEqual(ngramcounter.get_count('a'), 30) self.assertEqual(ngramcounter.get_count('b'), 20) self.assertEqual(ngramcounter.get_count('c'), 8) self.assertEqual(ngramcounter.get_count('d'), 6) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 6)