def testCount2(self): ngramcounter = NgramCounter(2) ngramcounter.count( self.corpusfile ) self.assertEqual(ngramcounter.get_count('a b'), 7) self.assertEqual(ngramcounter.get_count('b a'), 4) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3) self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
def testShave(self): ngramcounter = NgramCounter(1) ngramcounter.count( self.corpusfile ) ngramcounter.shave(4) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def testVocab(self): wds = WordsList() wds.add("a") wds.add("b") wds.add("c") ngramcounter = NgramCounter(1,wds) ngramcounter.count( self.corpusfile ) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(UNKSTAMP), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def testCount1(self): ngramcounter = NgramCounter() # default is unigram ngramcounter.count( self.corpusfile ) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3) ngramcounter = NgramCounter(1) ngramcounter.count( self.corpusfile, self.corpusfile ) self.assertEqual(ngramcounter.get_count('a'), 30) self.assertEqual(ngramcounter.get_count('b'), 20) self.assertEqual(ngramcounter.get_count('c'), 8) self.assertEqual(ngramcounter.get_count('d'), 6) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 6)
def testAppendSentence1(self): ngramcounter = NgramCounter() # default is unigram ngramcounter.append_sentence( self.sent1 ) self.assertEqual(ngramcounter.get_count('a'), 6) self.assertEqual(ngramcounter.get_count('b'), 4) self.assertEqual(ngramcounter.get_count('c'), 1) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 1) self.assertEqual(ngramcounter.get_ncount(), 12) ngramcounter.append_sentence( self.sent2 ) ngramcounter.append_sentence( self.sent3 ) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def testAppendSentence2(self): ngramcounter = NgramCounter(2) # bigram ngramcounter.append_sentence( self.sent1 ) self.assertEqual(ngramcounter.get_count('a b'), 3) self.assertEqual(ngramcounter.get_count('b a'), 2) self.assertEqual(ngramcounter.get_count('a c'), 1) self.assertEqual(ngramcounter.get_count('a d'), 0) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 1) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' b'), 0) self.assertEqual(ngramcounter.get_count('a '+END_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 1) ngramcounter.append_sentence( self.sent2 ) ngramcounter.append_sentence( self.sent3 ) self.assertEqual(ngramcounter.get_count('a b'), 7) self.assertEqual(ngramcounter.get_count('b a'), 4) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3) self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)