def testCount2(self): ngramcounter = sppasNgramCounter(2) ngramcounter.count(self.corpusfile) self.assertEqual(ngramcounter.get_count('a b'), 7) self.assertEqual(ngramcounter.get_count('b a'), 4) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3) self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
def testCount2(self): ngramcounter = sppasNgramCounter(2) ngramcounter.count(self.corpusfile) self.assertEqual(ngramcounter.get_count('a b'), 7) self.assertEqual(ngramcounter.get_count('b a'), 4) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL + ' a'), 3) self.assertEqual(ngramcounter.get_count('b ' + END_SENT_SYMBOL), 3)
def testCount1(self): ngramcounter = sppasNgramCounter() # default is unigram ngramcounter.count(self.corpusfile) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3) ngramcounter = sppasNgramCounter(1) ngramcounter.count(self.corpusfile, self.corpusfile) self.assertEqual(ngramcounter.get_count('a'), 30) self.assertEqual(ngramcounter.get_count('b'), 20) self.assertEqual(ngramcounter.get_count('c'), 8) self.assertEqual(ngramcounter.get_count('d'), 6) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 6)
def testShave(self): ngramcounter = sppasNgramCounter(1) ngramcounter.count(self.corpusfile) ngramcounter.shave(4) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def testVocab(self): wds = sppasVocabulary() wds.add("a") wds.add("b") wds.add("c") ngramcounter = sppasNgramCounter(1, wds) ngramcounter.count(self.corpusfile) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(symbols.unk), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def testVocab(self): wds = sppasVocabulary() wds.add("a") wds.add("b") wds.add("c") ngramcounter = sppasNgramCounter(1, wds) ngramcounter.count(self.corpusfile) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(unk_stamp), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def testAppendSentence2(self): ngramcounter = sppasNgramCounter(2) # bigram ngramcounter.append_sentence(self.sent1) self.assertEqual(ngramcounter.get_count('a b'), 3) self.assertEqual(ngramcounter.get_count('b a'), 2) self.assertEqual(ngramcounter.get_count('a c'), 1) self.assertEqual(ngramcounter.get_count('a d'), 0) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL + ' a'), 1) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL + ' b'), 0) self.assertEqual(ngramcounter.get_count('a ' + END_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count('b ' + END_SENT_SYMBOL), 1) ngramcounter.append_sentence(self.sent2) ngramcounter.append_sentence(self.sent3) self.assertEqual(ngramcounter.get_count('a b'), 7) self.assertEqual(ngramcounter.get_count('b a'), 4) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL + ' a'), 3) self.assertEqual(ngramcounter.get_count('b ' + END_SENT_SYMBOL), 3)
def testAppendSentence2(self): ngramcounter = sppasNgramCounter(2) # bigram ngramcounter.append_sentence(self.sent1) self.assertEqual(ngramcounter.get_count('a b'), 3) self.assertEqual(ngramcounter.get_count('b a'), 2) self.assertEqual(ngramcounter.get_count('a c'), 1) self.assertEqual(ngramcounter.get_count('a d'), 0) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 1) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' b'), 0) self.assertEqual(ngramcounter.get_count('a '+END_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 1) ngramcounter.append_sentence(self.sent2) ngramcounter.append_sentence(self.sent3) self.assertEqual(ngramcounter.get_count('a b'), 7) self.assertEqual(ngramcounter.get_count('b a'), 4) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3) self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
def testAppendSentence1(self): ngramcounter = sppasNgramCounter() # default is unigram ngramcounter.append_sentence(self.sent1) self.assertEqual(ngramcounter.get_count('a'), 6) self.assertEqual(ngramcounter.get_count('b'), 4) self.assertEqual(ngramcounter.get_count('c'), 1) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 1) self.assertEqual(ngramcounter.get_ncount(), 12) ngramcounter.append_sentence(self.sent2) ngramcounter.append_sentence(self.sent3) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def testInit(self): with self.assertRaises(NgramOrderValueError): m = sppasNgramCounter(0) m = sppasNgramCounter(100)