class BLEUTestCases(unittest.TestCase): """Test BLEU functions. abydos.distance.BLEU """ cmp = BLEU() cmp_skip_saps = BLEU( tokenizers=[QSkipgrams(), SAPSTokenizer()], weights=[0.33, 0.67] ) def test_bleu_sim(self): """Test abydos.distance.BLEU.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 0.0) self.assertEqual(self.cmp.sim('a', ''), 0.0) self.assertEqual(self.cmp.sim('', 'a'), 0.0) self.assertEqual(self.cmp.sim('abc', ''), 0.0) self.assertEqual(self.cmp.sim('', 'abc'), 0.0) self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6223329773) self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6223329773) self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7071067812) self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7071067812) self.assertAlmostEqual( self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5119598032 ) self.assertAlmostEqual( self.cmp_skip_saps.sim('Nigel', 'Niall'), 0.7828303104 ) def test_bleu_dist(self): """Test abydos.distance.BLEU.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 1.0) self.assertEqual(self.cmp.dist('a', ''), 1.0) self.assertEqual(self.cmp.dist('', 'a'), 1.0) self.assertEqual(self.cmp.dist('abc', ''), 1.0) self.assertEqual(self.cmp.dist('', 'abc'), 1.0) self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3776670227) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3776670227) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2928932188) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2928932188) self.assertAlmostEqual( self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4880401968 )
def test_pearson_chi_squared_sim_score(self): """Test abydos.distance.PearsonChiSquared.sim_score.""" # Base cases self.assertEqual(self.cmp.sim_score('', ''), 784.0) self.assertEqual(self.cmp.sim_score('a', ''), 0.0) self.assertEqual(self.cmp.sim_score('', 'a'), 0.0) self.assertEqual(self.cmp.sim_score('abc', ''), 0.0) self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0) self.assertEqual(self.cmp.sim_score('abc', 'abc'), 784.0) self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), 0.032298410951138765) self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), 192.9885210909) self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), 192.9885210909) self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), 192.9885210909) self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), 192.9885210909) self.assertAlmostEqual(self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 344.5438630111) # Tests with alphabet=0 (no d factor) self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0) self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0) self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0) self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0) self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0) self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 4.0) self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 10.0) self.assertAlmostEqual(self.cmp_no_d.sim_score('Nigel', 'Niall'), 2.25) self.assertAlmostEqual(self.cmp_no_d.sim_score('Niall', 'Nigel'), 2.25) self.assertAlmostEqual(self.cmp_no_d.sim_score('Colin', 'Coiln'), 2.25) self.assertAlmostEqual(self.cmp_no_d.sim_score('Coiln', 'Colin'), 2.25) self.assertAlmostEqual( self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 1.5272727273) self.assertEqual( PearsonChiSquared(alphabet=0, tokenizer=QSkipgrams(qval=2, scaler='SSK')).sim_score( 'a', 'eh'), 0.0, )
def test_koppen_i_corr(self): """Test abydos.distance.KoppenI.corr.""" # Base cases self.assertEqual(self.cmp.corr('', ''), 1.0) self.assertEqual(self.cmp.corr('a', ''), -0.0012771392081735637) self.assertEqual(self.cmp.corr('', 'a'), -0.0012771392081735637) self.assertEqual(self.cmp.corr('abc', ''), -0.002557544757033164) self.assertEqual(self.cmp.corr('', 'abc'), -0.002557544757033164) self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483896) self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4942159383) self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4942159383) self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4942159383) self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4942159383) self.assertAlmostEqual( self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6591251885 ) # Tests with alphabet=0 (no d factor) self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) self.assertAlmostEqual( self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0 ) self.assertEqual( KoppenI( alphabet=0, tokenizer=QSkipgrams(qval=2, scaler='SSK') ).corr('eh', 'a'), 0.0, )
def test_scott_pi_corr(self): """Test abydos.distance.ScottPi.corr.""" # Base cases self.assertEqual(self.cmp.corr('', ''), 1.0) self.assertEqual(self.cmp.corr('a', ''), -0.0012771392081137526) self.assertEqual(self.cmp.corr('', 'a'), -0.0012771392081137526) self.assertEqual(self.cmp.corr('abc', ''), -0.0025575447570442954) self.assertEqual(self.cmp.corr('', 'abc'), -0.0025575447570442954) self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237489689) self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589) self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589) self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589) self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589) self.assertAlmostEqual(self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6621417798) # Tests with alphabet=0 (no d factor) self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5) self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5) self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5) self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5) self.assertAlmostEqual(self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -0.3333333333) self.assertEqual( ScottPi(alphabet=0, tokenizer=QSkipgrams(qval=2, scaler='SSK')).corr('eh', 'a'), 0.0, )
def test_digby_corr(self): """Test abydos.distance.Digby.corr.""" # Base cases self.assertEqual(self.cmp.corr('', ''), 1.0) self.assertEqual(self.cmp.corr('a', ''), -1.0) self.assertEqual(self.cmp.corr('', 'a'), -1.0) self.assertEqual(self.cmp.corr('abc', ''), -1.0) self.assertEqual(self.cmp.corr('', 'abc'), -1.0) self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0) self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0) self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9694362533) self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9694362533) self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9694362533) self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9694362533) self.assertAlmostEqual(self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.9797093576) # Tests with alphabet=0 (no d factor) self.assertEqual(self.cmp_no_d.corr('', ''), 1.0) self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0) self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0) self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0) self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0) self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0) self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0) self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0) self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0) self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0) self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0) self.assertAlmostEqual(self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0) self.assertEqual( Digby(alphabet=0, tokenizer=QSkipgrams(qval=2, scaler='SSK')).corr('a', 'eh'), 0.0, )
def test_baulieu_xi_dist(self): """Test abydos.distance.BaulieuXI.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 0.0) self.assertEqual(self.cmp.dist('a', ''), 0.002551020408163265) self.assertEqual(self.cmp.dist('', 'a'), 0.002551020408163265) self.assertEqual(self.cmp.dist('abc', ''), 0.00510204081632653) self.assertEqual(self.cmp.dist('', 'abc'), 0.00510204081632653) self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.012755102040816327) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0076824584) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0076824584) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0076824584) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0076824584) self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.009009009) self.assertEqual( BaulieuXI(alphabet=None, tokenizer=QSkipgrams(qval=2, scaler='SSK')).dist('a', 'eh'), 0.0, )
def test_corpus(self): """Test abydos.corpus.Corpus.""" # base cases self.assertEqual(Corpus().corpus, []) self.assertEqual(Corpus('').corpus, []) self.assertEqual(Corpus(' ').corpus, []) self.assertEqual(Corpus('\n').corpus, []) self.assertEqual(Corpus(' \n').corpus, []) self.assertEqual(Corpus(' \n ').corpus, []) # one document/one sentence self.assertEqual(Corpus('a').corpus, [[['a']]]) self.assertEqual(Corpus('ab ab').corpus, [[['ab', 'ab']]]) self.assertEqual( Corpus('abc def ghi').corpus, [[['abc', 'def', 'ghi']]]) # multiple documents (one sentence each) self.assertEqual( Corpus('abc\n\ndef ghi').corpus, [[['abc']], [['def', 'ghi']]]) self.assertEqual( Corpus('abc\n\ndef ghi\n\n').corpus, [[['abc']], [['def', 'ghi']]]) self.assertEqual( Corpus('\n\nabc\r\n\ndef ghi\n\n').corpus, [[['abc']], [['def', 'ghi']]], ) # one document (multiple sentences each) self.assertEqual( Corpus('abc\n def ghi').corpus, [[['abc'], ['def', 'ghi']]]) self.assertEqual( Corpus('abc\n def ghi\n').corpus, [[['abc'], ['def', 'ghi']]]) self.assertEqual( Corpus('\nabc\n def ghi\n').corpus, [[['abc'], ['def', 'ghi']]]) # multiple documents (multiple sentences each) self.assertEqual( Corpus('abc\n abc def\n\n\ndef ghi\n jkl\n').corpus, [[['abc'], ['abc', 'def']], [['def', 'ghi'], ['jkl']]], ) # sentence(s) with ignorables self.assertEqual( Corpus('abc\nd-ef ghi\n', filter_chars='.-').corpus, [[['abc'], ['def', 'ghi']]], ) self.assertEqual( Corpus('abc\n\n\nd-ef ghi\n\n\n', filter_chars='.-').corpus, [[['abc']], [['def', 'ghi']]], ) self.assertEqual( Corpus( '\n\nabc\r\n\ndef ghi.\n\n' + 'a b c d e f g.\n\n\n', filter_chars='.-', ).corpus, [ [['abc']], [['def', 'ghi']], [['a', 'b', 'c', 'd', 'e', 'f', 'g']], ], ) # sentences with stopword removal self.assertEqual( Corpus( 'The quick brown fox jumped over the lazy dog', stop_words=('The', 'the'), ).corpus, [[['quick', 'brown', 'fox', 'jumped', 'over', 'lazy', 'dog']]], ) self.assertEqual( Corpus('a ab abc def', stop_words=('A', 'a')).corpus, [[['ab', 'abc', 'def']]], ) # alternate document divider self.assertEqual( Corpus( 'The quick brown@ fox jumped over@' + 'the lazy dog', doc_split='@', ).corpus, [ [['The', 'quick', 'brown']], [['fox', 'jumped', 'over']], [['the', 'lazy', 'dog']], ], ) # alternate sentence divider self.assertEqual( Corpus( 'The quick brown$ fox jumped over$' + 'the lazy dog', sent_split='$', ).corpus, [[ ['The', 'quick', 'brown'], ['fox', 'jumped', 'over'], ['the', 'lazy', 'dog'], ]], ) self.assertEqual( Corpus( 'The quick brown$ fox jumped over@' + 'the lazy dog', doc_split='@', sent_split='$', ).corpus, [ [['The', 'quick', 'brown'], ['fox', 'jumped', 'over']], [['the', 'lazy', 'dog']], ], ) self.assertEqual( Corpus( '<BOS> The quick brown <EOS>' + '<BOS> fox jumped over the lazy dog <EOS>', sent_split='<BOS>', stop_words=['<EOS>'], ).corpus, [[ ['The', 'quick', 'brown'], ['fox', 'jumped', 'over', 'the', 'lazy', 'dog'], ]], ) self.assertEqual( Corpus('quick', word_tokenizer=QSkipgrams(qval=3, start_stop='')).corpus, [[[ 'qui', 'quc', 'quk', 'qic', 'qik', 'qck', 'uic', 'uik', 'uck', 'ick', ]]], )
def test_token_distance(self): """Test abydos.distance._TokenDistance members.""" self.assertAlmostEqual( Jaccard(intersection_type='soft', alphabet=24).sim( 'ATCAACGAGT', 'AACGATTAG' ), 0.68, ) self.assertAlmostEqual( Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'), 0.9, ) self.assertAlmostEqual( Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim( 'ATCAACGAGT', 'AACGATTAG' ), 0.6372795969773299, ) self.assertAlmostEqual( Jaccard(alphabet=None).sim('synonym', 'antonym'), 0.3333333333333333, ) self.assertAlmostEqual( Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'), 0.34146341463414637, ) src_ctr = Counter({'a': 5, 'b': 2, 'c': 10}) tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12}) self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375) self.assertAlmostEqual( SokalMichener(normalizer='proportional').sim('synonym', 'antonym'), 0.984777917351113, ) self.assertAlmostEqual( SokalMichener(normalizer='log').sim('synonym', 'antonym'), 1.2385752469545532, ) self.assertAlmostEqual( SokalMichener(normalizer='exp', alphabet=0).sim( 'synonym', 'antonym' ), 3.221246147982545e18, ) self.assertAlmostEqual( SokalMichener(normalizer='laplace').sim('synonym', 'antonym'), 0.98856416772554, ) self.assertAlmostEqual( SokalMichener(normalizer='inverse').sim('synonym', 'antonym'), 197.95790155440417, ) self.assertAlmostEqual( SokalMichener(normalizer='complement').sim('synonym', 'antonym'), 1.0204081632653061, ) self.assertAlmostEqual( SokalMichener(normalizer='base case').sim('synonym', 'antonym'), 0.9897959183673469, ) self.assertAlmostEqual( SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469 ) sm = SokalMichener() sm._tokenize('synonym', 'antonym') # noqa: SF01 self.assertEqual( sm._get_tokens(), # noqa: SF01 ( Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, } ), Counter( { '$a': 1, 'an': 1, 'nt': 1, 'to': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, } ), ), ) self.assertEqual(sm._src_card(), 8) # noqa: SF01 self.assertEqual(sm._tar_card(), 8) # noqa: SF01 self.assertEqual( sm._symmetric_difference(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, '$a': 1, 'an': 1, 'nt': 1, 'to': 1, } ), ) self.assertEqual(sm._symmetric_difference_card(), 8) # noqa: SF01 self.assertEqual(sm._total_complement_card(), 772) # noqa: SF01 self.assertEqual(sm._population_card(), 788) # noqa: SF01 self.assertEqual( sm._union(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, '$a': 1, 'an': 1, 'nt': 1, 'to': 1, } ), ) self.assertEqual(sm._union_card(), 12) # noqa: SF01 self.assertEqual( sm._difference(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 0, 'ny': 0, 'ym': 0, 'm#': 0, '$a': -1, 'an': -1, 'nt': -1, 'to': -1, } ), ) self.assertEqual( sm._intersection(), # noqa: SF01 Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}), ) self.assertEqual( sm._get_confusion_table(), # noqa: SF01 ConfusionTable(tp=4, tn=772, fp=4, fn=4), ) sm = SokalMichener( alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1 ) sm._tokenize('ATCAACGAGT', 'AACGATTAG') # noqa: SF01 self.assertEqual(sm._total_complement_card(), 61) # noqa: SF01 jac = Jaccard( intersection_type='linkage', internal_assignment_problem=True ) self.assertAlmostEqual(jac.sim('abandonned', 'abandoned'), 1.0) self.assertAlmostEqual( jac.sim('abundacies', 'abundances'), 0.6296296296296297 ) # Some additional constructors needed to complete test coverage self.assertAlmostEqual( Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'), 0.42857142857142855, ) self.assertAlmostEqual( AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'), 0.22558922558922556, ) self.assertAlmostEqual( Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim( 'abc', 'abcd' ), 0.42857142857142855, ) self.assertAlmostEqual( Jaccard( alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer() ).sim('abc', 'abcd'), 0.0, ) self.assertAlmostEqual( Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5 ) self.assertAlmostEqual( Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75 )
class UnigramCorpusTestCases(unittest.TestCase): """Test abydos.corpus.UnigramCorpus.""" simple_corpus = UnigramCorpus() simple_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) double_corpus = UnigramCorpus() double_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) double_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) sotu2015_sample = "Mr. Speaker, Mr. Vice President, Members of Congress,\ my fellow Americans:\n\nWe are 15 years into this new century.\n Fifteen\ years that dawned with terror touching our shores; that unfolded with a\ new generation fighting two long and costly wars; that saw a vicious\ recession spread across our nation and the world.\n It has been, and still\ is, a hard time for many.\n\nBut tonight, we turn the page.\n Tonight,\ after a breakthrough year for America, our economy is growing and creating\ jobs at the fastest pace since 1999.\n Our unemployment rate is now lower\ than it was before the financial crisis.\n More of our kids are graduating\ than ever before.\n More of our people are insured than ever before.\n And\ we are as free from the grip of foreign oil as we've been in almost 30\ years.\n\nTonight, for the first time since 9/11, our combat mission in\ Afghanistan is over.\n Six years ago, nearly 180,000 American troops\ served in Iraq and Afghanistan.\n Today, fewer than 15,000 remain.\n And\ we salute the courage and sacrifice of every man and woman in this 9/11\ Generation who has served to keep us safe.\n We are humbled and grateful\ for your service.\n\nAmerica, for all that we have endured; for all the\ grit and hard work required to come back; for all the tasks that lie\ ahead, know this: The shadow of crisis has passed, and the State of the\ Union is strong.\n\nAt this moment -- with a growing economy, shrinking\ deficits, bustling industry, booming energy production -- we have risen\ from recession freer to write our own future than any other nation on\ Earth.\n It's now up to us to choose who we want to be over the next 15\ years and for decades to come.\n\nWill we accept an economy where only a\ few of us do spectacularly well?\n Or will we commit ourselves to an\ economy that generates rising incomes and chances for everyone who makes\ the effort?\n\nWill we approach the world fearful and reactive, dragged\ into costly conflicts that strain our military and set back our\ standing?\n Or will we lead wisely, using all elements of our power to\ defeat new threats and protect our planet?\n\nWill we allow ourselves to\ be sorted into factions and turned against one another?\n Or will we\ recapture the sense of common purpose that has always propelled America\ forward?\n\nIn two weeks, I will send this Congress a budget filled with\ ideas that are practical, not partisan.\n And in the months ahead, I'll\ crisscross the country making a case for those ideas.\n So tonight, I want\ to focus less on a checklist of proposals, and focus more on the values at\ stake in the choices before us." sotu2015_corpus = UnigramCorpus(sotu2015_sample) sdx_corpus = UnigramCorpus(word_transform=Soundex().encode) qsg_corpus = UnigramCorpus( word_tokenizer=QSkipgrams(qval=3, start_stop='')) pos_corpus = UnigramCorpus() pos_corpus.gng_importer(_corpus_file('simple-ngrams-pos.txt')) def test_unigram_corpus_init(self): """Test abydos.corpus.UnigramCorpus.__init__.""" self.assertIsInstance(UnigramCorpus(), UnigramCorpus) self.assertIsInstance(self.sotu2015_corpus, UnigramCorpus) def test_unigram_corpus_gng_importer(self): """Test abydos.corpus.UnigramCorpus.gng_importer.""" self.assertIsInstance(self.simple_corpus, UnigramCorpus) self.assertIsInstance(self.simple_corpus.corpus, defaultdict) # skip tests of UnigramCorpus on Python < 3.6 (lack ordered dict) if sys.version_info < (3, 6): return self.sdx_corpus.gng_importer('tests/corpora/simple-ngrams.txt') self.assertEqual( list(self.sdx_corpus.corpus.items()), [ ('T000', (20, 20)), ('Q200', (2, 2)), ('B650', (3, 3)), ('F200', (1, 1)), ('J513', (4, 4)), ('O160', (6, 6)), ('L200', (1, 1)), ('D200', (5, 5)), ('T220', (2, 2)), ('Q216', (1, 1)), ('B651', (1, 1)), ('F251', (1, 1)), ('O163', (3, 3)), ('T420', (2, 2)), ('L232', (1, 1)), ], ) self.qsg_corpus.gng_importer('tests/corpora/simple-ngrams.txt') self.assertEqual( list(self.qsg_corpus.corpus.items())[:30:2], [ ('the', (27, 27)), ('quc', (5, 5)), ('qic', (5, 5)), ('qck', (5, 5)), ('uik', (5, 5)), ('ick', (5, 5)), ('brw', (5, 5)), ('bow', (5, 5)), ('bwn', (5, 5)), ('ron', (5, 5)), ('own', (5, 5)), ('jum', (5, 5)), ('jue', (6, 5)), ('jmp', (5, 5)), ('jmd', (5, 5)), ], ) for term, _ in self.pos_corpus.corpus.items(): self.assertTrue('_' not in term) def test_unigram_corpus_save_load_corpus(self): """Test abydos.corpus.UnigramCorpus.save_corpus & .load_corpus.""" handle, path = tempfile.mkstemp('.dat') self.sotu2015_corpus.save_corpus(path) self.sotu2015_corpus.load_corpus(path) statinfo = os.stat(path) self.assertGreater(statinfo.st_size, 0) os.close(handle) os.remove(path) def test_unigram_corpus_idf(self): """Test abydos.corpus.UnigramCorpus.idf.""" # string-style tests self.assertAlmostEqual(self.simple_corpus.idf('the'), 0.69314718056) self.assertAlmostEqual(self.simple_corpus.idf('quick'), 2.3978952728) self.assertAlmostEqual(self.simple_corpus.idf('trolley'), float('inf'))
def test_qskipgrams(self): """Test abydos.tokenizer.QSkipgrams.""" self.assertEqual(sorted(QSkipgrams().tokenize('').get_list()), []) self.assertEqual( sorted(QSkipgrams(start_stop='').tokenize('a').get_list()), []) self.assertEqual(sorted(QSkipgrams().tokenize('a').get_list()), ['$#', '$a', 'a#']) self.assertEqual( sorted(QSkipgrams().tokenize('ab').get_list()), sorted(['$a', '$b', '$#', 'ab', 'a#', 'b#']), ) self.assertEqual( sorted(QSkipgrams().tokenize('NELSON').get_list()), sorted([ '$N', '$E', '$L', '$S', '$O', '$N', '$#', 'NE', 'NL', 'NS', 'NO', 'NN', 'N#', 'EL', 'ES', 'EO', 'EN', 'E#', 'LS', 'LO', 'LN', 'L#', 'SO', 'SN', 'S#', 'ON', 'O#', 'N#', ]), ) self.assertEqual( sorted(QSkipgrams().tokenize('NEILSEN').get_list()), sorted([ '$N', '$E', '$I', '$L', '$S', '$E', '$N', '$#', 'NE', 'NI', 'NL', 'NS', 'NE', 'NN', 'N#', 'EI', 'EL', 'ES', 'EE', 'EN', 'E#', 'IL', 'IS', 'IE', 'IN', 'I#', 'LS', 'LE', 'LN', 'L#', 'SE', 'SN', 'S#', 'EN', 'E#', 'N#', ]), ) self.assertEqual( sorted(QSkipgrams(qval=1).tokenize('NEILSEN').get_list()), sorted(['N', 'E', 'I', 'L', 'S', 'E', 'N']), ) test_counter = (QSkipgrams( qval=(2, ), scaler='SSK').tokenize('NEILSEN').get_counter()) gold_counter = Counter({ '$N': 1.2404672100000003, '$E': 1.2072969000000002, '$I': 0.6561, '$L': 0.5904900000000001, '$S': 0.531441, '$#': 0.3874204890000001, 'NE': 1.341441, 'NI': 0.7290000000000001, 'NL': 0.6561, 'NS': 0.5904900000000001, 'NN': 0.4782969000000001, 'N#': 1.2404672100000003, 'EI': 0.81, 'EL': 0.7290000000000001, 'ES': 0.6561, 'EE': 0.5904900000000001, 'EN': 1.341441, 'E#': 1.2072969000000002, 'IL': 0.81, 'IS': 0.7290000000000001, 'IE': 0.6561, 'IN': 0.5904900000000001, 'I#': 0.531441, 'LS': 0.81, 'LE': 0.7290000000000001, 'LN': 0.6561, 'L#': 0.5904900000000001, 'SE': 0.81, 'SN': 0.7290000000000001, 'S#': 0.6561, }) for key in gold_counter.keys(): self.assertAlmostEqual(gold_counter[key], test_counter[key]) test_counter = (QSkipgrams( qval=(4, 6, 5, 1, 0), scaler='SSK').tokenize('NIALL').get_counter()) gold_counter = Counter({ '$$$N': 0.531441, '$$$I': 0.4782969000000001, '$$$A': 0.4304672100000001, '$$$L': 0.7360989291000002, '$$$#': 0.8504267154039002, '$$NI': 1.4880348000000003, '$$NA': 1.3392313200000003, '$$NL': 2.2900855572000007, '$$N#': 2.645772003478801, '$$IA': 1.3392313200000003, '$$IL': 2.2900855572000007, '$$I#': 2.645772003478801, '$$AL': 2.2900855572000007, '$$A#': 2.645772003478801, '$$LL': 1.0847773692000002, '$$L#': 5.291544006957601, '$$##': 2.460275073345601, '$NIA': 1.4402051100000002, '$NIL': 2.462750738100001, '$NI#': 2.845254813264901, '$NAL': 2.462750738100001, '$NA#': 2.845254813264901, '$NLL': 1.1665661391000004, '$NL#': 5.690509626529802, '$N##': 2.645772003478801, '$IAL': 2.462750738100001, '$IA#': 2.845254813264901, '$ILL': 1.1665661391000004, '$IL#': 5.690509626529802, '$I##': 2.645772003478801, '$ALL': 1.1665661391000004, '$AL#': 5.690509626529802, '$A##': 2.645772003478801, '$LL#': 2.845254813264901, '$L##': 5.291544006957601, '$###': 0.8504267154039002, 'NIAL': 1.0097379000000002, 'NIA#': 1.1665661391000002, 'NILL': 0.4782969000000001, 'NIL#': 2.3331322782000004, 'NI##': 1.0847773692000002, 'NALL': 0.4782969000000001, 'NAL#': 2.3331322782000004, 'NA##': 1.0847773692000002, 'NLL#': 1.1665661391000002, 'NL##': 2.1695547384000005, 'N###': 0.3486784401000001, 'IALL': 0.531441, 'IAL#': 2.5923691980000005, 'IA##': 1.2053081880000003, 'ILL#': 1.2961845990000003, 'IL##': 2.4106163760000006, 'I###': 0.3874204890000001, 'ALL#': 1.4402051100000004, 'AL##': 2.6784626400000007, 'A###': 0.4304672100000001, 'LL##': 1.4880348000000003, 'L###': 1.0097379000000002, '$$$$$N': 0.3486784401000001, '$$$$$I': 0.31381059609000006, '$$$$$A': 0.2824295364810001, '$$$$$L': 0.48295450738251017, '$$$$$#': 0.8431447750407974, '$$$$NI': 1.6039208244600003, '$$$$NA': 1.4435287420140006, '$$$$NL': 2.468434148843941, '$$$$N#': 4.309406627986299, '$$$$IA': 1.4435287420140006, '$$$$IL': 2.468434148843941, '$$$$I#': 4.309406627986299, '$$$$AL': 2.468434148843941, '$$$$A#': 4.309406627986299, '$$$$LL': 1.1692582810313406, '$$$$L#': 8.618813255972597, '$$$$##': 7.715070145397851, '$$$NIA': 2.984687447256001, '$$$NIL': 5.103815534807762, '$$$NI#': 8.910270709073119, '$$$NAL': 5.103815534807762, '$$$NA#': 8.910270709073119, '$$$NLL': 2.417596832277361, '$$$NL#': 17.82054141814625, '$$$N##': 15.951932474542438, '$$$IAL': 5.103815534807762, '$$$IA#': 8.910270709073119, '$$$ILL': 2.417596832277361, '$$$IL#': 17.82054141814625, '$$$I##': 15.951932474542438, '$$$ALL': 2.417596832277361, '$$$AL#': 17.82054141814625, '$$$A##': 15.951932474542438, '$$$LL#': 8.910270709073119, '$$$L##': 31.903864949084834, '$$$###': 15.08638445665049, '$$NIAL': 5.396635688803742, '$$NIA#': 9.42147782919388, '$$NILL': 2.556301115749141, '$$NIL#': 18.84295565838777, '$$NI##': 16.867139400002937, '$$NALL': 2.556301115749141, '$$NAL#': 18.84295565838777, '$$NA##': 16.867139400002937, '$$NLL#': 9.42147782919388, '$$NL##': 33.73427880000585, '$$N###': 15.951932474542435, '$$IALL': 2.556301115749141, '$$IAL#': 18.84295565838777, '$$IA##': 16.867139400002937, '$$ILL#': 9.42147782919388, '$$IL##': 33.73427880000585, '$$I###': 15.951932474542435, '$$ALL#': 9.42147782919388, '$$AL##': 33.73427880000585, '$$A###': 15.951932474542435, '$$LL##': 16.867139400002937, '$$L###': 31.903864949084824, '$$####': 7.715070145397851, '$NIALL': 1.4278730800535104, '$NIAL#': 10.525109490228838, '$NIA##': 9.421477829193876, '$NILL#': 5.262554745114417, '$NIL##': 18.842955658387766, '$NI###': 8.910270709073117, '$NALL#': 5.262554745114417, '$NAL##': 18.842955658387766, '$NA###': 8.910270709073117, '$NLL##': 9.421477829193876, '$NL###': 17.820541418146256, '$N####': 4.309406627986299, '$IALL#': 5.262554745114417, '$IAL##': 18.842955658387766, '$IA###': 8.910270709073117, '$ILL##': 9.421477829193876, '$IL###': 17.820541418146256, '$I####': 4.309406627986299, '$ALL##': 9.421477829193876, '$AL###': 17.820541418146256, '$A####': 4.309406627986299, '$LL###': 8.910270709073117, '$L####': 8.618813255972595, '$#####': 0.8431447750407974, 'NIALL#': 1.4278730800535104, 'NIAL##': 5.112602231498281, 'NIA###': 2.417596832277361, 'NILL##': 2.556301115749141, 'NIL###': 4.835193664554721, 'NI####': 1.1692582810313406, 'NALL##': 2.556301115749141, 'NAL###': 4.835193664554721, 'NA####': 1.1692582810313406, 'NLL###': 2.417596832277361, 'NL####': 2.338516562062681, 'N#####': 0.2287679245496101, 'IALL##': 2.8403345730546006, 'IAL###': 5.3724374050608015, 'IA####': 1.2991758678126004, 'ILL###': 2.6862187025304003, 'IL####': 2.5983517356252004, 'I#####': 0.2541865828329001, 'ALL###': 2.984687447256001, 'AL####': 2.887057484028001, 'A#####': 0.2824295364810001, 'LL####': 1.6039208244600003, 'L#####': 0.6624890361900002, '$$$$N': 0.4304672100000001, '$$$$I': 0.3874204890000001, '$$$$A': 0.3486784401000001, '$$$$L': 0.5962401325710002, '$$$$#': 0.8741476583623434, '$$$NI': 1.5927286770000002, '$$$NA': 1.4334558093000005, '$$$NL': 2.4512094339030006, '$$$N#': 3.59371815104519, '$$$IA': 1.4334558093000005, '$$$IL': 2.4512094339030006, '$$$I#': 3.59371815104519, '$$$AL': 2.4512094339030006, '$$$A#': 3.59371815104519, '$$$LL': 1.1610992055330005, '$$$L#': 7.187436302090378, '$$$##': 4.91876456439945, '$$NIA': 2.2513435083000006, '$$NIL': 3.849797399193001, '$$NI#': 5.644187966956859, '$$NAL': 3.849797399193001, '$$NA#': 5.644187966956859, '$$NLL': 1.8235882417230007, '$$NL#': 11.28837593391372, '$$N##': 7.725266868411147, '$$IAL': 3.849797399193001, '$$IA#': 5.644187966956859, '$$ILL': 1.8235882417230007, '$$IL#': 11.28837593391372, '$$I##': 7.725266868411147, '$$ALL': 1.8235882417230007, '$$AL#': 11.28837593391372, '$$A##': 7.725266868411147, '$$LL#': 5.644187966956859, '$$L##': 15.4505337368223, '$$###': 4.918764564399449, '$NIAL': 2.812715796861001, '$NIA#': 4.123722629777913, '$NILL': 1.3323390616710005, '$NIL#': 8.247445259555828, '$NI##': 5.644187966956858, '$NALL': 1.3323390616710005, '$NAL#': 8.247445259555828, '$NA##': 5.644187966956858, '$NLL#': 4.123722629777913, '$NL##': 11.288375933913724, '$N###': 3.593718151045189, '$IALL': 1.3323390616710005, '$IAL#': 8.247445259555828, '$IA##': 5.644187966956858, '$ILL#': 4.123722629777913, '$IL##': 11.288375933913724, '$I###': 3.593718151045189, '$ALL#': 4.123722629777913, '$AL##': 11.288375933913724, '$A###': 3.593718151045189, '$LL##': 5.644187966956858, '$L###': 7.187436302090377, '$####': 0.8741476583623434, 'NIALL': 0.4304672100000001, 'NIAL#': 2.664678123342001, 'NIA##': 1.8235882417230007, 'NILL#': 1.3323390616710005, 'NIL##': 3.6471764834460014, 'NI###': 1.1610992055330005, 'NALL#': 1.3323390616710005, 'NAL##': 3.6471764834460014, 'NA###': 1.1610992055330005, 'NLL##': 1.8235882417230007, 'NL###': 2.322198411066001, 'N####': 0.2824295364810001, 'IALL#': 1.4803767351900001, 'IAL##': 4.0524183149400015, 'IA###': 1.2901102283700003, 'ILL##': 2.0262091574700007, 'IL###': 2.5802204567400007, 'I####': 0.31381059609000006, 'ALL##': 2.2513435083000006, 'AL###': 2.8669116186000005, 'A####': 0.3486784401000001, 'LL###': 1.5927286770000004, 'L####': 0.8178876990000001, 'N': 1.0, 'I': 1.0, 'A': 1.0, 'L': 2.0, }) for key in gold_counter.keys(): self.assertAlmostEqual(gold_counter[key], test_counter[key]) self.assertEqual( QSkipgrams(qval=(2, 3), scaler='length').tokenize('NIALL').get_counter(), Counter({ '$N': 2, '$I': 2, '$A': 2, '$L': 4, '$#': 2, 'NI': 2, 'NA': 2, 'NL': 4, 'N#': 2, 'IA': 2, 'IL': 4, 'I#': 2, 'AL': 4, 'A#': 2, 'LL': 2, 'L#': 4, '$$N': 3, '$$I': 3, '$$A': 3, '$$L': 6, '$$#': 6, '$NI': 6, '$NA': 6, '$NL': 12, '$N#': 12, '$IA': 6, '$IL': 12, '$I#': 12, '$AL': 12, '$A#': 12, '$LL': 6, '$L#': 24, '$##': 6, 'NIA': 3, 'NIL': 6, 'NI#': 6, 'NAL': 6, 'NA#': 6, 'NLL': 3, 'NL#': 12, 'N##': 3, 'IAL': 6, 'IA#': 6, 'ILL': 3, 'IL#': 12, 'I##': 3, 'ALL': 3, 'AL#': 12, 'A##': 3, 'LL#': 6, 'L##': 6, }), ) test_counter = (QSkipgrams( qval=(2, 3), scaler='length-log').tokenize('NIALL').get_counter()) gold_counter = Counter({ '$N': 1.0986122886681096, '$I': 1.0986122886681096, '$A': 1.0986122886681096, '$L': 2.197224577336219, '$#': 1.0986122886681096, 'NI': 1.0986122886681096, 'NA': 1.0986122886681096, 'NL': 2.197224577336219, 'N#': 1.0986122886681096, 'IA': 1.0986122886681096, 'IL': 2.197224577336219, 'I#': 1.0986122886681096, 'AL': 2.197224577336219, 'A#': 1.0986122886681096, 'LL': 1.0986122886681096, 'L#': 2.197224577336219, '$$N': 1.3862943611198906, '$$I': 1.3862943611198906, '$$A': 1.3862943611198906, '$$L': 2.772588722239781, '$$#': 2.772588722239781, '$NI': 2.772588722239781, '$NA': 2.772588722239781, '$NL': 5.545177444479562, '$N#': 5.545177444479562, '$IA': 2.772588722239781, '$IL': 5.545177444479562, '$I#': 5.545177444479562, '$AL': 5.545177444479562, '$A#': 5.545177444479562, '$LL': 2.772588722239781, '$L#': 11.090354888959125, '$##': 2.772588722239781, 'NIA': 1.3862943611198906, 'NIL': 2.772588722239781, 'NI#': 2.772588722239781, 'NAL': 2.772588722239781, 'NA#': 2.772588722239781, 'NLL': 1.3862943611198906, 'NL#': 5.545177444479562, 'N##': 1.3862943611198906, 'IAL': 2.772588722239781, 'IA#': 2.772588722239781, 'ILL': 1.3862943611198906, 'IL#': 5.545177444479562, 'I##': 1.3862943611198906, 'ALL': 1.3862943611198906, 'AL#': 5.545177444479562, 'A##': 1.3862943611198906, 'LL#': 2.772588722239781, 'L##': 2.772588722239781, }) for key in gold_counter.keys(): self.assertAlmostEqual(gold_counter[key], test_counter[key]) test_counter = (QSkipgrams( qval=(2, 3), scaler='length-exp').tokenize('NIALL').get_counter()) gold_counter = Counter({ '$N': 7.38905609893065, '$I': 7.38905609893065, '$A': 7.38905609893065, '$L': 14.7781121978613, '$#': 7.38905609893065, 'NI': 7.38905609893065, 'NA': 7.38905609893065, 'NL': 14.7781121978613, 'N#': 7.38905609893065, 'IA': 7.38905609893065, 'IL': 14.7781121978613, 'I#': 7.38905609893065, 'AL': 14.7781121978613, 'A#': 7.38905609893065, 'LL': 7.38905609893065, 'L#': 14.7781121978613, '$$N': 20.085536923187668, '$$I': 20.085536923187668, '$$A': 20.085536923187668, '$$L': 40.171073846375336, '$$#': 40.171073846375336, '$NI': 40.171073846375336, '$NA': 40.171073846375336, '$NL': 80.34214769275067, '$N#': 80.34214769275067, '$IA': 40.171073846375336, '$IL': 80.34214769275067, '$I#': 80.34214769275067, '$AL': 80.34214769275067, '$A#': 80.34214769275067, '$LL': 40.171073846375336, '$L#': 160.68429538550137, '$##': 40.171073846375336, 'NIA': 20.085536923187668, 'NIL': 40.171073846375336, 'NI#': 40.171073846375336, 'NAL': 40.171073846375336, 'NA#': 40.171073846375336, 'NLL': 20.085536923187668, 'NL#': 80.34214769275067, 'N##': 20.085536923187668, 'IAL': 40.171073846375336, 'IA#': 40.171073846375336, 'ILL': 20.085536923187668, 'IL#': 80.34214769275067, 'I##': 20.085536923187668, 'ALL': 20.085536923187668, 'AL#': 80.34214769275067, 'A##': 20.085536923187668, 'LL#': 40.171073846375336, 'L##': 40.171073846375336, }) for key in gold_counter.keys(): self.assertAlmostEqual(gold_counter[key], test_counter[key])
def test__tokenizer(self): """Test abydos.tokenizer._Tokenizer.""" self.assertEqual(_Tokenizer().tokenize('').get_counter(), Counter({'': 1})) self.assertEqual(_Tokenizer().tokenize('a').get_counter(), Counter({'a': 1})) self.assertEqual( _Tokenizer().tokenize('NELSON').get_counter(), Counter({'NELSON': 1}), ) self.assertEqual( _Tokenizer().tokenize('NEILSEN').get_counter(), Counter({'NEILSEN': 1}), ) self.assertEqual(_Tokenizer().tokenize('NEILSEN').count(), 1) self.assertEqual(_Tokenizer().tokenize('NEILSEN').count_unique(), 1) tweet = 'Good to be home for a night' self.assertEqual( _Tokenizer().tokenize(tweet).get_counter(), Counter({'Good to be home for a night': 1}), ) nelson = QGrams().tokenize('NELSON') neilsen = QGrams().tokenize('NEILSEN') self.assertEqual(nelson.get_set(), {'$N', 'EL', 'LS', 'N#', 'NE', 'ON', 'SO'}) self.assertEqual(nelson.get_list(), ['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#']) if sys.version_info >= (3, 6): self.assertEqual( repr(nelson), "QGrams({'$N': 1, 'NE': 1, 'EL': 1, 'LS': 1, 'SO': 1, 'ON': 1, \ 'N#': 1})", ) self.assertEqual(nelson & neilsen, Counter({ '$N': 1, 'NE': 1, 'LS': 1, 'N#': 1 })) self.assertEqual( nelson + neilsen, Counter({ '$N': 2, 'NE': 2, 'EL': 1, 'LS': 2, 'SO': 1, 'ON': 1, 'N#': 2, 'EI': 1, 'IL': 1, 'SE': 1, 'EN': 1, }), ) self.assertEqual(nelson - neilsen, Counter({ 'EL': 1, 'SO': 1, 'ON': 1 })) nelsonnelson = QGrams(scaler='set').tokenize('NELSONNELSON') self.assertEqual(nelsonnelson.count(), 8) nelson_ssk = QSkipgrams(scaler='SSK').tokenize('NELSON') self.assertAlmostEqual(nelson_ssk.count(), 18.66784401) nelson_log = QSkipgrams(qval=3, scaler=log1p).tokenize('NELSON') gold_standard = Counter({ '$$N': 1.0986122886681096, '$$E': 0.6931471805599453, '$$L': 0.6931471805599453, '$$S': 0.6931471805599453, '$$O': 0.6931471805599453, '$$#': 1.0986122886681096, '$NE': 1.0986122886681096, '$NL': 1.0986122886681096, '$NS': 1.0986122886681096, '$NO': 1.0986122886681096, '$NN': 1.0986122886681096, '$N#': 2.1972245773362196, '$EL': 1.0986122886681096, '$ES': 1.0986122886681096, '$EO': 1.0986122886681096, '$EN': 1.0986122886681096, '$E#': 1.6094379124341003, '$LS': 1.0986122886681096, '$LO': 1.0986122886681096, '$LN': 1.0986122886681096, '$L#': 1.6094379124341003, '$SO': 1.0986122886681096, '$SN': 1.0986122886681096, '$S#': 1.6094379124341003, '$ON': 1.0986122886681096, '$O#': 1.6094379124341003, '$##': 1.0986122886681096, 'NEL': 0.6931471805599453, 'NES': 0.6931471805599453, 'NEO': 0.6931471805599453, 'NEN': 0.6931471805599453, 'NE#': 1.0986122886681096, 'NLS': 0.6931471805599453, 'NLO': 0.6931471805599453, 'NLN': 0.6931471805599453, 'NL#': 1.0986122886681096, 'NSO': 0.6931471805599453, 'NSN': 0.6931471805599453, 'NS#': 1.0986122886681096, 'NON': 0.6931471805599453, 'NO#': 1.0986122886681096, 'NN#': 1.0986122886681096, 'N##': 1.0986122886681096, 'ELS': 0.6931471805599453, 'ELO': 0.6931471805599453, 'ELN': 0.6931471805599453, 'EL#': 1.0986122886681096, 'ESO': 0.6931471805599453, 'ESN': 0.6931471805599453, 'ES#': 1.0986122886681096, 'EON': 0.6931471805599453, 'EO#': 1.0986122886681096, 'EN#': 1.0986122886681096, 'E##': 0.6931471805599453, 'LSO': 0.6931471805599453, 'LSN': 0.6931471805599453, 'LS#': 1.0986122886681096, 'LON': 0.6931471805599453, 'LO#': 1.0986122886681096, 'LN#': 1.0986122886681096, 'L##': 0.6931471805599453, 'SON': 0.6931471805599453, 'SO#': 1.0986122886681096, 'SN#': 1.0986122886681096, 'S##': 0.6931471805599453, 'ON#': 1.0986122886681096, 'O##': 0.6931471805599453, }) test_counter = nelson_log.get_counter() for key in test_counter: self.assertAlmostEqual(test_counter[key], gold_standard[key]) nelson_entropy = QSkipgrams(scaler='entropy').tokenize('NELSON') self.assertAlmostEqual(nelson_entropy.count(), 4.6644977792)
WhitespaceTokenizer, WordpunctTokenizer, ) from nltk import TweetTokenizer from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char algorithms = { 'corvcluster': COrVClusterTokenizer().tokenize, 'cvcluster': CVClusterTokenizer().tokenize, 'character': CharacterTokenizer().tokenize, 'legalipy': LegaliPyTokenizer().tokenize, 'nltk': NLTKTokenizer(nltk_tokenizer=TweetTokenizer()).tokenize, 'qgrams': QGrams().tokenize, 'qskipgrams': QSkipgrams().tokenize, 'regexp': RegexpTokenizer().tokenize, 'saps': SAPSTokenizer().tokenize, 'sonoripy': SonoriPyTokenizer().tokenize, 'vccluster': VCClusterTokenizer().tokenize, 'whitespace': WhitespaceTokenizer().tokenize, 'wordpunct': WordpunctTokenizer().tokenize, } class BigListOfNaughtyStringsTestCases(unittest.TestCase): """Test each tokenizer against the BLNS set. Here, we test each algorithm against each string, but we only care that it does not result in an exception.
def test__tokenizer(self): """Test abydos.tokenizer._Tokenizer.""" self.assertEqual( _Tokenizer().tokenize('').get_counter(), Counter({'': 1}) ) self.assertEqual( _Tokenizer().tokenize('a').get_counter(), Counter({'a': 1}) ) self.assertEqual( _Tokenizer().tokenize('NELSON').get_counter(), Counter({'NELSON': 1}), ) self.assertEqual( _Tokenizer().tokenize('NEILSEN').get_counter(), Counter({'NEILSEN': 1}), ) self.assertEqual(_Tokenizer().tokenize('NEILSEN').count(), 1) self.assertEqual(_Tokenizer().tokenize('NEILSEN').count_unique(), 1) tweet = 'Good to be home for a night' self.assertEqual( _Tokenizer().tokenize(tweet).get_counter(), Counter({'Good to be home for a night': 1}), ) nelson = QGrams().tokenize('NELSON') neilsen = QGrams().tokenize('NEILSEN') self.assertEqual( nelson.get_set(), {'$N', 'EL', 'LS', 'N#', 'NE', 'ON', 'SO'} ) self.assertEqual( nelson.get_list(), ['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#'] ) if sys.version_info >= (3, 6): self.assertEqual( repr(nelson), "QGrams({'$N': 1, 'NE': 1, 'EL': 1, 'LS': 1, 'SO': 1, 'ON': 1, \ 'N#': 1})", ) self.assertEqual( nelson & neilsen, Counter({'$N': 1, 'NE': 1, 'LS': 1, 'N#': 1}) ) self.assertEqual( nelson + neilsen, Counter( { '$N': 2, 'NE': 2, 'EL': 1, 'LS': 2, 'SO': 1, 'ON': 1, 'N#': 2, 'EI': 1, 'IL': 1, 'SE': 1, 'EN': 1, } ), ) self.assertEqual( nelson - neilsen, Counter({'EL': 1, 'SO': 1, 'ON': 1}) ) nelsonnelson = QGrams(scaler='set').tokenize('NELSONNELSON') self.assertEqual(nelsonnelson.count(), 8) nelson_ssk = QSkipgrams(scaler='SSK').tokenize('NELSON') self.assertAlmostEqual(nelson_ssk.count(), 18.66784401) nelson_log = QSkipgrams(qval=3, scaler=log1p).tokenize('NELSON') gold_standard = Counter( { '$$N': 1.0986122886681096, '$$E': 0.6931471805599453, '$$L': 0.6931471805599453, '$$S': 0.6931471805599453, '$$O': 0.6931471805599453, '$$#': 1.0986122886681096, '$NE': 1.0986122886681096, '$NL': 1.0986122886681096, '$NS': 1.0986122886681096, '$NO': 1.0986122886681096, '$NN': 1.0986122886681096, '$N#': 2.1972245773362196, '$EL': 1.0986122886681096, '$ES': 1.0986122886681096, '$EO': 1.0986122886681096, '$EN': 1.0986122886681096, '$E#': 1.6094379124341003, '$LS': 1.0986122886681096, '$LO': 1.0986122886681096, '$LN': 1.0986122886681096, '$L#': 1.6094379124341003, '$SO': 1.0986122886681096, '$SN': 1.0986122886681096, '$S#': 1.6094379124341003, '$ON': 1.0986122886681096, '$O#': 1.6094379124341003, '$##': 1.0986122886681096, 'NEL': 0.6931471805599453, 'NES': 0.6931471805599453, 'NEO': 0.6931471805599453, 'NEN': 0.6931471805599453, 'NE#': 1.0986122886681096, 'NLS': 0.6931471805599453, 'NLO': 0.6931471805599453, 'NLN': 0.6931471805599453, 'NL#': 1.0986122886681096, 'NSO': 0.6931471805599453, 'NSN': 0.6931471805599453, 'NS#': 1.0986122886681096, 'NON': 0.6931471805599453, 'NO#': 1.0986122886681096, 'NN#': 1.0986122886681096, 'N##': 1.0986122886681096, 'ELS': 0.6931471805599453, 'ELO': 0.6931471805599453, 'ELN': 0.6931471805599453, 'EL#': 1.0986122886681096, 'ESO': 0.6931471805599453, 'ESN': 0.6931471805599453, 'ES#': 1.0986122886681096, 'EON': 0.6931471805599453, 'EO#': 1.0986122886681096, 'EN#': 1.0986122886681096, 'E##': 0.6931471805599453, 'LSO': 0.6931471805599453, 'LSN': 0.6931471805599453, 'LS#': 1.0986122886681096, 'LON': 0.6931471805599453, 'LO#': 1.0986122886681096, 'LN#': 1.0986122886681096, 'L##': 0.6931471805599453, 'SON': 0.6931471805599453, 'SO#': 1.0986122886681096, 'SN#': 1.0986122886681096, 'S##': 0.6931471805599453, 'ON#': 1.0986122886681096, 'O##': 0.6931471805599453, } ) test_counter = nelson_log.get_counter() for key in test_counter: self.assertAlmostEqual(test_counter[key], gold_standard[key])