def test_whitespace_tokenizer(self): """Test abydos.tokenizer.WhitespaceTokenizer.""" self.assertEqual(sorted(WhitespaceTokenizer().tokenize('').get_list()), []) self.assertEqual( sorted(WhitespaceTokenizer().tokenize('a').get_list()), ['a']) self.assertEqual( sorted(WhitespaceTokenizer().tokenize('NELSON').get_list()), sorted(['NELSON']), ) self.assertEqual( sorted(WhitespaceTokenizer().tokenize('NEILSEN').get_list()), sorted(['NEILSEN']), ) tweet = 'Good to be home for a night. Even better to see the\ @chicagobulls start the season off right! #SeeRed' self.assertEqual( sorted(WhitespaceTokenizer().tokenize(tweet).get_list()), sorted([ 'Good', 'to', 'be', 'home', 'for', 'a', 'night.', 'Even', 'better', 'to', 'see', 'the', '@chicagobulls', 'start', 'the', 'season', 'off', 'right!', '#SeeRed', ]), )
def test_soft_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (soft).""" # Base cases self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0) self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0) self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111) self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6) self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6) self.assertAlmostEqual(self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'), 0.68) self.assertAlmostEqual( Jaccard(intersection_type='soft', tokenizer=WhitespaceTokenizer()).sim( 'junior system analyst', 'systems analyst'), 0.6190476190476191, ) self.assertAlmostEqual( Jaccard(intersection_type='soft', tokenizer=WhitespaceTokenizer()).sim( 'systems analyst', 'junior system analyst'), 0.6190476190476191, ) with self.assertRaises(TypeError): Jaccard( intersection_type='soft', metric=JaroWinkler(), tokenizer=WhitespaceTokenizer(), ).sim('junior system analyst', 'systems analyst')
class QGramTestCases(unittest.TestCase): """Test QGram functions. abydos.distance.QGram """ cmp = QGram() cmp_q1 = QGram(qval=1) cmp_ws = QGram(tokenizer=WhitespaceTokenizer()) def test_q_gram_dist(self): """Test abydos.distance.QGram.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 0.0) self.assertEqual(self.cmp.dist('a', ''), 0.0) self.assertEqual(self.cmp.dist('', 'a'), 0.0) self.assertEqual(self.cmp.dist('abc', ''), 1.0) self.assertEqual(self.cmp.dist('', 'abc'), 1.0) self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8571428571) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8571428571) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8571428571) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8571428571) self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4545454545) def test_q_gram_sim(self): """Test abydos.distance.QGram.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 1.0) self.assertEqual(self.cmp.sim('a', ''), 1.0) self.assertEqual(self.cmp.sim('', 'a'), 1.0) self.assertEqual(self.cmp.sim('abc', ''), 0.0) self.assertEqual(self.cmp.sim('', 'abc'), 0.0) self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.1428571429) self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.1428571429) self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.1428571429) self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.1428571429) self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5454545455) def test_q_gram_dist_abs(self): """Test abydos.distance.QGram.dist_abs.""" # Base cases self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('a', ''), 0) self.assertEqual(self.cmp.dist_abs('', 'a'), 0) self.assertEqual(self.cmp.dist_abs('abc', ''), 2) self.assertEqual(self.cmp.dist_abs('', 'abc'), 2) self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 6) self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 6) self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 6) self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 6) self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 6) self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 5) # Example from paper self.assertEqual(self.cmp.dist_abs('01000', '001111'), 5) # Coverage self.assertEqual(self.cmp_q1.dist_abs('01000', '001111'), 5) self.assertEqual(self.cmp_ws.dist_abs('a a b b c', 'a b b b c d'), 3)
class ChebyshevTestCases(unittest.TestCase): """Test Chebyshev functions. abydos.distance.Chebyshev """ cmp = Chebyshev() cmp_q2 = Chebyshev(tokenizer=QGrams(2)) cmp_ws = Chebyshev(tokenizer=WhitespaceTokenizer()) def test_chebyshev_dist_abs(self): """Test abydos.distance.Chebyshev.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('nelson', ''), 1) self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 1) self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 1) self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 1) # supplied q-gram tests self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 1) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 1) def test_chebyshev_dist(self): """Test abydos.distance.Chebyshev.dist.""" self.assertRaises(NotImplementedError, self.cmp.dist) def test_chebyshev_sim(self): """Test abydos.distance.Chebyshev.sim.""" self.assertRaises(NotImplementedError, self.cmp.sim)
class MinkowskiTestCases(unittest.TestCase): """Test Minkowski functions. abydos.distance.Minkowski """ cmp = Minkowski() cmp_q2 = Minkowski(tokenizer=QGrams(2)) cmp_q1p0 = Minkowski(pval=0, tokenizer=QGrams(1)) cmp_ws = Minkowski(tokenizer=WhitespaceTokenizer()) def test_minkowski_dist_abs(self): """Test abydos.distance.Minkowski.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8) self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7) self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8) self.assertEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7) # supplied q-gram tests self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 7, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2) self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2) self.assertEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8) self.assertEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8) # test l_0 "norm" self.assertEqual(self.cmp_q1p0.dist_abs('', ''), 0) self.assertEqual(self.cmp_q1p0.dist_abs('a', ''), 1) self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b'), 2) self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b'), 1) self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b'), 1) self.assertEqual(self.cmp_q1p0.dist_abs('', '', normalized=True), 0) self.assertEqual(self.cmp_q1p0.dist_abs('a', '', normalized=True), 1) self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b', normalized=True), 1) self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b', normalized=True), 1 / 2) self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b', normalized=True), 1 / 2) self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'b', normalized=True), 1 / 2) self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'ab', normalized=True), 1 / 2) # test with alphabet self.assertEqual( Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs('ab', 'b'), 1) self.assertEqual( Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs('ab', 'b', normalized=True), 1 / 26, ) self.assertEqual( Minkowski(tokenizer=QGrams(1), alphabet='abcdefghijklmnopqrstuvwxyz').dist_abs( 'ab', 'b', normalized=True), 1 / 26, ) self.assertEqual( Minkowski(pval=float('inf')).dist_abs('nelsonian', 'neilsen'), 1.0) # Test wrapper self.assertAlmostEqual(minkowski('nelson', 'neilsen'), 7) def test_minkowski_sim(self): """Test abydos.distance.Minkowski.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(sim_minkowski('nelson', 'neilsen'), 8 / 15) def test_minkowski_dist(self): """Test abydos.distance.Minkowski.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(dist_minkowski('nelson', 'neilsen'), 7 / 15)
class EuclideanTestCases(unittest.TestCase): """Test Euclidean functions. abydos.distance.Euclidean """ cmp = Euclidean() cmp_q2 = Euclidean(tokenizer=QGrams(2)) cmp_ws = Euclidean(tokenizer=WhitespaceTokenizer()) def test_euclidean_dist_abs(self): """Test abydos.distance.Euclidean.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('nelson', ''), 7**0.5) self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8**0.5) self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7**0.5) self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7**0.5) self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8**0.5) self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7**0.5) # supplied q-gram tests self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 7**0.5, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8**0.5, ) self.assertAlmostEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7**0.5, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2**0.5) self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2**0.5) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8**0.5) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8**0.5) # Test wrapper self.assertAlmostEqual(euclidean('nelson', 'neilsen'), 7**0.5) def test_euclidean_sim(self): """Test abydos.distance.Euclidean.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 1 - 7**0.5 / 23**0.5) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 1 - 7**0.5 / 23**0.5) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1 - 7**0.5 / 23**0.5, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 - 8**0.5 / 24**0.5) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 - 8**0.5 / 24**0.5) # Test wrapper self.assertAlmostEqual(sim_euclidean('nelson', 'neilsen'), 1 - 7**0.5 / 23**0.5) def test_euclidean_dist(self): """Test abydos.distance.Euclidean.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7**0.5 / 23**0.5) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7**0.5 / 23**0.5) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7**0.5 / 23**0.5, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 8**0.5 / 24**0.5) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 8**0.5 / 24**0.5) # Test wrapper self.assertAlmostEqual(dist_euclidean('nelson', 'neilsen'), 7**0.5 / 23**0.5)
class JaccardTestCases(unittest.TestCase): """Test Jaccard functions. abydos.distance.Jaccard """ cmp = Jaccard() cmp_q2 = Jaccard(tokenizer=QGrams(2)) cmp_ws = Jaccard(tokenizer=WhitespaceTokenizer()) def test_jaccard_sim(self): """Test abydos.distance.Jaccard.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 11) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 4 / 11, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 3) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 3) # Test wrapper self.assertAlmostEqual(sim_jaccard('nelson', 'neilsen'), 4 / 11) def test_jaccard_dist(self): """Test abydos.distance.Jaccard.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 11) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 11) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7 / 11, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 2 / 3) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 2 / 3) # Test wrapper self.assertAlmostEqual(dist_jaccard('nelson', 'neilsen'), 7 / 11)
class TanimotoTestCases(unittest.TestCase): """Test Tanimoto functions. abydos.distance.Jaccard.tanimoto_coeff """ cmp = Jaccard() cmp_q2 = Jaccard(tokenizer=QGrams(2)) cmp_ws = Jaccard(tokenizer=WhitespaceTokenizer()) def test_jaccard_tanimoto_coeff(self): """Test abydos.distance.Jaccard.tanimoto_coeff.""" self.assertEqual(self.cmp.tanimoto_coeff('', ''), 0) self.assertEqual(self.cmp.tanimoto_coeff('nelson', ''), float('-inf')) self.assertEqual(self.cmp.tanimoto_coeff('', 'neilsen'), float('-inf')) self.assertAlmostEqual(self.cmp.tanimoto_coeff('nelson', 'neilsen'), log2(4 / 11)) self.assertEqual(self.cmp_q2.tanimoto_coeff('', ''), 0) self.assertEqual(self.cmp_q2.tanimoto_coeff('nelson', ''), float('-inf')) self.assertEqual(self.cmp_q2.tanimoto_coeff('', 'neilsen'), float('-inf')) self.assertAlmostEqual( self.cmp_q2.tanimoto_coeff('nelson', 'neilsen'), log2(4 / 11), ) # supplied q-gram tests self.assertEqual( self.cmp.tanimoto_coeff( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.tanimoto_coeff( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), float('-inf'), ) self.assertEqual( self.cmp.tanimoto_coeff( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), float('-inf'), ) self.assertAlmostEqual( self.cmp.tanimoto_coeff( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), log2(4 / 11), ) # non-q-gram tests self.assertEqual(self.cmp_ws.tanimoto_coeff('', ''), 0) self.assertEqual(self.cmp_ws.tanimoto_coeff('the quick', ''), float('-inf')) self.assertEqual(self.cmp_ws.tanimoto_coeff('', 'the quick'), float('-inf')) self.assertAlmostEqual(self.cmp_ws.tanimoto_coeff(NONQ_FROM, NONQ_TO), log2(1 / 3)) self.assertAlmostEqual(self.cmp_ws.tanimoto_coeff(NONQ_TO, NONQ_FROM), log2(1 / 3)) # Test wrapper self.assertAlmostEqual(tanimoto('nelson', 'neilsen'), log2(4 / 11))
class DiceTestCases(unittest.TestCase): """Test Dice functions. abydos.distance.Dice """ cmp = Dice() cmp_q2 = Dice(tokenizer=QGrams(2)) cmp_ws = Dice(tokenizer=WhitespaceTokenizer()) def test_dice_sim(self): """Test abydos.distance.Dice.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2) def test_dice_dist(self): """Test abydos.distance.Dice.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2)
def test_token_distance(self): """Test abydos.distance._TokenDistance members.""" self.assertAlmostEqual( Jaccard(intersection_type='soft', alphabet=24).sim( 'ATCAACGAGT', 'AACGATTAG' ), 0.68, ) self.assertAlmostEqual( Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'), 0.9, ) self.assertAlmostEqual( Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim( 'ATCAACGAGT', 'AACGATTAG' ), 0.6372795969773299, ) self.assertAlmostEqual( Jaccard(alphabet=None).sim('synonym', 'antonym'), 0.3333333333333333, ) self.assertAlmostEqual( Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'), 0.34146341463414637, ) src_ctr = Counter({'a': 5, 'b': 2, 'c': 10}) tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12}) self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375) self.assertAlmostEqual( SokalMichener(normalizer='proportional').sim('synonym', 'antonym'), 0.984777917351113, ) self.assertAlmostEqual( SokalMichener(normalizer='log').sim('synonym', 'antonym'), 1.2385752469545532, ) self.assertAlmostEqual( SokalMichener(normalizer='exp', alphabet=0).sim( 'synonym', 'antonym' ), 3.221246147982545e18, ) self.assertAlmostEqual( SokalMichener(normalizer='laplace').sim('synonym', 'antonym'), 0.98856416772554, ) self.assertAlmostEqual( SokalMichener(normalizer='inverse').sim('synonym', 'antonym'), 197.95790155440417, ) self.assertAlmostEqual( SokalMichener(normalizer='complement').sim('synonym', 'antonym'), 1.0204081632653061, ) self.assertAlmostEqual( SokalMichener(normalizer='base case').sim('synonym', 'antonym'), 0.9897959183673469, ) self.assertAlmostEqual( SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469 ) sm = SokalMichener() sm._tokenize('synonym', 'antonym') # noqa: SF01 self.assertEqual( sm._get_tokens(), # noqa: SF01 ( Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, } ), Counter( { '$a': 1, 'an': 1, 'nt': 1, 'to': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, } ), ), ) self.assertEqual(sm._src_card(), 8) # noqa: SF01 self.assertEqual(sm._tar_card(), 8) # noqa: SF01 self.assertEqual( sm._symmetric_difference(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, '$a': 1, 'an': 1, 'nt': 1, 'to': 1, } ), ) self.assertEqual(sm._symmetric_difference_card(), 8) # noqa: SF01 self.assertEqual(sm._total_complement_card(), 772) # noqa: SF01 self.assertEqual(sm._population_card(), 788) # noqa: SF01 self.assertEqual( sm._union(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, '$a': 1, 'an': 1, 'nt': 1, 'to': 1, } ), ) self.assertEqual(sm._union_card(), 12) # noqa: SF01 self.assertEqual( sm._difference(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 0, 'ny': 0, 'ym': 0, 'm#': 0, '$a': -1, 'an': -1, 'nt': -1, 'to': -1, } ), ) self.assertEqual( sm._intersection(), # noqa: SF01 Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}), ) self.assertEqual( sm._get_confusion_table(), # noqa: SF01 ConfusionTable(tp=4, tn=772, fp=4, fn=4), ) sm = SokalMichener( alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1 ) sm._tokenize('ATCAACGAGT', 'AACGATTAG') # noqa: SF01 self.assertEqual(sm._total_complement_card(), 61) # noqa: SF01 jac = Jaccard( intersection_type='linkage', internal_assignment_problem=True ) self.assertAlmostEqual(jac.sim('abandonned', 'abandoned'), 1.0) self.assertAlmostEqual( jac.sim('abundacies', 'abundances'), 0.6296296296296297 ) # Some additional constructors needed to complete test coverage self.assertAlmostEqual( Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'), 0.42857142857142855, ) self.assertAlmostEqual( AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'), 0.22558922558922556, ) self.assertAlmostEqual( Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim( 'abc', 'abcd' ), 0.42857142857142855, ) self.assertAlmostEqual( Jaccard( alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer() ).sim('abc', 'abcd'), 0.0, ) self.assertAlmostEqual( Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5 ) self.assertAlmostEqual( Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75 )
class CosineSimilarityTestCases(unittest.TestCase): """Test cosine similarity functions. abydos.distance.Cosine """ cmp = Cosine() cmp_q2 = Cosine(tokenizer=QGrams(2)) cmp_ws = Cosine(tokenizer=WhitespaceTokenizer()) def test_cosine_sim(self): """Test abydos.distance.Cosine.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / math.sqrt(7 * 8)) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / math.sqrt(7 * 8)) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 4 / math.sqrt(7 * 8), ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 4 / math.sqrt(9 * 7)) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 4 / math.sqrt(9 * 7)) self.assertEqual(self.cmp_q2.sim('eh', 'a'), 0.0) # Test wrapper self.assertAlmostEqual(sim_cosine('nelson', 'neilsen'), 4 / math.sqrt(7 * 8)) def test_cosine_dist(self): """Test abydos.distance.Cosine.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 1 - (4 / math.sqrt(7 * 8))) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 1 - (4 / math.sqrt(7 * 8))) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1 - (4 / math.sqrt(7 * 8)), ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 - 4 / math.sqrt(9 * 7)) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 - 4 / math.sqrt(9 * 7)) # Test wrapper self.assertAlmostEqual(dist_cosine('nelson', 'neilsen'), 1 - (4 / math.sqrt(7 * 8)))
class ManhattanTestCases(unittest.TestCase): """Test Manhattan functions. abydos.distance.Manhattan """ cmp = Manhattan() cmp_q2 = Manhattan(tokenizer=QGrams(2)) cmp_ws = Manhattan(tokenizer=WhitespaceTokenizer()) def test_manhattan_dist_abs(self): """Test abydos.distance.Manhattan.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8) self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7) self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8) self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7) # supplied q-gram tests self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 7, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8, ) self.assertAlmostEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2) self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8) # Test wrapper self.assertAlmostEqual(manhattan('nelson', 'neilsen'), 7) def test_manhattan_sim(self): """Test abydos.distance.Manhattan.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(sim_manhattan('nelson', 'neilsen'), 8 / 15) def test_manhattan_dist(self): """Test abydos.distance.Manhattan.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(dist_manhattan('nelson', 'neilsen'), 7 / 15)
class OverlapTestCases(unittest.TestCase): """Test overlap functions. abydos.distance.Overlap """ cmp = Overlap() cmp_q2 = Overlap(tokenizer=QGrams(2)) cmp_ws = Overlap(tokenizer=WhitespaceTokenizer()) def test_overlap_sim(self): """Test abydos.distance.Overlap.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 7) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 7) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 4 / 7, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 4 / 7) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 4 / 7) def test_overlap_dist(self): """Test abydos.distance.Overlap.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 3 / 7) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 3 / 7) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 3 / 7, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 3 / 7) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 3 / 7)
from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char algorithms = { 'corvcluster': COrVClusterTokenizer().tokenize, 'cvcluster': CVClusterTokenizer().tokenize, 'character': CharacterTokenizer().tokenize, 'legalipy': LegaliPyTokenizer().tokenize, 'nltk': NLTKTokenizer(nltk_tokenizer=TweetTokenizer()).tokenize, 'qgrams': QGrams().tokenize, 'qskipgrams': QSkipgrams().tokenize, 'regexp': RegexpTokenizer().tokenize, 'saps': SAPSTokenizer().tokenize, 'sonoripy': SonoriPyTokenizer().tokenize, 'vccluster': VCClusterTokenizer().tokenize, 'whitespace': WhitespaceTokenizer().tokenize, 'wordpunct': WordpunctTokenizer().tokenize, } class BigListOfNaughtyStringsTestCases(unittest.TestCase): """Test each tokenizer against the BLNS set. Here, we test each algorithm against each string, but we only care that it does not result in an exception. While not actually a fuzz test, this does serve the purpose of looking for errors resulting from unanticipated input. """ def fuzz_test_blns(self): """Test each tokenizer against the BLNS set."""
class TverskyIndexTestCases(unittest.TestCase): """Test Tversky functions. abydos.distance.Tversky """ cmp = Tversky() cmp_q2 = Tversky(tokenizer=QGrams(2)) cmp_ws = Tversky(tokenizer=WhitespaceTokenizer()) def test_tversky_sim(self): """Test abydos.distance.Tversky.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 11) # test valid alpha & beta self.assertRaises(ValueError, Tversky(alpha=-1.0, beta=-1.0).sim, 'abcd', 'dcba') self.assertRaises(ValueError, Tversky(alpha=-1.0, beta=0.0).sim, 'abcd', 'dcba') self.assertRaises(ValueError, Tversky(alpha=0.0, beta=-1.0).sim, 'abcd', 'dcba') # test empty QGrams self.assertAlmostEqual( Tversky(tokenizer=QGrams(7, start_stop='')).sim( 'nelson', 'neilsen'), 0.0, ) # test unequal alpha & beta self.assertAlmostEqual( Tversky(alpha=2.0, beta=1.0, tokenizer=QGrams(2)).sim('niall', 'neal'), 3 / 11, ) self.assertAlmostEqual( Tversky(alpha=1.0, beta=2.0, tokenizer=QGrams(2)).sim('niall', 'neal'), 3 / 10, ) self.assertAlmostEqual( Tversky(alpha=2.0, beta=2.0, tokenizer=QGrams(2)).sim('niall', 'neal'), 3 / 13, ) # test bias parameter self.assertAlmostEqual( Tversky(alpha=1.0, beta=1.0, bias=0.5, tokenizer=QGrams(2)).sim('niall', 'neal'), 7 / 11, ) self.assertAlmostEqual( Tversky(alpha=2.0, beta=1.0, bias=0.5, tokenizer=QGrams(2)).sim('niall', 'neal'), 7 / 9, ) self.assertAlmostEqual( Tversky(alpha=1.0, beta=2.0, bias=0.5, tokenizer=QGrams(2)).sim('niall', 'neal'), 7 / 15, ) self.assertAlmostEqual( Tversky(alpha=2.0, beta=2.0, bias=0.5, tokenizer=QGrams(2)).sim('niall', 'neal'), 7 / 11, ) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 4 / 11, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 3) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 3) # Test wrapper self.assertAlmostEqual(sim_tversky('nelson', 'neilsen'), 4 / 11) def test_tversky_dist(self): """Test abydos.distance.Tversky.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 11) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 11) # test valid alpha & beta self.assertRaises(ValueError, Tversky(alpha=-1.0, beta=-1.0).dist, 'abcd', 'dcba') self.assertRaises(ValueError, Tversky(alpha=-1.0, beta=0.0).dist, 'abcd', 'dcba') self.assertRaises(ValueError, Tversky(alpha=0.0, beta=-1.0).dist, 'abcd', 'dcba') # test empty QGrams self.assertAlmostEqual( Tversky(tokenizer=QGrams(7, start_stop='')).dist( 'nelson', 'neilsen'), 1.0, ) # test unequal alpha & beta self.assertAlmostEqual( Tversky(alpha=2.0, beta=1.0, tokenizer=QGrams(2)).dist('niall', 'neal'), 8 / 11, ) self.assertAlmostEqual( Tversky(alpha=1.0, beta=2.0, tokenizer=QGrams(2)).dist('niall', 'neal'), 7 / 10, ) self.assertAlmostEqual( Tversky(alpha=2.0, beta=2.0, tokenizer=QGrams(2)).dist('niall', 'neal'), 10 / 13, ) # test bias parameter self.assertAlmostEqual( Tversky(alpha=1.0, beta=1.0, bias=0.5, tokenizer=QGrams(2)).dist('niall', 'neal'), 4 / 11, ) self.assertAlmostEqual( Tversky(alpha=2.0, beta=1.0, bias=0.5, tokenizer=QGrams(2)).dist('niall', 'neal'), 2 / 9, ) self.assertAlmostEqual( Tversky(alpha=1.0, beta=2.0, bias=0.5, tokenizer=QGrams(2)).dist('niall', 'neal'), 8 / 15, ) self.assertAlmostEqual( Tversky(alpha=2.0, beta=2.0, bias=0.5, tokenizer=QGrams(2)).dist('niall', 'neal'), 4 / 11, ) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7 / 11, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 2 / 3) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 2 / 3) # Test wrapper self.assertAlmostEqual(dist_tversky('nelson', 'neilsen'), 7 / 11)