def test_fuzzy_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (fuzzy).""" # Base cases self.assertEqual(self.cmp_j_fuzzy.sim('', ''), 1.0) self.assertEqual(self.cmp_j_fuzzy.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_fuzzy.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_fuzzy.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_fuzzy.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_fuzzy.sim('abc', 'abc'), 1.0) self.assertAlmostEqual( self.cmp_j_fuzzy.sim('abcd', 'efgh'), 0.1111111111111111 ) self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Nigel', 'Niall'), 0.6) self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Niall', 'Nigel'), 0.6) self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Colin', 'Coiln'), 1.0) self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Coiln', 'Colin'), 1.0) self.assertAlmostEqual( self.cmp_j_fuzzy.sim('ATCAACGAGT', 'AACGATTAG'), 0.9565217391304348 ) self.assertAlmostEqual( Jaccard(intersection_type='fuzzy').sim('synonym', 'antonym'), 0.3333333333333333, )
def test_soft_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (soft).""" # Base cases self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0) self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0) self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111) self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6) self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6) self.assertAlmostEqual( self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'), 0.68 ) self.assertAlmostEqual( Jaccard(intersection_type='soft', metric=JaroWinkler()).sim( 'synonym', 'antonym' ), 0.777777777777, )
def test_linkage_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (group linkage).""" # Base cases self.assertEqual(self.cmp_j_linkage.sim('', ''), 1.0) self.assertEqual(self.cmp_j_linkage.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_linkage.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_linkage.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_linkage.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_linkage.sim('abc', 'abc'), 1.0) self.assertAlmostEqual(self.cmp_j_linkage.sim('abcd', 'efgh'), 0.1111111111111111) self.assertAlmostEqual(self.cmp_j_linkage.sim('Nigel', 'Niall'), 0.5) self.assertAlmostEqual(self.cmp_j_linkage.sim('Niall', 'Nigel'), 0.5) self.assertAlmostEqual(self.cmp_j_linkage.sim('Colin', 'Coiln'), 0.6) self.assertAlmostEqual(self.cmp_j_linkage.sim('Coiln', 'Colin'), 0.6) self.assertAlmostEqual( self.cmp_j_linkage.sim('ATCAACGAGT', 'AACGATTAG'), 0.68) self.assertAlmostEqual( Jaccard( intersection_type='linkage', metric=JaroWinkler(), threshold=0.2, ).sim('synonym', 'antonym'), 0.6, )
class MongeElkanTestCases(unittest.TestCase): """Test Monge-Elkan functions. abydos.distance.MongeElkan """ cmp = MongeElkan() cmp_sym = MongeElkan(symmetric=True) cmp_jac = MongeElkan(sim_func=Jaccard()) cmp_jac_sim = MongeElkan(sim_func=Jaccard().sim) def test_monge_elkan_sim(self): """Test abydos.distance.MongeElkan.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('', 'a'), 0) self.assertEqual(self.cmp.sim('a', 'a'), 1) self.assertEqual(self.cmp.sim('Niall', 'Neal'), 3 / 4) self.assertEqual(self.cmp.sim('Niall', 'Njall'), 5 / 6) self.assertEqual(self.cmp.sim('Niall', 'Niel'), 3 / 4) self.assertEqual(self.cmp.sim('Niall', 'Nigel'), 3 / 4) self.assertEqual(self.cmp_sym.sim('Niall', 'Neal'), 31 / 40) self.assertEqual(self.cmp_sym.sim('Niall', 'Njall'), 5 / 6) self.assertEqual(self.cmp_sym.sim('Niall', 'Niel'), 31 / 40) self.assertAlmostEqual(self.cmp_sym.sim('Niall', 'Nigel'), 17 / 24) self.assertEqual(self.cmp_jac.sim('Njall', 'Neil'), 29 / 60) self.assertEqual(self.cmp_jac_sim.sim('Njall', 'Neil'), 29 / 60) def test_monge_elkan_dist(self): """Test abydos.distance.MongeElkan.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('', 'a'), 1) self.assertEqual(self.cmp.dist('Niall', 'Neal'), 1 / 4) self.assertAlmostEqual(self.cmp.dist('Niall', 'Njall'), 1 / 6) self.assertEqual(self.cmp.dist('Niall', 'Niel'), 1 / 4) self.assertEqual(self.cmp.dist('Niall', 'Nigel'), 1 / 4) self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Neal'), 9 / 40) self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Njall'), 1 / 6) self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Niel'), 9 / 40) self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Nigel'), 7 / 24)
class TanimotoTestCases(unittest.TestCase): """Test Tanimoto functions. abydos.distance.Jaccard.tanimoto_coeff """ cmp = Jaccard() def test_jaccard_tanimoto_coeff(self): """Test abydos.distance.Jaccard.tanimoto_coeff.""" self.assertEqual(self.cmp.tanimoto_coeff('', ''), 0) self.assertEqual(self.cmp.tanimoto_coeff('nelson', ''), float('-inf')) self.assertEqual(self.cmp.tanimoto_coeff('', 'neilsen'), float('-inf')) self.assertAlmostEqual(self.cmp.tanimoto_coeff('nelson', 'neilsen'), math.log(4 / 11, 2)) self.assertEqual(self.cmp.tanimoto_coeff('', '', 2), 0) self.assertEqual(self.cmp.tanimoto_coeff('nelson', '', 2), float('-inf')) self.assertEqual(self.cmp.tanimoto_coeff('', 'neilsen', 2), float('-inf')) self.assertAlmostEqual( self.cmp.tanimoto_coeff('nelson', 'neilsen', 2), math.log(4 / 11, 2), ) # supplied q-gram tests self.assertEqual(self.cmp.tanimoto_coeff(QGrams(''), QGrams('')), 0) self.assertEqual( self.cmp.tanimoto_coeff(QGrams('nelson'), QGrams('')), float('-inf'), ) self.assertEqual( self.cmp.tanimoto_coeff(QGrams(''), QGrams('neilsen')), float('-inf'), ) self.assertAlmostEqual( self.cmp.tanimoto_coeff(QGrams('nelson'), QGrams('neilsen')), math.log(4 / 11, 2), ) # non-q-gram tests self.assertEqual(self.cmp.tanimoto_coeff('', '', 0), 0) self.assertEqual(self.cmp.tanimoto_coeff('the quick', '', 0), float('-inf')) self.assertEqual(self.cmp.tanimoto_coeff('', 'the quick', 0), float('-inf')) self.assertAlmostEqual(self.cmp.tanimoto_coeff(NONQ_FROM, NONQ_TO, 0), math.log(1 / 3, 2)) self.assertAlmostEqual(self.cmp.tanimoto_coeff(NONQ_TO, NONQ_FROM, 0), math.log(1 / 3, 2)) # Test wrapper self.assertAlmostEqual(tanimoto('nelson', 'neilsen'), math.log(4 / 11, 2))
def test_soft_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (soft).""" # Base cases self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0) self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0) self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111) self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6) self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6) self.assertAlmostEqual(self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'), 0.68) self.assertAlmostEqual( Jaccard(intersection_type='soft', tokenizer=WhitespaceTokenizer()).sim( 'junior system analyst', 'systems analyst'), 0.6190476190476191, ) self.assertAlmostEqual( Jaccard(intersection_type='soft', tokenizer=WhitespaceTokenizer()).sim( 'systems analyst', 'junior system analyst'), 0.6190476190476191, ) with self.assertRaises(TypeError): Jaccard( intersection_type='soft', metric=JaroWinkler(), tokenizer=WhitespaceTokenizer(), ).sim('junior system analyst', 'systems analyst')
class JaccardTestCases(unittest.TestCase): """Test Jaccard functions. abydos.distance.Jaccard """ cmp = Jaccard() def test_jaccard_sim(self): """Test abydos.distance.Jaccard.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11) self.assertEqual(self.cmp.sim('', '', 2), 1) self.assertEqual(self.cmp.sim('nelson', '', 2), 0) self.assertEqual(self.cmp.sim('', 'neilsen', 2), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen', 2), 4 / 11) # supplied q-gram tests self.assertEqual(self.cmp.sim(QGrams(''), QGrams('')), 1) self.assertEqual(self.cmp.sim(QGrams('nelson'), QGrams('')), 0) self.assertEqual(self.cmp.sim(QGrams(''), QGrams('neilsen')), 0) self.assertAlmostEqual( self.cmp.sim(QGrams('nelson'), QGrams('neilsen')), 4 / 11) # non-q-gram tests self.assertEqual(self.cmp.sim('', '', 0), 1) self.assertEqual(self.cmp.sim('the quick', '', 0), 0) self.assertEqual(self.cmp.sim('', 'the quick', 0), 0) self.assertAlmostEqual(self.cmp.sim(NONQ_FROM, NONQ_TO, 0), 1 / 3) self.assertAlmostEqual(self.cmp.sim(NONQ_TO, NONQ_FROM, 0), 1 / 3) # Test wrapper self.assertAlmostEqual(sim_jaccard('nelson', 'neilsen'), 4 / 11) def test_jaccard_dist(self): """Test abydos.distance.Jaccard.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 11) self.assertEqual(self.cmp.dist('', '', 2), 0) self.assertEqual(self.cmp.dist('nelson', '', 2), 1) self.assertEqual(self.cmp.dist('', 'neilsen', 2), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen', 2), 7 / 11) # supplied q-gram tests self.assertEqual(self.cmp.dist(QGrams(''), QGrams('')), 0) self.assertEqual(self.cmp.dist(QGrams('nelson'), QGrams('')), 1) self.assertEqual(self.cmp.dist(QGrams(''), QGrams('neilsen')), 1) self.assertAlmostEqual( self.cmp.dist(QGrams('nelson'), QGrams('neilsen')), 7 / 11) # non-q-gram tests self.assertEqual(self.cmp.dist('', '', 0), 0) self.assertEqual(self.cmp.dist('the quick', '', 0), 1) self.assertEqual(self.cmp.dist('', 'the quick', 0), 1) self.assertAlmostEqual(self.cmp.dist(NONQ_FROM, NONQ_TO, 0), 2 / 3) self.assertAlmostEqual(self.cmp.dist(NONQ_TO, NONQ_FROM, 0), 2 / 3) # Test wrapper self.assertAlmostEqual(dist_jaccard('nelson', 'neilsen'), 7 / 11)
class MetaLevenshteinTestCases(unittest.TestCase): """Test MetaLevenshtein functions. abydos.distance.MetaLevenshtein """ cmp = MetaLevenshtein() cmp_jac1 = MetaLevenshtein(metric=Jaccard(qval=1)) def test_meta_levenshtein_dist(self): """Test abydos.distance.MetaLevenshtein.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 0.0) self.assertEqual(self.cmp.dist('a', ''), 1.0) self.assertEqual(self.cmp.dist('', 'a'), 1.0) self.assertEqual(self.cmp.dist('abc', ''), 1.0) self.assertEqual(self.cmp.dist('', 'abc'), 1.0) self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.8463953614713058) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3077801314) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3077801314) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3077801314) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3077801314) self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.2931752664) def test_meta_levenshtein_sim(self): """Test abydos.distance.MetaLevenshtein.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 1.0) self.assertEqual(self.cmp.sim('a', ''), 0.0) self.assertEqual(self.cmp.sim('', 'a'), 0.0) self.assertEqual(self.cmp.sim('abc', ''), 0.0) self.assertEqual(self.cmp.sim('', 'abc'), 0.0) self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.15360463852869422) self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6922198686) self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6922198686) self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6922198686) self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6922198686) self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.7068247336) self.assertAlmostEqual(self.cmp_jac1.sim('Nigel', 'Niall'), 0.569107816) self.assertAlmostEqual(self.cmp_jac1.sim('Niall', 'Nigel'), 0.569107816) self.assertAlmostEqual(self.cmp_jac1.sim('Colin', 'Coiln'), 0.753775895) self.assertAlmostEqual(self.cmp_jac1.sim('Coiln', 'Colin'), 0.753775895) self.assertAlmostEqual(self.cmp_jac1.sim('ATCAACGAGT', 'AACGATTAG'), 0.5746789477) def test_meta_levenshtein_dist_abs(self): """Test abydos.distance.MetaLevenshtein.dist_abs.""" # Base cases self.assertEqual(self.cmp.dist_abs('', ''), 0.0) self.assertEqual(self.cmp.dist_abs('a', ''), 1.0) self.assertEqual(self.cmp.dist_abs('', 'a'), 1.0) self.assertEqual(self.cmp.dist_abs('abc', ''), 3.0) self.assertEqual(self.cmp.dist_abs('', 'abc'), 3.0) self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0) self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 3.385581445885223) self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 1.5389006572) self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 1.5389006572) self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 1.5389006572) self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 1.5389006572) self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2.9317526638) def test_meta_levenshtein_corpus(self): """Test abydos.distance.MetaLevenshtein with corpus.""" q3_corpus = UnigramCorpus(word_tokenizer=QGrams(qval=3)) download_package('en_qgram', silent=True) q3_corpus.load_corpus( os.path.join(package_path('en_qgram'), 'q3_en.dat')) cmp_q3 = MetaLevenshtein(tokenizer=QGrams(qval=3), corpus=q3_corpus) self.assertAlmostEqual(cmp_q3.dist_abs('Nigel', 'Niall'), 7.378939370) self.assertAlmostEqual(cmp_q3.dist_abs('Niall', 'Nigel'), 7.378939370) self.assertAlmostEqual(cmp_q3.dist_abs('Colin', 'Coiln'), 8.0) self.assertAlmostEqual(cmp_q3.dist_abs('Coiln', 'Colin'), 8.0) self.assertAlmostEqual(cmp_q3.dist('Nigel', 'Niall'), 0.527067098) self.assertAlmostEqual(cmp_q3.dist('Niall', 'Nigel'), 0.527067098) self.assertAlmostEqual(cmp_q3.dist('Colin', 'Coiln'), 0.571428571) self.assertAlmostEqual(cmp_q3.dist('Coiln', 'Colin'), 0.571428571) self.assertAlmostEqual(cmp_q3.sim('Nigel', 'Niall'), 0.472932902) self.assertAlmostEqual(cmp_q3.sim('Niall', 'Nigel'), 0.472932902) self.assertAlmostEqual(cmp_q3.sim('Colin', 'Coiln'), 0.428571429) self.assertAlmostEqual(cmp_q3.sim('Coiln', 'Colin'), 0.428571429)
class JaccardTestCases(unittest.TestCase): """Test Jaccard functions. abydos.distance.Jaccard """ cmp = Jaccard() cmp_q2 = Jaccard(tokenizer=QGrams(2)) cmp_ws = Jaccard(tokenizer=WhitespaceTokenizer()) def test_jaccard_sim(self): """Test abydos.distance.Jaccard.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 11) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 4 / 11, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 3) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 3) # Test wrapper self.assertAlmostEqual(sim_jaccard('nelson', 'neilsen'), 4 / 11) def test_jaccard_dist(self): """Test abydos.distance.Jaccard.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 11) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 11) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7 / 11, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 2 / 3) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 2 / 3) # Test wrapper self.assertAlmostEqual(dist_jaccard('nelson', 'neilsen'), 7 / 11)
class TanimotoTestCases(unittest.TestCase): """Test Tanimoto functions. abydos.distance.Jaccard.tanimoto_coeff """ cmp = Jaccard() cmp_q2 = Jaccard(tokenizer=QGrams(2)) cmp_ws = Jaccard(tokenizer=WhitespaceTokenizer()) def test_jaccard_tanimoto_coeff(self): """Test abydos.distance.Jaccard.tanimoto_coeff.""" self.assertEqual(self.cmp.tanimoto_coeff('', ''), 0) self.assertEqual(self.cmp.tanimoto_coeff('nelson', ''), float('-inf')) self.assertEqual(self.cmp.tanimoto_coeff('', 'neilsen'), float('-inf')) self.assertAlmostEqual(self.cmp.tanimoto_coeff('nelson', 'neilsen'), log2(4 / 11)) self.assertEqual(self.cmp_q2.tanimoto_coeff('', ''), 0) self.assertEqual(self.cmp_q2.tanimoto_coeff('nelson', ''), float('-inf')) self.assertEqual(self.cmp_q2.tanimoto_coeff('', 'neilsen'), float('-inf')) self.assertAlmostEqual( self.cmp_q2.tanimoto_coeff('nelson', 'neilsen'), log2(4 / 11), ) # supplied q-gram tests self.assertEqual( self.cmp.tanimoto_coeff( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.tanimoto_coeff( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), float('-inf'), ) self.assertEqual( self.cmp.tanimoto_coeff( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), float('-inf'), ) self.assertAlmostEqual( self.cmp.tanimoto_coeff( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), log2(4 / 11), ) # non-q-gram tests self.assertEqual(self.cmp_ws.tanimoto_coeff('', ''), 0) self.assertEqual(self.cmp_ws.tanimoto_coeff('the quick', ''), float('-inf')) self.assertEqual(self.cmp_ws.tanimoto_coeff('', 'the quick'), float('-inf')) self.assertAlmostEqual(self.cmp_ws.tanimoto_coeff(NONQ_FROM, NONQ_TO), log2(1 / 3)) self.assertAlmostEqual(self.cmp_ws.tanimoto_coeff(NONQ_TO, NONQ_FROM), log2(1 / 3)) # Test wrapper self.assertAlmostEqual(tanimoto('nelson', 'neilsen'), log2(4 / 11))
class TokenDistanceTestCases(unittest.TestCase): """Test _TokenDistance functions. abydos.distance._TokenDistance """ cmp_j_crisp = Jaccard(intersection_type='crisp') cmp_j_soft = Jaccard(intersection_type='soft') cmp_j_fuzzy = Jaccard( intersection_type='fuzzy', metric=DamerauLevenshtein(), threshold=0.4 ) cmp_j_linkage = Jaccard(intersection_type='linkage') cmp_j_linkage_int = Jaccard( intersection_type='linkage', internal_assignment_problem=True ) def test_crisp_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (crisp).""" # Base cases self.assertEqual(self.cmp_j_crisp.sim('', ''), 1.0) self.assertEqual(self.cmp_j_crisp.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_crisp.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_crisp.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_crisp.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_crisp.sim('abc', 'abc'), 1.0) self.assertEqual(self.cmp_j_crisp.sim('abcd', 'efgh'), 0.0) self.assertAlmostEqual( self.cmp_j_crisp.sim('Nigel', 'Niall'), 0.3333333333 ) self.assertAlmostEqual( self.cmp_j_crisp.sim('Niall', 'Nigel'), 0.3333333333 ) self.assertAlmostEqual( self.cmp_j_crisp.sim('Colin', 'Coiln'), 0.3333333333 ) self.assertAlmostEqual( self.cmp_j_crisp.sim('Coiln', 'Colin'), 0.3333333333 ) self.assertAlmostEqual( self.cmp_j_crisp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5 ) def test_soft_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (soft).""" # Base cases self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0) self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0) self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111) self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6) self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6) self.assertAlmostEqual( self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'), 0.68 ) self.assertAlmostEqual( Jaccard(intersection_type='soft', metric=JaroWinkler()).sim( 'synonym', 'antonym' ), 0.777777777777, ) def test_fuzzy_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (fuzzy).""" # Base cases self.assertEqual(self.cmp_j_fuzzy.sim('', ''), 1.0) self.assertEqual(self.cmp_j_fuzzy.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_fuzzy.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_fuzzy.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_fuzzy.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_fuzzy.sim('abc', 'abc'), 1.0) self.assertAlmostEqual( self.cmp_j_fuzzy.sim('abcd', 'efgh'), 0.1111111111111111 ) self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Nigel', 'Niall'), 0.6) self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Niall', 'Nigel'), 0.6) self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Colin', 'Coiln'), 1.0) self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Coiln', 'Colin'), 1.0) self.assertAlmostEqual( self.cmp_j_fuzzy.sim('ATCAACGAGT', 'AACGATTAG'), 0.9565217391304348 ) self.assertAlmostEqual( Jaccard(intersection_type='fuzzy').sim('synonym', 'antonym'), 0.3333333333333333, ) def test_linkage_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (group linkage).""" # Base cases self.assertEqual(self.cmp_j_linkage.sim('', ''), 1.0) self.assertEqual(self.cmp_j_linkage.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_linkage.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_linkage.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_linkage.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_linkage.sim('abc', 'abc'), 1.0) self.assertAlmostEqual( self.cmp_j_linkage.sim('abcd', 'efgh'), 0.1111111111111111 ) self.assertAlmostEqual(self.cmp_j_linkage.sim('Nigel', 'Niall'), 0.5) self.assertAlmostEqual(self.cmp_j_linkage.sim('Niall', 'Nigel'), 0.5) self.assertAlmostEqual(self.cmp_j_linkage.sim('Colin', 'Coiln'), 0.6) self.assertAlmostEqual(self.cmp_j_linkage.sim('Coiln', 'Colin'), 0.6) self.assertAlmostEqual( self.cmp_j_linkage.sim('ATCAACGAGT', 'AACGATTAG'), 0.68 ) # Base cases self.assertEqual(self.cmp_j_linkage_int.sim('', ''), 1.0) self.assertEqual(self.cmp_j_linkage_int.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_linkage_int.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_linkage_int.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_linkage_int.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_linkage_int.sim('abc', 'abc'), 1.0) self.assertEqual( self.cmp_j_linkage_int.sim('abcd', 'efgh'), 0.1111111111111111 ) self.assertAlmostEqual( self.cmp_j_linkage_int.sim('Nigel', 'Niall'), 0.5 ) self.assertAlmostEqual( self.cmp_j_linkage_int.sim('Niall', 'Nigel'), 0.6 ) self.assertAlmostEqual( self.cmp_j_linkage_int.sim('Colin', 'Coiln'), 0.5625 ) self.assertAlmostEqual( self.cmp_j_linkage_int.sim('Coiln', 'Colin'), 0.6 ) self.assertAlmostEqual( self.cmp_j_linkage_int.sim('ATCAACGAGT', 'AACGATTAG'), 0.75 ) self.assertAlmostEqual( Jaccard( intersection_type='linkage', metric=JaroWinkler(), threshold=0.2, ).sim('synonym', 'antonym'), 0.6, ) def test_token_distance(self): """Test abydos.distance._TokenDistance members.""" self.assertAlmostEqual( Jaccard(intersection_type='soft', alphabet=24).sim( 'ATCAACGAGT', 'AACGATTAG' ), 0.68, ) self.assertAlmostEqual( Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'), 0.9, ) self.assertAlmostEqual( Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim( 'ATCAACGAGT', 'AACGATTAG' ), 0.6372795969773299, ) self.assertAlmostEqual( Jaccard(alphabet=None).sim('synonym', 'antonym'), 0.3333333333333333, ) self.assertAlmostEqual( Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'), 0.34146341463414637, ) src_ctr = Counter({'a': 5, 'b': 2, 'c': 10}) tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12}) self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375) self.assertAlmostEqual( SokalMichener(normalizer='proportional').sim('synonym', 'antonym'), 0.984777917351113, ) self.assertAlmostEqual( SokalMichener(normalizer='log').sim('synonym', 'antonym'), 1.2385752469545532, ) self.assertAlmostEqual( SokalMichener(normalizer='exp', alphabet=0).sim( 'synonym', 'antonym' ), 3.221246147982545e18, ) self.assertAlmostEqual( SokalMichener(normalizer='laplace').sim('synonym', 'antonym'), 0.98856416772554, ) self.assertAlmostEqual( SokalMichener(normalizer='inverse').sim('synonym', 'antonym'), 197.95790155440417, ) self.assertAlmostEqual( SokalMichener(normalizer='complement').sim('synonym', 'antonym'), 1.0204081632653061, ) self.assertAlmostEqual( SokalMichener(normalizer='base case').sim('synonym', 'antonym'), 0.9897959183673469, ) self.assertAlmostEqual( SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469 ) sm = SokalMichener() sm._tokenize('synonym', 'antonym') # noqa: SF01 self.assertEqual( sm._get_tokens(), # noqa: SF01 ( Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, } ), Counter( { '$a': 1, 'an': 1, 'nt': 1, 'to': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, } ), ), ) self.assertEqual(sm._src_card(), 8) # noqa: SF01 self.assertEqual(sm._tar_card(), 8) # noqa: SF01 self.assertEqual( sm._symmetric_difference(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, '$a': 1, 'an': 1, 'nt': 1, 'to': 1, } ), ) self.assertEqual(sm._symmetric_difference_card(), 8) # noqa: SF01 self.assertEqual(sm._total_complement_card(), 772) # noqa: SF01 self.assertEqual(sm._population_card(), 788) # noqa: SF01 self.assertEqual( sm._union(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, '$a': 1, 'an': 1, 'nt': 1, 'to': 1, } ), ) self.assertEqual(sm._union_card(), 12) # noqa: SF01 self.assertEqual( sm._difference(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 0, 'ny': 0, 'ym': 0, 'm#': 0, '$a': -1, 'an': -1, 'nt': -1, 'to': -1, } ), ) self.assertEqual( sm._intersection(), # noqa: SF01 Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}), ) self.assertEqual( sm._get_confusion_table(), # noqa: SF01 ConfusionTable(tp=4, tn=772, fp=4, fn=4), ) sm = SokalMichener( alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1 ) sm._tokenize('ATCAACGAGT', 'AACGATTAG') # noqa: SF01 self.assertEqual(sm._total_complement_card(), 61) # noqa: SF01 jac = Jaccard( intersection_type='linkage', internal_assignment_problem=True ) self.assertAlmostEqual(jac.sim('abandonned', 'abandoned'), 1.0) self.assertAlmostEqual( jac.sim('abundacies', 'abundances'), 0.6296296296296297 ) # Some additional constructors needed to complete test coverage self.assertAlmostEqual( Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'), 0.42857142857142855, ) self.assertAlmostEqual( AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'), 0.22558922558922556, ) self.assertAlmostEqual( Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim( 'abc', 'abcd' ), 0.42857142857142855, ) self.assertAlmostEqual( Jaccard( alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer() ).sim('abc', 'abcd'), 0.0, ) self.assertAlmostEqual( Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5 ) self.assertAlmostEqual( Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75 )
def test_token_distance(self): """Test abydos.distance._TokenDistance members.""" self.assertAlmostEqual( Jaccard(intersection_type='soft', alphabet=24).sim( 'ATCAACGAGT', 'AACGATTAG' ), 0.68, ) self.assertAlmostEqual( Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'), 0.9, ) self.assertAlmostEqual( Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim( 'ATCAACGAGT', 'AACGATTAG' ), 0.6372795969773299, ) self.assertAlmostEqual( Jaccard(alphabet=None).sim('synonym', 'antonym'), 0.3333333333333333, ) self.assertAlmostEqual( Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'), 0.34146341463414637, ) src_ctr = Counter({'a': 5, 'b': 2, 'c': 10}) tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12}) self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375) self.assertAlmostEqual( SokalMichener(normalizer='proportional').sim('synonym', 'antonym'), 0.984777917351113, ) self.assertAlmostEqual( SokalMichener(normalizer='log').sim('synonym', 'antonym'), 1.2385752469545532, ) self.assertAlmostEqual( SokalMichener(normalizer='exp', alphabet=0).sim( 'synonym', 'antonym' ), 3.221246147982545e18, ) self.assertAlmostEqual( SokalMichener(normalizer='laplace').sim('synonym', 'antonym'), 0.98856416772554, ) self.assertAlmostEqual( SokalMichener(normalizer='inverse').sim('synonym', 'antonym'), 197.95790155440417, ) self.assertAlmostEqual( SokalMichener(normalizer='complement').sim('synonym', 'antonym'), 1.0204081632653061, ) self.assertAlmostEqual( SokalMichener(normalizer='base case').sim('synonym', 'antonym'), 0.9897959183673469, ) self.assertAlmostEqual( SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469 ) sm = SokalMichener() sm._tokenize('synonym', 'antonym') # noqa: SF01 self.assertEqual( sm._get_tokens(), # noqa: SF01 ( Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, } ), Counter( { '$a': 1, 'an': 1, 'nt': 1, 'to': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, } ), ), ) self.assertEqual(sm._src_card(), 8) # noqa: SF01 self.assertEqual(sm._tar_card(), 8) # noqa: SF01 self.assertEqual( sm._symmetric_difference(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, '$a': 1, 'an': 1, 'nt': 1, 'to': 1, } ), ) self.assertEqual(sm._symmetric_difference_card(), 8) # noqa: SF01 self.assertEqual(sm._total_complement_card(), 772) # noqa: SF01 self.assertEqual(sm._population_card(), 788) # noqa: SF01 self.assertEqual( sm._union(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 1, 'ny': 1, 'ym': 1, 'm#': 1, '$a': 1, 'an': 1, 'nt': 1, 'to': 1, } ), ) self.assertEqual(sm._union_card(), 12) # noqa: SF01 self.assertEqual( sm._difference(), # noqa: SF01 Counter( { '$s': 1, 'sy': 1, 'yn': 1, 'no': 1, 'on': 0, 'ny': 0, 'ym': 0, 'm#': 0, '$a': -1, 'an': -1, 'nt': -1, 'to': -1, } ), ) self.assertEqual( sm._intersection(), # noqa: SF01 Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}), ) self.assertEqual( sm._get_confusion_table(), # noqa: SF01 ConfusionTable(tp=4, tn=772, fp=4, fn=4), ) sm = SokalMichener( alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1 ) sm._tokenize('ATCAACGAGT', 'AACGATTAG') # noqa: SF01 self.assertEqual(sm._total_complement_card(), 61) # noqa: SF01 jac = Jaccard( intersection_type='linkage', internal_assignment_problem=True ) self.assertAlmostEqual(jac.sim('abandonned', 'abandoned'), 1.0) self.assertAlmostEqual( jac.sim('abundacies', 'abundances'), 0.6296296296296297 ) # Some additional constructors needed to complete test coverage self.assertAlmostEqual( Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'), 0.42857142857142855, ) self.assertAlmostEqual( AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'), 0.22558922558922556, ) self.assertAlmostEqual( Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim( 'abc', 'abcd' ), 0.42857142857142855, ) self.assertAlmostEqual( Jaccard( alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer() ).sim('abc', 'abcd'), 0.0, ) self.assertAlmostEqual( Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5 ) self.assertAlmostEqual( Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75 )
def test_pairwise_similarity_statistics(self): """Test abydos.stats.pairwise_similarity_statistics.""" (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, NIALL) self.assertAlmostEqual(pw_max, 1.0) self.assertAlmostEqual(pw_min, 0.11764705882352944) self.assertAlmostEqual(pw_mean, 0.4188369879201684) self.assertAlmostEqual(pw_std, 0.2265099631340623) (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, ('Kneal', )) self.assertAlmostEqual(pw_max, 0.8333333333333334) self.assertAlmostEqual(pw_min, 0.11764705882352944) self.assertAlmostEqual(pw_mean, 0.30474877450980387) self.assertAlmostEqual(pw_std, 0.1842666797571549) # Test symmetric (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, NIALL, symmetric=True) self.assertAlmostEqual(pw_max, 1.0) self.assertAlmostEqual(pw_min, 0.11764705882352944) self.assertAlmostEqual(pw_mean, 0.4188369879201679) self.assertAlmostEqual(pw_std, 0.22650996313406255) (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, ('Kneal', ), symmetric=True) self.assertAlmostEqual(pw_max, 0.8333333333333334) self.assertAlmostEqual(pw_min, 0.11764705882352944) self.assertAlmostEqual(pw_mean, 0.304748774509804) self.assertAlmostEqual(pw_std, 0.18426667975715486) # Test with splittable strings (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics('The quick brown fox', 'jumped over the lazy dog.') self.assertAlmostEqual(pw_max, 0.6666666666666667) self.assertAlmostEqual(pw_min, 0.0) self.assertAlmostEqual(pw_mean, 0.08499999999999999) self.assertAlmostEqual(pw_std, 0.16132265804901677) (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics('The', 'jumped') self.assertAlmostEqual(pw_max, 0.16666666666666663) self.assertAlmostEqual(pw_min, 0.16666666666666663) self.assertAlmostEqual(pw_mean, 0.16666666666666663) self.assertAlmostEqual(pw_std, 0.0) # Test with a set metric (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, NIALL, metric=Jaccard().sim) self.assertAlmostEqual(pw_max, 1.0) self.assertAlmostEqual(pw_min, 0.0) self.assertAlmostEqual(pw_mean, 0.23226906681010506) self.assertAlmostEqual(pw_std, 0.24747101181262784) (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, NIALL, metric=JaroWinkler().dist) self.assertAlmostEqual(pw_max, 1.0) self.assertAlmostEqual(pw_min, 0.0) self.assertAlmostEqual(pw_mean, 0.3352660334967324) self.assertAlmostEqual(pw_std, 0.18394505847524578) # Test using hmean' (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, NIALL, mean_func=hmean) self.assertAlmostEqual(pw_max, 1.0) self.assertAlmostEqual(pw_min, 0.11764705882352944) self.assertAlmostEqual(pw_mean, 0.30718771249150056) self.assertAlmostEqual(pw_std, 0.25253182790044676) # Test exceptions self.assertRaises( ValueError, pairwise_similarity_statistics, NIALL, NIALL, mean_func='mean', ) self.assertRaises( ValueError, pairwise_similarity_statistics, NIALL, NIALL, metric='Levenshtein', ) self.assertRaises(ValueError, pairwise_similarity_statistics, 5, NIALL) self.assertRaises(ValueError, pairwise_similarity_statistics, NIALL, 5)
def __init__(self): self.stop_words = list(stopwords.words('english')) self.lemmatizer = WordNetLemmatizer() self.jaccard_sim_obj = Jaccard() self.elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)