Exemple #1
0
    def test_fuzzy_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (fuzzy)."""
        # Base cases
        self.assertEqual(self.cmp_j_fuzzy.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_fuzzy.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_fuzzy.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_fuzzy.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_fuzzy.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_fuzzy.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(
            self.cmp_j_fuzzy.sim('abcd', 'efgh'), 0.1111111111111111
        )

        self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Nigel', 'Niall'), 0.6)
        self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Niall', 'Nigel'), 0.6)
        self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Colin', 'Coiln'), 1.0)
        self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Coiln', 'Colin'), 1.0)
        self.assertAlmostEqual(
            self.cmp_j_fuzzy.sim('ATCAACGAGT', 'AACGATTAG'), 0.9565217391304348
        )

        self.assertAlmostEqual(
            Jaccard(intersection_type='fuzzy').sim('synonym', 'antonym'),
            0.3333333333333333,
        )
Exemple #2
0
    def test_soft_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (soft)."""
        # Base cases
        self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111)

        self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6)
        self.assertAlmostEqual(
            self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'), 0.68
        )

        self.assertAlmostEqual(
            Jaccard(intersection_type='soft', metric=JaroWinkler()).sim(
                'synonym', 'antonym'
            ),
            0.777777777777,
        )
Exemple #3
0
    def test_linkage_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (group linkage)."""
        # Base cases
        self.assertEqual(self.cmp_j_linkage.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_linkage.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('abcd', 'efgh'),
                               0.1111111111111111)

        self.assertAlmostEqual(self.cmp_j_linkage.sim('Nigel', 'Niall'), 0.5)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('Niall', 'Nigel'), 0.5)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('Colin', 'Coiln'), 0.6)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('Coiln', 'Colin'), 0.6)
        self.assertAlmostEqual(
            self.cmp_j_linkage.sim('ATCAACGAGT', 'AACGATTAG'), 0.68)

        self.assertAlmostEqual(
            Jaccard(
                intersection_type='linkage',
                metric=JaroWinkler(),
                threshold=0.2,
            ).sim('synonym', 'antonym'),
            0.6,
        )
Exemple #4
0
class MongeElkanTestCases(unittest.TestCase):
    """Test Monge-Elkan functions.

    abydos.distance.MongeElkan
    """

    cmp = MongeElkan()
    cmp_sym = MongeElkan(symmetric=True)
    cmp_jac = MongeElkan(sim_func=Jaccard())
    cmp_jac_sim = MongeElkan(sim_func=Jaccard().sim)

    def test_monge_elkan_sim(self):
        """Test abydos.distance.MongeElkan.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('', 'a'), 0)
        self.assertEqual(self.cmp.sim('a', 'a'), 1)

        self.assertEqual(self.cmp.sim('Niall', 'Neal'), 3 / 4)
        self.assertEqual(self.cmp.sim('Niall', 'Njall'), 5 / 6)
        self.assertEqual(self.cmp.sim('Niall', 'Niel'), 3 / 4)
        self.assertEqual(self.cmp.sim('Niall', 'Nigel'), 3 / 4)

        self.assertEqual(self.cmp_sym.sim('Niall', 'Neal'), 31 / 40)
        self.assertEqual(self.cmp_sym.sim('Niall', 'Njall'), 5 / 6)
        self.assertEqual(self.cmp_sym.sim('Niall', 'Niel'), 31 / 40)
        self.assertAlmostEqual(self.cmp_sym.sim('Niall', 'Nigel'), 17 / 24)

        self.assertEqual(self.cmp_jac.sim('Njall', 'Neil'), 29 / 60)
        self.assertEqual(self.cmp_jac_sim.sim('Njall', 'Neil'), 29 / 60)

    def test_monge_elkan_dist(self):
        """Test abydos.distance.MongeElkan.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('', 'a'), 1)

        self.assertEqual(self.cmp.dist('Niall', 'Neal'), 1 / 4)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Njall'), 1 / 6)
        self.assertEqual(self.cmp.dist('Niall', 'Niel'), 1 / 4)
        self.assertEqual(self.cmp.dist('Niall', 'Nigel'), 1 / 4)

        self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Neal'), 9 / 40)
        self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Njall'), 1 / 6)
        self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Niel'), 9 / 40)
        self.assertAlmostEqual(self.cmp_sym.dist('Niall', 'Nigel'), 7 / 24)
Exemple #5
0
class TanimotoTestCases(unittest.TestCase):
    """Test Tanimoto functions.

    abydos.distance.Jaccard.tanimoto_coeff
    """

    cmp = Jaccard()

    def test_jaccard_tanimoto_coeff(self):
        """Test abydos.distance.Jaccard.tanimoto_coeff."""
        self.assertEqual(self.cmp.tanimoto_coeff('', ''), 0)
        self.assertEqual(self.cmp.tanimoto_coeff('nelson', ''), float('-inf'))
        self.assertEqual(self.cmp.tanimoto_coeff('', 'neilsen'), float('-inf'))
        self.assertAlmostEqual(self.cmp.tanimoto_coeff('nelson', 'neilsen'),
                               math.log(4 / 11, 2))

        self.assertEqual(self.cmp.tanimoto_coeff('', '', 2), 0)
        self.assertEqual(self.cmp.tanimoto_coeff('nelson', '', 2),
                         float('-inf'))
        self.assertEqual(self.cmp.tanimoto_coeff('', 'neilsen', 2),
                         float('-inf'))
        self.assertAlmostEqual(
            self.cmp.tanimoto_coeff('nelson', 'neilsen', 2),
            math.log(4 / 11, 2),
        )

        # supplied q-gram tests
        self.assertEqual(self.cmp.tanimoto_coeff(QGrams(''), QGrams('')), 0)
        self.assertEqual(
            self.cmp.tanimoto_coeff(QGrams('nelson'), QGrams('')),
            float('-inf'),
        )
        self.assertEqual(
            self.cmp.tanimoto_coeff(QGrams(''), QGrams('neilsen')),
            float('-inf'),
        )
        self.assertAlmostEqual(
            self.cmp.tanimoto_coeff(QGrams('nelson'), QGrams('neilsen')),
            math.log(4 / 11, 2),
        )

        # non-q-gram tests
        self.assertEqual(self.cmp.tanimoto_coeff('', '', 0), 0)
        self.assertEqual(self.cmp.tanimoto_coeff('the quick', '', 0),
                         float('-inf'))
        self.assertEqual(self.cmp.tanimoto_coeff('', 'the quick', 0),
                         float('-inf'))
        self.assertAlmostEqual(self.cmp.tanimoto_coeff(NONQ_FROM, NONQ_TO, 0),
                               math.log(1 / 3, 2))
        self.assertAlmostEqual(self.cmp.tanimoto_coeff(NONQ_TO, NONQ_FROM, 0),
                               math.log(1 / 3, 2))

        # Test wrapper
        self.assertAlmostEqual(tanimoto('nelson', 'neilsen'),
                               math.log(4 / 11, 2))
Exemple #6
0
    def test_soft_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (soft)."""
        # Base cases
        self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111)

        self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6)
        self.assertAlmostEqual(self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.68)

        self.assertAlmostEqual(
            Jaccard(intersection_type='soft',
                    tokenizer=WhitespaceTokenizer()).sim(
                        'junior system analyst', 'systems analyst'),
            0.6190476190476191,
        )
        self.assertAlmostEqual(
            Jaccard(intersection_type='soft',
                    tokenizer=WhitespaceTokenizer()).sim(
                        'systems analyst', 'junior system analyst'),
            0.6190476190476191,
        )

        with self.assertRaises(TypeError):
            Jaccard(
                intersection_type='soft',
                metric=JaroWinkler(),
                tokenizer=WhitespaceTokenizer(),
            ).sim('junior system analyst', 'systems analyst')
Exemple #7
0
class JaccardTestCases(unittest.TestCase):
    """Test Jaccard functions.

    abydos.distance.Jaccard
    """

    cmp = Jaccard()

    def test_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11)

        self.assertEqual(self.cmp.sim('', '', 2), 1)
        self.assertEqual(self.cmp.sim('nelson', '', 2), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen', 2), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen', 2), 4 / 11)

        # supplied q-gram tests
        self.assertEqual(self.cmp.sim(QGrams(''), QGrams('')), 1)
        self.assertEqual(self.cmp.sim(QGrams('nelson'), QGrams('')), 0)
        self.assertEqual(self.cmp.sim(QGrams(''), QGrams('neilsen')), 0)
        self.assertAlmostEqual(
            self.cmp.sim(QGrams('nelson'), QGrams('neilsen')), 4 / 11)

        # non-q-gram tests
        self.assertEqual(self.cmp.sim('', '', 0), 1)
        self.assertEqual(self.cmp.sim('the quick', '', 0), 0)
        self.assertEqual(self.cmp.sim('', 'the quick', 0), 0)
        self.assertAlmostEqual(self.cmp.sim(NONQ_FROM, NONQ_TO, 0), 1 / 3)
        self.assertAlmostEqual(self.cmp.sim(NONQ_TO, NONQ_FROM, 0), 1 / 3)

        # Test wrapper
        self.assertAlmostEqual(sim_jaccard('nelson', 'neilsen'), 4 / 11)

    def test_jaccard_dist(self):
        """Test abydos.distance.Jaccard.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 11)

        self.assertEqual(self.cmp.dist('', '', 2), 0)
        self.assertEqual(self.cmp.dist('nelson', '', 2), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen', 2), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen', 2), 7 / 11)

        # supplied q-gram tests
        self.assertEqual(self.cmp.dist(QGrams(''), QGrams('')), 0)
        self.assertEqual(self.cmp.dist(QGrams('nelson'), QGrams('')), 1)
        self.assertEqual(self.cmp.dist(QGrams(''), QGrams('neilsen')), 1)
        self.assertAlmostEqual(
            self.cmp.dist(QGrams('nelson'), QGrams('neilsen')), 7 / 11)

        # non-q-gram tests
        self.assertEqual(self.cmp.dist('', '', 0), 0)
        self.assertEqual(self.cmp.dist('the quick', '', 0), 1)
        self.assertEqual(self.cmp.dist('', 'the quick', 0), 1)
        self.assertAlmostEqual(self.cmp.dist(NONQ_FROM, NONQ_TO, 0), 2 / 3)
        self.assertAlmostEqual(self.cmp.dist(NONQ_TO, NONQ_FROM, 0), 2 / 3)

        # Test wrapper
        self.assertAlmostEqual(dist_jaccard('nelson', 'neilsen'), 7 / 11)
class MetaLevenshteinTestCases(unittest.TestCase):
    """Test MetaLevenshtein functions.

    abydos.distance.MetaLevenshtein
    """

    cmp = MetaLevenshtein()
    cmp_jac1 = MetaLevenshtein(metric=Jaccard(qval=1))

    def test_meta_levenshtein_dist(self):
        """Test abydos.distance.MetaLevenshtein.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 0.0)
        self.assertEqual(self.cmp.dist('a', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'a'), 1.0)
        self.assertEqual(self.cmp.dist('abc', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'abc'), 1.0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.8463953614713058)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3077801314)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3077801314)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.3077801314)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.3077801314)
        self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'),
                               0.2931752664)

    def test_meta_levenshtein_sim(self):
        """Test abydos.distance.MetaLevenshtein.sim."""
        # Base cases
        self.assertEqual(self.cmp.sim('', ''), 1.0)
        self.assertEqual(self.cmp.sim('a', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.15360463852869422)

        self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6922198686)
        self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6922198686)
        self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.6922198686)
        self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.6922198686)
        self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.7068247336)

        self.assertAlmostEqual(self.cmp_jac1.sim('Nigel', 'Niall'),
                               0.569107816)
        self.assertAlmostEqual(self.cmp_jac1.sim('Niall', 'Nigel'),
                               0.569107816)
        self.assertAlmostEqual(self.cmp_jac1.sim('Colin', 'Coiln'),
                               0.753775895)
        self.assertAlmostEqual(self.cmp_jac1.sim('Coiln', 'Colin'),
                               0.753775895)
        self.assertAlmostEqual(self.cmp_jac1.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.5746789477)

    def test_meta_levenshtein_dist_abs(self):
        """Test abydos.distance.MetaLevenshtein.dist_abs."""
        # Base cases
        self.assertEqual(self.cmp.dist_abs('', ''), 0.0)
        self.assertEqual(self.cmp.dist_abs('a', ''), 1.0)
        self.assertEqual(self.cmp.dist_abs('', 'a'), 1.0)
        self.assertEqual(self.cmp.dist_abs('abc', ''), 3.0)
        self.assertEqual(self.cmp.dist_abs('', 'abc'), 3.0)
        self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 3.385581445885223)

        self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'),
                               1.5389006572)
        self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'),
                               1.5389006572)
        self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'),
                               1.5389006572)
        self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'),
                               1.5389006572)
        self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'),
                               2.9317526638)

    def test_meta_levenshtein_corpus(self):
        """Test abydos.distance.MetaLevenshtein with corpus."""
        q3_corpus = UnigramCorpus(word_tokenizer=QGrams(qval=3))
        download_package('en_qgram', silent=True)
        q3_corpus.load_corpus(
            os.path.join(package_path('en_qgram'), 'q3_en.dat'))
        cmp_q3 = MetaLevenshtein(tokenizer=QGrams(qval=3), corpus=q3_corpus)

        self.assertAlmostEqual(cmp_q3.dist_abs('Nigel', 'Niall'), 7.378939370)
        self.assertAlmostEqual(cmp_q3.dist_abs('Niall', 'Nigel'), 7.378939370)
        self.assertAlmostEqual(cmp_q3.dist_abs('Colin', 'Coiln'), 8.0)
        self.assertAlmostEqual(cmp_q3.dist_abs('Coiln', 'Colin'), 8.0)

        self.assertAlmostEqual(cmp_q3.dist('Nigel', 'Niall'), 0.527067098)
        self.assertAlmostEqual(cmp_q3.dist('Niall', 'Nigel'), 0.527067098)
        self.assertAlmostEqual(cmp_q3.dist('Colin', 'Coiln'), 0.571428571)
        self.assertAlmostEqual(cmp_q3.dist('Coiln', 'Colin'), 0.571428571)

        self.assertAlmostEqual(cmp_q3.sim('Nigel', 'Niall'), 0.472932902)
        self.assertAlmostEqual(cmp_q3.sim('Niall', 'Nigel'), 0.472932902)
        self.assertAlmostEqual(cmp_q3.sim('Colin', 'Coiln'), 0.428571429)
        self.assertAlmostEqual(cmp_q3.sim('Coiln', 'Colin'), 0.428571429)
class JaccardTestCases(unittest.TestCase):
    """Test Jaccard functions.

    abydos.distance.Jaccard
    """

    cmp = Jaccard()
    cmp_q2 = Jaccard(tokenizer=QGrams(2))
    cmp_ws = Jaccard(tokenizer=WhitespaceTokenizer())

    def test_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 11)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            4 / 11,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 3)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 3)

        # Test wrapper
        self.assertAlmostEqual(sim_jaccard('nelson', 'neilsen'), 4 / 11)

    def test_jaccard_dist(self):
        """Test abydos.distance.Jaccard.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 11)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 11)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7 / 11,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 2 / 3)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 2 / 3)

        # Test wrapper
        self.assertAlmostEqual(dist_jaccard('nelson', 'neilsen'), 7 / 11)
class TanimotoTestCases(unittest.TestCase):
    """Test Tanimoto functions.

    abydos.distance.Jaccard.tanimoto_coeff
    """

    cmp = Jaccard()
    cmp_q2 = Jaccard(tokenizer=QGrams(2))
    cmp_ws = Jaccard(tokenizer=WhitespaceTokenizer())

    def test_jaccard_tanimoto_coeff(self):
        """Test abydos.distance.Jaccard.tanimoto_coeff."""
        self.assertEqual(self.cmp.tanimoto_coeff('', ''), 0)
        self.assertEqual(self.cmp.tanimoto_coeff('nelson', ''), float('-inf'))
        self.assertEqual(self.cmp.tanimoto_coeff('', 'neilsen'), float('-inf'))
        self.assertAlmostEqual(self.cmp.tanimoto_coeff('nelson', 'neilsen'),
                               log2(4 / 11))

        self.assertEqual(self.cmp_q2.tanimoto_coeff('', ''), 0)
        self.assertEqual(self.cmp_q2.tanimoto_coeff('nelson', ''),
                         float('-inf'))
        self.assertEqual(self.cmp_q2.tanimoto_coeff('', 'neilsen'),
                         float('-inf'))
        self.assertAlmostEqual(
            self.cmp_q2.tanimoto_coeff('nelson', 'neilsen'),
            log2(4 / 11),
        )

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.tanimoto_coeff(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.tanimoto_coeff(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            float('-inf'),
        )
        self.assertEqual(
            self.cmp.tanimoto_coeff(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            float('-inf'),
        )
        self.assertAlmostEqual(
            self.cmp.tanimoto_coeff(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            log2(4 / 11),
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.tanimoto_coeff('', ''), 0)
        self.assertEqual(self.cmp_ws.tanimoto_coeff('the quick', ''),
                         float('-inf'))
        self.assertEqual(self.cmp_ws.tanimoto_coeff('', 'the quick'),
                         float('-inf'))
        self.assertAlmostEqual(self.cmp_ws.tanimoto_coeff(NONQ_FROM, NONQ_TO),
                               log2(1 / 3))
        self.assertAlmostEqual(self.cmp_ws.tanimoto_coeff(NONQ_TO, NONQ_FROM),
                               log2(1 / 3))

        # Test wrapper
        self.assertAlmostEqual(tanimoto('nelson', 'neilsen'), log2(4 / 11))
Exemple #11
0
class TokenDistanceTestCases(unittest.TestCase):
    """Test _TokenDistance functions.

    abydos.distance._TokenDistance
    """

    cmp_j_crisp = Jaccard(intersection_type='crisp')
    cmp_j_soft = Jaccard(intersection_type='soft')
    cmp_j_fuzzy = Jaccard(
        intersection_type='fuzzy', metric=DamerauLevenshtein(), threshold=0.4
    )
    cmp_j_linkage = Jaccard(intersection_type='linkage')
    cmp_j_linkage_int = Jaccard(
        intersection_type='linkage', internal_assignment_problem=True
    )

    def test_crisp_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (crisp)."""
        # Base cases
        self.assertEqual(self.cmp_j_crisp.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_crisp.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_crisp.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_crisp.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_crisp.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_crisp.sim('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp_j_crisp.sim('abcd', 'efgh'), 0.0)

        self.assertAlmostEqual(
            self.cmp_j_crisp.sim('Nigel', 'Niall'), 0.3333333333
        )
        self.assertAlmostEqual(
            self.cmp_j_crisp.sim('Niall', 'Nigel'), 0.3333333333
        )
        self.assertAlmostEqual(
            self.cmp_j_crisp.sim('Colin', 'Coiln'), 0.3333333333
        )
        self.assertAlmostEqual(
            self.cmp_j_crisp.sim('Coiln', 'Colin'), 0.3333333333
        )
        self.assertAlmostEqual(
            self.cmp_j_crisp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5
        )

    def test_soft_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (soft)."""
        # Base cases
        self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111)

        self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6)
        self.assertAlmostEqual(
            self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'), 0.68
        )

        self.assertAlmostEqual(
            Jaccard(intersection_type='soft', metric=JaroWinkler()).sim(
                'synonym', 'antonym'
            ),
            0.777777777777,
        )

    def test_fuzzy_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (fuzzy)."""
        # Base cases
        self.assertEqual(self.cmp_j_fuzzy.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_fuzzy.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_fuzzy.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_fuzzy.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_fuzzy.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_fuzzy.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(
            self.cmp_j_fuzzy.sim('abcd', 'efgh'), 0.1111111111111111
        )

        self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Nigel', 'Niall'), 0.6)
        self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Niall', 'Nigel'), 0.6)
        self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Colin', 'Coiln'), 1.0)
        self.assertAlmostEqual(self.cmp_j_fuzzy.sim('Coiln', 'Colin'), 1.0)
        self.assertAlmostEqual(
            self.cmp_j_fuzzy.sim('ATCAACGAGT', 'AACGATTAG'), 0.9565217391304348
        )

        self.assertAlmostEqual(
            Jaccard(intersection_type='fuzzy').sim('synonym', 'antonym'),
            0.3333333333333333,
        )

    def test_linkage_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (group linkage)."""
        # Base cases
        self.assertEqual(self.cmp_j_linkage.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_linkage.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(
            self.cmp_j_linkage.sim('abcd', 'efgh'), 0.1111111111111111
        )

        self.assertAlmostEqual(self.cmp_j_linkage.sim('Nigel', 'Niall'), 0.5)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('Niall', 'Nigel'), 0.5)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('Colin', 'Coiln'), 0.6)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('Coiln', 'Colin'), 0.6)
        self.assertAlmostEqual(
            self.cmp_j_linkage.sim('ATCAACGAGT', 'AACGATTAG'), 0.68
        )

        # Base cases
        self.assertEqual(self.cmp_j_linkage_int.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_linkage_int.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_linkage_int.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_linkage_int.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_linkage_int.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_linkage_int.sim('abc', 'abc'), 1.0)
        self.assertEqual(
            self.cmp_j_linkage_int.sim('abcd', 'efgh'), 0.1111111111111111
        )

        self.assertAlmostEqual(
            self.cmp_j_linkage_int.sim('Nigel', 'Niall'), 0.5
        )
        self.assertAlmostEqual(
            self.cmp_j_linkage_int.sim('Niall', 'Nigel'), 0.6
        )
        self.assertAlmostEqual(
            self.cmp_j_linkage_int.sim('Colin', 'Coiln'), 0.5625
        )
        self.assertAlmostEqual(
            self.cmp_j_linkage_int.sim('Coiln', 'Colin'), 0.6
        )
        self.assertAlmostEqual(
            self.cmp_j_linkage_int.sim('ATCAACGAGT', 'AACGATTAG'), 0.75
        )

        self.assertAlmostEqual(
            Jaccard(
                intersection_type='linkage',
                metric=JaroWinkler(),
                threshold=0.2,
            ).sim('synonym', 'antonym'),
            0.6,
        )

    def test_token_distance(self):
        """Test abydos.distance._TokenDistance members."""
        self.assertAlmostEqual(
            Jaccard(intersection_type='soft', alphabet=24).sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.68,
        )
        self.assertAlmostEqual(
            Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'),
            0.9,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.6372795969773299,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=None).sim('synonym', 'antonym'),
            0.3333333333333333,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'),
            0.34146341463414637,
        )

        src_ctr = Counter({'a': 5, 'b': 2, 'c': 10})
        tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12})
        self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375)

        self.assertAlmostEqual(
            SokalMichener(normalizer='proportional').sim('synonym', 'antonym'),
            0.984777917351113,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='log').sim('synonym', 'antonym'),
            1.2385752469545532,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='exp', alphabet=0).sim(
                'synonym', 'antonym'
            ),
            3.221246147982545e18,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='laplace').sim('synonym', 'antonym'),
            0.98856416772554,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='inverse').sim('synonym', 'antonym'),
            197.95790155440417,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='complement').sim('synonym', 'antonym'),
            1.0204081632653061,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='base case').sim('synonym', 'antonym'),
            0.9897959183673469,
        )
        self.assertAlmostEqual(
            SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469
        )

        sm = SokalMichener()
        sm._tokenize('synonym', 'antonym')  # noqa: SF01

        self.assertEqual(
            sm._get_tokens(),  # noqa: SF01
            (
                Counter(
                    {
                        '$s': 1,
                        'sy': 1,
                        'yn': 1,
                        'no': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
                Counter(
                    {
                        '$a': 1,
                        'an': 1,
                        'nt': 1,
                        'to': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
            ),
        )
        self.assertEqual(sm._src_card(), 8)  # noqa: SF01
        self.assertEqual(sm._tar_card(), 8)  # noqa: SF01
        self.assertEqual(
            sm._symmetric_difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._symmetric_difference_card(), 8)  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 772)  # noqa: SF01
        self.assertEqual(sm._population_card(), 788)  # noqa: SF01
        self.assertEqual(
            sm._union(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 1,
                    'ny': 1,
                    'ym': 1,
                    'm#': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._union_card(), 12)  # noqa: SF01
        self.assertEqual(
            sm._difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 0,
                    'ny': 0,
                    'ym': 0,
                    'm#': 0,
                    '$a': -1,
                    'an': -1,
                    'nt': -1,
                    'to': -1,
                }
            ),
        )
        self.assertEqual(
            sm._intersection(),  # noqa: SF01
            Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}),
        )
        self.assertEqual(
            sm._get_confusion_table(),  # noqa: SF01
            ConfusionTable(tp=4, tn=772, fp=4, fn=4),
        )

        sm = SokalMichener(
            alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1
        )
        sm._tokenize('ATCAACGAGT', 'AACGATTAG')  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 61)  # noqa: SF01

        jac = Jaccard(
            intersection_type='linkage', internal_assignment_problem=True
        )
        self.assertAlmostEqual(jac.sim('abandonned', 'abandoned'), 1.0)
        self.assertAlmostEqual(
            jac.sim('abundacies', 'abundances'), 0.6296296296296297
        )

        # Some additional constructors needed to complete test coverage
        self.assertAlmostEqual(
            Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'),
            0.22558922558922556,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim(
                'abc', 'abcd'
            ),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            Jaccard(
                alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer()
            ).sim('abc', 'abcd'),
            0.0,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75
        )
Exemple #12
0
    def test_token_distance(self):
        """Test abydos.distance._TokenDistance members."""
        self.assertAlmostEqual(
            Jaccard(intersection_type='soft', alphabet=24).sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.68,
        )
        self.assertAlmostEqual(
            Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'),
            0.9,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.6372795969773299,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=None).sim('synonym', 'antonym'),
            0.3333333333333333,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'),
            0.34146341463414637,
        )

        src_ctr = Counter({'a': 5, 'b': 2, 'c': 10})
        tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12})
        self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375)

        self.assertAlmostEqual(
            SokalMichener(normalizer='proportional').sim('synonym', 'antonym'),
            0.984777917351113,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='log').sim('synonym', 'antonym'),
            1.2385752469545532,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='exp', alphabet=0).sim(
                'synonym', 'antonym'
            ),
            3.221246147982545e18,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='laplace').sim('synonym', 'antonym'),
            0.98856416772554,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='inverse').sim('synonym', 'antonym'),
            197.95790155440417,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='complement').sim('synonym', 'antonym'),
            1.0204081632653061,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='base case').sim('synonym', 'antonym'),
            0.9897959183673469,
        )
        self.assertAlmostEqual(
            SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469
        )

        sm = SokalMichener()
        sm._tokenize('synonym', 'antonym')  # noqa: SF01

        self.assertEqual(
            sm._get_tokens(),  # noqa: SF01
            (
                Counter(
                    {
                        '$s': 1,
                        'sy': 1,
                        'yn': 1,
                        'no': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
                Counter(
                    {
                        '$a': 1,
                        'an': 1,
                        'nt': 1,
                        'to': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
            ),
        )
        self.assertEqual(sm._src_card(), 8)  # noqa: SF01
        self.assertEqual(sm._tar_card(), 8)  # noqa: SF01
        self.assertEqual(
            sm._symmetric_difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._symmetric_difference_card(), 8)  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 772)  # noqa: SF01
        self.assertEqual(sm._population_card(), 788)  # noqa: SF01
        self.assertEqual(
            sm._union(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 1,
                    'ny': 1,
                    'ym': 1,
                    'm#': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._union_card(), 12)  # noqa: SF01
        self.assertEqual(
            sm._difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 0,
                    'ny': 0,
                    'ym': 0,
                    'm#': 0,
                    '$a': -1,
                    'an': -1,
                    'nt': -1,
                    'to': -1,
                }
            ),
        )
        self.assertEqual(
            sm._intersection(),  # noqa: SF01
            Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}),
        )
        self.assertEqual(
            sm._get_confusion_table(),  # noqa: SF01
            ConfusionTable(tp=4, tn=772, fp=4, fn=4),
        )

        sm = SokalMichener(
            alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1
        )
        sm._tokenize('ATCAACGAGT', 'AACGATTAG')  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 61)  # noqa: SF01

        jac = Jaccard(
            intersection_type='linkage', internal_assignment_problem=True
        )
        self.assertAlmostEqual(jac.sim('abandonned', 'abandoned'), 1.0)
        self.assertAlmostEqual(
            jac.sim('abundacies', 'abundances'), 0.6296296296296297
        )

        # Some additional constructors needed to complete test coverage
        self.assertAlmostEqual(
            Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'),
            0.22558922558922556,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim(
                'abc', 'abcd'
            ),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            Jaccard(
                alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer()
            ).sim('abc', 'abcd'),
            0.0,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75
        )
Exemple #13
0
    def test_pairwise_similarity_statistics(self):
        """Test abydos.stats.pairwise_similarity_statistics."""
        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL, NIALL)
        self.assertAlmostEqual(pw_max, 1.0)
        self.assertAlmostEqual(pw_min, 0.11764705882352944)
        self.assertAlmostEqual(pw_mean, 0.4188369879201684)
        self.assertAlmostEqual(pw_std, 0.2265099631340623)

        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL, ('Kneal', ))
        self.assertAlmostEqual(pw_max, 0.8333333333333334)
        self.assertAlmostEqual(pw_min, 0.11764705882352944)
        self.assertAlmostEqual(pw_mean, 0.30474877450980387)
        self.assertAlmostEqual(pw_std, 0.1842666797571549)

        # Test symmetric
        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL, NIALL, symmetric=True)
        self.assertAlmostEqual(pw_max, 1.0)
        self.assertAlmostEqual(pw_min, 0.11764705882352944)
        self.assertAlmostEqual(pw_mean, 0.4188369879201679)
        self.assertAlmostEqual(pw_std, 0.22650996313406255)

        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL, ('Kneal', ),
                                                  symmetric=True)
        self.assertAlmostEqual(pw_max, 0.8333333333333334)
        self.assertAlmostEqual(pw_min, 0.11764705882352944)
        self.assertAlmostEqual(pw_mean, 0.304748774509804)
        self.assertAlmostEqual(pw_std, 0.18426667975715486)

        # Test with splittable strings
        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics('The quick brown fox',
                                                  'jumped over the lazy dog.')
        self.assertAlmostEqual(pw_max, 0.6666666666666667)
        self.assertAlmostEqual(pw_min, 0.0)
        self.assertAlmostEqual(pw_mean, 0.08499999999999999)
        self.assertAlmostEqual(pw_std, 0.16132265804901677)

        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics('The', 'jumped')
        self.assertAlmostEqual(pw_max, 0.16666666666666663)
        self.assertAlmostEqual(pw_min, 0.16666666666666663)
        self.assertAlmostEqual(pw_mean, 0.16666666666666663)
        self.assertAlmostEqual(pw_std, 0.0)

        # Test with a set metric
        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL,
                                                  NIALL,
                                                  metric=Jaccard().sim)
        self.assertAlmostEqual(pw_max, 1.0)
        self.assertAlmostEqual(pw_min, 0.0)
        self.assertAlmostEqual(pw_mean, 0.23226906681010506)
        self.assertAlmostEqual(pw_std, 0.24747101181262784)

        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL,
                                                  NIALL,
                                                  metric=JaroWinkler().dist)
        self.assertAlmostEqual(pw_max, 1.0)
        self.assertAlmostEqual(pw_min, 0.0)
        self.assertAlmostEqual(pw_mean, 0.3352660334967324)
        self.assertAlmostEqual(pw_std, 0.18394505847524578)

        # Test using hmean'
        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL,
                                                  NIALL,
                                                  mean_func=hmean)
        self.assertAlmostEqual(pw_max, 1.0)
        self.assertAlmostEqual(pw_min, 0.11764705882352944)
        self.assertAlmostEqual(pw_mean, 0.30718771249150056)
        self.assertAlmostEqual(pw_std, 0.25253182790044676)

        # Test exceptions
        self.assertRaises(
            ValueError,
            pairwise_similarity_statistics,
            NIALL,
            NIALL,
            mean_func='mean',
        )
        self.assertRaises(
            ValueError,
            pairwise_similarity_statistics,
            NIALL,
            NIALL,
            metric='Levenshtein',
        )
        self.assertRaises(ValueError, pairwise_similarity_statistics, 5, NIALL)
        self.assertRaises(ValueError, pairwise_similarity_statistics, NIALL, 5)
 def __init__(self):
     self.stop_words = list(stopwords.words('english'))
     self.lemmatizer = WordNetLemmatizer()
     self.jaccard_sim_obj = Jaccard()
     self.elmo = hub.Module("https://tfhub.dev/google/elmo/2",
                            trainable=True)