Ejemplo n.º 1
0
    def test_tanimoto_coeff(self):
        """Test abydos.distance.sim_tanimoto."""
        self.assertEqual(sim_tanimoto('', ''), 1)
        self.assertEqual(sim_tanimoto('nelson', ''), 0)
        self.assertEqual(sim_tanimoto('', 'neilsen'), 0)
        self.assertAlmostEqual(sim_tanimoto('nelson', 'neilsen'), 4/11)

        self.assertEqual(sim_tanimoto('', '', 2), 1)
        self.assertEqual(sim_tanimoto('nelson', '', 2), 0)
        self.assertEqual(sim_tanimoto('', 'neilsen', 2), 0)
        self.assertAlmostEqual(sim_tanimoto('nelson', 'neilsen', 2), 4/11)

        # supplied q-gram tests
        self.assertEqual(sim_tanimoto(QGrams(''), QGrams('')), 1)
        self.assertEqual(sim_tanimoto(QGrams('nelson'), QGrams('')), 0)
        self.assertEqual(sim_tanimoto(QGrams(''), QGrams('neilsen')), 0)
        self.assertAlmostEqual(sim_tanimoto(QGrams('nelson'),
                                            QGrams('neilsen')), 4/11)

        # non-q-gram tests
        self.assertEqual(sim_tanimoto('', '', 0), 1)
        self.assertEqual(sim_tanimoto('the quick', '', 0), 0)
        self.assertEqual(sim_tanimoto('', 'the quick', 0), 0)
        self.assertAlmostEqual(sim_tanimoto(NONQ_FROM, NONQ_TO, 0), 1/3)
        self.assertAlmostEqual(sim_tanimoto(NONQ_TO, NONQ_FROM, 0), 1/3)
Ejemplo n.º 2
0
    def test_dist_overlap(self):
        """Test abydos.distance.dist_overlap."""
        self.assertEqual(dist_overlap('', ''), 0)
        self.assertEqual(dist_overlap('nelson', ''), 1)
        self.assertEqual(dist_overlap('', 'neilsen'), 1)
        self.assertAlmostEqual(dist_overlap('nelson', 'neilsen'), 3/7)

        self.assertEqual(dist_overlap('', '', 2), 0)
        self.assertEqual(dist_overlap('nelson', '', 2), 1)
        self.assertEqual(dist_overlap('', 'neilsen', 2), 1)
        self.assertAlmostEqual(dist_overlap('nelson', 'neilsen', 2), 3/7)

        # supplied q-gram tests
        self.assertEqual(dist_overlap(QGrams(''), QGrams('')), 0)
        self.assertEqual(dist_overlap(QGrams('nelson'), QGrams('')), 1)
        self.assertEqual(dist_overlap(QGrams(''), QGrams('neilsen')), 1)
        self.assertAlmostEqual(dist_overlap(QGrams('nelson'),
                                            QGrams('neilsen')), 3/7)

        # non-q-gram tests
        self.assertEqual(dist_overlap('', '', 0), 0)
        self.assertEqual(dist_overlap('the quick', '', 0), 1)
        self.assertEqual(dist_overlap('', 'the quick', 0), 1)
        self.assertAlmostEqual(dist_overlap(NONQ_FROM, NONQ_TO, 0), 3/7)
        self.assertAlmostEqual(dist_overlap(NONQ_TO, NONQ_FROM, 0), 3/7)
Ejemplo n.º 3
0
    def test_dist_jaccard(self):
        """Test abydos.distance.dist_jaccard."""
        self.assertEqual(dist_jaccard('', ''), 0)
        self.assertEqual(dist_jaccard('nelson', ''), 1)
        self.assertEqual(dist_jaccard('', 'neilsen'), 1)
        self.assertAlmostEqual(dist_jaccard('nelson', 'neilsen'), 7/11)

        self.assertEqual(dist_jaccard('', '', 2), 0)
        self.assertEqual(dist_jaccard('nelson', '', 2), 1)
        self.assertEqual(dist_jaccard('', 'neilsen', 2), 1)
        self.assertAlmostEqual(dist_jaccard('nelson', 'neilsen', 2), 7/11)

        # supplied q-gram tests
        self.assertEqual(dist_jaccard(QGrams(''), QGrams('')), 0)
        self.assertEqual(dist_jaccard(QGrams('nelson'), QGrams('')), 1)
        self.assertEqual(dist_jaccard(QGrams(''), QGrams('neilsen')), 1)
        self.assertAlmostEqual(dist_jaccard(QGrams('nelson'),
                                            QGrams('neilsen')), 7/11)

        # non-q-gram tests
        self.assertEqual(dist_jaccard('', '', 0), 0)
        self.assertEqual(dist_jaccard('the quick', '', 0), 1)
        self.assertEqual(dist_jaccard('', 'the quick', 0), 1)
        self.assertAlmostEqual(dist_jaccard(NONQ_FROM, NONQ_TO, 0), 2/3)
        self.assertAlmostEqual(dist_jaccard(NONQ_TO, NONQ_FROM, 0), 2/3)
Ejemplo n.º 4
0
    def test_sim_dice(self):
        """Test abydos.distance.sim_dice."""
        self.assertEqual(sim_dice('', ''), 1)
        self.assertEqual(sim_dice('nelson', ''), 0)
        self.assertEqual(sim_dice('', 'neilsen'), 0)
        self.assertAlmostEqual(sim_dice('nelson', 'neilsen'), 8/15)

        self.assertEqual(sim_dice('', '', 2), 1)
        self.assertEqual(sim_dice('nelson', '', 2), 0)
        self.assertEqual(sim_dice('', 'neilsen', 2), 0)
        self.assertAlmostEqual(sim_dice('nelson', 'neilsen', 2), 8/15)

        # supplied q-gram tests
        self.assertEqual(sim_dice(QGrams(''), QGrams('')), 1)
        self.assertEqual(sim_dice(QGrams('nelson'), QGrams('')), 0)
        self.assertEqual(sim_dice(QGrams(''), QGrams('neilsen')), 0)
        self.assertAlmostEqual(sim_dice(QGrams('nelson'), QGrams('neilsen')),
                               8/15)

        # non-q-gram tests
        self.assertEqual(sim_dice('', '', 0), 1)
        self.assertEqual(sim_dice('the quick', '', 0), 0)
        self.assertEqual(sim_dice('', 'the quick', 0), 0)
        self.assertAlmostEqual(sim_dice(NONQ_FROM, NONQ_TO, 0), 1/2)
        self.assertAlmostEqual(sim_dice(NONQ_TO, NONQ_FROM, 0), 1/2)
Ejemplo n.º 5
0
    def test_dist_tversky(self):
        """Test abydos.distance.dist_tversky."""
        self.assertEqual(dist_tversky('', ''), 0)
        self.assertEqual(dist_tversky('nelson', ''), 1)
        self.assertEqual(dist_tversky('', 'neilsen'), 1)
        self.assertAlmostEqual(dist_tversky('nelson', 'neilsen'), 7/11)

        self.assertEqual(dist_tversky('', '', 2), 0)
        self.assertEqual(dist_tversky('nelson', '', 2), 1)
        self.assertEqual(dist_tversky('', 'neilsen', 2), 1)
        self.assertAlmostEqual(dist_tversky('nelson', 'neilsen', 2), 7/11)

        # test valid alpha & beta
        self.assertRaises(ValueError, dist_tversky, 'abcd', 'dcba', 2, -1, -1)
        self.assertRaises(ValueError, dist_tversky, 'abcd', 'dcba', 2, -1, 0)
        self.assertRaises(ValueError, dist_tversky, 'abcd', 'dcba', 2, 0, -1)

        # test empty QGrams
        self.assertAlmostEqual(dist_tversky('nelson', 'neilsen', 7), 1.0)

        # test unequal alpha & beta
        self.assertAlmostEqual(dist_tversky('niall', 'neal', 2, 2, 1), 8/11)
        self.assertAlmostEqual(dist_tversky('niall', 'neal', 2, 1, 2), 7/10)
        self.assertAlmostEqual(dist_tversky('niall', 'neal', 2, 2, 2), 10/13)

        # test bias parameter
        self.assertAlmostEqual(dist_tversky('niall', 'neal', 2, 1, 1, 0.5),
                               4/11)
        self.assertAlmostEqual(dist_tversky('niall', 'neal', 2, 2, 1, 0.5),
                               2/9)
        self.assertAlmostEqual(dist_tversky('niall', 'neal', 2, 1, 2, 0.5),
                               8/15)
        self.assertAlmostEqual(dist_tversky('niall', 'neal', 2, 2, 2, 0.5),
                               4/11)

        # supplied q-gram tests
        self.assertEqual(dist_tversky(QGrams(''), QGrams('')), 0)
        self.assertEqual(dist_tversky(QGrams('nelson'), QGrams('')), 1)
        self.assertEqual(dist_tversky(QGrams(''), QGrams('neilsen')), 1)
        self.assertAlmostEqual(dist_tversky(QGrams('nelson'),
                                            QGrams('neilsen')), 7/11)

        # non-q-gram tests
        self.assertEqual(dist_tversky('', '', 0), 0)
        self.assertEqual(dist_tversky('the quick', '', 0), 1)
        self.assertEqual(dist_tversky('', 'the quick', 0), 1)
        self.assertAlmostEqual(dist_tversky(NONQ_FROM, NONQ_TO, 0), 2/3)
        self.assertAlmostEqual(dist_tversky(NONQ_TO, NONQ_FROM, 0), 2/3)
Ejemplo n.º 6
0
    def test_minkowski(self):
        """Test abydos.distance.minkowski."""
        self.assertEqual(minkowski('', ''), 0)
        self.assertEqual(minkowski('nelson', ''), 7)
        self.assertEqual(minkowski('', 'neilsen'), 8)
        self.assertAlmostEqual(minkowski('nelson', 'neilsen'), 7)

        self.assertEqual(minkowski('', '', 2), 0)
        self.assertEqual(minkowski('nelson', '', 2), 7)
        self.assertEqual(minkowski('', 'neilsen', 2), 8)
        self.assertAlmostEqual(minkowski('nelson', 'neilsen', 2), 7)

        # supplied q-gram tests
        self.assertEqual(minkowski(QGrams(''), QGrams('')), 0)
        self.assertEqual(minkowski(QGrams('nelson'), QGrams('')), 7)
        self.assertEqual(minkowski(QGrams(''), QGrams('neilsen')), 8)
        self.assertAlmostEqual(minkowski(QGrams('nelson'),
                                         QGrams('neilsen')), 7)

        # non-q-gram tests
        self.assertEqual(minkowski('', '', 0), 0)
        self.assertEqual(minkowski('the quick', '', 0), 2)
        self.assertEqual(minkowski('', 'the quick', 0), 2)
        self.assertAlmostEqual(minkowski(NONQ_FROM, NONQ_TO, 0), 8)
        self.assertAlmostEqual(minkowski(NONQ_TO, NONQ_FROM, 0), 8)

        # test l_0 "norm"
        self.assertEqual(minkowski('', '', 1, 0), 0)
        self.assertEqual(minkowski('a', '', 1, 0), 1)
        self.assertEqual(minkowski('a', 'b', 1, 0), 2)
        self.assertEqual(minkowski('ab', 'b', 1, 0), 1)
        self.assertEqual(minkowski('aab', 'b', 1, 0), 1)
        self.assertEqual(minkowski('', '', 1, 0, True), 0)
        self.assertEqual(minkowski('a', '', 1, 0, True), 1)
        self.assertEqual(minkowski('a', 'b', 1, 0, True), 1)
        self.assertEqual(minkowski('ab', 'b', 1, 0, True), 1/2)
        self.assertEqual(minkowski('aab', 'b', 1, 0, True), 1/2)
        self.assertEqual(minkowski('aaab', 'b', 1, 0, True), 1/2)
        self.assertEqual(minkowski('aaab', 'ab', 1, 0, True), 1/2)

        # test with alphabet
        self.assertEqual(minkowski('ab', 'b', 1, alphabet=26), 1)
        self.assertEqual(minkowski('ab', 'b', 1, normalized=True, alphabet=26),
                         1/26)
        self.assertEqual(minkowski('ab', 'b', 1, normalized=True,
                                   alphabet='abcdefghijklmnopqrstuvwxyz'),
                         1/26)
Ejemplo n.º 7
0
    def test_qgram_intersections(self):
        """Test abydos.tokenizer.qgram.QGrams intersections."""
        self.assertEqual(sorted(QGrams('NELSON') & QGrams('')), [])
        self.assertEqual(sorted(QGrams('') & QGrams('NEILSEN')), [])
        self.assertEqual(sorted(QGrams('NELSON') & QGrams('NEILSEN')),
                         sorted(['$N', 'NE', 'LS', 'N#']))
        self.assertEqual(sorted(QGrams('NELSON') & QGrams('NOSLEN')),
                         sorted(['$N', 'N#']))
        self.assertEqual(sorted(QGrams('NAIL') & QGrams('LIAN')), [])

        self.assertEqual(
            sorted(
                QGrams('NELSON', start_stop='')
                & QGrams('NEILSEN', start_stop='')), sorted(['NE', 'LS']))
        self.assertEqual(
            sorted(
                QGrams('NELSON', start_stop='')
                & QGrams('NOSLEN', start_stop='')), [])
        self.assertEqual(
            sorted(
                QGrams('NAIL', start_stop='') & QGrams('LIAN', start_stop='')),
            [])
Ejemplo n.º 8
0
    def test_qgrams(self):
        """Test abydos.tokenizer.qgram.QGrams."""
        self.assertEqual(sorted(QGrams('').elements()), [])
        self.assertEqual(sorted(QGrams('a', 2).elements()), [])
        self.assertEqual(sorted(QGrams('NELSON', 0).elements()), [])
        self.assertEqual(sorted(QGrams('NELSON', -1).elements()), [])

        self.assertEqual(
            sorted(QGrams('NELSON', 3).elements()),
            sorted(['$$N', '$NE', 'NEL', 'ELS', 'LSO', 'SON', 'ON#', 'N##']))
        self.assertEqual(sorted(QGrams('NELSON', 7).elements()), sorted([]))

        # http://www.sound-ex.com/alternative_qgram.htm
        self.assertEqual(sorted(QGrams('NELSON').elements()),
                         sorted(['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#']))
        self.assertEqual(
            sorted(QGrams('NEILSEN').elements()),
            sorted(['$N', 'NE', 'EI', 'IL', 'LS', 'SE', 'EN', 'N#']))
        self.assertEqual(sorted(QGrams('NELSON', start_stop='').elements()),
                         sorted(['NE', 'EL', 'LS', 'SO', 'ON']))
        self.assertEqual(sorted(QGrams('NEILSEN', start_stop='').elements()),
                         sorted(['NE', 'EI', 'IL', 'LS', 'SE', 'EN']))

        # qval=(1,2)
        self.assertEqual(
            sorted(QGrams('NELSON', qval=(1, 2)).elements()),
            sorted([
                '$N', 'E', 'EL', 'L', 'LS', 'N', 'N', 'N#', 'NE', 'O', 'ON',
                'S', 'SO'
            ]))
        self.assertEqual(
            sorted(QGrams('NELSON', qval=(2, 1)).elements()),
            sorted([
                '$N', 'E', 'EL', 'L', 'LS', 'N', 'N', 'N#', 'NE', 'O', 'ON',
                'S', 'SO'
            ]))
        self.assertEqual(
            sorted(QGrams('NELSON', qval=range(3)).elements()),
            sorted([
                '$N', 'E', 'EL', 'L', 'LS', 'N', 'N', 'N#', 'NE', 'O', 'ON',
                'S', 'SO'
            ]))
        self.assertEqual(QGrams('NELSON', qval=(1, 2)).count(), 13)

        # skip=(1,2)
        self.assertEqual(
            sorted(QGrams('NELSON', skip=(2, 1, 0)).elements()),
            sorted([
                '$E', '$L', '$N', 'EL', 'EO', 'ES', 'LN', 'LO', 'LS', 'N', 'N',
                'N#', 'NE', 'NL', 'NS', 'O', 'O#', 'ON', 'S#', 'SN', 'SO'
            ]))
        self.assertEqual(
            sorted(QGrams('NELSON', skip=(2, 1, 0)).elements()),
            sorted([
                '$E', '$L', '$N', 'EL', 'EO', 'ES', 'LN', 'LO', 'LS', 'N', 'N',
                'N#', 'NE', 'NL', 'NS', 'O', 'O#', 'ON', 'S#', 'SN', 'SO'
            ]))
        self.assertEqual(
            sorted(QGrams('NELSON', skip=range(3)).elements()),
            sorted([
                '$E', '$L', '$N', 'EL', 'EO', 'ES', 'LN', 'LO', 'LS', 'N', 'N',
                'N#', 'NE', 'NL', 'NS', 'O', 'O#', 'ON', 'S#', 'SN', 'SO'
            ]))
        self.assertEqual(QGrams('NELSON', skip=(0, 1, 2)).count(), 21)