def test_dist_jaro_winkler(self):
        """Test abydos.distance.JaroWinkler.dist."""
        self.assertEqual(self.jaro.dist('', ''), 0)
        self.assertEqual(self.jaro_winkler.dist('', ''), 0)
        self.assertEqual(self.jaro.dist('MARTHA', ''), 1)
        self.assertEqual(self.jaro_winkler.dist('MARTHA', ''), 1)
        self.assertEqual(self.jaro.dist('', 'MARHTA'), 1)
        self.assertEqual(self.jaro_winkler.dist('', 'MARHTA'), 1)
        self.assertEqual(self.jaro.dist('MARTHA', 'MARTHA'), 0)
        self.assertEqual(self.jaro_winkler.dist('MARTHA', 'MARTHA'), 0)

        # https://en.wikipedia.org/wiki/Jaro-Winkler_distance
        self.assertAlmostEqual(self.jaro.dist('MARTHA', 'MARHTA'), 0.05555555)
        self.assertAlmostEqual(self.jaro_winkler.dist('MARTHA', 'MARHTA'),
                               0.03888888)
        self.assertAlmostEqual(self.jaro.dist('DWAYNE', 'DUANE'), 0.17777777)
        self.assertAlmostEqual(self.jaro_winkler.dist('DWAYNE', 'DUANE'), 0.16)
        self.assertAlmostEqual(self.jaro.dist('DIXON', 'DICKSONX'), 0.23333333)
        self.assertAlmostEqual(self.jaro_winkler.dist('DIXON', 'DICKSONX'),
                               0.18666666)

        self.assertRaises(ValueError,
                          JaroWinkler(boost_threshold=2).dist, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(boost_threshold=-1).dist, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(scaling_factor=0.3).dist, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(scaling_factor=-1).dist, 'abcd', 'dcba')

        self.assertAlmostEqual(self.jaro_winkler.dist('ABCD', 'EFGH'), 1.0)
    def test_sim_jaro_winkler(self):
        """Test abydos.distance.JaroWinkler.sim."""
        self.assertEqual(self.jaro.sim('', ''), 1)
        self.assertEqual(self.jaro_winkler.sim('', ''), 1)
        self.assertEqual(self.jaro.sim('MARTHA', ''), 0)
        self.assertEqual(self.jaro_winkler.sim('MARTHA', ''), 0)
        self.assertEqual(self.jaro.sim('', 'MARHTA'), 0)
        self.assertEqual(self.jaro_winkler.sim('', 'MARHTA'), 0)
        self.assertEqual(self.jaro.sim('MARTHA', 'MARTHA'), 1)
        self.assertEqual(self.jaro_winkler.sim('MARTHA', 'MARTHA'), 1)

        # https://en.wikipedia.org/wiki/Jaro-Winkler_distance
        self.assertAlmostEqual(self.jaro.sim('MARTHA', 'MARHTA'), 0.94444444)
        self.assertAlmostEqual(self.jaro_winkler.sim('MARTHA', 'MARHTA'),
                               0.96111111)
        self.assertAlmostEqual(self.jaro.sim('DWAYNE', 'DUANE'), 0.82222222)
        self.assertAlmostEqual(self.jaro_winkler.sim('DWAYNE', 'DUANE'), 0.84)
        self.assertAlmostEqual(self.jaro.sim('DIXON', 'DICKSONX'), 0.76666666)
        self.assertAlmostEqual(self.jaro_winkler.sim('DIXON', 'DICKSONX'),
                               0.81333333)

        self.assertRaises(ValueError,
                          JaroWinkler(boost_threshold=2).sim, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(boost_threshold=-1).sim, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(scaling_factor=0.3).sim, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(scaling_factor=-1).sim, 'abcd', 'dcba')

        self.assertAlmostEqual(self.jaro_winkler.sim('ABCD', 'EFGH'), 0.0)

        # long_strings = True (applies only to Jaro-Winkler, not Jaro)
        self.assertEqual(
            JaroWinkler(long_strings=True).sim('ABCD', 'EFGH'),
            self.jaro.sim('ABCD', 'EFGH'),
        )
        self.assertEqual(
            JaroWinkler(mode='jaro',
                        long_strings=True).sim('DIXON', 'DICKSONX'),
            self.jaro.sim('DIXON', 'DICKSONX'),
        )
        self.assertAlmostEqual(
            JaroWinkler(mode='winkler',
                        long_strings=True).sim('DIXON', 'DICKSONX'),
            0.83030303,
        )
        self.assertAlmostEqual(
            JaroWinkler(mode='winkler',
                        long_strings=True).sim('MARTHA', 'MARHTA'),
            0.97083333,
        )

        # Test wrapper
        self.assertAlmostEqual(
            sim_jaro_winkler('DIXON', 'DICKSONX', mode='jaro'), 0.76666666)
        self.assertAlmostEqual(
            sim_jaro_winkler('DIXON', 'DICKSONX', mode='winkler'), 0.81333333)
Example #3
0
    def test_soft_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (soft)."""
        # Base cases
        self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111)

        self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6)
        self.assertAlmostEqual(
            self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'), 0.68
        )

        self.assertAlmostEqual(
            Jaccard(intersection_type='soft', metric=JaroWinkler()).sim(
                'synonym', 'antonym'
            ),
            0.777777777777,
        )
Example #4
0
    def test_linkage_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (group linkage)."""
        # Base cases
        self.assertEqual(self.cmp_j_linkage.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_linkage.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_linkage.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('abcd', 'efgh'),
                               0.1111111111111111)

        self.assertAlmostEqual(self.cmp_j_linkage.sim('Nigel', 'Niall'), 0.5)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('Niall', 'Nigel'), 0.5)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('Colin', 'Coiln'), 0.6)
        self.assertAlmostEqual(self.cmp_j_linkage.sim('Coiln', 'Colin'), 0.6)
        self.assertAlmostEqual(
            self.cmp_j_linkage.sim('ATCAACGAGT', 'AACGATTAG'), 0.68)

        self.assertAlmostEqual(
            Jaccard(
                intersection_type='linkage',
                metric=JaroWinkler(),
                threshold=0.2,
            ).sim('synonym', 'antonym'),
            0.6,
        )
Example #5
0
    def test_soft_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (soft)."""
        # Base cases
        self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111)

        self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6)
        self.assertAlmostEqual(self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.68)

        self.assertAlmostEqual(
            Jaccard(intersection_type='soft',
                    tokenizer=WhitespaceTokenizer()).sim(
                        'junior system analyst', 'systems analyst'),
            0.6190476190476191,
        )
        self.assertAlmostEqual(
            Jaccard(intersection_type='soft',
                    tokenizer=WhitespaceTokenizer()).sim(
                        'systems analyst', 'junior system analyst'),
            0.6190476190476191,
        )

        with self.assertRaises(TypeError):
            Jaccard(
                intersection_type='soft',
                metric=JaroWinkler(),
                tokenizer=WhitespaceTokenizer(),
            ).sim('junior system analyst', 'systems analyst')
Example #6
0
    def __init__(self, model='latin'):
        self.model = model
        self.impH = input_helpers.InputHelper()
        self.ST = syllable_tokenizer.SyllableTokenizer()
        # Phonetic Encoder
        self.pe = Ainsworth()
        # Soundex Firstname Algorithm
        self.pshp_soundex_first = PSHPSoundexFirst()
        # String Distance algorithms
        self.algos = [
            IterativeSubString(),
            BISIM(),
            DiscountedLevenshtein(),
            Prefix(),
            LCSstr(),
            MLIPNS(),
            Strcmp95(),
            MRA(),
            Editex(),
            SAPS(),
            FlexMetric(),
            JaroWinkler(mode='Jaro'),
            HigueraMico(),
            Sift4(),
            Eudex(),
            ALINE(),
            Covington(),
            PhoneticEditDistance()
        ]
        self.algo_names = [
            'iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix',
            'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps',
            'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline',
            'covington', 'phoneticeditdistance'
        ]

        # extract model tarball into directory if doesnt exist
        model_dir = os.path.join(os.path.dirname(__file__), "models",
                                 self.model)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
            tar = tarfile.open(
                os.path.join(os.path.dirname(__file__), "models",
                             self.model + ".tar.gz"), "r:gz")
            tar.extractall(model_dir)
            tar.close()

        # String Distance Pipeline (Level 0/Base Model)
        self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl'))

        # Character Embedding Network (Level 0/Base Model)
        self.vocab = preprocess.VocabularyProcessor(
            max_document_length=15,
            min_frequency=0).restore(os.path.join(model_dir, 'vocab'))

        siamese_model = os.path.join(model_dir, 'siamese')

        graph = tf.Graph()
        with graph.as_default() as graph:
            self.sess = tf.Session()
            with self.sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    '{}.meta'.format(siamese_model))
                self.sess.run(tf.global_variables_initializer())
                saver.restore(self.sess, siamese_model)
                # Get the placeholders from the graph by name
            self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0]
            self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0]

            self.dropout_keep_prob = graph.get_operation_by_name(
                'dropout_keep_prob').outputs[0]
            self.prediction = graph.get_operation_by_name(
                'output/distance').outputs[0]
            self.sim = graph.get_operation_by_name(
                'accuracy/temp_sim').outputs[0]

        # Logreg (Level 1/Meta Model)
        self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl'))

        # seen names (mapping dict from raw name to processed name)
        self.seen_names = {}

        # seen pairs (mapping dict from name pair tuple to similarity)
        self.seen_pairs = {}
Example #7
0
# Featurizer
pshp_soundex_first = PSHPSoundexFirst()
pe = Ainsworth()	
iss = IterativeSubString()
bisim = BISIM()
dlev = DiscountedLevenshtein()
prefix = Prefix()
lcs = LCSstr()
mlipns = MLIPNS()
strcmp95 = Strcmp95()
mra = MRA()
editex = Editex()
saps = SAPS()
flexmetric = FlexMetric()
jaro = JaroWinkler(mode='Jaro')
higuera_mico = HigueraMico()
sift4 = Sift4()
eudex = Eudex()
aline = ALINE()
phonetic_edit = PhoneticEditDistance()
algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex,
     aline, phonetic_edit]

algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra',
          'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline',
          'phoneticeditdistance']

def sum_ipa(name_a, name_b):
    feat1 = ipa_to_features(pe.encode(name_a))
    feat2 = ipa_to_features(pe.encode(name_b))
class CompleteLinkageTestCases(unittest.TestCase):
    """Test CompleteLinkage functions.

    abydos.distance.CompleteLinkage
    """

    cmp = CompleteLinkage()
    cmp_q4 = CompleteLinkage(tokenizer=QGrams(qval=4, start_stop=''))
    cmp_q4_jw = CompleteLinkage(tokenizer=QGrams(qval=4, start_stop=''),
                                metric=JaroWinkler())

    def test_complete_linkage_dist(self):
        """Test abydos.distance.CompleteLinkage.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 0.0)
        self.assertEqual(self.cmp.dist('a', ''), 0.0)
        self.assertEqual(self.cmp.dist('', 'a'), 0.0)
        self.assertEqual(self.cmp.dist('abc', ''), 0.0)
        self.assertEqual(self.cmp.dist('', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 1.0)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 1.0)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 1.0)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 1.0)
        self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 1.0)

        self.assertEqual(self.cmp_q4.dist('AAAT', 'AATT'), 0.25)
        self.assertAlmostEqual(self.cmp_q4_jw.dist('AAAT', 'AATT'),
                               0.133333333333)

    def test_complete_linkage_sim(self):
        """Test abydos.distance.CompleteLinkage.sim."""
        # Base cases
        self.assertEqual(self.cmp.sim('', ''), 1.0)
        self.assertEqual(self.cmp.sim('a', ''), 1.0)
        self.assertEqual(self.cmp.sim('', 'a'), 1.0)
        self.assertEqual(self.cmp.sim('abc', ''), 1.0)
        self.assertEqual(self.cmp.sim('', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0)

        self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0)
        self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0)
        self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0)
        self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0)
        self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.0)

    def test_complete_linkage_dist_abs(self):
        """Test abydos.distance.CompleteLinkage.dist_abs."""
        # Base cases
        self.assertEqual(self.cmp.dist_abs('', ''), float('-inf'))
        self.assertEqual(self.cmp.dist_abs('a', ''), float('-inf'))
        self.assertEqual(self.cmp.dist_abs('', 'a'), float('-inf'))
        self.assertEqual(self.cmp.dist_abs('abc', ''), float('-inf'))
        self.assertEqual(self.cmp.dist_abs('', 'abc'), float('-inf'))
        self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 2)
        self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 2)

        self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2)
        self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2)
        self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 2)
        self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 2)
        self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2)
class JaroWinklerTestCases(unittest.TestCase):
    """Test Jaro(-Winkler) functions.

    abydos.distance.JaroWinkler
    """

    jaro = JaroWinkler(mode='jaro')
    jaro_winkler = JaroWinkler(mode='winkler')

    def test_sim_jaro_winkler(self):
        """Test abydos.distance.JaroWinkler.sim."""
        self.assertEqual(self.jaro.sim('', ''), 1)
        self.assertEqual(self.jaro_winkler.sim('', ''), 1)
        self.assertEqual(self.jaro.sim('MARTHA', ''), 0)
        self.assertEqual(self.jaro_winkler.sim('MARTHA', ''), 0)
        self.assertEqual(self.jaro.sim('', 'MARHTA'), 0)
        self.assertEqual(self.jaro_winkler.sim('', 'MARHTA'), 0)
        self.assertEqual(self.jaro.sim('MARTHA', 'MARTHA'), 1)
        self.assertEqual(self.jaro_winkler.sim('MARTHA', 'MARTHA'), 1)

        # https://en.wikipedia.org/wiki/Jaro-Winkler_distance
        self.assertAlmostEqual(self.jaro.sim('MARTHA', 'MARHTA'), 0.94444444)
        self.assertAlmostEqual(self.jaro_winkler.sim('MARTHA', 'MARHTA'),
                               0.96111111)
        self.assertAlmostEqual(self.jaro.sim('DWAYNE', 'DUANE'), 0.82222222)
        self.assertAlmostEqual(self.jaro_winkler.sim('DWAYNE', 'DUANE'), 0.84)
        self.assertAlmostEqual(self.jaro.sim('DIXON', 'DICKSONX'), 0.76666666)
        self.assertAlmostEqual(self.jaro_winkler.sim('DIXON', 'DICKSONX'),
                               0.81333333)

        self.assertRaises(ValueError,
                          JaroWinkler(boost_threshold=2).sim, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(boost_threshold=-1).sim, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(scaling_factor=0.3).sim, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(scaling_factor=-1).sim, 'abcd', 'dcba')

        self.assertAlmostEqual(self.jaro_winkler.sim('ABCD', 'EFGH'), 0.0)

        # long_strings = True (applies only to Jaro-Winkler, not Jaro)
        self.assertEqual(
            JaroWinkler(long_strings=True).sim('ABCD', 'EFGH'),
            self.jaro.sim('ABCD', 'EFGH'),
        )
        self.assertEqual(
            JaroWinkler(mode='jaro',
                        long_strings=True).sim('DIXON', 'DICKSONX'),
            self.jaro.sim('DIXON', 'DICKSONX'),
        )
        self.assertAlmostEqual(
            JaroWinkler(mode='winkler',
                        long_strings=True).sim('DIXON', 'DICKSONX'),
            0.83030303,
        )
        self.assertAlmostEqual(
            JaroWinkler(mode='winkler',
                        long_strings=True).sim('MARTHA', 'MARHTA'),
            0.97083333,
        )

    def test_dist_jaro_winkler(self):
        """Test abydos.distance.JaroWinkler.dist."""
        self.assertEqual(self.jaro.dist('', ''), 0)
        self.assertEqual(self.jaro_winkler.dist('', ''), 0)
        self.assertEqual(self.jaro.dist('MARTHA', ''), 1)
        self.assertEqual(self.jaro_winkler.dist('MARTHA', ''), 1)
        self.assertEqual(self.jaro.dist('', 'MARHTA'), 1)
        self.assertEqual(self.jaro_winkler.dist('', 'MARHTA'), 1)
        self.assertEqual(self.jaro.dist('MARTHA', 'MARTHA'), 0)
        self.assertEqual(self.jaro_winkler.dist('MARTHA', 'MARTHA'), 0)

        # https://en.wikipedia.org/wiki/Jaro-Winkler_distance
        self.assertAlmostEqual(self.jaro.dist('MARTHA', 'MARHTA'), 0.05555555)
        self.assertAlmostEqual(self.jaro_winkler.dist('MARTHA', 'MARHTA'),
                               0.03888888)
        self.assertAlmostEqual(self.jaro.dist('DWAYNE', 'DUANE'), 0.17777777)
        self.assertAlmostEqual(self.jaro_winkler.dist('DWAYNE', 'DUANE'), 0.16)
        self.assertAlmostEqual(self.jaro.dist('DIXON', 'DICKSONX'), 0.23333333)
        self.assertAlmostEqual(self.jaro_winkler.dist('DIXON', 'DICKSONX'),
                               0.18666666)

        self.assertRaises(ValueError,
                          JaroWinkler(boost_threshold=2).dist, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(boost_threshold=-1).dist, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(scaling_factor=0.3).dist, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          JaroWinkler(scaling_factor=-1).dist, 'abcd', 'dcba')

        self.assertAlmostEqual(self.jaro_winkler.dist('ABCD', 'EFGH'), 1.0)
Example #10
0
    def __init__(self, model='latin', prefilter=True, allow_alt_surname=True, allow_initials=True,
                 allow_missing_components=True):

        # user-provided parameters
        self.model = model
        self.allow_alt_surname = allow_alt_surname
        self.allow_initials = allow_initials
        self.allow_missing_components = allow_missing_components
        self.prefilter = prefilter
        if self.prefilter:
            self.refined_soundex = {
                'b': 1, 'p': 1,
                'f': 2, 'v': 2,
                'c': 3, 'k': 3, 's': 3,
                'g': 4, 'j': 4,
                'q': 5, 'x': 5, 'z': 5,
                'd': 6, 't': 6,
                'l': 7,
                'm': 8, 'n': 8,
                'r': 9
            }

        # verify user-supplied class arguments
        model_dir = self.validate_parameters()

        self.impH = input_helpers.InputHelper()
        # Phonetic Encoder
        self.pe = Ainsworth()
        # Soundex Firstname Algorithm
        self.pshp_soundex_first = PSHPSoundexFirst()
        # Soundex Lastname Algorithm
        self.pshp_soundex_last = PSHPSoundexLast()

        # String Distance algorithms
        self.algos = [IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(),
                      Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(),
                      Sift4(), Eudex(), ALINE(), CovingtonGuard(), PhoneticEditDistance()]
        self.algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns',
                           'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico',
                           'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance']

        # String Distance Pipeline (Level 0/Base Model)
        self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl'))

        # Character Embedding Network (Level 0/Base Model)
        self.vocab = preprocess.VocabularyProcessor(max_document_length=15, min_frequency=0).restore(
            os.path.join(model_dir, 'vocab'))

        siamese_model = os.path.join(model_dir, 'siamese')

        # start tensorflow session
        graph = tf.Graph()
        with graph.as_default() as graph:
            self.sess = tf.Session() if tf.__version__[0] == '1' else tf.compat.v1.Session()
            with self.sess.as_default():
                # Load the saved meta graph and restore variables
                if tf.__version__[0] == '1':
                    saver = tf.train.import_meta_graph('{}.meta'.format(siamese_model))
                    self.sess.run(tf.global_variables_initializer())
                else:
                    saver = tf.compat.v1.train.import_meta_graph('{}.meta'.format(siamese_model))
                    self.sess.run(tf.compat.v1.global_variables_initializer())
                saver.restore(self.sess, siamese_model)
                # Get the placeholders from the graph by name
            self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0]
            self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0]

            self.dropout_keep_prob = graph.get_operation_by_name('dropout_keep_prob').outputs[0]
            self.prediction = graph.get_operation_by_name('output/distance').outputs[0]
            self.sim = graph.get_operation_by_name('accuracy/temp_sim').outputs[0]

        # Logreg (Level 1/Meta Model)
        self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl'))

        # seen names (mapping dict from raw name to processed name)
        self.seen_names = {}
        # seen pairs (mapping dict from name pair tuple to similarity)
        self.seen_pairs = {}
        # user scores (mapping dict from name pair tuple to similarity)
        self.user_scores = {}
Example #11
0
class SingleLinkageTestCases(unittest.TestCase):
    """Test SingleLinkage functions.

    abydos.distance.SingleLinkage
    """

    cmp = SingleLinkage()
    cmp_jw = SingleLinkage(metric=JaroWinkler())

    def test_single_linkage_dist(self):
        """Test abydos.distance.SingleLinkage.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 1.0)
        self.assertEqual(self.cmp.dist('a', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'a'), 1.0)
        self.assertEqual(self.cmp.dist('abc', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'abc'), 1.0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0)
        self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0)

    def test_single_linkage_sim(self):
        """Test abydos.distance.SingleLinkage.sim."""
        # Base cases
        self.assertEqual(self.cmp.sim('', ''), 0.0)
        self.assertEqual(self.cmp.sim('a', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5)

        self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 1.0)
        self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 1.0)
        self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 1.0)
        self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 1.0)
        self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 1.0)

    def test_single_linkage_dist_abs(self):
        """Test abydos.distance.SingleLinkage.dist_abs."""
        # Base cases
        self.assertEqual(self.cmp.dist_abs('', ''), float('inf'))
        self.assertEqual(self.cmp.dist_abs('a', ''), float('inf'))
        self.assertEqual(self.cmp.dist_abs('', 'a'), float('inf'))
        self.assertEqual(self.cmp.dist_abs('abc', ''), float('inf'))
        self.assertEqual(self.cmp.dist_abs('', 'abc'), float('inf'))
        self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0)
        self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 1)

        self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 0)
        self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 0)
        self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 0)
        self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 0)
        self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0)

        self.assertAlmostEqual(self.cmp_jw.dist_abs('abcd', 'dj'), 1 / 3)
Example #12
0
    def test_pairwise_similarity_statistics(self):
        """Test abydos.stats.pairwise_similarity_statistics."""
        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL, NIALL)
        self.assertAlmostEqual(pw_max, 1.0)
        self.assertAlmostEqual(pw_min, 0.11764705882352944)
        self.assertAlmostEqual(pw_mean, 0.4188369879201684)
        self.assertAlmostEqual(pw_std, 0.2265099631340623)

        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL, ('Kneal', ))
        self.assertAlmostEqual(pw_max, 0.8333333333333334)
        self.assertAlmostEqual(pw_min, 0.11764705882352944)
        self.assertAlmostEqual(pw_mean, 0.30474877450980387)
        self.assertAlmostEqual(pw_std, 0.1842666797571549)

        # Test symmetric
        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL, NIALL, symmetric=True)
        self.assertAlmostEqual(pw_max, 1.0)
        self.assertAlmostEqual(pw_min, 0.11764705882352944)
        self.assertAlmostEqual(pw_mean, 0.4188369879201679)
        self.assertAlmostEqual(pw_std, 0.22650996313406255)

        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL, ('Kneal', ),
                                                  symmetric=True)
        self.assertAlmostEqual(pw_max, 0.8333333333333334)
        self.assertAlmostEqual(pw_min, 0.11764705882352944)
        self.assertAlmostEqual(pw_mean, 0.304748774509804)
        self.assertAlmostEqual(pw_std, 0.18426667975715486)

        # Test with splittable strings
        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics('The quick brown fox',
                                                  'jumped over the lazy dog.')
        self.assertAlmostEqual(pw_max, 0.6666666666666667)
        self.assertAlmostEqual(pw_min, 0.0)
        self.assertAlmostEqual(pw_mean, 0.08499999999999999)
        self.assertAlmostEqual(pw_std, 0.16132265804901677)

        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics('The', 'jumped')
        self.assertAlmostEqual(pw_max, 0.16666666666666663)
        self.assertAlmostEqual(pw_min, 0.16666666666666663)
        self.assertAlmostEqual(pw_mean, 0.16666666666666663)
        self.assertAlmostEqual(pw_std, 0.0)

        # Test with a set metric
        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL,
                                                  NIALL,
                                                  metric=Jaccard().sim)
        self.assertAlmostEqual(pw_max, 1.0)
        self.assertAlmostEqual(pw_min, 0.0)
        self.assertAlmostEqual(pw_mean, 0.23226906681010506)
        self.assertAlmostEqual(pw_std, 0.24747101181262784)

        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL,
                                                  NIALL,
                                                  metric=JaroWinkler().dist)
        self.assertAlmostEqual(pw_max, 1.0)
        self.assertAlmostEqual(pw_min, 0.0)
        self.assertAlmostEqual(pw_mean, 0.3352660334967324)
        self.assertAlmostEqual(pw_std, 0.18394505847524578)

        # Test using hmean'
        (pw_max, pw_min, pw_mean,
         pw_std) = pairwise_similarity_statistics(NIALL,
                                                  NIALL,
                                                  mean_func=hmean)
        self.assertAlmostEqual(pw_max, 1.0)
        self.assertAlmostEqual(pw_min, 0.11764705882352944)
        self.assertAlmostEqual(pw_mean, 0.30718771249150056)
        self.assertAlmostEqual(pw_std, 0.25253182790044676)

        # Test exceptions
        self.assertRaises(
            ValueError,
            pairwise_similarity_statistics,
            NIALL,
            NIALL,
            mean_func='mean',
        )
        self.assertRaises(
            ValueError,
            pairwise_similarity_statistics,
            NIALL,
            NIALL,
            metric='Levenshtein',
        )
        self.assertRaises(ValueError, pairwise_similarity_statistics, 5, NIALL)
        self.assertRaises(ValueError, pairwise_similarity_statistics, NIALL, 5)