Esempio n. 1
0
    def test_sift4_dist_abs(self):
        """Test abydos.distance.Sift4.dist_abs."""
        # tests copied from Lukas Benedix's post at
        # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('a', ''), 1)
        self.assertEqual(self.cmp.dist_abs('', 'a'), 1)
        self.assertEqual(self.cmp.dist_abs('abc', ''), 3)
        self.assertEqual(self.cmp.dist_abs('', 'abc'), 3)

        self.assertEqual(self.cmp.dist_abs('a', 'a'), 0)
        self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0)

        self.assertEqual(self.cmp.dist_abs('a', 'ab'), 1)
        self.assertEqual(self.cmp.dist_abs('ac', 'abc'), 1)
        self.assertEqual(self.cmp.dist_abs('abcdefg', 'xabxcdxxefxgx'), 7)

        self.assertEqual(self.cmp.dist_abs('ab', 'b'), 1)
        self.assertEqual(self.cmp.dist_abs('ab', 'a'), 1)
        self.assertEqual(self.cmp.dist_abs('abc', 'ac'), 1)
        self.assertEqual(self.cmp.dist_abs('xabxcdxxefxgx', 'abcdefg'), 7)

        self.assertEqual(self.cmp.dist_abs('a', 'b'), 1)
        self.assertEqual(self.cmp.dist_abs('ab', 'ac'), 1)
        self.assertEqual(self.cmp.dist_abs('ac', 'bc'), 1)
        self.assertEqual(self.cmp.dist_abs('abc', 'axc'), 1)
        self.assertEqual(
            self.cmp.dist_abs('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6
        )

        self.assertEqual(self.cmp.dist_abs('example', 'samples'), 2)
        self.assertEqual(self.cmp.dist_abs('sturgeon', 'urgently'), 3)
        self.assertEqual(self.cmp.dist_abs('levenshtein', 'frankenstein'), 6)
        self.assertEqual(self.cmp.dist_abs('distance', 'difference'), 5)

        # Tests copied from
        # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java
        self.assertEqual(
            Sift4(5).dist_abs(
                'This is the first string', 'And this is another string'
            ),
            11,
        )
        self.assertEqual(
            Sift4(10).dist_abs(
                'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
                'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing'
                + ' elit.',
            ),
            12,
        )

        # cases with max_distance
        self.assertEqual(self.cmp55.dist_abs('example', 'samples'), 5)
        self.assertEqual(self.cmp55.dist_abs('sturgeon', 'urgently'), 5)
        self.assertEqual(self.cmp55.dist_abs('levenshtein', 'frankenstein'), 5)
        self.assertEqual(self.cmp55.dist_abs('distance', 'difference'), 5)

        # Test wrapper
        self.assertEqual(sift4_common('xabxcdxxefxgx', 'abcdefg'), 7)
Esempio n. 2
0
class Sift4TestCases(unittest.TestCase):
    """Test Sift4 functions.

    abydos.distance.Sift4
    """

    cmp = Sift4()
    cmp55 = Sift4(5, 5)

    def test_sift4_dist_abs(self):
        """Test abydos.distance.Sift4.dist_abs."""
        # tests copied from Lukas Benedix's post at
        # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('a', ''), 1)
        self.assertEqual(self.cmp.dist_abs('', 'a'), 1)
        self.assertEqual(self.cmp.dist_abs('abc', ''), 3)
        self.assertEqual(self.cmp.dist_abs('', 'abc'), 3)

        self.assertEqual(self.cmp.dist_abs('a', 'a'), 0)
        self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0)

        self.assertEqual(self.cmp.dist_abs('a', 'ab'), 1)
        self.assertEqual(self.cmp.dist_abs('ac', 'abc'), 1)
        self.assertEqual(self.cmp.dist_abs('abcdefg', 'xabxcdxxefxgx'), 7)

        self.assertEqual(self.cmp.dist_abs('ab', 'b'), 1)
        self.assertEqual(self.cmp.dist_abs('ab', 'a'), 1)
        self.assertEqual(self.cmp.dist_abs('abc', 'ac'), 1)
        self.assertEqual(self.cmp.dist_abs('xabxcdxxefxgx', 'abcdefg'), 7)

        self.assertEqual(self.cmp.dist_abs('a', 'b'), 1)
        self.assertEqual(self.cmp.dist_abs('ab', 'ac'), 1)
        self.assertEqual(self.cmp.dist_abs('ac', 'bc'), 1)
        self.assertEqual(self.cmp.dist_abs('abc', 'axc'), 1)
        self.assertEqual(
            self.cmp.dist_abs('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6
        )

        self.assertEqual(self.cmp.dist_abs('example', 'samples'), 2)
        self.assertEqual(self.cmp.dist_abs('sturgeon', 'urgently'), 3)
        self.assertEqual(self.cmp.dist_abs('levenshtein', 'frankenstein'), 6)
        self.assertEqual(self.cmp.dist_abs('distance', 'difference'), 5)

        # Tests copied from
        # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java
        self.assertEqual(
            Sift4(5).dist_abs(
                'This is the first string', 'And this is another string'
            ),
            11,
        )
        self.assertEqual(
            Sift4(10).dist_abs(
                'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
                'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing'
                + ' elit.',
            ),
            12,
        )

        # cases with max_distance
        self.assertEqual(self.cmp55.dist_abs('example', 'samples'), 5)
        self.assertEqual(self.cmp55.dist_abs('sturgeon', 'urgently'), 5)
        self.assertEqual(self.cmp55.dist_abs('levenshtein', 'frankenstein'), 5)
        self.assertEqual(self.cmp55.dist_abs('distance', 'difference'), 5)

        # Test wrapper
        self.assertEqual(sift4_common('xabxcdxxefxgx', 'abcdefg'), 7)

    def test_sift4_dist(self):
        """Test abydos.distance.Sift4.dist."""
        # tests copied from Lukas Benedix's post at
        # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('a', ''), 1)
        self.assertEqual(self.cmp.dist('', 'a'), 1)
        self.assertEqual(self.cmp.dist('abc', ''), 1)
        self.assertEqual(self.cmp.dist('', 'abc'), 1)

        self.assertEqual(self.cmp.dist('a', 'a'), 0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 0)

        self.assertEqual(self.cmp.dist('a', 'ab'), 0.5)
        self.assertEqual(self.cmp.dist('ac', 'abc'), 1 / 3)
        self.assertAlmostEqual(
            self.cmp.dist('abcdefg', 'xabxcdxxefxgx'), 0.538461538
        )

        self.assertEqual(self.cmp.dist('ab', 'b'), 0.5)
        self.assertEqual(self.cmp.dist('ab', 'a'), 0.5)
        self.assertEqual(self.cmp.dist('abc', 'ac'), 1 / 3)
        self.assertAlmostEqual(
            self.cmp.dist('xabxcdxxefxgx', 'abcdefg'), 0.538461538
        )

        self.assertEqual(self.cmp.dist('a', 'b'), 1)
        self.assertEqual(self.cmp.dist('ab', 'ac'), 0.5)
        self.assertEqual(self.cmp.dist('ac', 'bc'), 0.5)
        self.assertEqual(self.cmp.dist('abc', 'axc'), 1 / 3)
        self.assertAlmostEqual(
            self.cmp.dist('xabxcdxxefxgx', '1ab2cd34ef5g6'), 0.461538461
        )

        self.assertAlmostEqual(
            self.cmp.dist('example', 'samples'), 0.285714285
        )
        self.assertAlmostEqual(self.cmp.dist('sturgeon', 'urgently'), 0.375)
        self.assertAlmostEqual(
            self.cmp.dist('levenshtein', 'frankenstein'), 0.5
        )
        self.assertAlmostEqual(self.cmp.dist('distance', 'difference'), 0.5)

        # Tests copied from
        # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java
        self.assertAlmostEqual(
            Sift4(5).dist(
                'This is the first string', 'And this is another string'
            ),
            0.423076923,
        )
        self.assertAlmostEqual(
            Sift4(10).dist(
                'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
                'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing'
                + ' elit.',
            ),
            0.193548387,
        )

        # cases with max_distance
        self.assertAlmostEqual(
            self.cmp55.dist('example', 'samples'), 0.714285714
        )
        self.assertAlmostEqual(self.cmp55.dist('sturgeon', 'urgently'), 0.625)
        self.assertAlmostEqual(
            self.cmp55.dist('levenshtein', 'frankenstein'), 0.416666666
        )
        self.assertAlmostEqual(self.cmp55.dist('distance', 'difference'), 0.5)

        # Test wrapper
        self.assertAlmostEqual(
            dist_sift4('xabxcdxxefxgx', 'abcdefg'), 0.538461538
        )

    def test_sift4_sim(self):
        """Test abydos.distance.Sift4.sim."""
        # tests copied from Lukas Benedix's post at
        # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('a', ''), 0)
        self.assertEqual(self.cmp.sim('', 'a'), 0)
        self.assertEqual(self.cmp.sim('abc', ''), 0)
        self.assertEqual(self.cmp.sim('', 'abc'), 0)

        self.assertEqual(self.cmp.sim('a', 'a'), 1)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 1)

        self.assertEqual(self.cmp.sim('a', 'ab'), 0.5)
        self.assertAlmostEqual(self.cmp.sim('ac', 'abc'), 2 / 3)
        self.assertAlmostEqual(
            self.cmp.sim('abcdefg', 'xabxcdxxefxgx'), 0.461538461
        )

        self.assertEqual(self.cmp.sim('ab', 'b'), 0.5)
        self.assertEqual(self.cmp.sim('ab', 'a'), 0.5)
        self.assertAlmostEqual(self.cmp.sim('abc', 'ac'), 2 / 3)
        self.assertAlmostEqual(
            self.cmp.sim('xabxcdxxefxgx', 'abcdefg'), 0.461538461
        )

        self.assertEqual(self.cmp.sim('a', 'b'), 0)
        self.assertEqual(self.cmp.sim('ab', 'ac'), 0.5)
        self.assertEqual(self.cmp.sim('ac', 'bc'), 0.5)
        self.assertAlmostEqual(self.cmp.sim('abc', 'axc'), 2 / 3)
        self.assertAlmostEqual(
            self.cmp.sim('xabxcdxxefxgx', '1ab2cd34ef5g6'), 0.538461538
        )

        self.assertAlmostEqual(self.cmp.sim('example', 'samples'), 0.714285714)
        self.assertAlmostEqual(self.cmp.sim('sturgeon', 'urgently'), 0.625)
        self.assertAlmostEqual(
            self.cmp.sim('levenshtein', 'frankenstein'), 0.5
        )
        self.assertAlmostEqual(self.cmp.sim('distance', 'difference'), 0.5)

        # Tests copied from
        # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java
        self.assertAlmostEqual(
            Sift4(5).sim(
                'This is the first string', 'And this is another string'
            ),
            0.576923077,
        )
        self.assertAlmostEqual(
            Sift4(10).sim(
                'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
                'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing'
                + ' elit.',
            ),
            0.806451613,
        )

        # cases with max_distance
        self.assertAlmostEqual(
            self.cmp55.sim('example', 'samples'), 0.285714286
        )
        self.assertAlmostEqual(self.cmp55.sim('sturgeon', 'urgently'), 0.375)
        self.assertAlmostEqual(
            self.cmp55.sim('levenshtein', 'frankenstein'), 0.583333333
        )
        self.assertAlmostEqual(self.cmp55.sim('distance', 'difference'), 0.5)

        # Test wrapper
        self.assertAlmostEqual(
            sim_sift4('xabxcdxxefxgx', 'abcdefg'), 0.461538461
        )
Esempio n. 3
0
pshp_soundex_first = PSHPSoundexFirst()
pe = Ainsworth()	
iss = IterativeSubString()
bisim = BISIM()
dlev = DiscountedLevenshtein()
prefix = Prefix()
lcs = LCSstr()
mlipns = MLIPNS()
strcmp95 = Strcmp95()
mra = MRA()
editex = Editex()
saps = SAPS()
flexmetric = FlexMetric()
jaro = JaroWinkler(mode='Jaro')
higuera_mico = HigueraMico()
sift4 = Sift4()
eudex = Eudex()
aline = ALINE()
phonetic_edit = PhoneticEditDistance()
algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex,
     aline, phonetic_edit]

algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra',
          'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline',
          'phoneticeditdistance']

def sum_ipa(name_a, name_b):
    feat1 = ipa_to_features(pe.encode(name_a))
    feat2 = ipa_to_features(pe.encode(name_b))
    if len(feat1) <= 1:
        score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/1
Esempio n. 4
0
    def __init__(self, model='latin'):
        self.model = model
        self.impH = input_helpers.InputHelper()
        self.ST = syllable_tokenizer.SyllableTokenizer()
        # Phonetic Encoder
        self.pe = Ainsworth()
        # Soundex Firstname Algorithm
        self.pshp_soundex_first = PSHPSoundexFirst()
        # String Distance algorithms
        self.algos = [
            IterativeSubString(),
            BISIM(),
            DiscountedLevenshtein(),
            Prefix(),
            LCSstr(),
            MLIPNS(),
            Strcmp95(),
            MRA(),
            Editex(),
            SAPS(),
            FlexMetric(),
            JaroWinkler(mode='Jaro'),
            HigueraMico(),
            Sift4(),
            Eudex(),
            ALINE(),
            Covington(),
            PhoneticEditDistance()
        ]
        self.algo_names = [
            'iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix',
            'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps',
            'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline',
            'covington', 'phoneticeditdistance'
        ]

        # extract model tarball into directory if doesnt exist
        model_dir = os.path.join(os.path.dirname(__file__), "models",
                                 self.model)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
            tar = tarfile.open(
                os.path.join(os.path.dirname(__file__), "models",
                             self.model + ".tar.gz"), "r:gz")
            tar.extractall(model_dir)
            tar.close()

        # String Distance Pipeline (Level 0/Base Model)
        self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl'))

        # Character Embedding Network (Level 0/Base Model)
        self.vocab = preprocess.VocabularyProcessor(
            max_document_length=15,
            min_frequency=0).restore(os.path.join(model_dir, 'vocab'))

        siamese_model = os.path.join(model_dir, 'siamese')

        graph = tf.Graph()
        with graph.as_default() as graph:
            self.sess = tf.Session()
            with self.sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    '{}.meta'.format(siamese_model))
                self.sess.run(tf.global_variables_initializer())
                saver.restore(self.sess, siamese_model)
                # Get the placeholders from the graph by name
            self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0]
            self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0]

            self.dropout_keep_prob = graph.get_operation_by_name(
                'dropout_keep_prob').outputs[0]
            self.prediction = graph.get_operation_by_name(
                'output/distance').outputs[0]
            self.sim = graph.get_operation_by_name(
                'accuracy/temp_sim').outputs[0]

        # Logreg (Level 1/Meta Model)
        self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl'))

        # seen names (mapping dict from raw name to processed name)
        self.seen_names = {}

        # seen pairs (mapping dict from name pair tuple to similarity)
        self.seen_pairs = {}
Esempio n. 5
0
    def __init__(self, model='latin', prefilter=True, allow_alt_surname=True, allow_initials=True,
                 allow_missing_components=True):

        # user-provided parameters
        self.model = model
        self.allow_alt_surname = allow_alt_surname
        self.allow_initials = allow_initials
        self.allow_missing_components = allow_missing_components
        self.prefilter = prefilter
        if self.prefilter:
            self.refined_soundex = {
                'b': 1, 'p': 1,
                'f': 2, 'v': 2,
                'c': 3, 'k': 3, 's': 3,
                'g': 4, 'j': 4,
                'q': 5, 'x': 5, 'z': 5,
                'd': 6, 't': 6,
                'l': 7,
                'm': 8, 'n': 8,
                'r': 9
            }

        # verify user-supplied class arguments
        model_dir = self.validate_parameters()

        self.impH = input_helpers.InputHelper()
        # Phonetic Encoder
        self.pe = Ainsworth()
        # Soundex Firstname Algorithm
        self.pshp_soundex_first = PSHPSoundexFirst()
        # Soundex Lastname Algorithm
        self.pshp_soundex_last = PSHPSoundexLast()

        # String Distance algorithms
        self.algos = [IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(),
                      Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(),
                      Sift4(), Eudex(), ALINE(), CovingtonGuard(), PhoneticEditDistance()]
        self.algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns',
                           'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico',
                           'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance']

        # String Distance Pipeline (Level 0/Base Model)
        self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl'))

        # Character Embedding Network (Level 0/Base Model)
        self.vocab = preprocess.VocabularyProcessor(max_document_length=15, min_frequency=0).restore(
            os.path.join(model_dir, 'vocab'))

        siamese_model = os.path.join(model_dir, 'siamese')

        # start tensorflow session
        graph = tf.Graph()
        with graph.as_default() as graph:
            self.sess = tf.Session() if tf.__version__[0] == '1' else tf.compat.v1.Session()
            with self.sess.as_default():
                # Load the saved meta graph and restore variables
                if tf.__version__[0] == '1':
                    saver = tf.train.import_meta_graph('{}.meta'.format(siamese_model))
                    self.sess.run(tf.global_variables_initializer())
                else:
                    saver = tf.compat.v1.train.import_meta_graph('{}.meta'.format(siamese_model))
                    self.sess.run(tf.compat.v1.global_variables_initializer())
                saver.restore(self.sess, siamese_model)
                # Get the placeholders from the graph by name
            self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0]
            self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0]

            self.dropout_keep_prob = graph.get_operation_by_name('dropout_keep_prob').outputs[0]
            self.prediction = graph.get_operation_by_name('output/distance').outputs[0]
            self.sim = graph.get_operation_by_name('accuracy/temp_sim').outputs[0]

        # Logreg (Level 1/Meta Model)
        self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl'))

        # seen names (mapping dict from raw name to processed name)
        self.seen_names = {}
        # seen pairs (mapping dict from name pair tuple to similarity)
        self.seen_pairs = {}
        # user scores (mapping dict from name pair tuple to similarity)
        self.user_scores = {}