def test_sift4_dist_abs(self): """Test abydos.distance.Sift4.dist_abs.""" # tests copied from Lukas Benedix's post at # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('a', ''), 1) self.assertEqual(self.cmp.dist_abs('', 'a'), 1) self.assertEqual(self.cmp.dist_abs('abc', ''), 3) self.assertEqual(self.cmp.dist_abs('', 'abc'), 3) self.assertEqual(self.cmp.dist_abs('a', 'a'), 0) self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) self.assertEqual(self.cmp.dist_abs('a', 'ab'), 1) self.assertEqual(self.cmp.dist_abs('ac', 'abc'), 1) self.assertEqual(self.cmp.dist_abs('abcdefg', 'xabxcdxxefxgx'), 7) self.assertEqual(self.cmp.dist_abs('ab', 'b'), 1) self.assertEqual(self.cmp.dist_abs('ab', 'a'), 1) self.assertEqual(self.cmp.dist_abs('abc', 'ac'), 1) self.assertEqual(self.cmp.dist_abs('xabxcdxxefxgx', 'abcdefg'), 7) self.assertEqual(self.cmp.dist_abs('a', 'b'), 1) self.assertEqual(self.cmp.dist_abs('ab', 'ac'), 1) self.assertEqual(self.cmp.dist_abs('ac', 'bc'), 1) self.assertEqual(self.cmp.dist_abs('abc', 'axc'), 1) self.assertEqual( self.cmp.dist_abs('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6 ) self.assertEqual(self.cmp.dist_abs('example', 'samples'), 2) self.assertEqual(self.cmp.dist_abs('sturgeon', 'urgently'), 3) self.assertEqual(self.cmp.dist_abs('levenshtein', 'frankenstein'), 6) self.assertEqual(self.cmp.dist_abs('distance', 'difference'), 5) # Tests copied from # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java self.assertEqual( Sift4(5).dist_abs( 'This is the first string', 'And this is another string' ), 11, ) self.assertEqual( Sift4(10).dist_abs( 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.', 'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing' + ' elit.', ), 12, ) # cases with max_distance self.assertEqual(self.cmp55.dist_abs('example', 'samples'), 5) self.assertEqual(self.cmp55.dist_abs('sturgeon', 'urgently'), 5) self.assertEqual(self.cmp55.dist_abs('levenshtein', 'frankenstein'), 5) self.assertEqual(self.cmp55.dist_abs('distance', 'difference'), 5) # Test wrapper self.assertEqual(sift4_common('xabxcdxxefxgx', 'abcdefg'), 7)
class Sift4TestCases(unittest.TestCase): """Test Sift4 functions. abydos.distance.Sift4 """ cmp = Sift4() cmp55 = Sift4(5, 5) def test_sift4_dist_abs(self): """Test abydos.distance.Sift4.dist_abs.""" # tests copied from Lukas Benedix's post at # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('a', ''), 1) self.assertEqual(self.cmp.dist_abs('', 'a'), 1) self.assertEqual(self.cmp.dist_abs('abc', ''), 3) self.assertEqual(self.cmp.dist_abs('', 'abc'), 3) self.assertEqual(self.cmp.dist_abs('a', 'a'), 0) self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) self.assertEqual(self.cmp.dist_abs('a', 'ab'), 1) self.assertEqual(self.cmp.dist_abs('ac', 'abc'), 1) self.assertEqual(self.cmp.dist_abs('abcdefg', 'xabxcdxxefxgx'), 7) self.assertEqual(self.cmp.dist_abs('ab', 'b'), 1) self.assertEqual(self.cmp.dist_abs('ab', 'a'), 1) self.assertEqual(self.cmp.dist_abs('abc', 'ac'), 1) self.assertEqual(self.cmp.dist_abs('xabxcdxxefxgx', 'abcdefg'), 7) self.assertEqual(self.cmp.dist_abs('a', 'b'), 1) self.assertEqual(self.cmp.dist_abs('ab', 'ac'), 1) self.assertEqual(self.cmp.dist_abs('ac', 'bc'), 1) self.assertEqual(self.cmp.dist_abs('abc', 'axc'), 1) self.assertEqual( self.cmp.dist_abs('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6 ) self.assertEqual(self.cmp.dist_abs('example', 'samples'), 2) self.assertEqual(self.cmp.dist_abs('sturgeon', 'urgently'), 3) self.assertEqual(self.cmp.dist_abs('levenshtein', 'frankenstein'), 6) self.assertEqual(self.cmp.dist_abs('distance', 'difference'), 5) # Tests copied from # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java self.assertEqual( Sift4(5).dist_abs( 'This is the first string', 'And this is another string' ), 11, ) self.assertEqual( Sift4(10).dist_abs( 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.', 'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing' + ' elit.', ), 12, ) # cases with max_distance self.assertEqual(self.cmp55.dist_abs('example', 'samples'), 5) self.assertEqual(self.cmp55.dist_abs('sturgeon', 'urgently'), 5) self.assertEqual(self.cmp55.dist_abs('levenshtein', 'frankenstein'), 5) self.assertEqual(self.cmp55.dist_abs('distance', 'difference'), 5) # Test wrapper self.assertEqual(sift4_common('xabxcdxxefxgx', 'abcdefg'), 7) def test_sift4_dist(self): """Test abydos.distance.Sift4.dist.""" # tests copied from Lukas Benedix's post at # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('a', ''), 1) self.assertEqual(self.cmp.dist('', 'a'), 1) self.assertEqual(self.cmp.dist('abc', ''), 1) self.assertEqual(self.cmp.dist('', 'abc'), 1) self.assertEqual(self.cmp.dist('a', 'a'), 0) self.assertEqual(self.cmp.dist('abc', 'abc'), 0) self.assertEqual(self.cmp.dist('a', 'ab'), 0.5) self.assertEqual(self.cmp.dist('ac', 'abc'), 1 / 3) self.assertAlmostEqual( self.cmp.dist('abcdefg', 'xabxcdxxefxgx'), 0.538461538 ) self.assertEqual(self.cmp.dist('ab', 'b'), 0.5) self.assertEqual(self.cmp.dist('ab', 'a'), 0.5) self.assertEqual(self.cmp.dist('abc', 'ac'), 1 / 3) self.assertAlmostEqual( self.cmp.dist('xabxcdxxefxgx', 'abcdefg'), 0.538461538 ) self.assertEqual(self.cmp.dist('a', 'b'), 1) self.assertEqual(self.cmp.dist('ab', 'ac'), 0.5) self.assertEqual(self.cmp.dist('ac', 'bc'), 0.5) self.assertEqual(self.cmp.dist('abc', 'axc'), 1 / 3) self.assertAlmostEqual( self.cmp.dist('xabxcdxxefxgx', '1ab2cd34ef5g6'), 0.461538461 ) self.assertAlmostEqual( self.cmp.dist('example', 'samples'), 0.285714285 ) self.assertAlmostEqual(self.cmp.dist('sturgeon', 'urgently'), 0.375) self.assertAlmostEqual( self.cmp.dist('levenshtein', 'frankenstein'), 0.5 ) self.assertAlmostEqual(self.cmp.dist('distance', 'difference'), 0.5) # Tests copied from # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java self.assertAlmostEqual( Sift4(5).dist( 'This is the first string', 'And this is another string' ), 0.423076923, ) self.assertAlmostEqual( Sift4(10).dist( 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.', 'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing' + ' elit.', ), 0.193548387, ) # cases with max_distance self.assertAlmostEqual( self.cmp55.dist('example', 'samples'), 0.714285714 ) self.assertAlmostEqual(self.cmp55.dist('sturgeon', 'urgently'), 0.625) self.assertAlmostEqual( self.cmp55.dist('levenshtein', 'frankenstein'), 0.416666666 ) self.assertAlmostEqual(self.cmp55.dist('distance', 'difference'), 0.5) # Test wrapper self.assertAlmostEqual( dist_sift4('xabxcdxxefxgx', 'abcdefg'), 0.538461538 ) def test_sift4_sim(self): """Test abydos.distance.Sift4.sim.""" # tests copied from Lukas Benedix's post at # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('a', ''), 0) self.assertEqual(self.cmp.sim('', 'a'), 0) self.assertEqual(self.cmp.sim('abc', ''), 0) self.assertEqual(self.cmp.sim('', 'abc'), 0) self.assertEqual(self.cmp.sim('a', 'a'), 1) self.assertEqual(self.cmp.sim('abc', 'abc'), 1) self.assertEqual(self.cmp.sim('a', 'ab'), 0.5) self.assertAlmostEqual(self.cmp.sim('ac', 'abc'), 2 / 3) self.assertAlmostEqual( self.cmp.sim('abcdefg', 'xabxcdxxefxgx'), 0.461538461 ) self.assertEqual(self.cmp.sim('ab', 'b'), 0.5) self.assertEqual(self.cmp.sim('ab', 'a'), 0.5) self.assertAlmostEqual(self.cmp.sim('abc', 'ac'), 2 / 3) self.assertAlmostEqual( self.cmp.sim('xabxcdxxefxgx', 'abcdefg'), 0.461538461 ) self.assertEqual(self.cmp.sim('a', 'b'), 0) self.assertEqual(self.cmp.sim('ab', 'ac'), 0.5) self.assertEqual(self.cmp.sim('ac', 'bc'), 0.5) self.assertAlmostEqual(self.cmp.sim('abc', 'axc'), 2 / 3) self.assertAlmostEqual( self.cmp.sim('xabxcdxxefxgx', '1ab2cd34ef5g6'), 0.538461538 ) self.assertAlmostEqual(self.cmp.sim('example', 'samples'), 0.714285714) self.assertAlmostEqual(self.cmp.sim('sturgeon', 'urgently'), 0.625) self.assertAlmostEqual( self.cmp.sim('levenshtein', 'frankenstein'), 0.5 ) self.assertAlmostEqual(self.cmp.sim('distance', 'difference'), 0.5) # Tests copied from # https://github.com/tdebatty/java-string-similarity/blob/master/src/test/java/info/debatty/java/stringsimilarity/experimental/Sift4Test.java self.assertAlmostEqual( Sift4(5).sim( 'This is the first string', 'And this is another string' ), 0.576923077, ) self.assertAlmostEqual( Sift4(10).sim( 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.', 'Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing' + ' elit.', ), 0.806451613, ) # cases with max_distance self.assertAlmostEqual( self.cmp55.sim('example', 'samples'), 0.285714286 ) self.assertAlmostEqual(self.cmp55.sim('sturgeon', 'urgently'), 0.375) self.assertAlmostEqual( self.cmp55.sim('levenshtein', 'frankenstein'), 0.583333333 ) self.assertAlmostEqual(self.cmp55.sim('distance', 'difference'), 0.5) # Test wrapper self.assertAlmostEqual( sim_sift4('xabxcdxxefxgx', 'abcdefg'), 0.461538461 )
pshp_soundex_first = PSHPSoundexFirst() pe = Ainsworth() iss = IterativeSubString() bisim = BISIM() dlev = DiscountedLevenshtein() prefix = Prefix() lcs = LCSstr() mlipns = MLIPNS() strcmp95 = Strcmp95() mra = MRA() editex = Editex() saps = SAPS() flexmetric = FlexMetric() jaro = JaroWinkler(mode='Jaro') higuera_mico = HigueraMico() sift4 = Sift4() eudex = Eudex() aline = ALINE() phonetic_edit = PhoneticEditDistance() algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex, aline, phonetic_edit] algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'phoneticeditdistance'] def sum_ipa(name_a, name_b): feat1 = ipa_to_features(pe.encode(name_a)) feat2 = ipa_to_features(pe.encode(name_b)) if len(feat1) <= 1: score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/1
def __init__(self, model='latin'): self.model = model self.impH = input_helpers.InputHelper() self.ST = syllable_tokenizer.SyllableTokenizer() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # String Distance algorithms self.algos = [ IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), Covington(), PhoneticEditDistance() ] self.algo_names = [ 'iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance' ] # extract model tarball into directory if doesnt exist model_dir = os.path.join(os.path.dirname(__file__), "models", self.model) if not os.path.exists(model_dir): os.makedirs(model_dir) tar = tarfile.open( os.path.join(os.path.dirname(__file__), "models", self.model + ".tar.gz"), "r:gz") tar.extractall(model_dir) tar.close() # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor( max_document_length=15, min_frequency=0).restore(os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( '{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name( 'dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name( 'output/distance').outputs[0] self.sim = graph.get_operation_by_name( 'accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {}
def __init__(self, model='latin', prefilter=True, allow_alt_surname=True, allow_initials=True, allow_missing_components=True): # user-provided parameters self.model = model self.allow_alt_surname = allow_alt_surname self.allow_initials = allow_initials self.allow_missing_components = allow_missing_components self.prefilter = prefilter if self.prefilter: self.refined_soundex = { 'b': 1, 'p': 1, 'f': 2, 'v': 2, 'c': 3, 'k': 3, 's': 3, 'g': 4, 'j': 4, 'q': 5, 'x': 5, 'z': 5, 'd': 6, 't': 6, 'l': 7, 'm': 8, 'n': 8, 'r': 9 } # verify user-supplied class arguments model_dir = self.validate_parameters() self.impH = input_helpers.InputHelper() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # Soundex Lastname Algorithm self.pshp_soundex_last = PSHPSoundexLast() # String Distance algorithms self.algos = [IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), CovingtonGuard(), PhoneticEditDistance()] self.algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance'] # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor(max_document_length=15, min_frequency=0).restore( os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') # start tensorflow session graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session() if tf.__version__[0] == '1' else tf.compat.v1.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables if tf.__version__[0] == '1': saver = tf.train.import_meta_graph('{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) else: saver = tf.compat.v1.train.import_meta_graph('{}.meta'.format(siamese_model)) self.sess.run(tf.compat.v1.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name('dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name('output/distance').outputs[0] self.sim = graph.get_operation_by_name('accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {} # user scores (mapping dict from name pair tuple to similarity) self.user_scores = {}