class EudexTestCases(unittest.TestCase): """Test Eudex distance functions. abydos.distance.Eudex """ cmp = Eudex() def test_eudex_dist_abs(self): """Test abydos.distance.Eudex.dist_abs.""" # Base cases self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(Eudex(None).dist_abs('', ''), 0) self.assertEqual(Eudex('fibonacci').dist_abs('', ''), 0) self.assertEqual(Eudex([10, 1, 1, 1]).dist_abs('', ''), 0) self.assertEqual(Eudex(_yield_1).dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('', '', normalized=True), 0) self.assertEqual(self.cmp.dist_abs('Niall', 'Niall'), 0) self.assertEqual(Eudex(None).dist_abs('Niall', 'Niall'), 0) self.assertEqual(Eudex('fibonacci').dist_abs('Niall', 'Niall'), 0) self.assertEqual(Eudex([10, 1, 1, 1]).dist_abs('Niall', 'Niall'), 0) self.assertEqual(Eudex(_yield_1).dist_abs('Niall', 'Niall'), 0) self.assertEqual(self.cmp.dist_abs('Niall', 'Niall', normalized=True), 0) self.assertEqual(self.cmp.dist_abs('Niall', 'Neil'), 2) self.assertEqual(Eudex(None).dist_abs('Niall', 'Neil'), 1) self.assertEqual(Eudex('fibonacci').dist_abs('Niall', 'Neil'), 2) self.assertEqual(Eudex([10, 1, 1, 1]).dist_abs('Niall', 'Neil'), 1) self.assertEqual(Eudex(_yield_1).dist_abs('Niall', 'Neil'), 1) self.assertAlmostEqual( self.cmp.dist_abs('Niall', 'Neil', normalized=True), 0.00098039) self.assertEqual(self.cmp.dist_abs('Niall', 'Colin'), 524) self.assertEqual(Eudex(None).dist_abs('Niall', 'Colin'), 10) self.assertEqual(Eudex('fibonacci').dist_abs('Niall', 'Colin'), 146) self.assertEqual(Eudex([10, 1, 1, 1]).dist_abs('Niall', 'Colin'), 42) self.assertEqual(Eudex(_yield_1).dist_abs('Niall', 'Colin'), 10) self.assertAlmostEqual( self.cmp.dist_abs('Niall', 'Colin', normalized=True), 0.25686274) # Test wrapper self.assertEqual(eudex_hamming('Niall', 'Neil', 'fibonacci'), 2) def test_eudex_dist(self): """Test abydos.distance.Eudex.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(Eudex(None).dist('', ''), 0) self.assertEqual(Eudex('fibonacci').dist('', ''), 0) self.assertEqual(self.cmp.dist('Niall', 'Niall'), 0) self.assertEqual(Eudex(None).dist('Niall', 'Niall'), 0) self.assertEqual(Eudex('fibonacci').dist('Niall', 'Niall'), 0) self.assertAlmostEqual(self.cmp.dist('Niall', 'Neil'), 0.00098039) self.assertAlmostEqual(Eudex(None).dist('Niall', 'Neil'), 0.11111111) self.assertAlmostEqual( Eudex('fibonacci').dist('Niall', 'Neil'), 0.00287356) self.assertAlmostEqual(self.cmp.dist('Niall', 'Colin'), 0.25686275) self.assertAlmostEqual(Eudex(None).dist('Niall', 'Colin'), 0.16666667) self.assertAlmostEqual( Eudex('fibonacci').dist('Niall', 'Colin'), 0.20977011) with self.assertRaises(ValueError): Eudex('veryLarge').dist_abs('Niall', 'Colin') # Test wrapper self.assertAlmostEqual(dist_eudex('Niall', 'Neil', 'fibonacci'), 0.00287356) def test_eudex_sim(self): """Test abydos.distance.Eudex.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(Eudex(None).sim('', ''), 1) self.assertEqual(Eudex('fibonacci').sim('', ''), 1) self.assertEqual(self.cmp.sim('Niall', 'Niall'), 1) self.assertEqual(Eudex(None).sim('Niall', 'Niall'), 1) self.assertEqual(Eudex('fibonacci').sim('Niall', 'Niall'), 1) self.assertAlmostEqual(self.cmp.sim('Niall', 'Neil'), 0.99901961) self.assertAlmostEqual(Eudex(None).sim('Niall', 'Neil'), 0.88888889) self.assertAlmostEqual( Eudex('fibonacci').sim('Niall', 'Neil'), 0.99712644) self.assertAlmostEqual(self.cmp.sim('Niall', 'Colin'), 0.74313725) self.assertAlmostEqual(Eudex(None).sim('Niall', 'Colin'), 0.83333333) self.assertAlmostEqual( Eudex('fibonacci').sim('Niall', 'Colin'), 0.79022989) # Test wrapper self.assertAlmostEqual(sim_eudex('Niall', 'Neil', 'fibonacci'), 0.99712644)
def __init__(self, model='latin', prefilter=True, allow_alt_surname=True, allow_initials=True, allow_missing_components=True): # user-provided parameters self.model = model self.allow_alt_surname = allow_alt_surname self.allow_initials = allow_initials self.allow_missing_components = allow_missing_components self.prefilter = prefilter if self.prefilter: self.refined_soundex = { 'b': 1, 'p': 1, 'f': 2, 'v': 2, 'c': 3, 'k': 3, 's': 3, 'g': 4, 'j': 4, 'q': 5, 'x': 5, 'z': 5, 'd': 6, 't': 6, 'l': 7, 'm': 8, 'n': 8, 'r': 9 } # verify user-supplied class arguments model_dir = self.validate_parameters() self.impH = input_helpers.InputHelper() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # Soundex Lastname Algorithm self.pshp_soundex_last = PSHPSoundexLast() # String Distance algorithms self.algos = [ IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), CovingtonGuard(), PhoneticEditDistance() ] self.algo_names = [ 'iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance' ] # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor( max_document_length=15, min_frequency=0).restore(os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') # start tensorflow session graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session( ) if tf.__version__[0] == '1' else tf.compat.v1.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables if tf.__version__[0] == '1': saver = tf.train.import_meta_graph( '{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) else: saver = tf.compat.v1.train.import_meta_graph( '{}.meta'.format(siamese_model)) self.sess.run(tf.compat.v1.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name( 'dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name( 'output/distance').outputs[0] self.sim = graph.get_operation_by_name( 'accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {} # user scores (mapping dict from name pair tuple to similarity) self.user_scores = {}