def test_dist_jaro_winkler(self): """Test abydos.distance.JaroWinkler.dist.""" self.assertEqual(self.jaro.dist('', ''), 0) self.assertEqual(self.jaro_winkler.dist('', ''), 0) self.assertEqual(self.jaro.dist('MARTHA', ''), 1) self.assertEqual(self.jaro_winkler.dist('MARTHA', ''), 1) self.assertEqual(self.jaro.dist('', 'MARHTA'), 1) self.assertEqual(self.jaro_winkler.dist('', 'MARHTA'), 1) self.assertEqual(self.jaro.dist('MARTHA', 'MARTHA'), 0) self.assertEqual(self.jaro_winkler.dist('MARTHA', 'MARTHA'), 0) # https://en.wikipedia.org/wiki/Jaro-Winkler_distance self.assertAlmostEqual(self.jaro.dist('MARTHA', 'MARHTA'), 0.05555555) self.assertAlmostEqual(self.jaro_winkler.dist('MARTHA', 'MARHTA'), 0.03888888) self.assertAlmostEqual(self.jaro.dist('DWAYNE', 'DUANE'), 0.17777777) self.assertAlmostEqual(self.jaro_winkler.dist('DWAYNE', 'DUANE'), 0.16) self.assertAlmostEqual(self.jaro.dist('DIXON', 'DICKSONX'), 0.23333333) self.assertAlmostEqual(self.jaro_winkler.dist('DIXON', 'DICKSONX'), 0.18666666) self.assertRaises(ValueError, JaroWinkler(boost_threshold=2).dist, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(boost_threshold=-1).dist, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(scaling_factor=0.3).dist, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(scaling_factor=-1).dist, 'abcd', 'dcba') self.assertAlmostEqual(self.jaro_winkler.dist('ABCD', 'EFGH'), 1.0)
def test_sim_jaro_winkler(self): """Test abydos.distance.JaroWinkler.sim.""" self.assertEqual(self.jaro.sim('', ''), 1) self.assertEqual(self.jaro_winkler.sim('', ''), 1) self.assertEqual(self.jaro.sim('MARTHA', ''), 0) self.assertEqual(self.jaro_winkler.sim('MARTHA', ''), 0) self.assertEqual(self.jaro.sim('', 'MARHTA'), 0) self.assertEqual(self.jaro_winkler.sim('', 'MARHTA'), 0) self.assertEqual(self.jaro.sim('MARTHA', 'MARTHA'), 1) self.assertEqual(self.jaro_winkler.sim('MARTHA', 'MARTHA'), 1) # https://en.wikipedia.org/wiki/Jaro-Winkler_distance self.assertAlmostEqual(self.jaro.sim('MARTHA', 'MARHTA'), 0.94444444) self.assertAlmostEqual(self.jaro_winkler.sim('MARTHA', 'MARHTA'), 0.96111111) self.assertAlmostEqual(self.jaro.sim('DWAYNE', 'DUANE'), 0.82222222) self.assertAlmostEqual(self.jaro_winkler.sim('DWAYNE', 'DUANE'), 0.84) self.assertAlmostEqual(self.jaro.sim('DIXON', 'DICKSONX'), 0.76666666) self.assertAlmostEqual(self.jaro_winkler.sim('DIXON', 'DICKSONX'), 0.81333333) self.assertRaises(ValueError, JaroWinkler(boost_threshold=2).sim, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(boost_threshold=-1).sim, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(scaling_factor=0.3).sim, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(scaling_factor=-1).sim, 'abcd', 'dcba') self.assertAlmostEqual(self.jaro_winkler.sim('ABCD', 'EFGH'), 0.0) # long_strings = True (applies only to Jaro-Winkler, not Jaro) self.assertEqual( JaroWinkler(long_strings=True).sim('ABCD', 'EFGH'), self.jaro.sim('ABCD', 'EFGH'), ) self.assertEqual( JaroWinkler(mode='jaro', long_strings=True).sim('DIXON', 'DICKSONX'), self.jaro.sim('DIXON', 'DICKSONX'), ) self.assertAlmostEqual( JaroWinkler(mode='winkler', long_strings=True).sim('DIXON', 'DICKSONX'), 0.83030303, ) self.assertAlmostEqual( JaroWinkler(mode='winkler', long_strings=True).sim('MARTHA', 'MARHTA'), 0.97083333, ) # Test wrapper self.assertAlmostEqual( sim_jaro_winkler('DIXON', 'DICKSONX', mode='jaro'), 0.76666666) self.assertAlmostEqual( sim_jaro_winkler('DIXON', 'DICKSONX', mode='winkler'), 0.81333333)
def test_soft_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (soft).""" # Base cases self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0) self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0) self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111) self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6) self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6) self.assertAlmostEqual( self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'), 0.68 ) self.assertAlmostEqual( Jaccard(intersection_type='soft', metric=JaroWinkler()).sim( 'synonym', 'antonym' ), 0.777777777777, )
def test_linkage_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (group linkage).""" # Base cases self.assertEqual(self.cmp_j_linkage.sim('', ''), 1.0) self.assertEqual(self.cmp_j_linkage.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_linkage.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_linkage.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_linkage.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_linkage.sim('abc', 'abc'), 1.0) self.assertAlmostEqual(self.cmp_j_linkage.sim('abcd', 'efgh'), 0.1111111111111111) self.assertAlmostEqual(self.cmp_j_linkage.sim('Nigel', 'Niall'), 0.5) self.assertAlmostEqual(self.cmp_j_linkage.sim('Niall', 'Nigel'), 0.5) self.assertAlmostEqual(self.cmp_j_linkage.sim('Colin', 'Coiln'), 0.6) self.assertAlmostEqual(self.cmp_j_linkage.sim('Coiln', 'Colin'), 0.6) self.assertAlmostEqual( self.cmp_j_linkage.sim('ATCAACGAGT', 'AACGATTAG'), 0.68) self.assertAlmostEqual( Jaccard( intersection_type='linkage', metric=JaroWinkler(), threshold=0.2, ).sim('synonym', 'antonym'), 0.6, )
def test_soft_jaccard_sim(self): """Test abydos.distance.Jaccard.sim (soft).""" # Base cases self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0) self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0) self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0) self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0) self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111) self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5) self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6) self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6) self.assertAlmostEqual(self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'), 0.68) self.assertAlmostEqual( Jaccard(intersection_type='soft', tokenizer=WhitespaceTokenizer()).sim( 'junior system analyst', 'systems analyst'), 0.6190476190476191, ) self.assertAlmostEqual( Jaccard(intersection_type='soft', tokenizer=WhitespaceTokenizer()).sim( 'systems analyst', 'junior system analyst'), 0.6190476190476191, ) with self.assertRaises(TypeError): Jaccard( intersection_type='soft', metric=JaroWinkler(), tokenizer=WhitespaceTokenizer(), ).sim('junior system analyst', 'systems analyst')
def __init__(self, model='latin'): self.model = model self.impH = input_helpers.InputHelper() self.ST = syllable_tokenizer.SyllableTokenizer() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # String Distance algorithms self.algos = [ IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), Covington(), PhoneticEditDistance() ] self.algo_names = [ 'iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance' ] # extract model tarball into directory if doesnt exist model_dir = os.path.join(os.path.dirname(__file__), "models", self.model) if not os.path.exists(model_dir): os.makedirs(model_dir) tar = tarfile.open( os.path.join(os.path.dirname(__file__), "models", self.model + ".tar.gz"), "r:gz") tar.extractall(model_dir) tar.close() # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor( max_document_length=15, min_frequency=0).restore(os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( '{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name( 'dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name( 'output/distance').outputs[0] self.sim = graph.get_operation_by_name( 'accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {}
# Featurizer pshp_soundex_first = PSHPSoundexFirst() pe = Ainsworth() iss = IterativeSubString() bisim = BISIM() dlev = DiscountedLevenshtein() prefix = Prefix() lcs = LCSstr() mlipns = MLIPNS() strcmp95 = Strcmp95() mra = MRA() editex = Editex() saps = SAPS() flexmetric = FlexMetric() jaro = JaroWinkler(mode='Jaro') higuera_mico = HigueraMico() sift4 = Sift4() eudex = Eudex() aline = ALINE() phonetic_edit = PhoneticEditDistance() algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex, aline, phonetic_edit] algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'phoneticeditdistance'] def sum_ipa(name_a, name_b): feat1 = ipa_to_features(pe.encode(name_a)) feat2 = ipa_to_features(pe.encode(name_b))
class CompleteLinkageTestCases(unittest.TestCase): """Test CompleteLinkage functions. abydos.distance.CompleteLinkage """ cmp = CompleteLinkage() cmp_q4 = CompleteLinkage(tokenizer=QGrams(qval=4, start_stop='')) cmp_q4_jw = CompleteLinkage(tokenizer=QGrams(qval=4, start_stop=''), metric=JaroWinkler()) def test_complete_linkage_dist(self): """Test abydos.distance.CompleteLinkage.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 0.0) self.assertEqual(self.cmp.dist('a', ''), 0.0) self.assertEqual(self.cmp.dist('', 'a'), 0.0) self.assertEqual(self.cmp.dist('abc', ''), 0.0) self.assertEqual(self.cmp.dist('', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abc', 'abc'), 1.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 1.0) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 1.0) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 1.0) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 1.0) self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 1.0) self.assertEqual(self.cmp_q4.dist('AAAT', 'AATT'), 0.25) self.assertAlmostEqual(self.cmp_q4_jw.dist('AAAT', 'AATT'), 0.133333333333) def test_complete_linkage_sim(self): """Test abydos.distance.CompleteLinkage.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 1.0) self.assertEqual(self.cmp.sim('a', ''), 1.0) self.assertEqual(self.cmp.sim('', 'a'), 1.0) self.assertEqual(self.cmp.sim('abc', ''), 1.0) self.assertEqual(self.cmp.sim('', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abc', 'abc'), 0.0) self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0) self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0) self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0) self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0) self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.0) def test_complete_linkage_dist_abs(self): """Test abydos.distance.CompleteLinkage.dist_abs.""" # Base cases self.assertEqual(self.cmp.dist_abs('', ''), float('-inf')) self.assertEqual(self.cmp.dist_abs('a', ''), float('-inf')) self.assertEqual(self.cmp.dist_abs('', 'a'), float('-inf')) self.assertEqual(self.cmp.dist_abs('abc', ''), float('-inf')) self.assertEqual(self.cmp.dist_abs('', 'abc'), float('-inf')) self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 2) self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 2) self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2) self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2) self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 2) self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 2) self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2)
class JaroWinklerTestCases(unittest.TestCase): """Test Jaro(-Winkler) functions. abydos.distance.JaroWinkler """ jaro = JaroWinkler(mode='jaro') jaro_winkler = JaroWinkler(mode='winkler') def test_sim_jaro_winkler(self): """Test abydos.distance.JaroWinkler.sim.""" self.assertEqual(self.jaro.sim('', ''), 1) self.assertEqual(self.jaro_winkler.sim('', ''), 1) self.assertEqual(self.jaro.sim('MARTHA', ''), 0) self.assertEqual(self.jaro_winkler.sim('MARTHA', ''), 0) self.assertEqual(self.jaro.sim('', 'MARHTA'), 0) self.assertEqual(self.jaro_winkler.sim('', 'MARHTA'), 0) self.assertEqual(self.jaro.sim('MARTHA', 'MARTHA'), 1) self.assertEqual(self.jaro_winkler.sim('MARTHA', 'MARTHA'), 1) # https://en.wikipedia.org/wiki/Jaro-Winkler_distance self.assertAlmostEqual(self.jaro.sim('MARTHA', 'MARHTA'), 0.94444444) self.assertAlmostEqual(self.jaro_winkler.sim('MARTHA', 'MARHTA'), 0.96111111) self.assertAlmostEqual(self.jaro.sim('DWAYNE', 'DUANE'), 0.82222222) self.assertAlmostEqual(self.jaro_winkler.sim('DWAYNE', 'DUANE'), 0.84) self.assertAlmostEqual(self.jaro.sim('DIXON', 'DICKSONX'), 0.76666666) self.assertAlmostEqual(self.jaro_winkler.sim('DIXON', 'DICKSONX'), 0.81333333) self.assertRaises(ValueError, JaroWinkler(boost_threshold=2).sim, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(boost_threshold=-1).sim, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(scaling_factor=0.3).sim, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(scaling_factor=-1).sim, 'abcd', 'dcba') self.assertAlmostEqual(self.jaro_winkler.sim('ABCD', 'EFGH'), 0.0) # long_strings = True (applies only to Jaro-Winkler, not Jaro) self.assertEqual( JaroWinkler(long_strings=True).sim('ABCD', 'EFGH'), self.jaro.sim('ABCD', 'EFGH'), ) self.assertEqual( JaroWinkler(mode='jaro', long_strings=True).sim('DIXON', 'DICKSONX'), self.jaro.sim('DIXON', 'DICKSONX'), ) self.assertAlmostEqual( JaroWinkler(mode='winkler', long_strings=True).sim('DIXON', 'DICKSONX'), 0.83030303, ) self.assertAlmostEqual( JaroWinkler(mode='winkler', long_strings=True).sim('MARTHA', 'MARHTA'), 0.97083333, ) def test_dist_jaro_winkler(self): """Test abydos.distance.JaroWinkler.dist.""" self.assertEqual(self.jaro.dist('', ''), 0) self.assertEqual(self.jaro_winkler.dist('', ''), 0) self.assertEqual(self.jaro.dist('MARTHA', ''), 1) self.assertEqual(self.jaro_winkler.dist('MARTHA', ''), 1) self.assertEqual(self.jaro.dist('', 'MARHTA'), 1) self.assertEqual(self.jaro_winkler.dist('', 'MARHTA'), 1) self.assertEqual(self.jaro.dist('MARTHA', 'MARTHA'), 0) self.assertEqual(self.jaro_winkler.dist('MARTHA', 'MARTHA'), 0) # https://en.wikipedia.org/wiki/Jaro-Winkler_distance self.assertAlmostEqual(self.jaro.dist('MARTHA', 'MARHTA'), 0.05555555) self.assertAlmostEqual(self.jaro_winkler.dist('MARTHA', 'MARHTA'), 0.03888888) self.assertAlmostEqual(self.jaro.dist('DWAYNE', 'DUANE'), 0.17777777) self.assertAlmostEqual(self.jaro_winkler.dist('DWAYNE', 'DUANE'), 0.16) self.assertAlmostEqual(self.jaro.dist('DIXON', 'DICKSONX'), 0.23333333) self.assertAlmostEqual(self.jaro_winkler.dist('DIXON', 'DICKSONX'), 0.18666666) self.assertRaises(ValueError, JaroWinkler(boost_threshold=2).dist, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(boost_threshold=-1).dist, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(scaling_factor=0.3).dist, 'abcd', 'dcba') self.assertRaises(ValueError, JaroWinkler(scaling_factor=-1).dist, 'abcd', 'dcba') self.assertAlmostEqual(self.jaro_winkler.dist('ABCD', 'EFGH'), 1.0)
def __init__(self, model='latin', prefilter=True, allow_alt_surname=True, allow_initials=True, allow_missing_components=True): # user-provided parameters self.model = model self.allow_alt_surname = allow_alt_surname self.allow_initials = allow_initials self.allow_missing_components = allow_missing_components self.prefilter = prefilter if self.prefilter: self.refined_soundex = { 'b': 1, 'p': 1, 'f': 2, 'v': 2, 'c': 3, 'k': 3, 's': 3, 'g': 4, 'j': 4, 'q': 5, 'x': 5, 'z': 5, 'd': 6, 't': 6, 'l': 7, 'm': 8, 'n': 8, 'r': 9 } # verify user-supplied class arguments model_dir = self.validate_parameters() self.impH = input_helpers.InputHelper() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # Soundex Lastname Algorithm self.pshp_soundex_last = PSHPSoundexLast() # String Distance algorithms self.algos = [IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), CovingtonGuard(), PhoneticEditDistance()] self.algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance'] # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor(max_document_length=15, min_frequency=0).restore( os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') # start tensorflow session graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session() if tf.__version__[0] == '1' else tf.compat.v1.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables if tf.__version__[0] == '1': saver = tf.train.import_meta_graph('{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) else: saver = tf.compat.v1.train.import_meta_graph('{}.meta'.format(siamese_model)) self.sess.run(tf.compat.v1.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name('dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name('output/distance').outputs[0] self.sim = graph.get_operation_by_name('accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {} # user scores (mapping dict from name pair tuple to similarity) self.user_scores = {}
class SingleLinkageTestCases(unittest.TestCase): """Test SingleLinkage functions. abydos.distance.SingleLinkage """ cmp = SingleLinkage() cmp_jw = SingleLinkage(metric=JaroWinkler()) def test_single_linkage_dist(self): """Test abydos.distance.SingleLinkage.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 1.0) self.assertEqual(self.cmp.dist('a', ''), 1.0) self.assertEqual(self.cmp.dist('', 'a'), 1.0) self.assertEqual(self.cmp.dist('abc', ''), 1.0) self.assertEqual(self.cmp.dist('', 'abc'), 1.0) self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.5) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0) self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.0) def test_single_linkage_sim(self): """Test abydos.distance.SingleLinkage.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 0.0) self.assertEqual(self.cmp.sim('a', ''), 0.0) self.assertEqual(self.cmp.sim('', 'a'), 0.0) self.assertEqual(self.cmp.sim('abc', ''), 0.0) self.assertEqual(self.cmp.sim('', 'abc'), 0.0) self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.5) self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 1.0) self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 1.0) self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 1.0) self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 1.0) self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 1.0) def test_single_linkage_dist_abs(self): """Test abydos.distance.SingleLinkage.dist_abs.""" # Base cases self.assertEqual(self.cmp.dist_abs('', ''), float('inf')) self.assertEqual(self.cmp.dist_abs('a', ''), float('inf')) self.assertEqual(self.cmp.dist_abs('', 'a'), float('inf')) self.assertEqual(self.cmp.dist_abs('abc', ''), float('inf')) self.assertEqual(self.cmp.dist_abs('', 'abc'), float('inf')) self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 1) self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 0) self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 0) self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 0) self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 0) self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 0) self.assertAlmostEqual(self.cmp_jw.dist_abs('abcd', 'dj'), 1 / 3)
def test_pairwise_similarity_statistics(self): """Test abydos.stats.pairwise_similarity_statistics.""" (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, NIALL) self.assertAlmostEqual(pw_max, 1.0) self.assertAlmostEqual(pw_min, 0.11764705882352944) self.assertAlmostEqual(pw_mean, 0.4188369879201684) self.assertAlmostEqual(pw_std, 0.2265099631340623) (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, ('Kneal', )) self.assertAlmostEqual(pw_max, 0.8333333333333334) self.assertAlmostEqual(pw_min, 0.11764705882352944) self.assertAlmostEqual(pw_mean, 0.30474877450980387) self.assertAlmostEqual(pw_std, 0.1842666797571549) # Test symmetric (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, NIALL, symmetric=True) self.assertAlmostEqual(pw_max, 1.0) self.assertAlmostEqual(pw_min, 0.11764705882352944) self.assertAlmostEqual(pw_mean, 0.4188369879201679) self.assertAlmostEqual(pw_std, 0.22650996313406255) (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, ('Kneal', ), symmetric=True) self.assertAlmostEqual(pw_max, 0.8333333333333334) self.assertAlmostEqual(pw_min, 0.11764705882352944) self.assertAlmostEqual(pw_mean, 0.304748774509804) self.assertAlmostEqual(pw_std, 0.18426667975715486) # Test with splittable strings (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics('The quick brown fox', 'jumped over the lazy dog.') self.assertAlmostEqual(pw_max, 0.6666666666666667) self.assertAlmostEqual(pw_min, 0.0) self.assertAlmostEqual(pw_mean, 0.08499999999999999) self.assertAlmostEqual(pw_std, 0.16132265804901677) (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics('The', 'jumped') self.assertAlmostEqual(pw_max, 0.16666666666666663) self.assertAlmostEqual(pw_min, 0.16666666666666663) self.assertAlmostEqual(pw_mean, 0.16666666666666663) self.assertAlmostEqual(pw_std, 0.0) # Test with a set metric (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, NIALL, metric=Jaccard().sim) self.assertAlmostEqual(pw_max, 1.0) self.assertAlmostEqual(pw_min, 0.0) self.assertAlmostEqual(pw_mean, 0.23226906681010506) self.assertAlmostEqual(pw_std, 0.24747101181262784) (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, NIALL, metric=JaroWinkler().dist) self.assertAlmostEqual(pw_max, 1.0) self.assertAlmostEqual(pw_min, 0.0) self.assertAlmostEqual(pw_mean, 0.3352660334967324) self.assertAlmostEqual(pw_std, 0.18394505847524578) # Test using hmean' (pw_max, pw_min, pw_mean, pw_std) = pairwise_similarity_statistics(NIALL, NIALL, mean_func=hmean) self.assertAlmostEqual(pw_max, 1.0) self.assertAlmostEqual(pw_min, 0.11764705882352944) self.assertAlmostEqual(pw_mean, 0.30718771249150056) self.assertAlmostEqual(pw_std, 0.25253182790044676) # Test exceptions self.assertRaises( ValueError, pairwise_similarity_statistics, NIALL, NIALL, mean_func='mean', ) self.assertRaises( ValueError, pairwise_similarity_statistics, NIALL, NIALL, metric='Levenshtein', ) self.assertRaises(ValueError, pairwise_similarity_statistics, 5, NIALL) self.assertRaises(ValueError, pairwise_similarity_statistics, NIALL, 5)