def test_phonetic_edit_distance_alignment(self): """Test abydos.distance.PhoneticEditDistance.alignment.""" # Base cases self.assertEqual(self.ped.alignment('', ''), (0.0, '', '')) self.assertEqual(self.ped.alignment('a', ''), (1.0, 'a', '-')) self.assertEqual(self.ped.alignment('', 'a'), (1.0, '-', 'a')) self.assertEqual(self.ped.alignment('abc', ''), (3.0, 'abc', '---')) self.assertEqual(self.ped.alignment('', 'abc'), (3.0, '---', 'abc')) self.assertEqual(self.ped.alignment('abc', 'abc'), (0.0, 'abc', 'abc')) self.assertEqual( self.ped.alignment('abcd', 'efgh'), (0.4193548387096774, 'abcd', 'efgh'), ) self.assertEqual( self.ped.alignment('Nigel', 'Niall'), (0.8870967741935485, 'Nigel', 'Niall'), ) self.assertEqual( self.ped.alignment('Niall', 'Nigel'), (0.8870967741935485, 'Niall', 'Nigel'), ) self.assertEqual( self.ped.alignment('Colin', 'Coiln'), (0.870967741935484, 'Colin', 'Coiln'), ) self.assertEqual( self.ped.alignment('Coiln', 'Colin'), (0.870967741935484, 'Coiln', 'Colin'), ) self.assertEqual( PhoneticEditDistance(mode='osa').alignment('Niel', 'Neil'), (0.06451612903225801, 'Niel', 'Neil'), )
def test_phonetic_edit_distance_dist_abs(self): """Test abydos.distance.PhoneticEditDistance.dist_abs.""" # Base cases self.assertEqual(self.ped.dist_abs('', ''), 0) self.assertEqual(self.ped.dist_abs('a', ''), 1) self.assertEqual(self.ped.dist_abs('', 'a'), 1) self.assertEqual(self.ped.dist_abs('abc', ''), 3) self.assertEqual(self.ped.dist_abs('', 'abc'), 3) self.assertEqual(self.ped.dist_abs('abc', 'abc'), 0) self.assertEqual(self.ped.dist_abs('abcd', 'efgh'), 0.4193548387096774) self.assertAlmostEqual(self.ped.dist_abs('Nigel', 'Niall'), 0.8870967741935485) self.assertAlmostEqual(self.ped.dist_abs('Niall', 'Nigel'), 0.8870967741935485) self.assertAlmostEqual(self.ped.dist_abs('Colin', 'Coiln'), 0.870967741935484) self.assertAlmostEqual(self.ped.dist_abs('Coiln', 'Colin'), 0.870967741935484) self.assertAlmostEqual(self.ped.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2.370967741935484) self.assertEqual( PhoneticEditDistance(weights={ 'syllabic': 1.0 }).dist_abs('Nigel', 'Niall'), 0.0, ) self.assertAlmostEqual( PhoneticEditDistance(weights=(1, 1, 1)).dist_abs('Nigel', 'Niall'), 0.33333333333333326, ) self.assertAlmostEqual( PhoneticEditDistance(mode='osa').dist_abs('Niel', 'Neil'), 0.06451612903225801, )
def __init__(self, model='latin'): self.model = model self.impH = input_helpers.InputHelper() self.ST = syllable_tokenizer.SyllableTokenizer() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # String Distance algorithms self.algos = [ IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), Covington(), PhoneticEditDistance() ] self.algo_names = [ 'iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance' ] # extract model tarball into directory if doesnt exist model_dir = os.path.join(os.path.dirname(__file__), "models", self.model) if not os.path.exists(model_dir): os.makedirs(model_dir) tar = tarfile.open( os.path.join(os.path.dirname(__file__), "models", self.model + ".tar.gz"), "r:gz") tar.extractall(model_dir) tar.close() # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor( max_document_length=15, min_frequency=0).restore(os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( '{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name( 'dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name( 'output/distance').outputs[0] self.sim = graph.get_operation_by_name( 'accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {}
bisim = BISIM() dlev = DiscountedLevenshtein() prefix = Prefix() lcs = LCSstr() mlipns = MLIPNS() strcmp95 = Strcmp95() mra = MRA() editex = Editex() saps = SAPS() flexmetric = FlexMetric() jaro = JaroWinkler(mode='Jaro') higuera_mico = HigueraMico() sift4 = Sift4() eudex = Eudex() aline = ALINE() phonetic_edit = PhoneticEditDistance() algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex, aline, phonetic_edit] algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'phoneticeditdistance'] def sum_ipa(name_a, name_b): feat1 = ipa_to_features(pe.encode(name_a)) feat2 = ipa_to_features(pe.encode(name_b)) if len(feat1) <= 1: score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/1 else: score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/len(feat1) return score
def __init__(self, model='latin', prefilter=True, allow_alt_surname=True, allow_initials=True, allow_missing_components=True): # user-provided parameters self.model = model self.allow_alt_surname = allow_alt_surname self.allow_initials = allow_initials self.allow_missing_components = allow_missing_components self.prefilter = prefilter if self.prefilter: self.refined_soundex = { 'b': 1, 'p': 1, 'f': 2, 'v': 2, 'c': 3, 'k': 3, 's': 3, 'g': 4, 'j': 4, 'q': 5, 'x': 5, 'z': 5, 'd': 6, 't': 6, 'l': 7, 'm': 8, 'n': 8, 'r': 9 } # verify user-supplied class arguments model_dir = self.validate_parameters() self.impH = input_helpers.InputHelper() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # Soundex Lastname Algorithm self.pshp_soundex_last = PSHPSoundexLast() # String Distance algorithms self.algos = [IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), CovingtonGuard(), PhoneticEditDistance()] self.algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance'] # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor(max_document_length=15, min_frequency=0).restore( os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') # start tensorflow session graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session() if tf.__version__[0] == '1' else tf.compat.v1.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables if tf.__version__[0] == '1': saver = tf.train.import_meta_graph('{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) else: saver = tf.compat.v1.train.import_meta_graph('{}.meta'.format(siamese_model)) self.sess.run(tf.compat.v1.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name('dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name('output/distance').outputs[0] self.sim = graph.get_operation_by_name('accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {} # user scores (mapping dict from name pair tuple to similarity) self.user_scores = {}
"""Evaluation functions for sequence models.""" __author__ = "Kyle Gorman" import logging import numpy # type: ignore from typing import Any, Iterator, List, Tuple from abydos.distance import PhoneticEditDistance Labels = List[Any] Metrics = [ PhoneticEditDistance(vowel_ignorance=True), PhoneticEditDistance(vowel_dominance=True), PhoneticEditDistance(vowel_ignorance=True, no_features=True), PhoneticEditDistance() ] def edit_distance(x: Labels, y: Labels) -> int: # For a more expressive version of the same, see: # # https://gist.github.com/kylebgorman/8034009 idim = len(x) + 1 jdim = len(y) + 1 table = numpy.zeros((idim, jdim), dtype=numpy.uint8) table[1:, 0] = 1 table[0, 1:] = 1
import argparse from abydos.distance import Levenshtein, PhoneticEditDistance from evallib import edit_distance from tqdm import tqdm parser = argparse.ArgumentParser() parser.add_argument('--left', type=str) parser.add_argument('--right', type=str) parser.add_argument('--out', type=str) args = parser.parse_args() PED = PhoneticEditDistance() PED_IG = PhoneticEditDistance(vowel_ignorance=True) PED_DOM = PhoneticEditDistance(vowel_dominance=True) PED_NOF_IG = PhoneticEditDistance(vowel_ignorance=True, no_features=True) PED_NOF = PhoneticEditDistance(no_features=True) LEV = Levenshtein() with open(args.left) as left, open(args.right) as right, open(args.out, 'w') as ouf: for left_line, right_line in tqdm(zip(left, right)): left_grapheme, left_phoneme = left_line.strip().split('\t') right_grapheme, right_phoneme = right_line.strip().split('\t') lsplit = left_phoneme.split() rsplit = right_phoneme.split() gorman_lev = edit_distance(lsplit, rsplit) / len(lsplit) left_phon_input = ''.join(lsplit)
class PhoneticEditDistanceTestCases(unittest.TestCase): """Test phonetic edit distance functions. abydos.distance.PhoneticEditDistance """ ped = PhoneticEditDistance() def test_phonetic_edit_distance_dist(self): """Test abydos.distance.PhoneticEditDistance.dist.""" # Base cases self.assertEqual(self.ped.dist('', ''), 0.0) self.assertEqual(self.ped.dist('a', ''), 1.0) self.assertEqual(self.ped.dist('', 'a'), 1.0) self.assertEqual(self.ped.dist('abc', ''), 1.0) self.assertEqual(self.ped.dist('', 'abc'), 1.0) self.assertEqual(self.ped.dist('abc', 'abc'), 0.0) self.assertEqual(self.ped.dist('abcd', 'efgh'), 0.10483870967741934) self.assertAlmostEqual(self.ped.dist('Nigel', 'Niall'), 0.1774193548387097) self.assertAlmostEqual(self.ped.dist('Niall', 'Nigel'), 0.1774193548387097) self.assertAlmostEqual(self.ped.dist('Colin', 'Coiln'), 0.1741935483870968) self.assertAlmostEqual(self.ped.dist('Coiln', 'Colin'), 0.1741935483870968) self.assertAlmostEqual(self.ped.dist('ATCAACGAGT', 'AACGATTAG'), 0.2370967741935484) def test_phonetic_edit_distance_dist_abs(self): """Test abydos.distance.PhoneticEditDistance.dist_abs.""" # Base cases self.assertEqual(self.ped.dist_abs('', ''), 0) self.assertEqual(self.ped.dist_abs('a', ''), 1) self.assertEqual(self.ped.dist_abs('', 'a'), 1) self.assertEqual(self.ped.dist_abs('abc', ''), 3) self.assertEqual(self.ped.dist_abs('', 'abc'), 3) self.assertEqual(self.ped.dist_abs('abc', 'abc'), 0) self.assertEqual(self.ped.dist_abs('abcd', 'efgh'), 0.4193548387096774) self.assertAlmostEqual(self.ped.dist_abs('Nigel', 'Niall'), 0.8870967741935485) self.assertAlmostEqual(self.ped.dist_abs('Niall', 'Nigel'), 0.8870967741935485) self.assertAlmostEqual(self.ped.dist_abs('Colin', 'Coiln'), 0.870967741935484) self.assertAlmostEqual(self.ped.dist_abs('Coiln', 'Colin'), 0.870967741935484) self.assertAlmostEqual(self.ped.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2.370967741935484) self.assertEqual( PhoneticEditDistance(weights={ 'syllabic': 1.0 }).dist_abs('Nigel', 'Niall'), 0.0, ) self.assertAlmostEqual( PhoneticEditDistance(weights=(1, 1, 1)).dist_abs('Nigel', 'Niall'), 0.33333333333333326, ) self.assertAlmostEqual( PhoneticEditDistance(mode='osa').dist_abs('Niel', 'Neil'), 0.06451612903225801, ) def test_phonetic_edit_distance_alignment(self): """Test abydos.distance.PhoneticEditDistance.alignment.""" # Base cases self.assertEqual(self.ped.alignment('', ''), (0.0, '', '')) self.assertEqual(self.ped.alignment('a', ''), (1.0, 'a', '-')) self.assertEqual(self.ped.alignment('', 'a'), (1.0, '-', 'a')) self.assertEqual(self.ped.alignment('abc', ''), (3.0, 'abc', '---')) self.assertEqual(self.ped.alignment('', 'abc'), (3.0, '---', 'abc')) self.assertEqual(self.ped.alignment('abc', 'abc'), (0.0, 'abc', 'abc')) self.assertEqual( self.ped.alignment('abcd', 'efgh'), (0.4193548387096774, 'abcd', 'efgh'), ) self.assertEqual( self.ped.alignment('Nigel', 'Niall'), (0.8870967741935485, 'Nigel', 'Niall'), ) self.assertEqual( self.ped.alignment('Niall', 'Nigel'), (0.8870967741935485, 'Niall', 'Nigel'), ) self.assertEqual( self.ped.alignment('Colin', 'Coiln'), (0.870967741935484, 'Colin', 'Coiln'), ) self.assertEqual( self.ped.alignment('Coiln', 'Colin'), (0.870967741935484, 'Coiln', 'Colin'), ) self.assertEqual( PhoneticEditDistance(mode='osa').alignment('Niel', 'Neil'), (0.06451612903225801, 'Niel', 'Neil'), )