Example #1
0
    def test_phonetic_edit_distance_alignment(self):
        """Test abydos.distance.PhoneticEditDistance.alignment."""
        # Base cases
        self.assertEqual(self.ped.alignment('', ''), (0.0, '', ''))
        self.assertEqual(self.ped.alignment('a', ''), (1.0, 'a', '-'))
        self.assertEqual(self.ped.alignment('', 'a'), (1.0, '-', 'a'))
        self.assertEqual(self.ped.alignment('abc', ''), (3.0, 'abc', '---'))
        self.assertEqual(self.ped.alignment('', 'abc'), (3.0, '---', 'abc'))
        self.assertEqual(self.ped.alignment('abc', 'abc'), (0.0, 'abc', 'abc'))
        self.assertEqual(
            self.ped.alignment('abcd', 'efgh'),
            (0.4193548387096774, 'abcd', 'efgh'),
        )

        self.assertEqual(
            self.ped.alignment('Nigel', 'Niall'),
            (0.8870967741935485, 'Nigel', 'Niall'),
        )
        self.assertEqual(
            self.ped.alignment('Niall', 'Nigel'),
            (0.8870967741935485, 'Niall', 'Nigel'),
        )
        self.assertEqual(
            self.ped.alignment('Colin', 'Coiln'),
            (0.870967741935484, 'Colin', 'Coiln'),
        )
        self.assertEqual(
            self.ped.alignment('Coiln', 'Colin'),
            (0.870967741935484, 'Coiln', 'Colin'),
        )
        self.assertEqual(
            PhoneticEditDistance(mode='osa').alignment('Niel', 'Neil'),
            (0.06451612903225801, 'Niel', 'Neil'),
        )
Example #2
0
    def test_phonetic_edit_distance_dist_abs(self):
        """Test abydos.distance.PhoneticEditDistance.dist_abs."""
        # Base cases
        self.assertEqual(self.ped.dist_abs('', ''), 0)
        self.assertEqual(self.ped.dist_abs('a', ''), 1)
        self.assertEqual(self.ped.dist_abs('', 'a'), 1)
        self.assertEqual(self.ped.dist_abs('abc', ''), 3)
        self.assertEqual(self.ped.dist_abs('', 'abc'), 3)
        self.assertEqual(self.ped.dist_abs('abc', 'abc'), 0)
        self.assertEqual(self.ped.dist_abs('abcd', 'efgh'), 0.4193548387096774)

        self.assertAlmostEqual(self.ped.dist_abs('Nigel', 'Niall'),
                               0.8870967741935485)
        self.assertAlmostEqual(self.ped.dist_abs('Niall', 'Nigel'),
                               0.8870967741935485)
        self.assertAlmostEqual(self.ped.dist_abs('Colin', 'Coiln'),
                               0.870967741935484)
        self.assertAlmostEqual(self.ped.dist_abs('Coiln', 'Colin'),
                               0.870967741935484)
        self.assertAlmostEqual(self.ped.dist_abs('ATCAACGAGT', 'AACGATTAG'),
                               2.370967741935484)

        self.assertEqual(
            PhoneticEditDistance(weights={
                'syllabic': 1.0
            }).dist_abs('Nigel', 'Niall'),
            0.0,
        )
        self.assertAlmostEqual(
            PhoneticEditDistance(weights=(1, 1, 1)).dist_abs('Nigel', 'Niall'),
            0.33333333333333326,
        )
        self.assertAlmostEqual(
            PhoneticEditDistance(mode='osa').dist_abs('Niel', 'Neil'),
            0.06451612903225801,
        )
Example #3
0
    def __init__(self, model='latin'):
        self.model = model
        self.impH = input_helpers.InputHelper()
        self.ST = syllable_tokenizer.SyllableTokenizer()
        # Phonetic Encoder
        self.pe = Ainsworth()
        # Soundex Firstname Algorithm
        self.pshp_soundex_first = PSHPSoundexFirst()
        # String Distance algorithms
        self.algos = [
            IterativeSubString(),
            BISIM(),
            DiscountedLevenshtein(),
            Prefix(),
            LCSstr(),
            MLIPNS(),
            Strcmp95(),
            MRA(),
            Editex(),
            SAPS(),
            FlexMetric(),
            JaroWinkler(mode='Jaro'),
            HigueraMico(),
            Sift4(),
            Eudex(),
            ALINE(),
            Covington(),
            PhoneticEditDistance()
        ]
        self.algo_names = [
            'iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix',
            'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps',
            'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline',
            'covington', 'phoneticeditdistance'
        ]

        # extract model tarball into directory if doesnt exist
        model_dir = os.path.join(os.path.dirname(__file__), "models",
                                 self.model)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
            tar = tarfile.open(
                os.path.join(os.path.dirname(__file__), "models",
                             self.model + ".tar.gz"), "r:gz")
            tar.extractall(model_dir)
            tar.close()

        # String Distance Pipeline (Level 0/Base Model)
        self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl'))

        # Character Embedding Network (Level 0/Base Model)
        self.vocab = preprocess.VocabularyProcessor(
            max_document_length=15,
            min_frequency=0).restore(os.path.join(model_dir, 'vocab'))

        siamese_model = os.path.join(model_dir, 'siamese')

        graph = tf.Graph()
        with graph.as_default() as graph:
            self.sess = tf.Session()
            with self.sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    '{}.meta'.format(siamese_model))
                self.sess.run(tf.global_variables_initializer())
                saver.restore(self.sess, siamese_model)
                # Get the placeholders from the graph by name
            self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0]
            self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0]

            self.dropout_keep_prob = graph.get_operation_by_name(
                'dropout_keep_prob').outputs[0]
            self.prediction = graph.get_operation_by_name(
                'output/distance').outputs[0]
            self.sim = graph.get_operation_by_name(
                'accuracy/temp_sim').outputs[0]

        # Logreg (Level 1/Meta Model)
        self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl'))

        # seen names (mapping dict from raw name to processed name)
        self.seen_names = {}

        # seen pairs (mapping dict from name pair tuple to similarity)
        self.seen_pairs = {}
Example #4
0
bisim = BISIM()
dlev = DiscountedLevenshtein()
prefix = Prefix()
lcs = LCSstr()
mlipns = MLIPNS()
strcmp95 = Strcmp95()
mra = MRA()
editex = Editex()
saps = SAPS()
flexmetric = FlexMetric()
jaro = JaroWinkler(mode='Jaro')
higuera_mico = HigueraMico()
sift4 = Sift4()
eudex = Eudex()
aline = ALINE()
phonetic_edit = PhoneticEditDistance()
algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex,
     aline, phonetic_edit]

algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra',
          'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline',
          'phoneticeditdistance']

def sum_ipa(name_a, name_b):
    feat1 = ipa_to_features(pe.encode(name_a))
    feat2 = ipa_to_features(pe.encode(name_b))
    if len(feat1) <= 1:
        score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/1
    else:    
        score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/len(feat1)
    return score
Example #5
0
    def __init__(self, model='latin', prefilter=True, allow_alt_surname=True, allow_initials=True,
                 allow_missing_components=True):

        # user-provided parameters
        self.model = model
        self.allow_alt_surname = allow_alt_surname
        self.allow_initials = allow_initials
        self.allow_missing_components = allow_missing_components
        self.prefilter = prefilter
        if self.prefilter:
            self.refined_soundex = {
                'b': 1, 'p': 1,
                'f': 2, 'v': 2,
                'c': 3, 'k': 3, 's': 3,
                'g': 4, 'j': 4,
                'q': 5, 'x': 5, 'z': 5,
                'd': 6, 't': 6,
                'l': 7,
                'm': 8, 'n': 8,
                'r': 9
            }

        # verify user-supplied class arguments
        model_dir = self.validate_parameters()

        self.impH = input_helpers.InputHelper()
        # Phonetic Encoder
        self.pe = Ainsworth()
        # Soundex Firstname Algorithm
        self.pshp_soundex_first = PSHPSoundexFirst()
        # Soundex Lastname Algorithm
        self.pshp_soundex_last = PSHPSoundexLast()

        # String Distance algorithms
        self.algos = [IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(),
                      Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(),
                      Sift4(), Eudex(), ALINE(), CovingtonGuard(), PhoneticEditDistance()]
        self.algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns',
                           'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico',
                           'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance']

        # String Distance Pipeline (Level 0/Base Model)
        self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl'))

        # Character Embedding Network (Level 0/Base Model)
        self.vocab = preprocess.VocabularyProcessor(max_document_length=15, min_frequency=0).restore(
            os.path.join(model_dir, 'vocab'))

        siamese_model = os.path.join(model_dir, 'siamese')

        # start tensorflow session
        graph = tf.Graph()
        with graph.as_default() as graph:
            self.sess = tf.Session() if tf.__version__[0] == '1' else tf.compat.v1.Session()
            with self.sess.as_default():
                # Load the saved meta graph and restore variables
                if tf.__version__[0] == '1':
                    saver = tf.train.import_meta_graph('{}.meta'.format(siamese_model))
                    self.sess.run(tf.global_variables_initializer())
                else:
                    saver = tf.compat.v1.train.import_meta_graph('{}.meta'.format(siamese_model))
                    self.sess.run(tf.compat.v1.global_variables_initializer())
                saver.restore(self.sess, siamese_model)
                # Get the placeholders from the graph by name
            self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0]
            self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0]

            self.dropout_keep_prob = graph.get_operation_by_name('dropout_keep_prob').outputs[0]
            self.prediction = graph.get_operation_by_name('output/distance').outputs[0]
            self.sim = graph.get_operation_by_name('accuracy/temp_sim').outputs[0]

        # Logreg (Level 1/Meta Model)
        self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl'))

        # seen names (mapping dict from raw name to processed name)
        self.seen_names = {}
        # seen pairs (mapping dict from name pair tuple to similarity)
        self.seen_pairs = {}
        # user scores (mapping dict from name pair tuple to similarity)
        self.user_scores = {}
Example #6
0
"""Evaluation functions for sequence models."""

__author__ = "Kyle Gorman"

import logging

import numpy  # type: ignore

from typing import Any, Iterator, List, Tuple

from abydos.distance import PhoneticEditDistance

Labels = List[Any]

Metrics = [
    PhoneticEditDistance(vowel_ignorance=True),
    PhoneticEditDistance(vowel_dominance=True),
    PhoneticEditDistance(vowel_ignorance=True, no_features=True),
    PhoneticEditDistance()
]


def edit_distance(x: Labels, y: Labels) -> int:
    # For a more expressive version of the same, see:
    #
    #     https://gist.github.com/kylebgorman/8034009
    idim = len(x) + 1
    jdim = len(y) + 1
    table = numpy.zeros((idim, jdim), dtype=numpy.uint8)
    table[1:, 0] = 1
    table[0, 1:] = 1
Example #7
0
import argparse

from abydos.distance import Levenshtein, PhoneticEditDistance
from evallib import edit_distance
from tqdm import tqdm

parser = argparse.ArgumentParser()
parser.add_argument('--left', type=str)
parser.add_argument('--right', type=str)
parser.add_argument('--out', type=str)
args = parser.parse_args()

PED = PhoneticEditDistance()
PED_IG = PhoneticEditDistance(vowel_ignorance=True)
PED_DOM = PhoneticEditDistance(vowel_dominance=True)
PED_NOF_IG = PhoneticEditDistance(vowel_ignorance=True, no_features=True)
PED_NOF = PhoneticEditDistance(no_features=True)
LEV = Levenshtein()

with open(args.left) as left, open(args.right) as right, open(args.out,
                                                              'w') as ouf:
    for left_line, right_line in tqdm(zip(left, right)):
        left_grapheme, left_phoneme = left_line.strip().split('\t')
        right_grapheme, right_phoneme = right_line.strip().split('\t')

        lsplit = left_phoneme.split()
        rsplit = right_phoneme.split()

        gorman_lev = edit_distance(lsplit, rsplit) / len(lsplit)

        left_phon_input = ''.join(lsplit)
Example #8
0
class PhoneticEditDistanceTestCases(unittest.TestCase):
    """Test phonetic edit distance functions.

    abydos.distance.PhoneticEditDistance
    """

    ped = PhoneticEditDistance()

    def test_phonetic_edit_distance_dist(self):
        """Test abydos.distance.PhoneticEditDistance.dist."""
        # Base cases
        self.assertEqual(self.ped.dist('', ''), 0.0)
        self.assertEqual(self.ped.dist('a', ''), 1.0)
        self.assertEqual(self.ped.dist('', 'a'), 1.0)
        self.assertEqual(self.ped.dist('abc', ''), 1.0)
        self.assertEqual(self.ped.dist('', 'abc'), 1.0)
        self.assertEqual(self.ped.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.ped.dist('abcd', 'efgh'), 0.10483870967741934)

        self.assertAlmostEqual(self.ped.dist('Nigel', 'Niall'),
                               0.1774193548387097)
        self.assertAlmostEqual(self.ped.dist('Niall', 'Nigel'),
                               0.1774193548387097)
        self.assertAlmostEqual(self.ped.dist('Colin', 'Coiln'),
                               0.1741935483870968)
        self.assertAlmostEqual(self.ped.dist('Coiln', 'Colin'),
                               0.1741935483870968)
        self.assertAlmostEqual(self.ped.dist('ATCAACGAGT', 'AACGATTAG'),
                               0.2370967741935484)

    def test_phonetic_edit_distance_dist_abs(self):
        """Test abydos.distance.PhoneticEditDistance.dist_abs."""
        # Base cases
        self.assertEqual(self.ped.dist_abs('', ''), 0)
        self.assertEqual(self.ped.dist_abs('a', ''), 1)
        self.assertEqual(self.ped.dist_abs('', 'a'), 1)
        self.assertEqual(self.ped.dist_abs('abc', ''), 3)
        self.assertEqual(self.ped.dist_abs('', 'abc'), 3)
        self.assertEqual(self.ped.dist_abs('abc', 'abc'), 0)
        self.assertEqual(self.ped.dist_abs('abcd', 'efgh'), 0.4193548387096774)

        self.assertAlmostEqual(self.ped.dist_abs('Nigel', 'Niall'),
                               0.8870967741935485)
        self.assertAlmostEqual(self.ped.dist_abs('Niall', 'Nigel'),
                               0.8870967741935485)
        self.assertAlmostEqual(self.ped.dist_abs('Colin', 'Coiln'),
                               0.870967741935484)
        self.assertAlmostEqual(self.ped.dist_abs('Coiln', 'Colin'),
                               0.870967741935484)
        self.assertAlmostEqual(self.ped.dist_abs('ATCAACGAGT', 'AACGATTAG'),
                               2.370967741935484)

        self.assertEqual(
            PhoneticEditDistance(weights={
                'syllabic': 1.0
            }).dist_abs('Nigel', 'Niall'),
            0.0,
        )
        self.assertAlmostEqual(
            PhoneticEditDistance(weights=(1, 1, 1)).dist_abs('Nigel', 'Niall'),
            0.33333333333333326,
        )
        self.assertAlmostEqual(
            PhoneticEditDistance(mode='osa').dist_abs('Niel', 'Neil'),
            0.06451612903225801,
        )

    def test_phonetic_edit_distance_alignment(self):
        """Test abydos.distance.PhoneticEditDistance.alignment."""
        # Base cases
        self.assertEqual(self.ped.alignment('', ''), (0.0, '', ''))
        self.assertEqual(self.ped.alignment('a', ''), (1.0, 'a', '-'))
        self.assertEqual(self.ped.alignment('', 'a'), (1.0, '-', 'a'))
        self.assertEqual(self.ped.alignment('abc', ''), (3.0, 'abc', '---'))
        self.assertEqual(self.ped.alignment('', 'abc'), (3.0, '---', 'abc'))
        self.assertEqual(self.ped.alignment('abc', 'abc'), (0.0, 'abc', 'abc'))
        self.assertEqual(
            self.ped.alignment('abcd', 'efgh'),
            (0.4193548387096774, 'abcd', 'efgh'),
        )

        self.assertEqual(
            self.ped.alignment('Nigel', 'Niall'),
            (0.8870967741935485, 'Nigel', 'Niall'),
        )
        self.assertEqual(
            self.ped.alignment('Niall', 'Nigel'),
            (0.8870967741935485, 'Niall', 'Nigel'),
        )
        self.assertEqual(
            self.ped.alignment('Colin', 'Coiln'),
            (0.870967741935484, 'Colin', 'Coiln'),
        )
        self.assertEqual(
            self.ped.alignment('Coiln', 'Colin'),
            (0.870967741935484, 'Coiln', 'Colin'),
        )
        self.assertEqual(
            PhoneticEditDistance(mode='osa').alignment('Niel', 'Neil'),
            (0.06451612903225801, 'Niel', 'Neil'),
        )