예제 #1
0
class MraTestCases(unittest.TestCase):
    """Test MRA functions.

    abydos.distance.MRA
    """

    cmp = MRA()

    def test_mra_dist_abs(self):
        """Test abydos.distance.MRA.dist_abs."""
        self.assertEqual(self.cmp.dist_abs('', ''), 6)
        self.assertEqual(self.cmp.dist_abs('a', 'a'), 6)
        self.assertEqual(self.cmp.dist_abs('abcdefg', 'abcdefg'), 6)
        self.assertEqual(self.cmp.dist_abs('abcdefg', ''), 0)
        self.assertEqual(self.cmp.dist_abs('', 'abcdefg'), 0)

        # https://en.wikipedia.org/wiki/Match_rating_approach
        self.assertEqual(self.cmp.dist_abs('Byrne', 'Boern'), 5)
        self.assertEqual(self.cmp.dist_abs('Smith', 'Smyth'), 5)
        self.assertEqual(self.cmp.dist_abs('Catherine', 'Kathryn'), 4)

        self.assertEqual(self.cmp.dist_abs('ab', 'abcdefgh'), 0)
        self.assertEqual(self.cmp.dist_abs('ab', 'ac'), 5)
        self.assertEqual(self.cmp.dist_abs('abcdefik', 'abcdefgh'), 3)
        self.assertEqual(self.cmp.dist_abs('xyz', 'abc'), 0)

    def test_mra_sim(self):
        """Test abydos.distance.MRA.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('a', 'a'), 1)
        self.assertEqual(self.cmp.sim('abcdefg', 'abcdefg'), 1)
        self.assertEqual(self.cmp.sim('abcdefg', ''), 0)
        self.assertEqual(self.cmp.sim('', 'abcdefg'), 0)

        # https://en.wikipedia.org/wiki/Match_rating_approach
        self.assertEqual(self.cmp.sim('Byrne', 'Boern'), 5 / 6)
        self.assertEqual(self.cmp.sim('Smith', 'Smyth'), 5 / 6)
        self.assertEqual(self.cmp.sim('Catherine', 'Kathryn'), 4 / 6)

        self.assertEqual(self.cmp.sim('ab', 'abcdefgh'), 0)
        self.assertEqual(self.cmp.sim('ab', 'ac'), 5 / 6)
        self.assertEqual(self.cmp.sim('abcdefik', 'abcdefgh'), 3 / 6)
        self.assertEqual(self.cmp.sim('xyz', 'abc'), 0)

    def test_mra_dist(self):
        """Test abydos.distance.MRA.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('a', 'a'), 0)
        self.assertEqual(self.cmp.dist('abcdefg', 'abcdefg'), 0)
        self.assertEqual(self.cmp.dist('abcdefg', ''), 1)
        self.assertEqual(self.cmp.dist('', 'abcdefg'), 1)

        # https://en.wikipedia.org/wiki/Match_rating_approach
        self.assertAlmostEqual(self.cmp.dist('Byrne', 'Boern'), 1 / 6)
        self.assertAlmostEqual(self.cmp.dist('Smith', 'Smyth'), 1 / 6)
        self.assertAlmostEqual(self.cmp.dist('Catherine', 'Kathryn'), 2 / 6)

        self.assertEqual(self.cmp.dist('ab', 'abcdefgh'), 1)
        self.assertAlmostEqual(self.cmp.dist('ab', 'ac'), 1 / 6)
        self.assertAlmostEqual(self.cmp.dist('abcdefik', 'abcdefgh'), 3 / 6)
        self.assertEqual(self.cmp.dist('xyz', 'abc'), 1)
예제 #2
0
from abydos.phonetic import PSHPSoundexFirst, Ainsworth
from abydos.phones import *
import re
from sklearn.preprocessing import MinMaxScaler

# Featurizer
pshp_soundex_first = PSHPSoundexFirst()
pe = Ainsworth()	
iss = IterativeSubString()
bisim = BISIM()
dlev = DiscountedLevenshtein()
prefix = Prefix()
lcs = LCSstr()
mlipns = MLIPNS()
strcmp95 = Strcmp95()
mra = MRA()
editex = Editex()
saps = SAPS()
flexmetric = FlexMetric()
jaro = JaroWinkler(mode='Jaro')
higuera_mico = HigueraMico()
sift4 = Sift4()
eudex = Eudex()
aline = ALINE()
phonetic_edit = PhoneticEditDistance()
algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex,
     aline, phonetic_edit]

algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra',
          'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline',
          'phoneticeditdistance']
예제 #3
0
파일: matcher.py 프로젝트: olacharles/hmni
    def __init__(self, model='latin'):
        self.model = model
        self.impH = input_helpers.InputHelper()
        self.ST = syllable_tokenizer.SyllableTokenizer()
        # Phonetic Encoder
        self.pe = Ainsworth()
        # Soundex Firstname Algorithm
        self.pshp_soundex_first = PSHPSoundexFirst()
        # String Distance algorithms
        self.algos = [
            IterativeSubString(),
            BISIM(),
            DiscountedLevenshtein(),
            Prefix(),
            LCSstr(),
            MLIPNS(),
            Strcmp95(),
            MRA(),
            Editex(),
            SAPS(),
            FlexMetric(),
            JaroWinkler(mode='Jaro'),
            HigueraMico(),
            Sift4(),
            Eudex(),
            ALINE(),
            Covington(),
            PhoneticEditDistance()
        ]
        self.algo_names = [
            'iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix',
            'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps',
            'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline',
            'covington', 'phoneticeditdistance'
        ]

        # extract model tarball into directory if doesnt exist
        model_dir = os.path.join(os.path.dirname(__file__), "models",
                                 self.model)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
            tar = tarfile.open(
                os.path.join(os.path.dirname(__file__), "models",
                             self.model + ".tar.gz"), "r:gz")
            tar.extractall(model_dir)
            tar.close()

        # String Distance Pipeline (Level 0/Base Model)
        self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl'))

        # Character Embedding Network (Level 0/Base Model)
        self.vocab = preprocess.VocabularyProcessor(
            max_document_length=15,
            min_frequency=0).restore(os.path.join(model_dir, 'vocab'))

        siamese_model = os.path.join(model_dir, 'siamese')

        graph = tf.Graph()
        with graph.as_default() as graph:
            self.sess = tf.Session()
            with self.sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    '{}.meta'.format(siamese_model))
                self.sess.run(tf.global_variables_initializer())
                saver.restore(self.sess, siamese_model)
                # Get the placeholders from the graph by name
            self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0]
            self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0]

            self.dropout_keep_prob = graph.get_operation_by_name(
                'dropout_keep_prob').outputs[0]
            self.prediction = graph.get_operation_by_name(
                'output/distance').outputs[0]
            self.sim = graph.get_operation_by_name(
                'accuracy/temp_sim').outputs[0]

        # Logreg (Level 1/Meta Model)
        self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl'))

        # seen names (mapping dict from raw name to processed name)
        self.seen_names = {}

        # seen pairs (mapping dict from name pair tuple to similarity)
        self.seen_pairs = {}
예제 #4
0
    def __init__(self, model='latin', prefilter=True, allow_alt_surname=True, allow_initials=True,
                 allow_missing_components=True):

        # user-provided parameters
        self.model = model
        self.allow_alt_surname = allow_alt_surname
        self.allow_initials = allow_initials
        self.allow_missing_components = allow_missing_components
        self.prefilter = prefilter
        if self.prefilter:
            self.refined_soundex = {
                'b': 1, 'p': 1,
                'f': 2, 'v': 2,
                'c': 3, 'k': 3, 's': 3,
                'g': 4, 'j': 4,
                'q': 5, 'x': 5, 'z': 5,
                'd': 6, 't': 6,
                'l': 7,
                'm': 8, 'n': 8,
                'r': 9
            }

        # verify user-supplied class arguments
        model_dir = self.validate_parameters()

        self.impH = input_helpers.InputHelper()
        # Phonetic Encoder
        self.pe = Ainsworth()
        # Soundex Firstname Algorithm
        self.pshp_soundex_first = PSHPSoundexFirst()
        # Soundex Lastname Algorithm
        self.pshp_soundex_last = PSHPSoundexLast()

        # String Distance algorithms
        self.algos = [IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(),
                      Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(),
                      Sift4(), Eudex(), ALINE(), CovingtonGuard(), PhoneticEditDistance()]
        self.algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns',
                           'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico',
                           'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance']

        # String Distance Pipeline (Level 0/Base Model)
        self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl'))

        # Character Embedding Network (Level 0/Base Model)
        self.vocab = preprocess.VocabularyProcessor(max_document_length=15, min_frequency=0).restore(
            os.path.join(model_dir, 'vocab'))

        siamese_model = os.path.join(model_dir, 'siamese')

        # start tensorflow session
        graph = tf.Graph()
        with graph.as_default() as graph:
            self.sess = tf.Session() if tf.__version__[0] == '1' else tf.compat.v1.Session()
            with self.sess.as_default():
                # Load the saved meta graph and restore variables
                if tf.__version__[0] == '1':
                    saver = tf.train.import_meta_graph('{}.meta'.format(siamese_model))
                    self.sess.run(tf.global_variables_initializer())
                else:
                    saver = tf.compat.v1.train.import_meta_graph('{}.meta'.format(siamese_model))
                    self.sess.run(tf.compat.v1.global_variables_initializer())
                saver.restore(self.sess, siamese_model)
                # Get the placeholders from the graph by name
            self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0]
            self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0]

            self.dropout_keep_prob = graph.get_operation_by_name('dropout_keep_prob').outputs[0]
            self.prediction = graph.get_operation_by_name('output/distance').outputs[0]
            self.sim = graph.get_operation_by_name('accuracy/temp_sim').outputs[0]

        # Logreg (Level 1/Meta Model)
        self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl'))

        # seen names (mapping dict from raw name to processed name)
        self.seen_names = {}
        # seen pairs (mapping dict from name pair tuple to similarity)
        self.seen_pairs = {}
        # user scores (mapping dict from name pair tuple to similarity)
        self.user_scores = {}