Python unicode_normalize Examples, rltk.utils.unicode_normalize Python Examples

Example #1

0

Show file

def needleman_wunsch_score(s1,
                           s2,
                           match=2,
                           mismatch=-1,
                           gap=-0.5,
                           score_table={}):

    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    s1 = utils.unicode_normalize(s1)
    s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)
    if n1 == 0 and n2 == 0:
        return 0

    # construct matrix to get max score of all possible alignments
    dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
    for i in xrange(n1 + 1):
        for j in xrange(n2 + 1):
            if i == 0 and j == 0:  # [0,0]
                continue
            elif i == 0:  # most top row
                dp[i][j] = gap + dp[i][j - 1]
            elif j == 0:  # most left column
                dp[i][j] = gap + dp[i - 1][j]
            else:
                dp[i][j] = max(
                    dp[i][j - 1] + gap,
                    dp[i - 1][j] + gap, dp[i - 1][j - 1] + _get_score(
                        s1[i - 1], s2[j - 1], match, mismatch, score_table))

    return dp[n1][n2]

Example #2

0

Show file

def damerau_levenshtein_distance(s1, s2):
    """
    Similar to Levenshtein, Damerau-Levenshtein distance is the minimum number of operations needed to transform one string into the other, where an operation is defined as an insertion, deletion, or substitution of a single character, or a transposition of two adjacent characters.

    Args:
        s1 (str): Sequence 1.
        s2 (str): Sequence 2.

    Returns:
        float: Damerau Levenshtein Distance.

    Examples:
        >>> rltk.damerau_levenshtein_distance('abcd', 'acbd')
        1
        >>> rltk.damerau_levenshtein_distance('abbd', 'acad')
        2
    """

    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    s1 = utils.unicode_normalize(s1)
    s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)
    infinite = n1 + n2

    char_arr = defaultdict(int)
    dp = [[0] * (n2 + 2) for _ in xrange(n1 + 2)]

    dp[0][0] = infinite
    for i in xrange(0, n1 + 1):
        dp[i + 1][0] = infinite
        dp[i + 1][1] = i
    for i in xrange(0, n2 + 1):
        dp[0][i + 1] = infinite
        dp[1][i + 1] = i

    for i in xrange(1, n1 + 1):
        db = 0
        for j in xrange(1, n2 + 1):
            i1 = char_arr[s2[j - 1]]
            j1 = db
            cost = 1
            if s1[i - 1] == s2[j - 1]:
                cost = 0
                db = j

            dp[i + 1][j + 1] = min(dp[i][j] + cost,
                                   dp[i + 1][j] + 1,
                                   dp[i][j + 1] + 1,
                                   dp[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1))
        char_arr[s1[i - 1]] = i

    return dp[n1 + 1][n2 + 1]

Example #3

0

Show file

File: soundex.py Project: vishalbelsare/rltk

def soundex(s):
    """
    The standard used for this implementation is provided by `U.S. Census Bureau <https://www.archives.gov/research/census/soundex.html>`_.

    Args:
        s (str): Sequence.

    Returns:
        str: Coded sequence.

    Examples:
        >>> rltk.soundex('ashcraft')
        'A261'
        >>> rltk.soundex('pineapple')
        'P514'
    """

    utils.check_for_none(s)
    utils.check_for_type(str, s)

    s = utils.unicode_normalize(s)

    if len(s) == 0:
        raise ValueError('Empty string')

    s = s.upper()

    CODES = (
        ('BFPV', '1'),
        ('CGJKQSXZ', '2'),
        ('DT', '3'),
        ('L', '4'),
        ('MN', '5'),
        ('R', '6'),
        ('AEIOUHWY', '.')  # placeholder
    )
    CODE_DICT = dict((c, replace) for chars, replace in CODES for c in chars)

    sdx = s[0]
    for i in range(1, len(s)):
        if s[i] not in CODE_DICT:
            continue

        code = CODE_DICT[s[i]]
        if code == '.':
            continue
        if s[i] == s[i - 1]:  # ignore same letter
            continue
        if s[i - 1] in CODE_DICT and CODE_DICT[s[
                i - 1]] == code:  # 'side-by-side' rule
            continue
        if s[i - 1] in ('H', 'W') and i - 2 > 0 and \
                        s[i - 2] in CODE_DICT and CODE_DICT[s[i - 2]] != '.':  # consonant separators
            continue

        sdx += code

    sdx = sdx[0:4].ljust(4, '0')

    return sdx

Example #4

0

Show file

def hamming_distance(s1, s2):

    utils.check_for_none(s1, s2)
    # utils.check_for_type(basestring, s1, s2)

    if type(s1) != type(s2):
        raise TypeError('Different type')

    if isinstance(s1, basestring) and isinstance(s2, basestring):
        s1 = utils.unicode_normalize(s1)
        s2 = utils.unicode_normalize(s2)

    if len(s1) != len(s2):
        raise ValueError('Unequal length')

    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

Example #5

0

Show file

File: levenshtein.py Project: cybergla/rltk

def optimal_string_alignment_distance(s1, s2):
    """
        This is a variation of the Damerau-Levenshtein distance that returns the strings' edit distance
        taking into account deletion, insertion, substitution, and transposition, under the condition
        that no substring is edited more than once.

        Args:
            s1 (str): Sequence 1.
            s2 (str): Sequence 2.

        Returns:
            float: Optimal String Alignment Distance.

        Examples:
            >>> rltk.optimal_string_alignment_distance('abcd', 'acbd')
            1
            >>> rltk.optimal_string_alignment_distance('ca', 'abc')
            3
        """

    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    s1 = utils.unicode_normalize(s1)
    s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)

    dp = [[0] * (n2 + 1) for _ in xrange(n1 + 1)]

    for i in xrange(0, n1 + 1):
        dp[i][0] = i
    for j in xrange(0, n2 + 1):
        dp[0][j] = j

    for i in xrange(1, n1 + 1):
        for j in xrange(1, n2 + 1):
            cost = 0 if s1[i - 1] == s2[j - 1] else 1

            dp[i][j] = min(dp[i][j - 1] + 1, dp[i - 1][j] + 1,
                           dp[i - 1][j - 1] + cost)

            if (i > 1 and j > 1 and s1[i - 1] == s2[j - 2]
                    and s1[i - 2] == s2[j - 1]):
                dp[i][j] = min(dp[i][j], dp[i - 2][j - 2] + cost)

    return dp[n1][n2]

Example #6

0

Show file

File: jaro.py Project: mit2nil/rltk

def _jaro_distance(s1, s2):
    # code from https://github.com/nap/jaro-winkler-distance
    # Copyright Jean-Bernard Ratte

    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    s1 = utils.unicode_normalize(s1)
    s2 = utils.unicode_normalize(s2)

    shorter, longer = s1.lower(), s2.lower()

    if len(s1) > len(s2):
        longer, shorter = shorter, longer

    m1 = _get_matching_characters(shorter, longer)
    m2 = _get_matching_characters(longer, shorter)

    if len(m1) == 0 or len(m2) == 0:
        return 0.0

    return (float(len(m1)) / len(shorter) +
            float(len(m2)) / len(longer) +
            float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0

Example #7

0

Show file

File: metaphone.py Project: mit2nil/rltk

def _metaphone(s):
    """
    Metaphone fundamentally improves on the Soundex algorithm by using information about variations and inconsistencies in English spelling and pronunciation to produce a more accurate encoding, which does a better job of matching words and names which sound similar. As with Soundex, similar-sounding words should share the same keys. Metaphone is available as a built-in operator in a number of systems.

    Args:
        s (str): Sequence.

    Returns:
        str: Coded sequence.

    Examples:
        >>> rltk.metaphone('ashcraft')
        'AXKRFT'
        >>> rltk.metaphone('pineapple')
        'PNPL'
    """
    # code from https://github.com/jamesturk/jellyfish
    # Copyright (c) 2015, James Turk
    # Copyright (c) 2015, Sunlight Foundation
    # All rights reserved.

    utils.check_for_none(s)
    utils.check_for_type(basestring, s)

    s = utils.unicode_normalize(s)

    if len(s) == 0:
        raise ValueError('Empty string')

    s = s.lower()
    result = []

    # skip first character if s starts with these
    if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')):
        s = s[1:]

    i = 0

    while i < len(s):
        c = s[i]
        next = s[i + 1] if i < len(s) - 1 else '*****'
        nextnext = s[i + 2] if i < len(s) - 2 else '*****'

        # skip doubles except for cc
        if c == next and c != 'c':
            i += 1
            continue

        if c in 'aeiou':
            if i == 0 or s[i - 1] == ' ':
                result.append(c)
        elif c == 'b':
            if (not (i != 0 and s[i - 1] == 'm')) or next:
                result.append('b')
        elif c == 'c':
            if next == 'i' and nextnext == 'a' or next == 'h':
                result.append('x')
                i += 1
            elif next in 'iey':
                result.append('s')
                i += 1
            else:
                result.append('k')
        elif c == 'd':
            if next == 'g' and nextnext in 'iey':
                result.append('j')
                i += 2
            else:
                result.append('t')
        elif c in 'fjlmnr':
            result.append(c)
        elif c == 'g':
            if next in 'iey':
                result.append('j')
            elif next not in 'hn':
                result.append('k')
            elif next == 'h' and nextnext and nextnext not in 'aeiou':
                i += 1
        elif c == 'h':
            if i == 0 or next in 'aeiou' or s[i - 1] not in 'aeiou':
                result.append('h')
        elif c == 'k':
            if i == 0 or s[i - 1] != 'c':
                result.append('k')
        elif c == 'p':
            if next == 'h':
                result.append('f')
                i += 1
            else:
                result.append('p')
        elif c == 'q':
            result.append('k')
        elif c == 's':
            if next == 'h':
                result.append('x')
                i += 1
            elif next == 'i' and nextnext in 'oa':
                result.append('x')
                i += 2
            else:
                result.append('s')
        elif c == 't':
            if next == 'i' and nextnext in 'oa':
                result.append('x')
            elif next == 'h':
                result.append('0')
                i += 1
            elif next != 'c' or nextnext != 'h':
                result.append('t')
        elif c == 'v':
            result.append('f')
        elif c == 'w':
            if i == 0 and next == 'h':
                i += 1
            if nextnext in 'aeiou' or nextnext == '*****':
                result.append('w')
        elif c == 'x':
            if i == 0:
                if next == 'h' or (next == 'i' and nextnext in 'oa'):
                    result.append('x')
                else:
                    result.append('s')
            else:
                result.append('k')
                result.append('s')
        elif c == 'y':
            if next in 'aeiou':
                result.append('y')
        elif c == 'z':
            result.append('s')
        elif c == ' ':
            if len(result) > 0 and result[-1] != ' ':
                result.append(' ')

        i += 1

    return ''.join(result).upper()

Example #8

0

Show file

def levenshtein_distance(s1, s2, insert={}, delete={}, substitute={},
                 insert_default=1, delete_default=1, substitute_default=1):
    """
    The Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other.

    Args:
        s1 (str): Sequence 1.
        s2 (str): Sequence 2.
        insert (dict(str, int), optional): Insert cost of characters. Defaults to empty dict.
        delete (dict(str, int), optional): Delete cost of characters. Defaults to empty dict.
        substitute (dict(str, dict(str, int)), optional): Substitute cost of characters. Defaults to empty dict.
        insert_default (int, optional): Default value of insert cost. Defaults to 1.
        delete_default (int, optional): Default value of delete cost. Defaults to 1.
        substitute_default (int, optional): Default value of substitute cost. Defaults to 1.

    Returns:
        int: Levenshtein Distance.

    Examples:
        >>> rltk.levenshtein_distance('ab', 'abc')
        1
        >>> rltk.levenshtein_distance('a', 'abc', insert = {'c':50},
        ... insert_default=100, delete_default=100, substitute_default=100)
        150
    """

    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    s1 = utils.unicode_normalize(s1)
    s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)
    if n1 == 0 and n2 == 0:
        return 0

    # if n1 == 0 or n2 == 0:
    #     return max(n1, n2)

    dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
    for i in xrange(n1 + 1):
        for j in xrange(n2 + 1):
            if i == 0 and j == 0: # [0,0]
                continue
            elif i == 0: # most top row
                c = s2[j-1]
                dp[i][j] = insert[c] if c in insert else insert_default
                dp[i][j] += dp[i][j-1]
            elif j == 0: # most left column
                c = s1[i-1]
                dp[i][j] = delete[c] if c in delete else delete_default
                dp[i][j] += dp[i-1][j]
            else:
                c1, c2 = s1[i-1], s2[j-1]
                insert_cost = insert[c2] if c2 in insert else insert_default
                delete_cost = delete[c1] if c1 in delete else delete_default
                substitute_cost = substitute[c1][c2] \
                    if c1 in substitute and c2 in substitute[c1] else substitute_default

                if c1 == c2:
                    dp[i][j] = dp[i-1][j-1]
                else:
                    dp[i][j] = min(dp[i][j-1] + insert_cost,
                                   dp[i-1][j] + delete_cost,
                                   dp[i-1][j-1] + substitute_cost)
    return dp[n1][n2]

Example #9

0

Show file

File: nysiis.py Project: mit2nil/rltk

def _nysiis(s):
    """
    New York State Immunization Information System (NYSIIS) Phonetic Code is a phonetic algorithm created by `The New York State Department of Health's (NYSDOH) Bureau of Immunization
    <https://www.health.ny.gov/prevention/immunization/information_system/>`_.

    Args:
        s (str): Sequence.

    Returns:
        str: Coded sequence.

    Examples:
        >>> rltk.metaphone('ashcraft')
        'AXKRFT'
        >>> rltk.metaphone('pineapple')
        'PNPL'
    """
    # code from https://github.com/jamesturk/jellyfish
    # Copyright (c) 2015, James Turk
    # Copyright (c) 2015, Sunlight Foundation
    # All rights reserved.

    utils.check_for_none(s)
    utils.check_for_type(basestring, s)

    s = utils.unicode_normalize(s)

    if len(s) == 0:
        raise ValueError('Empty string')

    s = s.upper()
    key = []

    # step 1 - prefixes
    if s.startswith('MAC'):
        s = 'MCC' + s[3:]
    elif s.startswith('KN'):
        s = s[1:]
    elif s.startswith('K'):
        s = 'C' + s[1:]
    elif s.startswith(('PH', 'PF')):
        s = 'FF' + s[2:]
    elif s.startswith('SCH'):
        s = 'SSS' + s[3:]

    # step 2 - suffixes
    if s.endswith(('IE', 'EE')):
        s = s[:-2] + 'Y'
    elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')):
        s = s[:-2] + 'D'

    # step 3 - first character of key comes from name
    key.append(s[0])

    # step 4 - translate remaining chars
    i = 1
    len_s = len(s)
    while i < len_s:
        ch = s[i]
        if ch == 'E' and i + 1 < len_s and s[i + 1] == 'V':
            ch = 'AF'
            i += 1
        elif ch in 'AEIOU':
            ch = 'A'
        elif ch == 'Q':
            ch = 'G'
        elif ch == 'Z':
            ch = 'S'
        elif ch == 'M':
            ch = 'N'
        elif ch == 'K':
            if i + 1 < len(s) and s[i + 1] == 'N':
                ch = 'N'
            else:
                ch = 'C'
        elif ch == 'S' and s[i + 1:i + 3] == 'CH':
            ch = 'SS'
            i += 2
        elif ch == 'P' and i + 1 < len(s) and s[i + 1] == 'H':
            ch = 'F'
            i += 1
        elif ch == 'H' and (s[i - 1] not in 'AEIOU' or
                            (i + 1 < len(s) and s[i + 1] not in 'AEIOU')):
            if s[i - 1] in 'AEIOU':
                ch = 'A'
            else:
                ch = s[i - 1]
        elif ch == 'W' and s[i - 1] in 'AEIOU':
            ch = s[i - 1]

        if ch[-1] != key[-1][-1]:
            key.append(ch)

        i += 1

    key = ''.join(key)

    # step 5 - remove trailing S
    if key.endswith('S') and key != 'S':
        key = key[:-1]

    # step 6 - replace AY w/ Y
    if key.endswith('AY'):
        key = key[:-2] + 'Y'

    # step 7 - remove trailing A
    if key.endswith('A') and key != 'A':
        key = key[:-1]

    # step 8 was already done

    return key