コード例 #1
0
ファイル: hybrid.py プロジェクト: linqyd/rltk
def monge_elkan_similarity(bag1,
                           bag2,
                           function=jaro_winkler_similarity,
                           parameters={}):
    """
    Monge Elkan similarity.

    Args:
        bag1 (list): Bag 1.
        bag2 (list): Bag 2.
        function (function, optional): The reference of a similarity measure function. \
            It should return the value in range [0,1]. If it is set to None, \
            `jaro_winlker_similarity` will be used.
        parameters (dict, optional): Other parameters of function. Defaults to empty dict.

    Returns:
        float: Monge Elkan similarity.
    """

    utils.check_for_none(bag1, bag2)
    utils.check_for_type(list, bag1, bag2)

    if len(bag1) == 0:
        return 0.0

    score_sum = 0
    for ele1 in bag1:
        max_score = MIN_FLOAT
        for ele2 in bag2:
            max_score = max(max_score, function(ele1, ele2, **parameters))
        score_sum += max_score

    return float(score_sum) / float(len(bag1))
コード例 #2
0
def tf_idf_similarity(bag1, bag2, df_corpus, doc_size, math_log=False):
    """
    Computes TF/IDF measure. This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to find documents that are relevant to keyword queries. The intuition underlying the TF/IDF measure is that two strings are similar if they share distinguishing terms.
        
    Note:
        If you will call this function many times, :meth:`TF_IDF` is more efficient.

    Args:
        bag1 (list): Bag 1.
        bag2 (list): Bag 2.
        df_corpus (dict): The pre calculated document frequency of corpus.
        doc_size (int): total documents used in corpus.
        math_log (bool, optional): Flag to indicate whether math.log() should be used in TF and IDF formulas. Defaults to False.

    Returns:
        float: TF/IDF cosine similarity.

    Examples:
        >>> rltk.tfidf(['a', 'b', 'a'], ['a', 'c'], {'a':3, 'b':1, 'c':1}, 3)
        0.17541160386140586
        >>> rltk.tfidf(['a', 'b', 'a'], ['a', 'c'], {'a':3, 'b':2, 'c':1}, 4, True)
        0.12977804138
        >>> rltk.tfidf(['a', 'b', 'a'], ['a'], {'a':3, 'b':1, 'c':1}, 3)
        0.5547001962252291
    """
    # http://www.tfidf.com/

    utils.check_for_none(bag1, bag2, df_corpus)
    utils.check_for_type(list, bag1, bag2)

    # term frequency for input strings
    t_x, t_y = collections.Counter(bag1), collections.Counter(bag2)
    tf_x = {k: float(v) / len(bag1) for k, v in t_x.items()}
    tf_y = {k: float(v) / len(bag2) for k, v in t_y.items()}

    # unique element
    total_unique_elements = set()
    total_unique_elements.update(bag1)
    total_unique_elements.update(bag2)

    idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

    # tfidf calculation
    for element in total_unique_elements:
        if element not in df_corpus:
            continue
        idf_element = doc_size * 1.0 / df_corpus[element]

        v_x = 0 if element not in tf_x else (math.log(idf_element) *
                                             tf_x[element]) if math_log else (
                                                 idf_element * tf_x[element])
        v_y = 0 if element not in tf_y else (math.log(idf_element) *
                                             tf_y[element]) if math_log else (
                                                 idf_element * tf_y[element])
        v_x_y += v_x * v_y
        v_x_2 += v_x * v_x
        v_y_2 += v_y * v_y

    # cosine similarity
    return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
コード例 #3
0
ファイル: soundex.py プロジェクト: vishalbelsare/rltk
def soundex(s):
    """
    The standard used for this implementation is provided by `U.S. Census Bureau <https://www.archives.gov/research/census/soundex.html>`_.

    Args:
        s (str): Sequence.

    Returns:
        str: Coded sequence.

    Examples:
        >>> rltk.soundex('ashcraft')
        'A261'
        >>> rltk.soundex('pineapple')
        'P514'
    """

    utils.check_for_none(s)
    utils.check_for_type(str, s)

    s = utils.unicode_normalize(s)

    if len(s) == 0:
        raise ValueError('Empty string')

    s = s.upper()

    CODES = (
        ('BFPV', '1'),
        ('CGJKQSXZ', '2'),
        ('DT', '3'),
        ('L', '4'),
        ('MN', '5'),
        ('R', '6'),
        ('AEIOUHWY', '.')  # placeholder
    )
    CODE_DICT = dict((c, replace) for chars, replace in CODES for c in chars)

    sdx = s[0]
    for i in range(1, len(s)):
        if s[i] not in CODE_DICT:
            continue

        code = CODE_DICT[s[i]]
        if code == '.':
            continue
        if s[i] == s[i - 1]:  # ignore same letter
            continue
        if s[i - 1] in CODE_DICT and CODE_DICT[s[
                i - 1]] == code:  # 'side-by-side' rule
            continue
        if s[i - 1] in ('H', 'W') and i - 2 > 0 and \
                        s[i - 2] in CODE_DICT and CODE_DICT[s[i - 2]] != '.':  # consonant separators
            continue

        sdx += code

    sdx = sdx[0:4].ljust(4, '0')

    return sdx
コード例 #4
0
def longest_common_subsequence_distance(s1, s2):
    """
    The LCS distance between strings X (of length n) and Y (of length m) is n + m - 2 |LCS(X, Y)| min = 0 max = n + m

    Args:
        s1 (str): Sequence 1.
        s2 (str): Sequence 2.

    Returns:
        float: Longest Common Subsequence Distance.

    Examples:
        >>> rltk.longest_common_subsequence_distance('abcd', 'acbd')
        2
        >>> rltk.longest_common_subsequence_distance('abcdefg', 'acef')
        3
    """
    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    m, n = len(s1), len(s2)

    dp = [[None] * (n + 1) for i in xrange(m + 1)]

    lcs = _lcs(s1, s2)
    return n + m - 2 * lcs
コード例 #5
0
def metric_longest_common_subsequence(s1, s2):
    """
    The Metric LCS distance between 2 strings is similar to LCS between 2 string where Metric Longest Common Subsequence is computed as 1 - |LCS(s1, s2)| / max(|s1|, |s2|)

    Args:
        s1 (str): Sequence 1.
        s2 (str): Sequence 2.

    Returns:
        float: Metric Longest Common Subsequence Distance.

    Examples:
        >>> rltk.longest_common_subsequence('ABCDEFG', 'ABCDEFHJKL')
        0.4
        # LCS: ABCDEF => length = 6
        # longest = s2 => length = 10
        # => 1 - 6/10 = 0.4

        >>> rltk.optimal_string_alignment_distance('ABDEF', 'ABDIF')
        4
        # LCS: ABDF => length = 4
        # longest = ABDEF => length = 5
        # => 1 - 4 / 5 = 0.2
    """
    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    lcs = _lcs(s1, s2)
    return 1 - float(lcs) / max(len(s1), len(s2), 1)
コード例 #6
0
ファイル: cosine.py プロジェクト: vishalbelsare/rltk
def cosine_similarity(vec1, vec2):
    """
    The cosine similarity between to vectors.

    Args:
        vec1 (list): Vector 1. List of integer or float.
        vec2 (list): Vector 2. List of integer or float. It should have the same length to vec1.

    Returns:
        float: Cosine similarity.

    Examples:
        >>> rltk.cosine_similarity([1, 2, 1, 3], [2, 5, 2, 3])
        0.91634193
    """

    utils.check_for_none(vec1, vec2)
    utils.check_for_type(list, vec1, vec2)
    if len(vec1) != len(vec2):
        raise ValueError('vec1 and vec2 should have same length')

    v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0
    for v1, v2 in zip(vec1, vec2):  # list of int / float
        v_x_y += v1 * v2
        v_x_2 += v1 * v1
        v_y_2 += v2 * v2

    return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
コード例 #7
0
ファイル: cosine.py プロジェクト: vishalbelsare/rltk
def string_cosine_similarity(bag1, bag2):
    """
    The similarity between the two strings is the cosine of the angle between these two vectors representation.

    Args:
        bag1 (list): Bag1, tokenized string sequence.
        bag2 (list): Bag2, tokenized string sequence.

    Returns:
        float: Cosine similarity.
    """

    utils.check_for_none(bag1, bag2)
    utils.check_for_type(list, bag1, bag2)

    d1 = collections.Counter(bag1)
    d2 = collections.Counter(bag2)

    intersection = set(d1.keys()) & set(d2.keys())
    v_x_y = sum([d1[x] * d2[x] for x in intersection])
    v_x_2 = sum([v * v for k, v in d1.items()])
    v_y_2 = sum([v * v for k, v in d2.items()])

    return 0.0 if v_x_y == 0 else float(v_x_y) / (math.sqrt(v_x_2) *
                                                  math.sqrt(v_y_2))
コード例 #8
0
ファイル: needleman.py プロジェクト: msuryaprakash/rltk
def needleman_wunsch_score(s1, s2, match=2, mismatch=-1, gap=-0.5, score_table=None):
    """
    Neeldman Wunsch score
    """
    utils.check_for_none(s1, s2)
    utils.check_for_type(str, s1, s2)

    score_table = score_table if isinstance(score_table, dict) else {}

    # s1 = utils.unicode_normalize(s1)
    # s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)
    if n1 == 0 and n2 == 0:
        return 0

    # construct matrix to get max score of all possible alignments
    dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
    for i in range(n1 + 1):
        for j in range(n2 + 1):
            if i == 0 and j == 0:  # [0,0]
                continue
            elif i == 0:  # most top row
                dp[i][j] = gap + dp[i][j - 1]
            elif j == 0:  # most left column
                dp[i][j] = gap + dp[i - 1][j]
            else:
                dp[i][j] = max(dp[i][j - 1] + gap,
                               dp[i - 1][j] + gap,
                               dp[i - 1][j - 1] + _get_score(s1[i - 1], s2[j - 1], match, mismatch, score_table))

    return dp[n1][n2]
コード例 #9
0
ファイル: hybrid.py プロジェクト: mit2nil/rltk
def hybrid_jaccard_similarity(set1, set2, threshold=0.5, function=jaro_winkler_similarity, parameters={}):

    utils.check_for_none(set1, set2)
    utils.check_for_type(set, set1, set2)

    matching_score = []
    for s1 in set1:
        inner = []
        for s2 in set2:
            score = function(s1, s2, **parameters)
            if score < threshold:
                score = 0.0
            inner.append(1.0 - score) # munkres finds out the smallest element
        matching_score.append(inner)

    indexes = munkres.Munkres().compute(matching_score)

    score_sum, matching_count = 0.0, 0
    for r, c in indexes:
        matching_count += 1
        score_sum += 1.0 - matching_score[r][c]  # go back to similarity

    if len(set1) + len(set2) - matching_count == 0:
        return 1.0
    return float(score_sum) / float(len(set1) + len(set2) - matching_count)
コード例 #10
0
def _jaccard_index(set1, set2):
    utils.check_for_none(set1, set2)
    utils.check_for_type(set, set1, set2)

    if len(set1) == 0 or len(set2) == 0:
        return 0

    return float(len(set1 & set2)) / float(len(set1 | set2))
コード例 #11
0
ファイル: dice.py プロジェクト: mit2nil/rltk
def dice_similarity(set1, set2):

    utils.check_for_none(set1, set2)
    utils.check_for_type(set, set1, set2)

    if len(set1) == 0 or len(set2) == 0:
        return 0

    return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
コード例 #12
0
def damerau_levenshtein_distance(s1, s2):
    """
    Similar to Levenshtein, Damerau-Levenshtein distance is the minimum number of operations needed to transform\
     one string into the other, where an operation is defined as an insertion, deletion, or substitution of \
     a single character, or a transposition of two adjacent characters.

    Args:
        s1 (str): Sequence 1.
        s2 (str): Sequence 2.

    Returns:
        float: Damerau Levenshtein Distance.

    Examples:
        >>> rltk.damerau_levenshtein_distance('abcd', 'acbd')
        1
        >>> rltk.damerau_levenshtein_distance('abbd', 'acad')
        2
    """

    utils.check_for_none(s1, s2)
    utils.check_for_type(str, s1, s2)

    # s1 = utils.unicode_normalize(s1)
    # s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)
    infinite = n1 + n2

    char_arr = defaultdict(int)
    dp = [[0] * (n2 + 2) for _ in range(n1 + 2)]

    dp[0][0] = infinite
    for i in range(0, n1 + 1):
        dp[i + 1][0] = infinite
        dp[i + 1][1] = i
    for i in range(0, n2 + 1):
        dp[0][i + 1] = infinite
        dp[1][i + 1] = i

    for i in range(1, n1 + 1):
        db = 0
        for j in range(1, n2 + 1):
            i1 = char_arr[s2[j - 1]]
            j1 = db
            cost = 1
            if s1[i - 1] == s2[j - 1]:
                cost = 0
                db = j

            dp[i + 1][j + 1] = min(
                dp[i][j] + cost, dp[i + 1][j] + 1, dp[i][j + 1] + 1,
                dp[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1))
        char_arr[s1[i - 1]] = i

    return dp[n1 + 1][n2 + 1]
コード例 #13
0
ファイル: jaccard.py プロジェクト: vishalbelsare/rltk
def _jaccard_index(set1, set2):
    utils.check_for_none(set1, set2)
    utils.check_for_type(set, set1, set2)

    if len(set1) == 0 or len(set2) == 0:
        return 0

    # return float(len(set1 & set2)) / float(len(set1 | set2))

    inter_len = len(set1 & set2)
    return float(inter_len) / (len(set1) + len(set2) - inter_len)
コード例 #14
0
def hybrid_jaccard_similarity(set1,
                              set2,
                              threshold=0.5,
                              function=jaro_winkler_similarity,
                              parameters=None):
    """
    Generalized Jaccard Measure.

    Args:
        set1 (set): Set 1.
        set2 (set): Set 2.
        threshold (float, optional): The threshold to keep the score of similarity function. \
            Defaults to 0.5.
        function (function, optional): The reference of a similarity measure function. \
            It should return the value in range [0,1]. If it is set to None, \
            `jaro_winlker_similarity` will be used.
        parameters (dict, optional): Other parameters of function. Defaults to None.

    Returns:
        float: Hybrid Jaccard similarity.

    Examples:
        >>> def hybrid_test_similarity(m ,n):
        ...     ...
        >>> rltk.hybrid_jaccard_similarity(set(['a','b','c']), set(['p', 'q']), function=hybrid_test_similarity)
        0.533333333333
    """

    utils.check_for_none(set1, set2)
    utils.check_for_type(set, set1, set2)

    parameters = parameters if isinstance(parameters, dict) else {}

    matching_score = []
    for s1 in set1:
        inner = []
        for s2 in set2:
            score = function(s1, s2, **parameters)
            if score < threshold:
                score = 0.0
            inner.append(1.0 - score)  # munkres finds out the smallest element
        matching_score.append(inner)

    row_idx, col_idx = linear_sum_assignment(matching_score)

    score_sum, matching_count = 0.0, 0
    for r, c in zip(row_idx, col_idx):
        matching_count += 1
        score_sum += 1.0 - matching_score[r][c]  # go back to similarity

    if len(set1) + len(set2) - matching_count == 0:
        return 1.0
    return float(score_sum) / float(len(set1) + len(set2) - matching_count)
コード例 #15
0
ファイル: equal.py プロジェクト: vishalbelsare/rltk
def number_equal(num1, num2, epsilon=0):
    """
    Args:
        n1 (int / float): Number 1.
        n2 (int / float): Number 2.
        epsilon (float, optional): Approximation margin.

    Returns:
        int: 0 for unequal and 1 for equal.
    """

    utils.check_for_type((int, float), num1, num2)
    return int(abs(num1 - num2) <= epsilon)
コード例 #16
0
ファイル: equal.py プロジェクト: vishalbelsare/rltk
def string_equal(str1, str2):
    """
    Args:
        n1 (str): String 1.
        n2 (str): String 2.

    Returns:
        int: 0 for unequal and 1 for equal.
    """

    utils.check_for_none(str1, str2)
    utils.check_for_type(str, str1, str2)
    return int(str1 == str2)
コード例 #17
0
def monge_elkan_similarity(bag1,
                           bag2,
                           function=jaro_winkler_similarity,
                           parameters=None,
                           lower_bound=None):
    """
    Monge Elkan similarity.

    Args:
        bag1 (list): Bag 1.
        bag2 (list): Bag 2.
        function (function, optional): The reference of a similarity measure function. \
            It should return the value in range [0,1]. If it is set to None, \
            `jaro_winlker_similarity` will be used.
        parameters (dict, optional): Other parameters of function. Defaults to None.
        lower_bound (float): This is for early exit. If the similarity is not possible to satisfy this value, \
            the function returns immediately with the return value 0.0. Defaults to None.

    Returns:
        float: Monge Elkan similarity.

    Note:
        The order of bag1 and bag2 matters. \
            Alternatively, `symmetric_monge_elkan_similarity` is not sensitive to the order.
        If the `lower_bound` is set, the early exit condition is more easy to be triggered if bag1 has bigger size.
    """

    utils.check_for_none(bag1, bag2)
    utils.check_for_type(list, bag1, bag2)

    parameters = parameters if isinstance(parameters, dict) else {}

    score_sum = 0
    for idx, ele1 in enumerate(bag1):
        max_score = utils.MIN_FLOAT
        for ele2 in bag2:
            max_score = max(max_score, function(ele1, ele2, **parameters))
        score_sum += max_score

        # if it satisfies early exit condition
        if lower_bound:
            rest_max = len(bag1) - 1 - idx  # assume the rest scores are all 1
            if float(score_sum + rest_max) / float(len(bag1)) < lower_bound:
                return 0.0

    sim = float(score_sum) / float(len(bag1))
    if lower_bound and sim < lower_bound:
        return 0.0
    return sim
コード例 #18
0
ファイル: cosine.py プロジェクト: mit2nil/rltk
def string_cosine_similarity(bag1, bag2):

    utils.check_for_none(bag1, bag2)
    utils.check_for_type(list, bag1, bag2)

    d1 = collections.Counter(bag1)
    d2 = collections.Counter(bag2)

    intersection = set(d1.keys()) & set(d2.keys())
    v_x_y = sum([d1[x] * d2[x] for x in intersection])
    v_x_2 = sum([v * v for k, v in d1.iteritems()])
    v_y_2 = sum([v * v for k, v in d2.iteritems()])

    return 0.0 if v_x_y == 0 else float(v_x_y) / (math.sqrt(v_x_2) *
                                                  math.sqrt(v_y_2))
コード例 #19
0
def optimal_string_alignment_distance(s1, s2):
    """
        This is a variation of the Damerau-Levenshtein distance that returns the strings' edit distance
        taking into account deletion, insertion, substitution, and transposition, under the condition
        that no substring is edited more than once.

        Args:
            s1 (str): Sequence 1.
            s2 (str): Sequence 2.

        Returns:
            float: Optimal String Alignment Distance.

        Examples:
            >>> rltk.optimal_string_alignment_distance('abcd', 'acbd')
            1
            >>> rltk.optimal_string_alignment_distance('ca', 'abc')
            3
        """

    utils.check_for_none(s1, s2)
    utils.check_for_type(str, s1, s2)

    # s1 = utils.unicode_normalize(s1)
    # s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)

    dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]

    for i in range(0, n1 + 1):
        dp[i][0] = i
    for j in range(0, n2 + 1):
        dp[0][j] = j

    for i in range(1, n1 + 1):
        for j in range(1, n2 + 1):
            cost = 0 if s1[i - 1] == s2[j - 1] else 1

            dp[i][j] = min(dp[i][j - 1] + 1, dp[i - 1][j] + 1,
                           dp[i - 1][j - 1] + cost)

            if i > 1 and j > 1 and s1[i - 1] == s2[j - 2] and s1[i -
                                                                 2] == s2[j -
                                                                          1]:
                dp[i][j] = min(dp[i][j], dp[i - 2][j - 2] + cost)

    return dp[n1][n2]
コード例 #20
0
ファイル: hybrid.py プロジェクト: mit2nil/rltk
def monge_elkan_similarity(bag1, bag2, function=jaro_winkler_similarity, parameters={}):

    utils.check_for_none(bag1, bag2)
    utils.check_for_type(list, bag1, bag2)

    if len(bag1) == 0:
        return 0.0

    score_sum = 0
    for ele1 in bag1:
        max_score = MIN_FLOAT
        for ele2 in bag2:
            max_score = max(max_score, function(ele1, ele2, **parameters))
        score_sum += max_score

    return float(score_sum) / float(len(bag1))
コード例 #21
0
ファイル: cosine.py プロジェクト: mit2nil/rltk
def cosine_similarity(vec1, vec2):
    """
    vec1 & vec2 should have same length and the type of element in vector should be int / float.
    """

    utils.check_for_none(vec1, vec2)
    utils.check_for_type(list, vec1, vec2)
    if len(vec1) != len(vec2):
        raise ValueError('vec1 and vec2 should have same length')

    v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0
    for v1, v2 in zip(vec1, vec2):  # list of int / float
        v_x_y += v1 * v2
        v_x_2 += v1 * v1
        v_y_2 += v2 * v2

    return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
コード例 #22
0
ファイル: distance.py プロジェクト: vishalbelsare/rltk
def manhattan_distance(vec1, vec2, weights=None):
    """
    Manhattan distance.

    Args:
        vec1 (list): Vector 1. List of integer or float.
        vec2 (list): Vector 2. List of integer or float. It should have the same length to vec1.
        weights (list): Weights for each value in vectors. If it's None, all weights will be 1.0. Defaults to None.

    Returns:
        float: Manhattan distance.
    """
    utils.check_for_none(vec1, vec2)
    utils.check_for_type(list, vec1, vec2)
    if weights:
        utils.check_for_type(list, weights)
    if len(vec1) != len(vec2):
        raise ValueError('vec1 and vec2 should have same length')

    return cityblock(vec1, vec2, weights)
コード例 #23
0
def dice_similarity(set1, set2):
    """
    The Dice similarity score is defined as twice the intersection of two sets divided by sum of lengths.

    Args:
        set1 (set): Set 1.
        set2 (set): Set 2.

    Returns:
        float: Dice similarity.

    Examples:
        >>> rltk.dice_similarity(set(['a', 'b']), set(['c', 'b']))
        0.5
    """

    utils.check_for_none(set1, set2)
    utils.check_for_type(set, set1, set2)

    if len(set1) == 0 or len(set2) == 0:
        return 0

    return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
コード例 #24
0
ファイル: jaro.py プロジェクト: vishalbelsare/rltk
def _jaro_distance(s1, s2):
    # code from https://github.com/nap/jaro-winkler-distance
    # Copyright Jean-Bernard Ratte

    utils.check_for_none(s1, s2)
    utils.check_for_type(str, s1, s2)

    # s1 = utils.unicode_normalize(s1)
    # s2 = utils.unicode_normalize(s2)

    shorter, longer = s1.lower(), s2.lower()

    if len(s1) > len(s2):
        longer, shorter = shorter, longer

    m1 = _get_matching_characters(shorter, longer)
    m2 = _get_matching_characters(longer, shorter)

    if len(m1) == 0 or len(m2) == 0:
        return 0.0

    return (float(len(m1)) / len(shorter) + float(len(m2)) / len(longer) +
            float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0
コード例 #25
0
ファイル: levenshtein.py プロジェクト: msuryaprakash/rltk
def levenshtein_distance(s1, s2, insert=None, delete=None, substitute=None,
                         insert_default=1, delete_default=1, substitute_default=1):
    """
    The Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other.

    Args:
        s1 (str): Sequence 1.
        s2 (str): Sequence 2.
        insert (dict(str, int), optional): Insert cost of characters. Defaults to None.
        delete (dict(str, int), optional): Delete cost of characters. Defaults to None.
        substitute (dict(str, dict(str, int)), optional): Substitute cost of characters. Defaults to None.
        insert_default (int, optional): Default value of insert cost. Defaults to 1.
        delete_default (int, optional): Default value of delete cost. Defaults to 1.
        substitute_default (int, optional): Default value of substitute cost. Defaults to 1.

    Returns:
        int: Levenshtein Distance.

    Examples:
        >>> rltk.levenshtein_distance('ab', 'abc')
        1
        >>> rltk.levenshtein_distance('a', 'abc', insert = {'c':50},
        ... insert_default=100, delete_default=100, substitute_default=100)
        150
    """

    utils.check_for_none(s1, s2)
    utils.check_for_type(str, s1, s2)

    insert = insert if isinstance(insert, dict) else {}
    delete = delete if isinstance(delete, dict) else {}
    substitute = substitute if isinstance(substitute, dict) else {}

    # s1 = utils.unicode_normalize(s1)
    # s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)
    if n1 == 0 and n2 == 0:
        return 0

    # if n1 == 0 or n2 == 0:
    #     return max(n1, n2)

    dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
    for i in range(n1 + 1):
        for j in range(n2 + 1):
            if i == 0 and j == 0:  # [0,0]
                continue
            elif i == 0:  # most top row
                c = s2[j - 1]
                dp[i][j] = insert[c] if c in insert else insert_default
                dp[i][j] += dp[i][j - 1]
            elif j == 0:  # most left column
                c = s1[i - 1]
                dp[i][j] = delete[c] if c in delete else delete_default
                dp[i][j] += dp[i - 1][j]
            else:
                c1, c2 = s1[i - 1], s2[j - 1]
                insert_cost = insert[c2] if c2 in insert else insert_default
                delete_cost = delete[c1] if c1 in delete else delete_default
                substitute_cost = substitute[c1][c2] \
                    if c1 in substitute and c2 in substitute[c1] else substitute_default

                if c1 == c2:
                    dp[i][j] = dp[i - 1][j - 1]
                else:
                    dp[i][j] = min(dp[i][j - 1] + insert_cost,
                                   dp[i - 1][j] + delete_cost,
                                   dp[i - 1][j - 1] + substitute_cost)
    return dp[n1][n2]
コード例 #26
0
def hybrid_jaccard_similarity(set1,
                              set2,
                              threshold=0.5,
                              function=jaro_winkler_similarity,
                              parameters=None,
                              lower_bound=None):
    """
    Generalized Jaccard Measure.

    Args:
        set1 (set): Set 1.
        set2 (set): Set 2.
        threshold (float, optional): The threshold to keep the score of similarity function. \
            Defaults to 0.5.
        function (function, optional): The reference of a similarity measure function. \
            It should return the value in range [0,1]. If it is set to None, \
            `jaro_winlker_similarity` will be used.
        parameters (dict, optional): Other parameters of function. Defaults to None.
        lower_bound (float): This is for early exit. If the similarity is not possible to satisfy this value, \
            the function returns immediately with the return value 0.0. Defaults to None.

    Returns:
        float: Hybrid Jaccard similarity.

    Examples:
        >>> def hybrid_test_similarity(m ,n):
        ...     ...
        >>> rltk.hybrid_jaccard_similarity(set(['a','b','c']), set(['p', 'q']), function=hybrid_test_similarity)
        0.533333333333
    """

    utils.check_for_none(set1, set2)
    utils.check_for_type(set, set1, set2)

    parameters = parameters if isinstance(parameters, dict) else {}

    if len(set1) > len(set2):
        set1, set2 = set2, set1
    total_num_matches = len(set1)

    matching_score = [[1.0] * len(set2) for _ in range(len(set1))]
    row_max = [0.0] * len(set1)
    for i, s1 in enumerate(set1):
        for j, s2 in enumerate(set2):
            score = function(s1, s2, **parameters)
            if score < threshold:
                score = 0.0
            row_max[i] = max(row_max[i], score)
            matching_score[i][
                j] = 1.0 - score  # munkres finds out the smallest element

        if lower_bound:
            max_possible_score_sum = sum(row_max[:i + 1] + [1] *
                                         (total_num_matches - i - 1))
            max_possible = 1.0 * max_possible_score_sum / float(
                len(set1) + len(set2) - total_num_matches)
            if max_possible < lower_bound:
                return 0.0

    # run munkres, finds the min score (max similarity) for each row
    row_idx, col_idx = linear_sum_assignment(matching_score)

    # recover scores
    score_sum = 0.0
    for r, c in zip(row_idx, col_idx):
        score_sum += 1.0 - matching_score[r][c]

    if len(set1) + len(set2) - total_num_matches == 0:
        return 1.0
    sim = float(score_sum) / float(len(set1) + len(set2) - total_num_matches)
    if lower_bound and sim < lower_bound:
        return 0.0
    return sim
コード例 #27
0
def levenshtein_similarity(s1,
                           s2,
                           insert=None,
                           delete=None,
                           substitute=None,
                           insert_default=1,
                           delete_default=1,
                           substitute_default=1,
                           lower_bound=None):
    """
    Computed as 1 - levenshtein_distance / max-cost(s1,s2)
    """
    insert = insert if isinstance(insert, dict) else {}
    delete = delete if isinstance(delete, dict) else {}
    substitute = substitute if isinstance(substitute, dict) else {}

    def compute_max_cost(s):
        return sum([
            max(insert[c] if c in insert else insert_default,
                delete[c] if c in delete else delete_default,
                substitute[c] if c in substitute else substitute_default)
            for c in s
        ])

    def estimate_min_char_cost(s):
        return min([
            min(insert[c] if c in insert else insert_default,
                delete[c] if c in delete else delete_default,
                substitute[c] if c in substitute else substitute_default)
            for c in s
        ])

    utils.check_for_none(s1, s2)
    utils.check_for_type(str, s1, s2)

    max_cost = max(compute_max_cost(s1), compute_max_cost(s2))

    if lower_bound:
        diff = abs(len(s1) - len(s2))
        if len(s1) == 0 and len(s2) == 0:
            return 1.0
        elif len(s1) == 0:
            min_lev = float(diff * estimate_min_char_cost(s2))
        elif len(s2) == 0:
            min_lev = float(diff * estimate_min_char_cost(s1))
        else:
            min_lev = float(
                diff *
                min(estimate_min_char_cost(s1), estimate_min_char_cost(s2)))
        est_sim = 1.0 - min_lev / max_cost
        if est_sim < lower_bound:
            return 0.0

    lev = levenshtein_distance(s1, s2, insert, delete, substitute,
                               insert_default, delete_default,
                               substitute_default)

    if max_cost < lev:
        raise ValueError('Illegal value of operation cost')

    if max_cost == 0:
        return 1.0

    lev_sim = 1.0 - float(lev) / max_cost
    if lower_bound and lev_sim < lower_bound:
        return 0.0
    return lev_sim
コード例 #28
0
ファイル: nysiis.py プロジェクト: mit2nil/rltk
def _nysiis(s):
    """
    New York State Immunization Information System (NYSIIS) Phonetic Code is a phonetic algorithm created by `The New York State Department of Health's (NYSDOH) Bureau of Immunization
    <https://www.health.ny.gov/prevention/immunization/information_system/>`_.

    Args:
        s (str): Sequence.

    Returns:
        str: Coded sequence.

    Examples:
        >>> rltk.metaphone('ashcraft')
        'AXKRFT'
        >>> rltk.metaphone('pineapple')
        'PNPL'
    """
    # code from https://github.com/jamesturk/jellyfish
    # Copyright (c) 2015, James Turk
    # Copyright (c) 2015, Sunlight Foundation
    # All rights reserved.

    utils.check_for_none(s)
    utils.check_for_type(basestring, s)

    s = utils.unicode_normalize(s)

    if len(s) == 0:
        raise ValueError('Empty string')

    s = s.upper()
    key = []

    # step 1 - prefixes
    if s.startswith('MAC'):
        s = 'MCC' + s[3:]
    elif s.startswith('KN'):
        s = s[1:]
    elif s.startswith('K'):
        s = 'C' + s[1:]
    elif s.startswith(('PH', 'PF')):
        s = 'FF' + s[2:]
    elif s.startswith('SCH'):
        s = 'SSS' + s[3:]

    # step 2 - suffixes
    if s.endswith(('IE', 'EE')):
        s = s[:-2] + 'Y'
    elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')):
        s = s[:-2] + 'D'

    # step 3 - first character of key comes from name
    key.append(s[0])

    # step 4 - translate remaining chars
    i = 1
    len_s = len(s)
    while i < len_s:
        ch = s[i]
        if ch == 'E' and i + 1 < len_s and s[i + 1] == 'V':
            ch = 'AF'
            i += 1
        elif ch in 'AEIOU':
            ch = 'A'
        elif ch == 'Q':
            ch = 'G'
        elif ch == 'Z':
            ch = 'S'
        elif ch == 'M':
            ch = 'N'
        elif ch == 'K':
            if i + 1 < len(s) and s[i + 1] == 'N':
                ch = 'N'
            else:
                ch = 'C'
        elif ch == 'S' and s[i + 1:i + 3] == 'CH':
            ch = 'SS'
            i += 2
        elif ch == 'P' and i + 1 < len(s) and s[i + 1] == 'H':
            ch = 'F'
            i += 1
        elif ch == 'H' and (s[i - 1] not in 'AEIOU' or
                            (i + 1 < len(s) and s[i + 1] not in 'AEIOU')):
            if s[i - 1] in 'AEIOU':
                ch = 'A'
            else:
                ch = s[i - 1]
        elif ch == 'W' and s[i - 1] in 'AEIOU':
            ch = s[i - 1]

        if ch[-1] != key[-1][-1]:
            key.append(ch)

        i += 1

    key = ''.join(key)

    # step 5 - remove trailing S
    if key.endswith('S') and key != 'S':
        key = key[:-1]

    # step 6 - replace AY w/ Y
    if key.endswith('AY'):
        key = key[:-2] + 'Y'

    # step 7 - remove trailing A
    if key.endswith('A') and key != 'A':
        key = key[:-1]

    # step 8 was already done

    return key
コード例 #29
0
def ngram_similarity(s0, s1, n=2):
    """
    N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance" String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.

    Args:
        s1 (str): Sequence 1.
        s2 (str): Sequence 2.

    Returns:
        float: NGram Similarity.

    Examples:
        >>> rltk.ngram_similarity('ABCD', 'ABTUIO')
        0.4166666666666667
    """

    utils.check_for_none(s0, s1)
    utils.check_for_type(str, s0, s1)

    n1, n2 = len(s0), len(s1)
    special = "\n"

    if (n1 == 0 or n2 == 0):
        return 0

    if (s0 == s1):
        return 1

    cost = 0
    if (n1 < n or n2 < n):
        return 0

    # Adding special chars (n-1) to s0
    sa = special * (n - 1) + s0

    s2_j = [None] * n  # jth n-gram of s2
    d = [0] * (n1 + 1)  # cost array, horizontally
    p = [0] * (n1 + 1)  # 'previous' cost array, horizontally

    for i in range(n1 + 1):
        p[i] = 0

    for j in range(1, n2 + 1):
        # Construct s2_j n-gram
        if (j < n):
            for ti in range(n - j):
                s2_j[ti] = special

            for ti in range(n - j, n):
                s2_j[ti] = s1[ti - (n - j)]

        else:
            s2_j = list(s1[j - n:j])

        d[0] = 0

        for i in range(1, n1 + 1):
            cost = 0
            tn = n
            # Compare sa to s2_j
            for ni in range(n):
                if sa[i - 1 + ni] == s2_j[ni] and sa[i - 1 + ni] != "\n":
                    cost += 1
                elif sa[i - 1 + ni] == special:
                    tn -= 1

            ec = float(cost) / tn
            # minimum of cell to the left+1, to the top+1,
            # diagonally left and up +cost
            d[i] = max(d[i - 1], p[i], p[i - 1] + ec)

        d2 = p
        p = d
        d = d2
    return float(p[n1]) / max(n2, n1)
コード例 #30
0
ファイル: metaphone.py プロジェクト: mit2nil/rltk
def _metaphone(s):
    """
    Metaphone fundamentally improves on the Soundex algorithm by using information about variations and inconsistencies in English spelling and pronunciation to produce a more accurate encoding, which does a better job of matching words and names which sound similar. As with Soundex, similar-sounding words should share the same keys. Metaphone is available as a built-in operator in a number of systems.

    Args:
        s (str): Sequence.

    Returns:
        str: Coded sequence.

    Examples:
        >>> rltk.metaphone('ashcraft')
        'AXKRFT'
        >>> rltk.metaphone('pineapple')
        'PNPL'
    """
    # code from https://github.com/jamesturk/jellyfish
    # Copyright (c) 2015, James Turk
    # Copyright (c) 2015, Sunlight Foundation
    # All rights reserved.

    utils.check_for_none(s)
    utils.check_for_type(basestring, s)

    s = utils.unicode_normalize(s)

    if len(s) == 0:
        raise ValueError('Empty string')

    s = s.lower()
    result = []

    # skip first character if s starts with these
    if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')):
        s = s[1:]

    i = 0

    while i < len(s):
        c = s[i]
        next = s[i + 1] if i < len(s) - 1 else '*****'
        nextnext = s[i + 2] if i < len(s) - 2 else '*****'

        # skip doubles except for cc
        if c == next and c != 'c':
            i += 1
            continue

        if c in 'aeiou':
            if i == 0 or s[i - 1] == ' ':
                result.append(c)
        elif c == 'b':
            if (not (i != 0 and s[i - 1] == 'm')) or next:
                result.append('b')
        elif c == 'c':
            if next == 'i' and nextnext == 'a' or next == 'h':
                result.append('x')
                i += 1
            elif next in 'iey':
                result.append('s')
                i += 1
            else:
                result.append('k')
        elif c == 'd':
            if next == 'g' and nextnext in 'iey':
                result.append('j')
                i += 2
            else:
                result.append('t')
        elif c in 'fjlmnr':
            result.append(c)
        elif c == 'g':
            if next in 'iey':
                result.append('j')
            elif next not in 'hn':
                result.append('k')
            elif next == 'h' and nextnext and nextnext not in 'aeiou':
                i += 1
        elif c == 'h':
            if i == 0 or next in 'aeiou' or s[i - 1] not in 'aeiou':
                result.append('h')
        elif c == 'k':
            if i == 0 or s[i - 1] != 'c':
                result.append('k')
        elif c == 'p':
            if next == 'h':
                result.append('f')
                i += 1
            else:
                result.append('p')
        elif c == 'q':
            result.append('k')
        elif c == 's':
            if next == 'h':
                result.append('x')
                i += 1
            elif next == 'i' and nextnext in 'oa':
                result.append('x')
                i += 2
            else:
                result.append('s')
        elif c == 't':
            if next == 'i' and nextnext in 'oa':
                result.append('x')
            elif next == 'h':
                result.append('0')
                i += 1
            elif next != 'c' or nextnext != 'h':
                result.append('t')
        elif c == 'v':
            result.append('f')
        elif c == 'w':
            if i == 0 and next == 'h':
                i += 1
            if nextnext in 'aeiou' or nextnext == '*****':
                result.append('w')
        elif c == 'x':
            if i == 0:
                if next == 'h' or (next == 'i' and nextnext in 'oa'):
                    result.append('x')
                else:
                    result.append('s')
            else:
                result.append('k')
                result.append('s')
        elif c == 'y':
            if next in 'aeiou':
                result.append('y')
        elif c == 'z':
            result.append('s')
        elif c == ' ':
            if len(result) > 0 and result[-1] != ' ':
                result.append(' ')

        i += 1

    return ''.join(result).upper()