コード例 #1
0
def levenshtein(string1, string2):
    """
    Computes the Levenshtein distance between two strings.

    Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
    is carried out using a sequence of the following operators: delete a character, insert a character, and
    substitute one character for another.

    Args:
        string1,string2 (str): Input strings

    Returns:
        Levenshtein distance (int)

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> levenshtein('a', '')
        1
        >>> levenshtein('example', 'samples')
        3
        >>> levenshtein('levenshtein', 'frankenstein')
        6



    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    if utils.sim_check_for_exact_match(string1, string2):
        return 0.0

    ins_cost, del_cost, sub_cost, trans_cost = (1, 1, 1, 1)

    len_str1 = len(string1)
    len_str2 = len(string2)

    if len_str1 == 0:
        return len_str2 * ins_cost

    if len_str2 == 0:
        return len_str1 * del_cost

    d_mat = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.int)

    for i in _range(len_str1 + 1):
        d_mat[i, 0] = i * del_cost

    for j in _range(len_str2 + 1):
        d_mat[0, j] = j * ins_cost

    for i in _range(len_str1):
        for j in _range(len_str2):
            d_mat[i + 1, j + 1] = min(
                d_mat[i + 1, j] + ins_cost, d_mat[i, j + 1] + del_cost,
                d_mat[i, j] + (sub_cost if string1[i] != string2[j] else 0))

    return d_mat[len_str1, len_str2]
コード例 #2
0
def levenshtein(string1, string2):
    """
    Computes the Levenshtein distance between two strings.

    Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
    is carried out using a sequence of the following operators: delete a character, insert a character, and
    substitute one character for another.

    Args:
        string1,string2 (str): Input strings

    Returns:
        Levenshtein distance (int)

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> levenshtein('a', '')
        1
        >>> levenshtein('example', 'samples')
        3
        >>> levenshtein('levenshtein', 'frankenstein')
        6


    Note:
        This implementation internally uses python-levenshtein package to compute the Levenshtein distance

    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    # using Levenshtein library
    return Levenshtein.distance(string1, string2)
コード例 #3
0
def levenshtein(string1, string2):
    """
    Computes the Levenshtein distance between two strings.

    Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
    is carried out using a sequence of the following operators: delete a character, insert a character, and
    substitute one character for another.

    Args:
        string1,string2 (str): Input strings

    Returns:
        Levenshtein distance (int)

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> levenshtein('a', '')
        1
        >>> levenshtein('example', 'samples')
        3
        >>> levenshtein('levenshtein', 'frankenstein')
        6


    Note:
        This implementation internally uses python-levenshtein package to compute the Levenshtein distance

    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    # using Levenshtein library
    return Levenshtein.distance(string1, string2)
コード例 #4
0
def needleman_wunsch(string1, string2, gap_cost=1.0, sim_score=sim_ident):
    """
    Computes the Needleman-Wunsch measure between two strings.

    The Needleman-Wunsch generalizes the Levenshtein distance and considers global alignment between two strings.
    Specifically, it is computed by assigning a score to each alignment between two input strings and choosing the
    score of the best alignment, that is, the maximal score.

    An alignment between two strings is a set of correspondences between the characters of between them, allowing for
    gaps.

    Args:
        string1,string2 (str) : Input strings

        gap_cost (float) : Cost of gap (defaults to 1.0)

        sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults
            to an identity function, where if two characters are same it returns 1.0 else returns 0.


    Returns:
        Needleman-Wunsch measure (float)


    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.

    Examples:
        >>> needleman_wunsch('dva', 'deeva')
        1.0
        >>> needleman_wunsch('dva', 'deeve', 0.0)
        2.0
        >>> needleman_wunsch('dva', 'deeve', 1.0, sim_score=lambda s1, s2 : (2.0 if s1 == s2 else -1.0))
        1.0
        >>> needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5, sim_score=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
        2.5
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)

    dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
    # DP initialization
    for i in _range(len(string1) + 1):
        dist_mat[i, 0] = -(i * gap_cost)
    # DP initialization
    for j in _range(len(string2) + 1):
        dist_mat[0, j] = -(j * gap_cost)
    # Needleman-Wunsch DP calculation
    for i in _range(1, len(string1) + 1):
        for j in _range(1, len(string2) + 1):
            match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1],
                                                       string2[j - 1])
            delete = dist_mat[i - 1, j] - gap_cost
            insert = dist_mat[i, j - 1] - gap_cost
            dist_mat[i, j] = max(match, delete, insert)
    return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
コード例 #5
0
def needleman_wunsch(string1, string2, gap_cost=1.0, sim_score=sim_ident):
    """
    Computes the Needleman-Wunsch measure between two strings.

    The Needleman-Wunsch generalizes the Levenshtein distance and considers global alignment between two strings.
    Specifically, it is computed by assigning a score to each alignment between two input strings and choosing the
    score of the best alignment, that is, the maximal score.

    An alignment between two strings is a set of correspondences between the characters of between them, allowing for
    gaps.

    Args:
        string1,string2 (str) : Input strings

        gap_cost (float) : Cost of gap (defaults to 1.0)

        sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults
            to an identity function, where if two characters are same it returns 1.0 else returns 0.


    Returns:
        Needleman-Wunsch measure (float)


    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.

    Examples:
        >>> needleman_wunsch('dva', 'deeva')
        1.0
        >>> needleman_wunsch('dva', 'deeve', 0.0)
        2.0
        >>> needleman_wunsch('dva', 'deeve', 1.0, sim_score=lambda s1, s2 : (2.0 if s1 == s2 else -1.0))
        1.0
        >>> needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5, sim_score=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
        2.5
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)

    dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
    # DP initialization
    for i in _range(len(string1) + 1):
        dist_mat[i, 0] = -(i * gap_cost)
    # DP initialization
    for j in _range(len(string2) + 1):
        dist_mat[0, j] = -(j * gap_cost)
    # Needleman-Wunsch DP calculation
    for i in _range(1, len(string1) + 1):
        for j in _range(1, len(string2) + 1):
            match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1])
            delete = dist_mat[i - 1, j] - gap_cost
            insert = dist_mat[i, j - 1] - gap_cost
            dist_mat[i, j] = max(match, delete, insert)
    return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
コード例 #6
0
    def get_raw_score(self, string1, string2):
        """
        Computes the bag distance between two strings.

        For two strings X and Y, the Bag distance is:
        :math:`max( |bag(string1)-bag(string2)|, |bag(string2)-bag(string1)| )`

        Args:
            string1,string2 (str): Input strings

        Returns:
            Bag distance (int)

        Raises:
            TypeError : If the inputs are not strings

        Examples:
            >>> bd = BagDistance()
            >>> bd.get_raw_score('cat', 'hat')
            1
            >>> bd.get_raw_score('Niall', 'Neil')
            2
            >>> bd.get_raw_score('aluminum', 'Catalan')
            5
            >>> bd.get_raw_score('ATCG', 'TAGC')
            0
            >>> bd.get_raw_score('abcde', 'xyz')
            5

        References:
            * http://www.icmlc.org/icmlc2011/018_icmlc2011.pdf
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)
        if utils.sim_check_for_exact_match(string1, string2):
            return 0

        len_str1 = len(string1)
        len_str2 = len(string2)

        if len_str1 == 0:
            return len_str2

        if len_str2 == 0:
            return len_str1

        bag1 = collections.Counter(string1)
        bag2 = collections.Counter(string2)

        size1 = sum((bag1 - bag2).values())
        size2 = sum((bag2 - bag1).values())

        # returning the max of difference of sets
        return max(size1, size2)
コード例 #7
0
    def get_raw_score(self, string1, string2):
        """
        Computes the bag distance between two strings.

        For two strings X and Y, the Bag distance is:
        :math:`max( |bag(string1)-bag(string2)|, |bag(string2)-bag(string1)| )`

        Args:
            string1,string2 (str): Input strings

        Returns:
            Bag distance (int)

        Raises:
            TypeError : If the inputs are not strings

        Examples:
            >>> bd = BagDistance()
            >>> bd.get_raw_score('cat', 'hat')
            1
            >>> bd.get_raw_score('Niall', 'Neil')
            2
            >>> bd.get_raw_score('aluminum', 'Catalan')
            5
            >>> bd.get_raw_score('ATCG', 'TAGC')
            0
            >>> bd.get_raw_score('abcde', 'xyz')
            5

        References:
            * String Matching with Metric Trees Using an Approximate Distance: http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)
        if utils.sim_check_for_exact_match(string1, string2):
            return 0

        len_str1 = len(string1)
        len_str2 = len(string2)

        if len_str1 == 0:
            return len_str2

        if len_str2 == 0:
            return len_str1

        bag1 = collections.Counter(string1)
        bag2 = collections.Counter(string2)

        size1 = sum((bag1 - bag2).values())
        size2 = sum((bag2 - bag1).values())

        # returning the max of difference of sets
        return max(size1, size2)
コード例 #8
0
    def get_raw_score(self,
                      string1,
                      string2,
                      force_ascii=True,
                      full_process=True):
        """
        Computes the Fuzzy Wuzzy token sort measure raw score between two strings.
        This score is in the range [0,100].

        Args:
            string1,string2 (str), : Input strings
            force_ascii (boolean) : Flag to remove non-ascii characters or not
            full_process (boolean) : Flag to process the string or not. Processing includes
            removing non alphanumeric characters, converting string to lower case and 
            removing leading and trailing whitespaces.

        Returns:
            Token Sort measure raw score (int) is returned

        Raises:
            TypeError: If the inputs are not strings

        Examples:
            >>> s = TokenSort()
            >>> s.get_raw_score('great is scala', 'java is great')
            81
            >>> s.get_raw_score('Sue', 'sue')
            100
            >>> s.get_raw_score('C++ and Java', 'Java and Python')
            64

        References:
            * https://pypi.python.org/pypi/fuzzywuzzy
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        sorted1 = self._process_string_and_sort(string1,
                                                force_ascii,
                                                full_process=full_process)
        sorted2 = self._process_string_and_sort(string2,
                                                force_ascii,
                                                full_process=full_process)
        ratio = Ratio()
        return ratio.get_raw_score(sorted1, sorted2)
コード例 #9
0
def smith_waterman(string1, string2, gap_cost=1.0, sim_score=sim_ident):
    """
    Computes the Smith-Waterman measure between two strings.

    The Smith–Waterman algorithm performs local sequence alignment; that is, for determining similar regions
    between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of
    all possible lengths and optimizes the similarity measure.


    Args:
        string1,string2 (str) : Input strings

        gap_cost (float) : Cost of gap (defaults to 1.0)

        sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults
            to an identity function, where if two characters are same it returns 1 else returns 0.

    Returns:
        Smith-Waterman measure (float)

    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.

    Examples:
        >>> smith_waterman('cat', 'hat')
        2.0
        >>> smith_waterman('dva', 'deeve', 2.2)
        1.0
        >>> smith_waterman('dva', 'deeve', 1, sim_score=lambda s1, s2 : (2 if s1 == s2 else -1))
        2.0
        >>> smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
        6.5
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)

    dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
    max_value = 0
    # Smith Waterman DP calculations
    for i in _range(1, len(string1) + 1):
        for j in _range(1, len(string2) + 1):
            match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1],
                                                       string2[j - 1])
            delete = dist_mat[i - 1, j] - gap_cost
            insert = dist_mat[i, j - 1] - gap_cost
            dist_mat[i, j] = max(0, match, delete, insert)
            max_value = max(max_value, dist_mat[i, j])
    return max_value
コード例 #10
0
def smith_waterman(string1, string2, gap_cost=1.0, sim_score=sim_ident):
    """
    Computes the Smith-Waterman measure between two strings.

    The Smith–Waterman algorithm performs local sequence alignment; that is, for determining similar regions
    between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of
    all possible lengths and optimizes the similarity measure.


    Args:
        string1,string2 (str) : Input strings

        gap_cost (float) : Cost of gap (defaults to 1.0)

        sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults
            to an identity function, where if two characters are same it returns 1 else returns 0.

    Returns:
        Smith-Waterman measure (float)

    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.

    Examples:
        >>> smith_waterman('cat', 'hat')
        2.0
        >>> smith_waterman('dva', 'deeve', 2.2)
        1.0
        >>> smith_waterman('dva', 'deeve', 1, sim_score=lambda s1, s2 : (2 if s1 == s2 else -1))
        2.0
        >>> smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
        6.5
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)

    dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
    max_value = 0
    # Smith Waterman DP calculations
    for i in _range(1, len(string1) + 1):
        for j in _range(1, len(string2) + 1):
            match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1])
            delete = dist_mat[i - 1, j] - gap_cost
            insert = dist_mat[i, j - 1] - gap_cost
            dist_mat[i, j] = max(0, match, delete, insert)
            max_value = max(max_value, dist_mat[i, j])
    return max_value
コード例 #11
0
    def get_raw_score(self, string1, string2, force_ascii=True, full_process=True):
        """
        Computes the Fuzzy Wuzzy partial token sort measure raw score between two strings.
        This score is in the range [0,100].

        Args:
            string1,string2 (str), : Input strings
            force_ascii (boolean) : Flag to remove non-ascii characters or not
            full_process (boolean) : Flag to process the string or not. Processing includes
            removing non alphanumeric characters, converting string to lower case and 
            removing leading and trailing whitespaces.

        Returns:
            Partial Token Sort measure raw score (int) is returned

        Raises:
            TypeError: If the inputs are not strings

        Examples:
            >>> s = PartialTokenSort()
            >>> s.get_raw_score('great is scala', 'java is great')
            81
            >>> s.get_raw_score('Sue', 'sue')
            100
            >>> s.get_raw_score('C++ and Java', 'Java and Python')
            64

        References:
            * https://pypi.python.org/pypi/fuzzywuzzy
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        sorted1 = self._process_string_and_sort(string1, force_ascii, full_process=full_process)
        sorted2 = self._process_string_and_sort(string2, force_ascii, full_process=full_process)
        partialRatio = PartialRatio()
        return partialRatio.get_raw_score(sorted1, sorted2)
コード例 #12
0
    def get_raw_score(self, string1, string2):
        """
        Computes the Fuzzy Wuzzy ratio measure raw score between two strings.
        This score is in the range [0,100].

        Args:
            string1,string2 (str): Input strings

        Returns:
            Ratio measure raw score (int) is returned

        Raises:
            TypeError: If the inputs are not strings

        Examples:
            >>> s = Ratio()
            >>> s.get_raw_score('Robert', 'Rupert')
            67
            >>> s.get_raw_score('Sue', 'sue')
            67
            >>> s.get_raw_score('example', 'samples')
            71

        References:
            * https://pypi.python.org/pypi/fuzzywuzzy
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        sm = SequenceMatcher(None, string1, string2)
        return int(round(100 * sm.ratio()))
コード例 #13
0
    def get_sim_score(self, string1, string2):
        """
        Computes the Fuzzy Wuzzy ratio similarity score between two strings.
        This score is in the range [0,1].

        Args:
            string1,string2 (str): Input strings

        Returns:
            Ratio measure similarity score (float) is returned

        Raises:
            TypeError: If the inputs are not strings

        Examples:
            >>> s = Ratio()
            >>> s.get_sim_score('Robert', 'Rupert')
            0.67
            >>> s.get_sim_score('Sue', 'sue')
            0.67
            >>> s.get_sim_score('example', 'samples')
            0.71

        References:
            * https://pypi.python.org/pypi/fuzzywuzzy
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        raw_score = 1.0 * self.get_raw_score(string1, string2)
        sim_score = raw_score / 100
        return sim_score
コード例 #14
0
    def get_sim_score(self, string1, string2):
        """
        Computes the Fuzzy Wuzzy partial ratio similarity score between two strings.
        This score is in the range [0,1].

        Args:
            string1,string2 (str): Input strings

        Returns:
            Partial Ratio measure similarity score (float) is returned

        Raises:
            TypeError: If the inputs are not strings

        Examples:
            >>> s = PartialRatio()
            >>> s.get_sim_score('Robert Rupert', 'Rupert')
            1.0
            >>> s.get_sim_score('Sue', 'sue')
            0.67
            >>> s.get_sim_score('example', 'samples')
            0.86
        
        References:
            * https://pypi.python.org/pypi/fuzzywuzzy
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        raw_score = 1.0 * self.get_raw_score(string1, string2)
        sim_score = raw_score / 100
        return sim_score
コード例 #15
0
    def get_raw_score(self, string1, string2):
        """
        Computes the editex distance between two strings.

        As described on pages 3 & 4 of
        Zobel, Justin and Philip Dart. 1996. Phonetic string matching: Lessons from
        information retrieval. In: Proceedings of the ACM-SIGIR Conference on
        Research and Development in Information Retrieval, Zurich, Switzerland.
        166–173. http://goanna.cs.rmit.edu.au/~jz/fulltext/sigir96.pdf

        The local variant is based on
        Ring, Nicholas and Alexandra L. Uitdenbogerd. 2009. Finding ‘Lucy in
        Disguise’: The Misheard Lyric Matching Problem. In: Proceedings of the 5th
        Asia Information Retrieval Symposium, Sapporo, Japan. 157-167.
        http://www.seg.rmit.edu.au/research/download.php?manuscript=404

        Args:
            string1,string2 (str): Input strings

        Returns:
            Editex distance (int)

        Raises:
            TypeError : If the inputs are not strings

        Examples:
            >>> ed = Editex()
            >>> ed.get_raw_score('cat', 'hat')
            2
            >>> ed.get_raw_score('Niall', 'Neil')
            2
            >>> ed.get_raw_score('aluminum', 'Catalan')
            12
            >>> ed.get_raw_score('ATCG', 'TAGC')
            6

        References:
            * Abydos Library - https://github.com/chrislit/abydos/blob/master/abydos/distance.py

        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)
        if utils.sim_check_for_exact_match(string1, string2):
            return 0

        # convert both the strings to NFKD normalized unicode
        string1 = unicodedata.normalize('NFKD', text_type(string1.upper()))
        string2 = unicodedata.normalize('NFKD', text_type(string2.upper()))

        # convert ß to SS (for Python2)
        string1 = string1.replace('ß', 'SS')
        string2 = string2.replace('ß', 'SS')

        if len(string1) == 0:
            return len(string2) * self.mismatch_cost
        if len(string2) == 0:
            return len(string1) * self.mismatch_cost

        d_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.int)
        len1 = len(string1)
        len2 = len(string2)
        string1 = ' ' + string1
        string2 = ' ' + string2
        editex_helper = EditexHelper(self.match_cost, self.mismatch_cost,
                                     self.group_cost)

        if not self.local:
            for i in xrange(1, len1 + 1):
                d_mat[i, 0] = d_mat[i - 1, 0] + editex_helper.d_cost(
                                                    string1[i - 1], string1[i])

        for j in xrange(1, len2 + 1):
            d_mat[0, j] = d_mat[0, j - 1] + editex_helper.d_cost(string2[j - 1],
                                                                 string2[j])

        for i in xrange(1, len1 + 1):
            for j in xrange(1, len2 + 1):
                d_mat[i, j] = min(d_mat[i - 1, j] + editex_helper.d_cost(
                                                    string1[i - 1], string1[i]),
                                  d_mat[i, j - 1] + editex_helper.d_cost(
                                                    string2[j - 1], string2[j]),
                                  d_mat[i - 1, j - 1] + editex_helper.r_cost(
                                                        string1[i], string2[j]))

        return d_mat[len1, len2]
コード例 #16
0
    def get_raw_score(self, string1, string2):
        """
        Computes the Soundex phonetic similarity between two strings.

        Phonetic measure such as soundex match string based on their sound. These
        measures have been especially effective in matching names, since names are
        often spelled in different ways that sound the same. For example, Meyer, Meier,
        and Mire sound the same, as do Smith, Smithe, and Smythe.

        Soundex is used primarily to match surnames. It does not work as well for names
        of East Asian origins, because much of the discriminating power of these names
        resides in the vowel sounds, which the code ignores.

        Args:
            string1,string2 (str): Input strings

        Returns:
            Soundex similarity score (int) is returned

        Raises:
            TypeError : If the inputs are not strings

        Examples:
            >>> s = Soundex()
            >>> s.get_raw_score('Robert', 'Rupert')
            1
            >>> s.get_raw_score('Sue', 's')
            1
            >>> s.get_raw_score('Gough', 'Goff')
            0
            >>> s.get_raw_score('a,,li', 'ali')
            1

        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # remove all chars but alphanumeric characters
        string1 = re.sub("[^a-zA-Z0-9]", "", string1)
        string2 = re.sub("[^a-zA-Z0-9]", "", string2)

        utils.sim_check_for_zero_len(string1, string2)

        if utils.sim_check_for_exact_match(string1, string2):
            return 1

        string1, string2 = string1.upper(), string2.upper()
        first_letter1, first_letter2 = string1[0], string2[0]
        string1, string2 = string1[1:], string2[1:]

        # remove occurrences of vowels, 'y', 'w' and 'h'
        string1 = re.sub('[AEIOUYWH]', '', string1)
        string2 = re.sub('[AEIOUYWH]', '', string2)

        # replace (B,F,P,V)->1 (C,G,J,K,Q,S,X,Z)->2 (D,T)->3 (L)->4
        # (M,N)->5 (R)->6
        string1 = re.sub('[BFPV]', '1', string1)
        string1 = re.sub('[CGJKQSXZ]', '2', string1)
        string1 = re.sub('[DT]', '3', string1)
        string1 = re.sub('[L]', '4', string1)
        string1 = re.sub('[MN]', '5', string1)
        string1 = re.sub('[R]', '6', string1)

        string2 = re.sub('[BFPV]', '1', string2)
        string2 = re.sub('[CGJKQSXZ]', '2', string2)
        string2 = re.sub('[DT]', '3', string2)
        string2 = re.sub('[L]', '4', string2)
        string2 = re.sub('[MN]', '5', string2)
        string2 = re.sub('[R]', '6', string2)

        string1 = first_letter1 + string1[:3]
        string2 = first_letter2 + string2[:3]

        return 1 if string1 == string2 else 0
コード例 #17
0
    def get_raw_score(self, string1, string2):
        """
        Computes the Fuzzy Wuzzy partial ratio measure raw score between two strings.
        This score is in the range [0,100].

        Args:
            string1,string2 (str): Input strings

        Returns:
            Partial Ratio measure raw score (int) is returned

        Raises:
            TypeError: If the inputs are not strings

        Examples:
            >>> s = PartialRatio()
            >>> s.get_raw_score('Robert Rupert', 'Rupert')
            100
            >>> s.get_raw_score('Sue', 'sue')
            67
            >>> s.get_raw_score('example', 'samples')
            86

        References:
            * https://pypi.python.org/pypi/fuzzywuzzy
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        # string1 should be smaller in length than string2. If this is not the case
        # then swap string1 and string2
        if len(string1) > len(string2):
            temp = string1
            string1 = string2
            string2 = temp

        sm = SequenceMatcher(None, string1, string2)
        matching_blocks = sm.get_matching_blocks()

        scores = []
        for block in matching_blocks:
            string2_starting_index = 0
            if (block[1] - block[0] > 0):
                string2_starting_index = block[1] - block[0]
            string2_ending_index = string2_starting_index + len(string1)
            string2_substr = string2[string2_starting_index:string2_ending_index]

            sm2 = SequenceMatcher(None, string1, string2_substr)
            similarity_ratio = sm2.ratio()
            if similarity_ratio > .995:
                return 100
            else:
                scores.append(similarity_ratio)

        return int(round(100 * max(scores)))
コード例 #18
0
def soundex(string1, string2):
    """
    Computes the Soundex phonetic similarity between two strings.

    Phonetic measure such as soundex match string based on their sound. These
    measures have been especially effective in matching names, since names are
    often spelled in different ways that sound the same. For example, Meyer, Meier,
    and Mire sound the same, as do Smith, Smithe, and Smythe.

    Soundex is used primarily to match surnames. It does not work as well for names
    of East Asian origins, because much of the discriminating power of these names
    resides in the vowel sounds, which the code ignores.

    Args:
        string1,string2 (str): Input strings

    Returns:
        Soundex similarity score (int) is returned

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> soundex('Robert', 'Rupert')
        1
        >>> soundex('Sue', 's')
        1
        >>> soundex('Gough', 'Goff')
        0
        >>> soundex('a,,li', 'ali')
        1

    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    if utils.sim_check_for_exact_match(string1, string2):
        return 1
    utils.sim_check_for_zero_len(string1, string2)
    string1, string2 = string1.upper(), string2.upper()
    firstLetter1, firstLetter2 = string1[0], string2[0]
    string1, string2 = string1[1:], string2[1:]
    # remove occurrences of vowels, 'y', 'w' and 'h'
    string1 = re.sub('[AEIOUYWH]', '', string1)
    string2 = re.sub('[AEIOUYWH]', '', string2)

    # replace (B,F,P,V)->1 (C,G,J,K,Q,S,X,Z)->2 (D,T)->3 (L)->4 (M,N)->5 (R)->6
    string1 = re.sub('[BFPV]', '1', string1)
    string1 = re.sub('[CGJKQSXZ]', '2', string1)
    string1 = re.sub('[DT]', '3', string1)
    string1 = re.sub('[L]', '4', string1)
    string1 = re.sub('[MN]', '5', string1)
    string1 = re.sub('[R]', '6', string1)

    string2 = re.sub('[BFPV]', '1', string2)
    string2 = re.sub('[CGJKQSXZ]', '2', string2)
    string2 = re.sub('[DT]', '3', string2)
    string2 = re.sub('[L]', '4', string2)
    string2 = re.sub('[MN]', '5', string2)
    string2 = re.sub('[R]', '6', string2)

    # remove all chars but digits
    string1 = re.sub("\D", "", string1)
    string2 = re.sub("\D", "", string2)

    string1 = firstLetter1 + string1[:3]
    string2 = firstLetter2 + string2[:3]
    return 1 if string1 == string2 else 0
コード例 #19
0
def editex(string1, string2, match_cost=0, group_cost=1, mismatch_cost=2, local=False):
    """
    Computes the editex distance between two strings.

    As described on pages 3 & 4 of
    Zobel, Justin and Philip Dart. 1996. Phonetic string matching: Lessons from
    information retrieval. In: Proceedings of the ACM-SIGIR Conference on
    Research and Development in Information Retrieval, Zurich, Switzerland.
    166–173. http://goanna.cs.rmit.edu.au/~jz/fulltext/sigir96.pdf

    The local variant is based on
    Ring, Nicholas and Alexandra L. Uitdenbogerd. 2009. Finding ‘Lucy in
    Disguise’: The Misheard Lyric Matching Problem. In: Proceedings of the 5th
    Asia Information Retrieval Symposium, Sapporo, Japan. 157-167.
    http://www.seg.rmit.edu.au/research/download.php?manuscript=404

    Args:
        string1,string2 (str): Input strings
        match_cost (int): Weight to give the correct char match, default=0
        group_cost (int): Weight to give if the chars are in the same editex group, default=1
        mismatch_cost (int): Weight to give the incorrect char match, default=2
        local (boolean): Local variant on/off, default=False

    Returns:
        Editex distance (int)

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> editex('cat', 'hat')
        2
        >>> editex('Niall', 'Neil')
        2
        >>> editex('aluminum', 'Catalan')
        12
        >>> editex('ATCG', 'TAGC')
        6

    References:
        * Abydos Library - https://github.com/chrislit/abydos/blob/master/abydos/distance.py
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    if utils.sim_check_for_exact_match(string1, string2):
        return 0
    # convert both the strings to NFKD normalized unicode
    string1 = unicodedata.normalize('NFKD', _unicode(string1.upper()))
    string2 = unicodedata.normalize('NFKD', _unicode(string2.upper()))
    # convert ß to SS (for Python2)
    string1 = string1.replace('ß', 'SS')
    string2 = string2.replace('ß', 'SS')

    if string1 == string2:
        return 0
    if len(string1) == 0:
        return len(string2) * mismatch_cost
    if len(string2) == 0:
        return len(string1) * mismatch_cost

    d_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.int)
    len1 = len(string1)
    len2 = len(string2)
    string1 = ' ' + string1
    string2 = ' ' + string2
    editex_helper = utils.Editex(match_cost, mismatch_cost, group_cost)
    if not local:
        for i in _range(1, len1 + 1):
            d_mat[i, 0] = d_mat[i - 1, 0] + editex_helper.d_cost(string1[i - 1], string1[i])
    for j in _range(1, len2 + 1):
        d_mat[0, j] = d_mat[0, j - 1] + editex_helper.d_cost(string2[j - 1], string2[j])

    for i in _range(1, len1 + 1):
        for j in _range(1, len2 + 1):
            d_mat[i, j] = min(d_mat[i - 1, j] + editex_helper.d_cost(string1[i - 1], string1[i]),
                              d_mat[i, j - 1] + editex_helper.d_cost(string2[j - 1], string2[j]),
                              d_mat[i - 1, j - 1] + editex_helper.r_cost(string1[i], string2[j]))

    return d_mat[len1, len2]
コード例 #20
0
def levenshtein(string1, string2):
    """
    Computes the Levenshtein distance between two strings.

    Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
    is carried out using a sequence of the following operators: delete a character, insert a character, and
    substitute one character for another.

    Args:
        string1,string2 (str): Input strings

    Returns:
        Levenshtein distance (int)

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> levenshtein('a', '')
        1
        >>> levenshtein('example', 'samples')
        3
        >>> levenshtein('levenshtein', 'frankenstein')
        6



    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    if utils.sim_check_for_exact_match(string1, string2):
        return 0.0

    ins_cost, del_cost, sub_cost, trans_cost = (1, 1, 1, 1)

    len_str1 = len(string1)
    len_str2 = len(string2)

    if len_str1 == 0:
        return len_str2 * ins_cost

    if len_str2 == 0:
        return len_str1 * del_cost

    d_mat = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.int)

    for i in _range(len_str1 + 1):
        d_mat[i, 0] = i * del_cost

    for j in _range(len_str2 + 1):
        d_mat[0, j] = j * ins_cost

    for i in _range(len_str1):
        for j in _range(len_str2):
            d_mat[i + 1, j + 1] = min(
                d_mat[i + 1, j] + ins_cost,
                d_mat[i, j + 1] + del_cost,
                d_mat[i, j] + (sub_cost if string1[i] != string2[j] else 0)
            )

    return d_mat[len_str1, len_str2]