def get_sim_score(self, string1, string2):
        """Computes the normalized Hamming similarity score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Normalized Hamming similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.
            ValueError : If the input strings are not of same length.

        Examples:
            >>> hd = HammingDistance()
            >>> hd.get_sim_score('', '')
            1.0
            >>> hd.get_sim_score('alex', 'john')
            0.0
            >>> hd.get_sim_score(' ', 'a')
            0.0
            >>> hd.get_sim_score('JOHN', 'john')
            0.0
        """

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)
        
        raw_score = self.get_raw_score(string1, string2)

        common_len = len(string1)
        if common_len == 0:
            return 1.0
        return 1 - (raw_score / common_len)
    def get_raw_score(self, string1, string2):
        """Computes the raw Levenshtein distance between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Levenshtein distance (int).

        Raises:
            TypeError : If the inputs are not strings.

        Examples:
            >>> lev = Levenshtein()
            >>> lev.get_raw_score('a', '')
            1
            >>> lev.get_raw_score('example', 'samples')
            3
            >>> lev.get_raw_score('levenshtein', 'frankenstein')
            6
        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        if utils.sim_check_for_exact_match(string1, string2):
            return 0.0

        return levenshtein(string1, string2)
Beispiel #3
0
    def get_sim_score(self, string1, string2):
        """Computes the normalized Levenshtein similarity score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Normalized Levenshtein similarity (float).

        Raises:
            TypeError : If the inputs are not strings.

        Examples:
            >>> lev = Levenshtein()
            >>> lev.get_sim_score('a', '')
            0.0
            >>> lev.get_sim_score('example', 'samples')
            0.5714285714285714
            >>> lev.get_sim_score('levenshtein', 'frankenstein')
            0.5

        """

        # convert input strings to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        raw_score = self.get_raw_score(string1, string2)
        max_len = max(len(string1), len(string2))
        if max_len == 0:
            return 1.0
        return 1 - (raw_score / max_len)
Beispiel #4
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Levenshtein distance between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Levenshtein distance (int).

        Raises:
            TypeError : If the inputs are not strings.

        Examples:
            >>> lev = Levenshtein()
            >>> lev.get_raw_score('a', '')
            1
            >>> lev.get_raw_score('example', 'samples')
            3
            >>> lev.get_raw_score('levenshtein', 'frankenstein')
            6
        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        if utils.sim_check_for_exact_match(string1, string2):
            return 0.0

        return levenshtein(string1, string2)
    def get_sim_score(self, string1, string2):
        """Computes the normalized Levenshtein similarity score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Normalized Levenshtein similarity (float).

        Raises:
            TypeError : If the inputs are not strings.

        Examples:
            >>> lev = Levenshtein()
            >>> lev.get_sim_score('a', '')
            0.0
            >>> lev.get_sim_score('example', 'samples')
            0.5714285714285714
            >>> lev.get_sim_score('levenshtein', 'frankenstein')
            0.5

        """

        # convert input strings to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        raw_score = self.get_raw_score(string1, string2)
        max_len = max(len(string1), len(string2))
        if max_len == 0:
            return 1.0
        return 1 - (raw_score / max_len)
    def get_raw_score(self, string1, string2):
        """Computes the raw Needleman-Wunsch score between two strings.

        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Needleman-Wunsch similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> nw = NeedlemanWunsch()
            >>> nw.get_raw_score('dva', 'deeva')
            1.0
            >>> nw = NeedlemanWunsch(gap_cost=0.0)
            >>> nw.get_raw_score('dva', 'deeve')
            2.0
            >>> nw = NeedlemanWunsch(gap_cost=1.0, sim_func=lambda s1, s2 : (2.0 if s1 == s2 else -1.0))
            >>> nw.get_raw_score('dva', 'deeve')
            1.0
            >>> nw = NeedlemanWunsch(gap_cost=0.5, sim_func=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
            >>> nw.get_raw_score('GCATGCUA', 'GATTACA')
            2.5
        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        dist_mat = np.zeros((len(string1) + 1, len(string2) + 1),
                            dtype=np.float)

        # DP initialization
        for i in xrange(len(string1) + 1):
            dist_mat[i, 0] = -(i * self.gap_cost)

        # DP initialization
        for j in xrange(len(string2) + 1):
            dist_mat[0, j] = -(j * self.gap_cost)

        # Needleman-Wunsch DP calculation
        for i in xrange(1, len(string1) + 1):
            for j in xrange(1, len(string2) + 1):
                match = dist_mat[i - 1, j - 1] + self.sim_func(
                    string1[i - 1], string2[j - 1])
                delete = dist_mat[i - 1, j] - self.gap_cost
                insert = dist_mat[i, j - 1] - self.gap_cost
                dist_mat[i, j] = max(match, delete, insert)

        return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
    def get_raw_score(self, string1, string2):
        """Computes the raw Smith-Waterman score between two strings.

        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Smith-Waterman similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> sw = SmithWaterman()
            >>> sw.get_raw_score('cat', 'hat')
            2.0
            >>> sw = SmithWaterman(gap_cost=2.2)
            >>> sw.get_raw_score('dva', 'deeve')
            1.0
            >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1))
            >>> sw.get_raw_score('dva', 'deeve')
            2.0
            >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
            >>> sw.get_raw_score('GCATAGCU', 'GATTACA')
            6.5
        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        dist_mat = np.zeros((len(string1) + 1, len(string2) + 1),
                            dtype=np.float)
        max_value = 0
        # Smith Waterman DP calculations
        for i in xrange(1, len(string1) + 1):
            for j in xrange(1, len(string2) + 1):
                match = dist_mat[i - 1, j - 1] + self.sim_func(
                    string1[i - 1], string2[j - 1])
                delete = dist_mat[i - 1, j] - self.gap_cost
                insert = dist_mat[i, j - 1] - self.gap_cost
                dist_mat[i, j] = max(0, match, delete, insert)
                max_value = max(max_value, dist_mat[i, j])

        return max_value
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro-Winkler score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro-Winkler similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jw = JaroWinkler()
            >>> jw.get_raw_score('MARTHA', 'MARHTA')
            0.9611111111111111
            >>> jw.get_raw_score('DWAYNE', 'DUANE')
            0.84
            >>> jw.get_raw_score('DIXON', 'DICKSONX')
            0.8133333333333332

        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        jw_score = Jaro().get_raw_score(string1, string2)
        min_len = min(len(string1), len(string2))

        # prefix length can be at max 4
        j = min(min_len, 4)
        i = 0
        while i < j and string1[i] == string2[i] and string1[i]:
            i += 1

        if i:
            jw_score += i * self.prefix_weight * (1 - jw_score)

        return jw_score
    def get_raw_score(self, string1, string2):
        """Computes the raw hamming distance between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Hamming distance (int).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.
            ValueError : If the input strings are not of same length.

        Examples:
            >>> hd = HammingDistance()
            >>> hd.get_raw_score('', '')
            0
            >>> hd.get_raw_score('alex', 'john')
            4
            >>> hd.get_raw_score(' ', 'a')
            1
            >>> hd.get_raw_score('JOHN', 'john')
            4
        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # for Hamming Distance string length should be same
        utils.sim_check_for_same_len(string1, string2)

        # sum all the mismatch characters at the corresponding index of
        # input strings
        return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
    def get_raw_score(self, string1, string2):
        """
        Computes the Fuzzy Wuzzy ratio measure raw score between two strings.
        This score is in the range [0,100].

        Args:
            string1,string2 (str): Input strings

        Returns:
            Ratio measure raw score (int) is returned

        Raises:
            TypeError: If the inputs are not strings

        Examples:
            >>> s = Ratio()
            >>> s.get_raw_score('Robert', 'Rupert')
            67
            >>> s.get_raw_score('Sue', 'sue')
            67
            >>> s.get_raw_score('example', 'samples')
            71

        References:
            * https://pypi.python.org/pypi/fuzzywuzzy
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        sm = SequenceMatcher(None, string1, string2)
        return int(round(100 * sm.ratio()))
Beispiel #11
0
    def get_raw_score(self, string1, string2):
        """Computes the affine gap score between two strings. This score can be outside the range [0,1].
        
        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Affine gap score betwen the two input strings (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> aff = Affine()
            >>> aff.get_raw_score('dva', 'deeva')
            1.5
            >>> aff = Affine(gap_start=2, gap_continuation=0.5)
            >>> aff.get_raw_score('dva', 'deeve')
            -0.5
            >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
            >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
            4.4
        """
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return affine(string1, string2, self.gap_start, self.gap_continuation,
                      self.sim_func)
    def get_raw_score(self, string1, string2):
        """Computes the raw Needleman-Wunsch score between two strings.

        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Needleman-Wunsch similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> nw = NeedlemanWunsch()
            >>> nw.get_raw_score('dva', 'deeva')
            1.0
            >>> nw = NeedlemanWunsch(gap_cost=0.0)
            >>> nw.get_raw_score('dva', 'deeve')
            2.0
            >>> nw = NeedlemanWunsch(gap_cost=1.0, sim_func=lambda s1, s2 : (2.0 if s1 == s2 else -1.0))
            >>> nw.get_raw_score('dva', 'deeve')
            1.0
            >>> nw = NeedlemanWunsch(gap_cost=0.5, sim_func=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
            >>> nw.get_raw_score('GCATGCUA', 'GATTACA')
            2.5
        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # returns the similarity score from the cython function
        return needleman_wunsch(string1, string2, self.gap_cost, self.sim_func)
    def get_raw_score(self, string1, string2):
        """Computes the raw Smith-Waterman score between two strings.

        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Smith-Waterman similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> sw = SmithWaterman()
            >>> sw.get_raw_score('cat', 'hat')
            2.0
            >>> sw = SmithWaterman(gap_cost=2.2)
            >>> sw.get_raw_score('dva', 'deeve')
            1.0
            >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1))
            >>> sw.get_raw_score('dva', 'deeve')
            2.0
            >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
            >>> sw.get_raw_score('GCATAGCU', 'GATTACA')
            6.5
        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # Returns smith waterman similarity score from cython function
        return smith_waterman(string1, string2, self.gap_cost, self.sim_func)
    def get_raw_score(self, string1, string2):
        """Computes the raw Smith-Waterman score between two strings.

        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Smith-Waterman similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> sw = SmithWaterman()
            >>> sw.get_raw_score('cat', 'hat')
            2.0
            >>> sw = SmithWaterman(gap_cost=2.2)
            >>> sw.get_raw_score('dva', 'deeve')
            1.0
            >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1))
            >>> sw.get_raw_score('dva', 'deeve')
            2.0
            >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
            >>> sw.get_raw_score('GCATAGCU', 'GATTACA')
            6.5
        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # Returns smith waterman similarity score from cython function
        return smith_waterman(string1,string2,self.gap_cost,self.sim_func)
Beispiel #15
0
    def get_raw_score(self, string1, string2):
        """Computes the affine gap score between two strings. This score can be outside the range [0,1].
        
        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Affine gap score betwen the two input strings (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> aff = Affine()
            >>> aff.get_raw_score('dva', 'deeva')
            1.5
            >>> aff = Affine(gap_start=2, gap_continuation=0.5)
            >>> aff.get_raw_score('dva', 'deeve')
            -0.5
            >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
            >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
            4.4
        """
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return affine(string1, string2, self.gap_start, self.gap_continuation, self.sim_func)
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jaro = Jaro()
            >>> jaro.get_raw_score('MARTHA', 'MARHTA')
            0.9444444444444445
            >>> jaro.get_raw_score('DWAYNE', 'DUANE')
            0.8222222222222223
            >>> jaro.get_raw_score('DIXON', 'DICKSONX')
            0.7666666666666666

        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return jaro(string1, string2)
Beispiel #17
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro-Winkler score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro-Winkler similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jw = JaroWinkler()
            >>> jw.get_raw_score('MARTHA', 'MARHTA')
            0.9611111111111111
            >>> jw.get_raw_score('DWAYNE', 'DUANE')
            0.84
            >>> jw.get_raw_score('DIXON', 'DICKSONX')
            0.8133333333333332

        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return jaro_winkler(string1, string2, self.prefix_weight)
Beispiel #18
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jaro = Jaro()
            >>> jaro.get_raw_score('MARTHA', 'MARHTA')
            0.9444444444444445
            >>> jaro.get_raw_score('DWAYNE', 'DUANE')
            0.8222222222222223
            >>> jaro.get_raw_score('DIXON', 'DICKSONX')
            0.7666666666666666

        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return jaro(string1, string2)
Beispiel #19
0
    def get_raw_score(self, string1, string2):
        """Computes the affine gap score between two strings. This score can be outside the range [0,1].
        
        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Affine gap score betwen the two input strings (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> aff = Affine()
            >>> aff.get_raw_score('dva', 'deeva')
            1.5
            >>> aff = Affine(gap_start=2, gap_continuation=0.5)
            >>> aff.get_raw_score('dva', 'deeve')
            -0.5
            >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
            >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
            4.4
        """
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        gap_start = -self.gap_start
        gap_continuation = -self.gap_continuation
        m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
        x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
        y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)

        # DP initialization
        for i in xrange(1, len(string1) + 1):
            m[i][0] = -float("inf")
            x[i][0] = gap_start + (i - 1) * gap_continuation
            y[i][0] = -float("inf")

        # DP initialization
        for j in xrange(1, len(string2) + 1):
            m[0][j] = -float("inf")
            x[0][j] = -float("inf")
            y[0][j] = gap_start + (j - 1) * gap_continuation

        # affine gap calculation using DP
        for i in xrange(1, len(string1) + 1):
            for j in xrange(1, len(string2) + 1):
                # best score between x_1....x_i and y_1....y_j
                # given that x_i is aligned to y_j
                m[i][j] = (
                    self.sim_func(string1[i - 1], string2[j - 1]) +
                    max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1]))

                # the best score given that x_i is aligned to a gap
                x[i][j] = max(gap_start + m[i - 1][j],
                              gap_continuation + x[i - 1][j])

                # the best score given that y_j is aligned to a gap
                y[i][j] = max(gap_start + m[i][j - 1],
                              gap_continuation + y[i][j - 1])

        return max(m[len(string1)][len(string2)],
                   x[len(string1)][len(string2)],
                   y[len(string1)][len(string2)])
    def get_raw_score(self, string1, string2):
        """
        Computes the Fuzzy Wuzzy partial ratio measure raw score between two strings.
        This score is in the range [0,100].

        Args:
            string1,string2 (str): Input strings

        Returns:
            Partial Ratio measure raw score (int) is returned

        Raises:
            TypeError: If the inputs are not strings

        Examples:
            >>> s = PartialRatio()
            >>> s.get_raw_score('Robert Rupert', 'Rupert')
            100
            >>> s.get_raw_score('Sue', 'sue')
            67
            >>> s.get_raw_score('example', 'samples')
            86

        References:
            * https://pypi.python.org/pypi/fuzzywuzzy
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        # string1 should be smaller in length than string2. If this is not the case
        # then swap string1 and string2
        if len(string1) > len(string2):
            temp = string1
            string1 = string2
            string2 = temp

        sm = SequenceMatcher(None, string1, string2)
        matching_blocks = sm.get_matching_blocks()

        scores = []
        for block in matching_blocks:
            string2_starting_index = 0
            if (block[1] - block[0] > 0):
                string2_starting_index = block[1] - block[0]
            string2_ending_index = string2_starting_index + len(string1)
            string2_substr = string2[string2_starting_index:string2_ending_index]

            sm2 = SequenceMatcher(None, string1, string2_substr)
            similarity_ratio = sm2.ratio()
            if similarity_ratio > .995:
                return 100
            else:
                scores.append(similarity_ratio)

        return int(round(100 * max(scores)))
Beispiel #21
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jaro = Jaro()
            >>> jaro.get_raw_score('MARTHA', 'MARHTA')
            0.9444444444444445
            >>> jaro.get_raw_score('DWAYNE', 'DUANE')
            0.8222222222222223
            >>> jaro.get_raw_score('DIXON', 'DICKSONX')
            0.7666666666666666

        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        len_s1 = len(string1)
        len_s2 = len(string2)

        max_len = max(len_s1, len_s2)
        search_range = (max_len // 2) - 1
        if search_range < 0:
            search_range = 0

        flags_s1 = [False] * len_s1
        flags_s2 = [False] * len_s2

        common_chars = 0
        for i, ch_s1 in enumerate(string1):
            low = i - search_range if i > search_range else 0
            high = i + search_range if i + search_range < len_s2 else len_s2 - 1
            for j in xrange(low, high + 1):
                if not flags_s2[j] and string2[j] == ch_s1:
                    flags_s1[i] = flags_s2[j] = True
                    common_chars += 1
                    break

        if not common_chars:
            return 0

        k = trans_count = 0
        for i, f_s1 in enumerate(flags_s1):
            if f_s1:
                for j in xrange(k, len_s2):
                    if flags_s2[j]:
                        k = j + 1
                        break
                if string1[i] != string2[j]:
                    trans_count += 1

        trans_count /= 2
        common_chars = float(common_chars)
        weight = ((common_chars / len_s1 + common_chars / len_s2 +
                   (common_chars - trans_count) / common_chars)) / 3
        return weight
Beispiel #22
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jaro = Jaro()
            >>> jaro.get_raw_score('MARTHA', 'MARHTA')
            0.9444444444444445
            >>> jaro.get_raw_score('DWAYNE', 'DUANE')
            0.8222222222222223
            >>> jaro.get_raw_score('DIXON', 'DICKSONX')
            0.7666666666666666

        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        len_s1 = len(string1)
        len_s2 = len(string2)

        max_len = max(len_s1, len_s2)
        search_range = (max_len // 2) - 1
        if search_range < 0:
            search_range = 0

        flags_s1 = [False] * len_s1
        flags_s2 = [False] * len_s2

        common_chars = 0
        for i, ch_s1 in enumerate(string1):
            low = i - search_range if i > search_range else 0
            high = i + search_range if i + search_range < len_s2 else len_s2 - 1
            for j in xrange(low, high + 1):
                if not flags_s2[j] and string2[j] == ch_s1:
                    flags_s1[i] = flags_s2[j] = True
                    common_chars += 1
                    break

        if not common_chars:
            return 0

        k = trans_count = 0
        for i, f_s1 in enumerate(flags_s1):
            if f_s1:
                for j in xrange(k, len_s2):
                    if flags_s2[j]:
                        k = j + 1
                        break
                if string1[i] != string2[j]:
                    trans_count += 1

        trans_count /= 2
        common_chars = float(common_chars)
        weight = ((common_chars / len_s1 + common_chars / len_s2 +
                   (common_chars - trans_count) / common_chars)) / 3
        return weight
Beispiel #23
0
    def get_raw_score(self, string1, string2):
        """Computes the affine gap score between two strings. This score can be outside the range [0,1].
        
        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Affine gap score betwen the two input strings (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> aff = Affine()
            >>> aff.get_raw_score('dva', 'deeva')
            1.5
            >>> aff = Affine(gap_start=2, gap_continuation=0.5)
            >>> aff.get_raw_score('dva', 'deeve')
            -0.5
            >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
            >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
            4.4
        """
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        gap_start = -self.gap_start
        gap_continuation = -self.gap_continuation
        m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
        x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
        y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)

        # DP initialization
        for i in xrange(1, len(string1) + 1):
            m[i][0] = -float("inf")
            x[i][0] = gap_start + (i - 1) * gap_continuation
            y[i][0] = -float("inf")

        # DP initialization
        for j in xrange(1, len(string2) + 1):
            m[0][j] = -float("inf")
            x[0][j] = -float("inf")
            y[0][j] = gap_start + (j - 1) * gap_continuation

        # affine gap calculation using DP
        for i in xrange(1, len(string1) + 1):
            for j in xrange(1, len(string2) + 1):
                # best score between x_1....x_i and y_1....y_j
                # given that x_i is aligned to y_j
                m[i][j] = (self.sim_func(string1[i - 1], string2[j - 1]) +
                           max(m[i - 1][j - 1], x[i - 1][j - 1],
                               y[i - 1][j - 1]))

                # the best score given that x_i is aligned to a gap
                x[i][j] = max(gap_start + m[i - 1][j],
                              gap_continuation + x[i - 1][j])

                # the best score given that y_j is aligned to a gap
                y[i][j] = max(gap_start + m[i][j - 1],
                              gap_continuation + y[i][j - 1])

        return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)],
                   y[len(string1)][len(string2)])
def bert_embed(bert_model, string):
    with torch.no_grad():
        return bert_model.encode(utils.convert_to_unicode(string))