コード例 #1
0
def delimiter(input_string, delim_str=' '):
    """
    Tokenizes input string based on the given delimiter.

    Args:
        input_string (str): Input string

        delim_str (str): Delimiter string


    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> delimiter('data science')
        ['data', 'science']
        >>> delimiter('data$#$science', '$#$')
        ['data', 'science']
        >>> delimiter('data science', ',')
        ['data science']

    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    return input_string.split(delim_str)
コード例 #2
0
def whitespace(input_string):
    """
    Tokenizes input string based on white space.

    Args:
        input_string (str): Input string

    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> whitespace('data science')
        ['data', 'science']
        >>> whitespace('data        science')
        ['data', 'science']
        >>> whitespace('data\tscience')
        ['data', 'science']

    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    return input_string.split()
コード例 #3
0
def whitespace(input_string):
    """
    Tokenizes input string based on white space.

    Args:
        input_string (str): Input string

    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> whitespace('data science')
        ['data', 'science']
        >>> whitespace('data        science')
        ['data', 'science']
        >>> whitespace('data\tscience')
        ['data', 'science']

    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    return input_string.split()
コード例 #4
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Levenshtein distance between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Levenshtein distance (int).

        Raises:
            TypeError : If the inputs are not strings.

        Examples:
            >>> lev = Levenshtein()
            >>> lev.get_raw_score('a', '')
            1
            >>> lev.get_raw_score('example', 'samples')
            3
            >>> lev.get_raw_score('levenshtein', 'frankenstein')
            6
        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        if utils.sim_check_for_exact_match(string1, string2):
            return 0.0

        return levenshtein(string1, string2)
コード例 #5
0
    def tokenize(self, input_string):
        """Tokenizes input string into alphanumeric tokens.

        Args:
            input_string (str): The string to be tokenized.

        Returns:
            A Python list, which represents a set of tokens if the flag return_set is true, and a bag of tokens otherwise. 

        Raises:
            TypeError : If the input is not a string.

        Examples:
            >>> alnum_tok = AlphanumericTokenizer()
            >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88')
            ['data9', 'science', 'data9', 'integration', '88']
            >>> alnum_tok.tokenize('#.&')
            []
            >>> alnum_tok = AlphanumericTokenizer(return_set=True) 
            >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88')
            ['data9', 'science', 'integration', '88']
                      
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        token_list = list(filter(None,
                                 self.__alnum_regex.findall(input_string)))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list
コード例 #6
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Levenshtein distance between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Levenshtein distance (int).

        Raises:
            TypeError : If the inputs are not strings.

        Examples:
            >>> lev = Levenshtein()
            >>> lev.get_raw_score('a', '')
            1
            >>> lev.get_raw_score('example', 'samples')
            3
            >>> lev.get_raw_score('levenshtein', 'frankenstein')
            6
        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        if utils.sim_check_for_exact_match(string1, string2):
            return 0.0

        return levenshtein(string1, string2)
コード例 #7
0
    def tokenize(self, input_string):
        """Tokenizes input string into alphabetical tokens.
        
        Args:
            input_string (str): The string to be tokenized.

        Returns:
            A Python list, which represents a set of tokens if the flag return_set is True, and a bag of tokens otherwise. 

        Raises:
            TypeError : If the input is not a string.

        Examples:
            >>> al_tok = AlphabeticTokenizer()
            >>> al_tok.tokenize('data99science, data#integration.')
            ['data', 'science', 'data', 'integration']
            >>> al_tok.tokenize('99')
            []
            >>> al_tok = AlphabeticTokenizer(return_set=True) 
            >>> al_tok.tokenize('data99science, data#integration.')
            ['data', 'science', 'integration']
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        token_list = list(filter(None, self.__al_regex.findall(input_string)))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list
コード例 #8
0
def delimiter(input_string, delim_str=' '):
    """
    Tokenizes input string based on the given delimiter.

    Args:
        input_string (str): Input string

        delim_str (str): Delimiter string


    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> delimiter('data science')
        ['data', 'science']
        >>> delimiter('data$#$science', '$#$')
        ['data', 'science']
        >>> delimiter('data science', ',')
        ['data science']

    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    return input_string.split(delim_str)
コード例 #9
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Needleman-Wunsch score between two strings.

        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Needleman-Wunsch similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> nw = NeedlemanWunsch()
            >>> nw.get_raw_score('dva', 'deeva')
            1.0
            >>> nw = NeedlemanWunsch(gap_cost=0.0)
            >>> nw.get_raw_score('dva', 'deeve')
            2.0
            >>> nw = NeedlemanWunsch(gap_cost=1.0, sim_func=lambda s1, s2 : (2.0 if s1 == s2 else -1.0))
            >>> nw.get_raw_score('dva', 'deeve')
            1.0
            >>> nw = NeedlemanWunsch(gap_cost=0.5, sim_func=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
            >>> nw.get_raw_score('GCATGCUA', 'GATTACA')
            2.5
        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        dist_mat = np.zeros((len(string1) + 1, len(string2) + 1),
                            dtype=np.float)

        # DP initialization
        for i in xrange(len(string1) + 1):
            dist_mat[i, 0] = -(i * self.gap_cost)

        # DP initialization
        for j in xrange(len(string2) + 1):
            dist_mat[0, j] = -(j * self.gap_cost)

        # Needleman-Wunsch DP calculation
        for i in xrange(1, len(string1) + 1):
            for j in xrange(1, len(string2) + 1):
                match = dist_mat[i - 1, j - 1] + self.sim_func(
                    string1[i - 1], string2[j - 1])
                delete = dist_mat[i - 1, j] - self.gap_cost
                insert = dist_mat[i, j - 1] - self.gap_cost
                dist_mat[i, j] = max(match, delete, insert)

        return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
コード例 #10
0
    def tokenize(self, input_string):
        """Tokenizes input string into qgrams.

        Args:
            input_string (str): The string to be tokenized. 

        Returns:
            A Python list, which is a set or a bag of qgrams, depending on whether return_set flag is True or False. 

        Raises:
            TypeError : If the input is not a string

        Examples:
            >>> qg2_tok = QgramTokenizer()
            >>> qg2_tok.tokenize('database')
            ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']
            >>> qg2_tok.tokenize('a')
            ['#a', 'a$']
            >>> qg3_tok = QgramTokenizer(qval=3)
            >>> qg3_tok.tokenize('database')
            ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']
            >>> qg3_nopad = QgramTokenizer(padding=False)
            >>> qg3_nopad.tokenize('database')
            ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']
            >>> qg3_diffpads = QgramTokenizer(prefix_pad='^', suffix_pad='!')
            >>> qg3_diffpads.tokenize('database')
            ['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!']
                      
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        qgram_list = []

        # If the padding flag is set to true, add q-1 "prefix_pad" characters
        # in front of the input string and  add q-1 "suffix_pad" characters at
        # the end of the input string.
        if self.padding:
            input_string = (self.prefix_pad * (self.qval - 1)) + input_string \
                           + (self.suffix_pad * (self.qval - 1))

        if len(input_string) < self.qval:
            return qgram_list

        qgram_list = [
            input_string[i:i + self.qval]
            for i in xrange(len(input_string) - (self.qval - 1))
        ]
        qgram_list = list(filter(None, qgram_list))

        if self.return_set:
            return utils.convert_bag_to_set(qgram_list)

        return qgram_list
コード例 #11
0
    def tokenize(self, input_string):
        """Tokenizes input string into qgrams.

        Args:
            input_string (str): The string to be tokenized. 

        Returns:
            A Python list, which is a set or a bag of qgrams, depending on whether return_set flag is True or False. 

        Raises:
            TypeError : If the input is not a string

        Examples:
            >>> qg2_tok = QgramTokenizer()
            >>> qg2_tok.tokenize('database')
            ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']
            >>> qg2_tok.tokenize('a')
            ['#a', 'a$']
            >>> qg3_tok = QgramTokenizer(qval=3)
            >>> qg3_tok.tokenize('database')
            ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']
            >>> qg3_nopad = QgramTokenizer(padding=False)
            >>> qg3_nopad.tokenize('database')
            ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']
            >>> qg3_diffpads = QgramTokenizer(prefix_pad='^', suffix_pad='!')
            >>> qg3_diffpads.tokenize('database')
            ['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!']
                      
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        qgram_list = []

        # If the padding flag is set to true, add q-1 "prefix_pad" characters
        # in front of the input string and  add q-1 "suffix_pad" characters at
        # the end of the input string.
        if self.padding:
            input_string = (self.prefix_pad * (self.qval - 1)) + input_string \
                           + (self.suffix_pad * (self.qval - 1))

        if len(input_string) < self.qval:
            return qgram_list

        qgram_list = [input_string[i:i + self.qval] for i in
                      xrange(len(input_string) - (self.qval - 1))]
        qgram_list = list(filter(None, qgram_list))

        if self.return_set:
            return utils.convert_bag_to_set(qgram_list)

        return qgram_list
コード例 #12
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Smith-Waterman score between two strings.

        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Smith-Waterman similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> sw = SmithWaterman()
            >>> sw.get_raw_score('cat', 'hat')
            2.0
            >>> sw = SmithWaterman(gap_cost=2.2)
            >>> sw.get_raw_score('dva', 'deeve')
            1.0
            >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1))
            >>> sw.get_raw_score('dva', 'deeve')
            2.0
            >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
            >>> sw.get_raw_score('GCATAGCU', 'GATTACA')
            6.5
        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        dist_mat = np.zeros((len(string1) + 1, len(string2) + 1),
                            dtype=np.float)
        max_value = 0
        # Smith Waterman DP calculations
        for i in xrange(1, len(string1) + 1):
            for j in xrange(1, len(string2) + 1):
                match = dist_mat[i - 1, j - 1] + self.sim_func(
                    string1[i - 1], string2[j - 1])
                delete = dist_mat[i - 1, j] - self.gap_cost
                insert = dist_mat[i, j - 1] - self.gap_cost
                dist_mat[i, j] = max(0, match, delete, insert)
                max_value = max(max_value, dist_mat[i, j])

        return max_value
コード例 #13
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro-Winkler score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro-Winkler similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jw = JaroWinkler()
            >>> jw.get_raw_score('MARTHA', 'MARHTA')
            0.9611111111111111
            >>> jw.get_raw_score('DWAYNE', 'DUANE')
            0.84
            >>> jw.get_raw_score('DIXON', 'DICKSONX')
            0.8133333333333332

        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        jw_score = Jaro().get_raw_score(string1, string2)
        min_len = min(len(string1), len(string2))

        # prefix length can be at max 4
        j = min(min_len, 4)
        i = 0
        while i < j and string1[i] == string2[i] and string1[i]:
            i += 1

        if i:
            jw_score += i * self.prefix_weight * (1 - jw_score)

        return jw_score
コード例 #14
0
def jaro_winkler(string1, string2, prefix_weight=0.1):
    """
    Computes the Jaro-Winkler measure between two strings.

    The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix
    and thus are likely to match.


    Args:
        string1,string2 (str): Input strings

        prefix_weight (float): Weight to give the prefix (defaults to 0.1)

    Returns:
        Jaro-Winkler measure (float)

    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.


    Examples:
        >>> jaro_winkler('MARTHA', 'MARHTA')
        0.9611111111111111
        >>> jaro_winkler('DWAYNE', 'DUANE')
        0.84
        >>> jaro_winkler('DIXON', 'DICKSONX')
        0.8133333333333332

    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.tok_check_for_string_input(string1, string2)
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(string1, string2):
        return 0

    jw_score = jaro(string1, string2)
    min_len = min(len(string1), len(string2))
    # prefix length can be at max 4
    j = min(min_len, 4)
    i = 0
    while i < j and string1[i] == string2[i] and string1[i]:
        i += 1
    if i:
        jw_score += i * prefix_weight * (1 - jw_score)
    return jw_score
コード例 #15
0
def jaro_winkler(string1, string2, prefix_weight=0.1):
    """
    Computes the Jaro-Winkler measure between two strings.

    The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix
    and thus are likely to match.


    Args:
        string1,string2 (str): Input strings

        prefix_weight (float): Weight to give the prefix (defaults to 0.1)

    Returns:
        Jaro-Winkler measure (float)

    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.


    Examples:
        >>> jaro_winkler('MARTHA', 'MARHTA')
        0.9611111111111111
        >>> jaro_winkler('DWAYNE', 'DUANE')
        0.84
        >>> jaro_winkler('DIXON', 'DICKSONX')
        0.8133333333333332

    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.tok_check_for_string_input(string1, string2)
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(string1, string2):
        return 0

    jw_score = jaro(string1, string2)
    min_len = min(len(string1), len(string2))
    # prefix length can be at max 4
    j = min(min_len, 4)
    i = 0
    while i < j and string1[i] == string2[i] and string1[i]:
        i += 1
    if i:
        jw_score += i * prefix_weight * (1 - jw_score)
    return jw_score
コード例 #16
0
def qgram(input_string, qval=2):
    """
    Tokenizes input string into q-grams.

    A q-gram is defined as all sequences of q characters. Q-grams are also known as n-grams and
    k-grams.

    Args:
        input_string (str): Input string

        qval (int): Q-gram length (defaults to 2)

    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> qgram('database')
        ['da','at','ta','ab','ba','as','se']
        >>> qgram('a')
        []
        >>> qgram('database', 3)
        ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']


    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    qgram_list = []

    if len(input_string) < qval or qval < 1:
        return qgram_list

    qgram_list = [
        input_string[i:i + qval]
        for i in _range(len(input_string) - (qval - 1))
    ]
    return qgram_list
コード例 #17
0
    def tokenize(self, input_string):
        """Tokenizes input string based on the set of delimiters.

        Args:
            input_string (str): The string to be tokenized. 

        Returns:
            A Python list which is a set or a bag of tokens, depending on whether return_set flag is set to True or False. 

        Raises:
            TypeError : If the input is not a string.

        Examples:
            >>> delim_tok = DelimiterTokenizer() 
            >>> delim_tok.tokenize('data science')
            ['data', 'science']
            >>> delim_tok = DelimiterTokenizer(['$#$']) 
            >>> delim_tok.tokenize('data$#$science')
            ['data', 'science']
            >>> delim_tok = DelimiterTokenizer([',', '.']) 
            >>> delim_tok.tokenize('data,science.data,integration.')
            ['data', 'science', 'data', 'integration']
            >>> delim_tok = DelimiterTokenizer([',', '.'], return_set=True) 
            >>> delim_tok.tokenize('data,science.data,integration.')
            ['data', 'science', 'integration']

        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)
    
        if self.__use_split:
            token_list = list(filter(None,
                                     input_string.split(self.__delim_str)))
        else:
            token_list = list(filter(None,
                                     self.__delim_regex.split(input_string)))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list
コード例 #18
0
    def tokenize(self, input_string):
        """Tokenizes input string based on the set of delimiters.

        Args:
            input_string (str): The string to be tokenized. 

        Returns:
            A Python list which is a set or a bag of tokens, depending on whether return_set flag is set to True or False. 

        Raises:
            TypeError : If the input is not a string.

        Examples:
            >>> delim_tok = DelimiterTokenizer() 
            >>> delim_tok.tokenize('data science')
            ['data', 'science']
            >>> delim_tok = DelimiterTokenizer(['$#$']) 
            >>> delim_tok.tokenize('data$#$science')
            ['data', 'science']
            >>> delim_tok = DelimiterTokenizer([',', '.']) 
            >>> delim_tok.tokenize('data,science.data,integration.')
            ['data', 'science', 'data', 'integration']
            >>> delim_tok = DelimiterTokenizer([',', '.'], return_set=True) 
            >>> delim_tok.tokenize('data,science.data,integration.')
            ['data', 'science', 'integration']

        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        if self.__use_split:
            token_list = list(
                filter(None, input_string.split(self.__delim_str)))
        else:
            token_list = list(
                filter(None, self.__delim_regex.split(input_string)))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list
コード例 #19
0
    def tokenize(self, input_string):
        """Tokenizes input string into numeric tokens.

        Args:
            input_string (str): The string to be tokenized.

        Returns:
            A Python list, which represents a set of tokens if the flag return_set is true, and a bag of tokens otherwise. 

        Raises:
            TypeError : If the input is not a string.
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        token_list = list(filter(None, self.__num_regex.findall(input_string)))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list
コード例 #20
0
    def get_raw_score(self, string1, string2):
        """Computes the raw hamming distance between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Hamming distance (int).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.
            ValueError : If the input strings are not of same length.

        Examples:
            >>> hd = HammingDistance()
            >>> hd.get_raw_score('', '')
            0
            >>> hd.get_raw_score('alex', 'john')
            4
            >>> hd.get_raw_score(' ', 'a')
            1
            >>> hd.get_raw_score('JOHN', 'john')
            4
        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # for Hamming Distance string length should be same
        utils.sim_check_for_same_len(string1, string2)

        # sum all the mismatch characters at the corresponding index of
        # input strings
        return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
コード例 #21
0
def qgram(input_string, qval=2):
    """
    Tokenizes input string into q-grams.

    A q-gram is defined as all sequences of q characters. Q-grams are also known as n-grams and
    k-grams.

    Args:
        input_string (str): Input string

        qval (int): Q-gram length (defaults to 2)

    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> qgram('database')
        ['da','at','ta','ab','ba','as','se']
        >>> qgram('a')
        []
        >>> qgram('database', 3)
        ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']


    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    qgram_list = []

    if len(input_string) < qval or qval < 1:
        return qgram_list

    qgram_list = [input_string[i:i + qval] for i in _range(len(input_string) - (qval - 1))]
    return qgram_list
コード例 #22
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Smith-Waterman score between two strings.

        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Smith-Waterman similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> sw = SmithWaterman()
            >>> sw.get_raw_score('cat', 'hat')
            2.0
            >>> sw = SmithWaterman(gap_cost=2.2)
            >>> sw.get_raw_score('dva', 'deeve')
            1.0
            >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1))
            >>> sw.get_raw_score('dva', 'deeve')
            2.0
            >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
            >>> sw.get_raw_score('GCATAGCU', 'GATTACA')
            6.5
        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # Returns smith waterman similarity score from cython function
        return smith_waterman(string1,string2,self.gap_cost,self.sim_func)
コード例 #23
0
    def get_raw_score(self, string1, string2):
        """Computes the affine gap score between two strings. This score can be outside the range [0,1].
        
        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Affine gap score betwen the two input strings (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> aff = Affine()
            >>> aff.get_raw_score('dva', 'deeva')
            1.5
            >>> aff = Affine(gap_start=2, gap_continuation=0.5)
            >>> aff.get_raw_score('dva', 'deeve')
            -0.5
            >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
            >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
            4.4
        """
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return affine(string1, string2, self.gap_start, self.gap_continuation,
                      self.sim_func)
コード例 #24
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Needleman-Wunsch score between two strings.

        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Needleman-Wunsch similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> nw = NeedlemanWunsch()
            >>> nw.get_raw_score('dva', 'deeva')
            1.0
            >>> nw = NeedlemanWunsch(gap_cost=0.0)
            >>> nw.get_raw_score('dva', 'deeve')
            2.0
            >>> nw = NeedlemanWunsch(gap_cost=1.0, sim_func=lambda s1, s2 : (2.0 if s1 == s2 else -1.0))
            >>> nw.get_raw_score('dva', 'deeve')
            1.0
            >>> nw = NeedlemanWunsch(gap_cost=0.5, sim_func=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
            >>> nw.get_raw_score('GCATGCUA', 'GATTACA')
            2.5
        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # returns the similarity score from the cython function
        return needleman_wunsch(string1, string2, self.gap_cost, self.sim_func)
コード例 #25
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Smith-Waterman score between two strings.

        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Smith-Waterman similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> sw = SmithWaterman()
            >>> sw.get_raw_score('cat', 'hat')
            2.0
            >>> sw = SmithWaterman(gap_cost=2.2)
            >>> sw.get_raw_score('dva', 'deeve')
            1.0
            >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1))
            >>> sw.get_raw_score('dva', 'deeve')
            2.0
            >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
            >>> sw.get_raw_score('GCATAGCU', 'GATTACA')
            6.5
        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # Returns smith waterman similarity score from cython function
        return smith_waterman(string1, string2, self.gap_cost, self.sim_func)
コード例 #26
0
def hamming_distance(string1, string2):
    """
    Computes the Hamming distance between two strings.

    The Hamming distance between two strings of equal length is the number of positions at which the corresponding
    symbols are different. In another way, it measures the minimum number of substitutions required to change
    one string into the other, or the minimum number of errors that could have transformed one string into the other.


    Args:
        string1,string2 (str): Input strings

    Returns:
        Hamming distance (int)

    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.
        ValueError : If the input strings are not of same length


    Examples:
        >>> hamming_distance('', '')
        0
        >>> hamming_distance('alex', 'john')
        4
        >>> hamming_distance(' ', 'a')
        0
        >>> hamming_distance('JOHN', 'john')
        4
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.tok_check_for_string_input(string1, string2)
    # for Hamming Distance string length should be same
    utils.sim_check_for_same_len(string1, string2)
    # sum all the mismatch characters at the corresponding index of
    # input strings
    return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
コード例 #27
0
def hamming_distance(string1, string2):
    """
    Computes the Hamming distance between two strings.

    The Hamming distance between two strings of equal length is the number of positions at which the corresponding
    symbols are different. In another way, it measures the minimum number of substitutions required to change
    one string into the other, or the minimum number of errors that could have transformed one string into the other.


    Args:
        string1,string2 (str): Input strings

    Returns:
        Hamming distance (int)

    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.
        ValueError : If the input strings are not of same length


    Examples:
        >>> hamming_distance('', '')
        0
        >>> hamming_distance('alex', 'john')
        4
        >>> hamming_distance(' ', 'a')
        0
        >>> hamming_distance('JOHN', 'john')
        4
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.tok_check_for_string_input(string1, string2)
    # for Hamming Distance string length should be same
    utils.sim_check_for_same_len(string1, string2)
    # sum all the mismatch characters at the corresponding index of
    # input strings
    return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
コード例 #28
0
    def get_raw_score(self, string1, string2):
        """Computes the affine gap score between two strings. This score can be outside the range [0,1].
        
        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Affine gap score betwen the two input strings (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> aff = Affine()
            >>> aff.get_raw_score('dva', 'deeva')
            1.5
            >>> aff = Affine(gap_start=2, gap_continuation=0.5)
            >>> aff.get_raw_score('dva', 'deeve')
            -0.5
            >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
            >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
            4.4
        """
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return affine(string1, string2, self.gap_start, self.gap_continuation, self.sim_func)
コード例 #29
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jaro = Jaro()
            >>> jaro.get_raw_score('MARTHA', 'MARHTA')
            0.9444444444444445
            >>> jaro.get_raw_score('DWAYNE', 'DUANE')
            0.8222222222222223
            >>> jaro.get_raw_score('DIXON', 'DICKSONX')
            0.7666666666666666

        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return jaro(string1, string2)
コード例 #30
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro-Winkler score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro-Winkler similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jw = JaroWinkler()
            >>> jw.get_raw_score('MARTHA', 'MARHTA')
            0.9611111111111111
            >>> jw.get_raw_score('DWAYNE', 'DUANE')
            0.84
            >>> jw.get_raw_score('DIXON', 'DICKSONX')
            0.8133333333333332

        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return jaro_winkler(string1, string2, self.prefix_weight)
コード例 #31
0
ファイル: jaro.py プロジェクト: anhaidgroup/py_stringmatching
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jaro = Jaro()
            >>> jaro.get_raw_score('MARTHA', 'MARHTA')
            0.9444444444444445
            >>> jaro.get_raw_score('DWAYNE', 'DUANE')
            0.8222222222222223
            >>> jaro.get_raw_score('DIXON', 'DICKSONX')
            0.7666666666666666

        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        return jaro(string1, string2)
コード例 #32
0
def jaro(string1, string2):
    """
    Computes the Jaro measure between two strings.

    The Jaro measure is a type of edit distance, This was developed mainly to compare short strings,
    such as first and last names.


    Args:
        string1,string2 (str): Input strings

    Returns:
        Jaro measure (float)


    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.


    Examples:
        >>> jaro('MARTHA', 'MARHTA')
        0.9444444444444445
        >>> jaro('DWAYNE', 'DUANE')
        0.8222222222222223
        >>> jaro('DIXON', 'DICKSONX')
        0.7666666666666666


    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.tok_check_for_string_input(string1, string2)
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(string1, string2):
        return 0

    len_s1 = len(string1)
    len_s2 = len(string2)

    max_len = max(len_s1, len_s2)
    search_range = (max_len // 2) - 1
    if search_range < 0:
        search_range = 0

    flags_s1 = [False] * len_s1
    flags_s2 = [False] * len_s2

    common_chars = 0
    for i, ch_s1 in enumerate(string1):
        low = i - search_range if i > search_range else 0
        hi = i + search_range if i + search_range < len_s2 else len_s2 - 1
        for j in _range(low, hi + 1):
            if not flags_s2[j] and string2[j] == ch_s1:
                flags_s1[i] = flags_s2[j] = True
                common_chars += 1
                break
    if not common_chars:
        return 0
    k = trans_count = 0
    for i, f_s1 in enumerate(flags_s1):
        if f_s1:
            for j in _range(k, len_s2):
                if flags_s2[j]:
                    k = j + 1
                    break
            if string1[i] != string2[j]:
                trans_count += 1
    trans_count /= 2
    common_chars = float(common_chars)
    weight = ((common_chars / len_s1 + common_chars / len_s2 +
               (common_chars - trans_count) / common_chars)) / 3
    return weight
コード例 #33
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jaro = Jaro()
            >>> jaro.get_raw_score('MARTHA', 'MARHTA')
            0.9444444444444445
            >>> jaro.get_raw_score('DWAYNE', 'DUANE')
            0.8222222222222223
            >>> jaro.get_raw_score('DIXON', 'DICKSONX')
            0.7666666666666666

        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        len_s1 = len(string1)
        len_s2 = len(string2)

        max_len = max(len_s1, len_s2)
        search_range = (max_len // 2) - 1
        if search_range < 0:
            search_range = 0

        flags_s1 = [False] * len_s1
        flags_s2 = [False] * len_s2

        common_chars = 0
        for i, ch_s1 in enumerate(string1):
            low = i - search_range if i > search_range else 0
            high = i + search_range if i + search_range < len_s2 else len_s2 - 1
            for j in xrange(low, high + 1):
                if not flags_s2[j] and string2[j] == ch_s1:
                    flags_s1[i] = flags_s2[j] = True
                    common_chars += 1
                    break

        if not common_chars:
            return 0

        k = trans_count = 0
        for i, f_s1 in enumerate(flags_s1):
            if f_s1:
                for j in xrange(k, len_s2):
                    if flags_s2[j]:
                        k = j + 1
                        break
                if string1[i] != string2[j]:
                    trans_count += 1

        trans_count /= 2
        common_chars = float(common_chars)
        weight = ((common_chars / len_s1 + common_chars / len_s2 +
                   (common_chars - trans_count) / common_chars)) / 3
        return weight
コード例 #34
0
def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_ident):
    """
    Computes the Affine gap score between two strings.

    The Affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more
    gracefully.

    For more information refer to string matching chapter in the DI book.

    Args:
        string1,string2 (str) : Input strings

        gap_start (float): Cost for the gap at the start (defaults to 1)

        gap_continuation (float) : Cost for the gap continuation (defaults to 0.5)

        sim_score (function) : Function computing similarity score between two chars, represented as strings
            (defaults to identity).

    Returns:
        Affine gap score (float)

    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.

    Examples:
        >>> affine('dva', 'deeva')
        1.5
        >>> affine('dva', 'deeve', gap_start=2, gap_continuation=0.5)
        -0.5
        >>> affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2: (int(1 if s1 == s2 else 0)))
        4.4
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.tok_check_for_string_input(string1, string2)
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(string1, string2):
        return 0

    gap_start = -gap_start
    gap_continuation = -gap_continuation
    m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
    x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
    y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
    # DP initialization
    for i in _range(1, len(string1) + 1):
        m[i][0] = -float("inf")
        x[i][0] = gap_start + (i - 1) * gap_continuation
        y[i][0] = -float("inf")
    # DP initialization
    for j in _range(1, len(string2) + 1):
        m[0][j] = -float("inf")
        x[0][j] = -float("inf")
        y[0][j] = gap_start + (j - 1) * gap_continuation
    # affine gap calculation using DP
    for i in _range(1, len(string1) + 1):
        for j in _range(1, len(string2) + 1):
            # best score between x_1....x_i and y_1....y_j given that x_i is aligned to y_j
            m[i][j] = sim_score(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1])
            # the best score given that x_i is aligned to a gap
            x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j])
            # the best score given that y_j is aligned to a gap
            y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1])
    return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)])
コード例 #35
0
ファイル: jaro.py プロジェクト: kvpradap/conda-pysm-appveyor
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jaro = Jaro()
            >>> jaro.get_raw_score('MARTHA', 'MARHTA')
            0.9444444444444445
            >>> jaro.get_raw_score('DWAYNE', 'DUANE')
            0.8222222222222223
            >>> jaro.get_raw_score('DIXON', 'DICKSONX')
            0.7666666666666666

        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        len_s1 = len(string1)
        len_s2 = len(string2)

        max_len = max(len_s1, len_s2)
        search_range = (max_len // 2) - 1
        if search_range < 0:
            search_range = 0

        flags_s1 = [False] * len_s1
        flags_s2 = [False] * len_s2

        common_chars = 0
        for i, ch_s1 in enumerate(string1):
            low = i - search_range if i > search_range else 0
            high = i + search_range if i + search_range < len_s2 else len_s2 - 1
            for j in xrange(low, high + 1):
                if not flags_s2[j] and string2[j] == ch_s1:
                    flags_s1[i] = flags_s2[j] = True
                    common_chars += 1
                    break

        if not common_chars:
            return 0

        k = trans_count = 0
        for i, f_s1 in enumerate(flags_s1):
            if f_s1:
                for j in xrange(k, len_s2):
                    if flags_s2[j]:
                        k = j + 1
                        break
                if string1[i] != string2[j]:
                    trans_count += 1

        trans_count /= 2
        common_chars = float(common_chars)
        weight = ((common_chars / len_s1 + common_chars / len_s2 +
                   (common_chars - trans_count) / common_chars)) / 3
        return weight
コード例 #36
0
    def get_raw_score(self, string1, string2):
        """Computes the affine gap score between two strings. This score can be outside the range [0,1].
        
        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Affine gap score betwen the two input strings (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> aff = Affine()
            >>> aff.get_raw_score('dva', 'deeva')
            1.5
            >>> aff = Affine(gap_start=2, gap_continuation=0.5)
            >>> aff.get_raw_score('dva', 'deeve')
            -0.5
            >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
            >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
            4.4
        """
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        gap_start = -self.gap_start
        gap_continuation = -self.gap_continuation
        m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
        x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
        y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)

        # DP initialization
        for i in xrange(1, len(string1) + 1):
            m[i][0] = -float("inf")
            x[i][0] = gap_start + (i - 1) * gap_continuation
            y[i][0] = -float("inf")

        # DP initialization
        for j in xrange(1, len(string2) + 1):
            m[0][j] = -float("inf")
            x[0][j] = -float("inf")
            y[0][j] = gap_start + (j - 1) * gap_continuation

        # affine gap calculation using DP
        for i in xrange(1, len(string1) + 1):
            for j in xrange(1, len(string2) + 1):
                # best score between x_1....x_i and y_1....y_j
                # given that x_i is aligned to y_j
                m[i][j] = (self.sim_func(string1[i - 1], string2[j - 1]) +
                           max(m[i - 1][j - 1], x[i - 1][j - 1],
                               y[i - 1][j - 1]))

                # the best score given that x_i is aligned to a gap
                x[i][j] = max(gap_start + m[i - 1][j],
                              gap_continuation + x[i - 1][j])

                # the best score given that y_j is aligned to a gap
                y[i][j] = max(gap_start + m[i][j - 1],
                              gap_continuation + y[i][j - 1])

        return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)],
                   y[len(string1)][len(string2)])
コード例 #37
0
ファイル: affine.py プロジェクト: rishab93/py_stringmatching
    def get_raw_score(self, string1, string2):
        """Computes the affine gap score between two strings. This score can be outside the range [0,1].
        
        Args:
            string1,string2 (str) : Input strings.

        Returns:
            Affine gap score betwen the two input strings (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> aff = Affine()
            >>> aff.get_raw_score('dva', 'deeva')
            1.5
            >>> aff = Affine(gap_start=2, gap_continuation=0.5)
            >>> aff.get_raw_score('dva', 'deeve')
            -0.5
            >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
            >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
            4.4
        """
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        gap_start = -self.gap_start
        gap_continuation = -self.gap_continuation
        m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
        x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
        y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)

        # DP initialization
        for i in xrange(1, len(string1) + 1):
            m[i][0] = -float("inf")
            x[i][0] = gap_start + (i - 1) * gap_continuation
            y[i][0] = -float("inf")

        # DP initialization
        for j in xrange(1, len(string2) + 1):
            m[0][j] = -float("inf")
            x[0][j] = -float("inf")
            y[0][j] = gap_start + (j - 1) * gap_continuation

        # affine gap calculation using DP
        for i in xrange(1, len(string1) + 1):
            for j in xrange(1, len(string2) + 1):
                # best score between x_1....x_i and y_1....y_j
                # given that x_i is aligned to y_j
                m[i][j] = (
                    self.sim_func(string1[i - 1], string2[j - 1]) +
                    max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1]))

                # the best score given that x_i is aligned to a gap
                x[i][j] = max(gap_start + m[i - 1][j],
                              gap_continuation + x[i - 1][j])

                # the best score given that y_j is aligned to a gap
                y[i][j] = max(gap_start + m[i][j - 1],
                              gap_continuation + y[i][j - 1])

        return max(m[len(string1)][len(string2)],
                   x[len(string1)][len(string2)],
                   y[len(string1)][len(string2)])
コード例 #38
0
def jaro(string1, string2):
    """
    Computes the Jaro measure between two strings.

    The Jaro measure is a type of edit distance, This was developed mainly to compare short strings,
    such as first and last names.


    Args:
        string1,string2 (str): Input strings

    Returns:
        Jaro measure (float)


    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.


    Examples:
        >>> jaro('MARTHA', 'MARHTA')
        0.9444444444444445
        >>> jaro('DWAYNE', 'DUANE')
        0.8222222222222223
        >>> jaro('DIXON', 'DICKSONX')
        0.7666666666666666


    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.tok_check_for_string_input(string1, string2)
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(string1, string2):
        return 0

    len_s1 = len(string1)
    len_s2 = len(string2)

    max_len = max(len_s1, len_s2)
    search_range = (max_len // 2) - 1
    if search_range < 0:
        search_range = 0

    flags_s1 = [False] * len_s1
    flags_s2 = [False] * len_s2

    common_chars = 0
    for i, ch_s1 in enumerate(string1):
        low = i - search_range if i > search_range else 0
        hi = i + search_range if i + search_range < len_s2 else len_s2 - 1
        for j in _range(low, hi + 1):
            if not flags_s2[j] and string2[j] == ch_s1:
                flags_s1[i] = flags_s2[j] = True
                common_chars += 1
                break
    if not common_chars:
        return 0
    k = trans_count = 0
    for i, f_s1 in enumerate(flags_s1):
        if f_s1:
            for j in _range(k, len_s2):
                if flags_s2[j]:
                    k = j + 1
                    break
            if string1[i] != string2[j]:
                trans_count += 1
    trans_count /= 2
    common_chars = float(common_chars)
    weight = ((common_chars / len_s1 + common_chars / len_s2 +
               (common_chars - trans_count) / common_chars)) / 3
    return weight
コード例 #39
0
def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_ident):
    """
    Computes the Affine gap score between two strings.

    The Affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more
    gracefully.

    For more information refer to string matching chapter in the DI book.

    Args:
        string1,string2 (str) : Input strings

        gap_start (float): Cost for the gap at the start (defaults to 1)

        gap_continuation (float) : Cost for the gap continuation (defaults to 0.5)

        sim_score (function) : Function computing similarity score between two chars, represented as strings
            (defaults to identity).

    Returns:
        Affine gap score (float)

    Raises:
        TypeError : If the inputs are not strings or if one of the inputs is None.

    Examples:
        >>> affine('dva', 'deeva')
        1.5
        >>> affine('dva', 'deeve', gap_start=2, gap_continuation=0.5)
        -0.5
        >>> affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2: (int(1 if s1 == s2 else 0)))
        4.4
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.tok_check_for_string_input(string1, string2)
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(string1, string2):
        return 0

    gap_start = -gap_start
    gap_continuation = -gap_continuation
    m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
    x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
    y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
    # DP initialization
    for i in _range(1, len(string1) + 1):
        m[i][0] = -float("inf")
        x[i][0] = gap_start + (i - 1) * gap_continuation
        y[i][0] = -float("inf")
    # DP initialization
    for j in _range(1, len(string2) + 1):
        m[0][j] = -float("inf")
        x[0][j] = -float("inf")
        y[0][j] = gap_start + (j - 1) * gap_continuation
    # affine gap calculation using DP
    for i in _range(1, len(string1) + 1):
        for j in _range(1, len(string2) + 1):
            # best score between x_1....x_i and y_1....y_j given that x_i is aligned to y_j
            m[i][j] = sim_score(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1])
            # the best score given that x_i is aligned to a gap
            x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j])
            # the best score given that y_j is aligned to a gap
            y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1])
    return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)])