def needleman_wunsch(string1, string2, gap_cost=1.0, sim_score=sim_ident): """ Computes the Needleman-Wunsch measure between two strings. The Needleman-Wunsch generalizes the Levenshtein distance and considers global alignment between two strings. Specifically, it is computed by assigning a score to each alignment between two input strings and choosing the score of the best alignment, that is, the maximal score. An alignment between two strings is a set of correspondences between the characters of between them, allowing for gaps. Args: string1,string2 (str) : Input strings gap_cost (float) : Cost of gap (defaults to 1.0) sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults to an identity function, where if two characters are same it returns 1.0 else returns 0. Returns: Needleman-Wunsch measure (float) Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> needleman_wunsch('dva', 'deeva') 1.0 >>> needleman_wunsch('dva', 'deeve', 0.0) 2.0 >>> needleman_wunsch('dva', 'deeve', 1.0, sim_score=lambda s1, s2 : (2.0 if s1 == s2 else -1.0)) 1.0 >>> needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5, sim_score=lambda s1, s2 : (1.0 if s1 == s2 else -1.0)) 2.5 """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) # DP initialization for i in _range(len(string1) + 1): dist_mat[i, 0] = -(i * gap_cost) # DP initialization for j in _range(len(string2) + 1): dist_mat[0, j] = -(j * gap_cost) # Needleman-Wunsch DP calculation for i in _range(1, len(string1) + 1): for j in _range(1, len(string2) + 1): match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1]) delete = dist_mat[i - 1, j] - gap_cost insert = dist_mat[i, j - 1] - gap_cost dist_mat[i, j] = max(match, delete, insert) return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
def smith_waterman(string1, string2, gap_cost=1.0, sim_score=sim_ident): """ Computes the Smith-Waterman measure between two strings. The Smith–Waterman algorithm performs local sequence alignment; that is, for determining similar regions between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of all possible lengths and optimizes the similarity measure. Args: string1,string2 (str) : Input strings gap_cost (float) : Cost of gap (defaults to 1.0) sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults to an identity function, where if two characters are same it returns 1 else returns 0. Returns: Smith-Waterman measure (float) Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> smith_waterman('cat', 'hat') 2.0 >>> smith_waterman('dva', 'deeve', 2.2) 1.0 >>> smith_waterman('dva', 'deeve', 1, sim_score=lambda s1, s2 : (2 if s1 == s2 else -1)) 2.0 >>> smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2 : (1.5 if s1 == s2 else 0.5)) 6.5 """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) max_value = 0 # Smith Waterman DP calculations for i in _range(1, len(string1) + 1): for j in _range(1, len(string2) + 1): match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1]) delete = dist_mat[i - 1, j] - gap_cost insert = dist_mat[i, j - 1] - gap_cost dist_mat[i, j] = max(0, match, delete, insert) max_value = max(max_value, dist_mat[i, j]) return max_value
def qgram(input_string, qval=2): """ Tokenizes input string into q-grams. A q-gram is defined as all sequences of q characters. Q-grams are also known as n-grams and k-grams. Args: input_string (str): Input string qval (int): Q-gram length (defaults to 2) Returns: Token list (list) Raises: TypeError : If the input is not a string Examples: >>> qgram('database') ['da','at','ta','ab','ba','as','se'] >>> qgram('a') [] >>> qgram('database', 3) ['dat', 'ata', 'tab', 'aba', 'bas', 'ase'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) qgram_list = [] if len(input_string) < qval or qval < 1: return qgram_list qgram_list = [input_string[i:i + qval] for i in _range(len(input_string) - (qval - 1))] return qgram_list
def jaro(string1, string2): """ Computes the Jaro measure between two strings. The Jaro measure is a type of edit distance, This was developed mainly to compare short strings, such as first and last names. Args: string1,string2 (str): Input strings Returns: Jaro measure (float) Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jaro('MARTHA', 'MARHTA') 0.9444444444444445 >>> jaro('DWAYNE', 'DUANE') 0.8222222222222223 >>> jaro('DIXON', 'DICKSONX') 0.7666666666666666 """ # input validations utils.sim_check_for_none(string1, string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 len_s1 = len(string1) len_s2 = len(string2) max_len = max(len_s1, len_s2) search_range = (max_len // 2) - 1 if search_range < 0: search_range = 0 flags_s1 = [False] * len_s1 flags_s2 = [False] * len_s2 common_chars = 0 for i, ch_s1 in enumerate(string1): low = i - search_range if i > search_range else 0 hi = i + search_range if i + search_range < len_s2 else len_s2 - 1 for j in _range(low, hi + 1): if not flags_s2[j] and string2[j] == ch_s1: flags_s1[i] = flags_s2[j] = True common_chars += 1 break if not common_chars: return 0 k = trans_count = 0 for i, f_s1 in enumerate(flags_s1): if f_s1: for j in _range(k, len_s2): if flags_s2[j]: k = j + 1 break if string1[i] != string2[j]: trans_count += 1 trans_count /= 2 common_chars = float(common_chars) weight = ((common_chars / len_s1 + common_chars / len_s2 + (common_chars - trans_count) / common_chars)) / 3 return weight
def levenshtein(string1, string2): """ Computes the Levenshtein distance between two strings. Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string is carried out using a sequence of the following operators: delete a character, insert a character, and substitute one character for another. Args: string1,string2 (str): Input strings Returns: Levenshtein distance (int) Raises: TypeError : If the inputs are not strings Examples: >>> levenshtein('a', '') 1 >>> levenshtein('example', 'samples') 3 >>> levenshtein('levenshtein', 'frankenstein') 6 """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 0.0 ins_cost, del_cost, sub_cost, trans_cost = (1, 1, 1, 1) len_str1 = len(string1) len_str2 = len(string2) if len_str1 == 0: return len_str2 * ins_cost if len_str2 == 0: return len_str1 * del_cost d_mat = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.int) for i in _range(len_str1 + 1): d_mat[i, 0] = i * del_cost for j in _range(len_str2 + 1): d_mat[0, j] = j * ins_cost for i in _range(len_str1): for j in _range(len_str2): d_mat[i + 1, j + 1] = min( d_mat[i + 1, j] + ins_cost, d_mat[i, j + 1] + del_cost, d_mat[i, j] + (sub_cost if string1[i] != string2[j] else 0) ) return d_mat[len_str1, len_str2]
def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_ident): """ Computes the Affine gap score between two strings. The Affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more gracefully. For more information refer to string matching chapter in the DI book. Args: string1,string2 (str) : Input strings gap_start (float): Cost for the gap at the start (defaults to 1) gap_continuation (float) : Cost for the gap continuation (defaults to 0.5) sim_score (function) : Function computing similarity score between two chars, represented as strings (defaults to identity). Returns: Affine gap score (float) Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> affine('dva', 'deeva') 1.5 >>> affine('dva', 'deeve', gap_start=2, gap_continuation=0.5) -0.5 >>> affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2: (int(1 if s1 == s2 else 0))) 4.4 """ # input validations utils.sim_check_for_none(string1, string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 gap_start = -gap_start gap_continuation = -gap_continuation m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) # DP initialization for i in _range(1, len(string1) + 1): m[i][0] = -float("inf") x[i][0] = gap_start + (i - 1) * gap_continuation y[i][0] = -float("inf") # DP initialization for j in _range(1, len(string2) + 1): m[0][j] = -float("inf") x[0][j] = -float("inf") y[0][j] = gap_start + (j - 1) * gap_continuation # affine gap calculation using DP for i in _range(1, len(string1) + 1): for j in _range(1, len(string2) + 1): # best score between x_1....x_i and y_1....y_j given that x_i is aligned to y_j m[i][j] = sim_score(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1]) # the best score given that x_i is aligned to a gap x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j]) # the best score given that y_j is aligned to a gap y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1]) return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)])