def search_editdist_zalg(text, pat): pat_len = len(pat) txt_len = len(text) first_string = pat + "$" + text # create the first string by concatenating pattern, "$", and text str_len = len(first_string) first_lis = calculateZ(first_string) # calculate the z values for the first string second_string = pat[::-1] + "$" + text[::-1] # create the second string by concatenating reversed pattern, "$", and reversed text second_lis = calculateZ(second_string) # calculate the z values for the second string result = [] for i in range(pat_len + 1, str_len - pat_len + 2): substitute_sum = first_lis[i] + second_lis[txt_len + pat_len + 2 - i] # sum the z value of the first character of the pattern in the first string, and the z value of the # last character in the second string. Case substitution delete_sum = first_lis[i] + second_lis[txt_len + pat_len + 2 - i - 1] # sum the z value of the first character of the pattern in the first string, and the z value of the # one before the last character in the second string so that the corresponding substring with length # of len(pat) + 1. Case insertion insert_sum = first_lis[i] + second_lis[txt_len + pat_len + 2 - i + 1] # sum the z value of the first character of the pattern in the first string, and the z value of the # second last character in the second string so that the corresponding substring with length # of len(pat) - 1. Case deletion if substitute_sum == pat_len * 2: print("match", i, substitute_sum) print(first_string[i:i + 5]) result.append([i - len(pat), 0]) # if they exactly match elif substitute_sum == pat_len - 1: print("sub", i, substitute_sum) print(first_string[i:i + 5]) result.append([i - len(pat), 1]) # if they match with hamming distance = 1 elif insert_sum == pat_len - 1: print("insert", i, insert_sum) print(first_string[i:i + 5]) result.append([i - len(pat), 1]) # if they match by deleting one character from pattern elif delete_sum == pat_len: # if they match by inserting one character into pattern if first_lis[i + 1] + second_lis[ txt_len + pat_len + 1 - i] != 2 * pat_len: #删除的时候,防止下一位开始正好完全匹配 print("delete", i, delete_sum) print(first_string[i:i + 5]) result.append([i - len(pat), 1]) # 1. This conditional statement will check cases in order, if any cases matched, it will not consider # the other cases. For example, if xyz matches xyz, then xy matches xyz with edit distance = 1 will # not be included # 2. When insertion case happens at i, the position will be stored only if there is not any exactly match at # i+1 # 3. When there is an exactly match at i, the deletion case at i+1 will not be considered because # its corresponding z_value in the second string will be the length of the pattern, exceeding the len -1 return result
def goodsufix(pat): m = len(pat) z = list(reversed(calculateZ(pat[::-1]))) good_suffix = [-1 for _ in range(m + 1)] for p in range(m): j = m - z[p] good_suffix[j] = p good_suffix.pop() return good_suffix
def computeSP(pat): m = len(pat) z = calculateZ(pat) SP = [0 for _ in range(m)] for j in range(m - 1, -1, -1): i = j + z[j] - 1 if i == -1: break SP[i] = z[j] return SP
def edit_distance(text, pat): str = pat + "$" + text z1 = calculateZ(str) reverse_str = pat[::-1] + "$" + text[::-1] z2 = calculateZ(reverse_str) str_len = len(str) m = len(pat) target = [] for i in range(m + 1, str_len - m + 2): if str_len - i + 2 <= str_len - 1: # sum the z value of the first character and the z value of the last character in current pattern substitution = z1[i] + z2[str_len - i + 1] #sum the z value of the first character and the z value of the next character of the last character in current pattern deletion = z1[i] + z2[str_len - i] # sum the z value of the first character and the z value of the character that before the last character in current pattern # if str_len-i+2<=str_len-1: insertion = z1[i] + z2[str_len - i + 2] else: continue if z1[i] == m: print(str[i:i + 5]) print("match", i) target.append([i - m, 0]) # the extra case 'i<=str_len-m 'make sure if the length of remain text < the length of pattern, then it should not use substitution elif substitution == m - 1 and i <= str_len - m: print(str[i:i + 5]) print("sub", i, substitution) target.append([i - m, 1]) elif insertion == m - 1 and i <= str_len - m + 1: print(str[i:i + 5]) print("insert", i, insertion) target.append([i - m, 1]) elif deletion == m and i <= str_len - m - 1: print(str[i:i + 5]) if z1[i + 1] != m: # if the pattern that start from i+1 matches the pat,ignore the case that start from i as redundant print("delete", i, deletion) target.append([i - m, 1]) return target
def matchedprefix(pat): m = len(pat) matched_prefix = [-1 for _ in range(m)] z = calculateZ(pat) for i in range(m - 1, -1, -1): if i + z[i] - 1 == m - 1: matched_prefix[i] = z[i] elif i == m - 1: # 最后一位且不匹配 matched_prefix[i] = 0 else: matched_prefix[i] = matched_prefix[i + 1] matched_prefix[0] = m - 1 return matched_prefix
def computeSPx(pat): m = len(pat) z = calculateZ(pat) #The size of Spi is m*75, each sublist records the length of the longest proper suffix of pat[1...i] that matches its prefix, with the extra condition # that pat[spi(x)+1]=x SP = [[0 for _ in range(CHARACTER)] for _ in range(m)] for j in range(m - 1, -1, -1): #for each j,compute its i will record spi with the longest when multiple z_box with different start point j but same end point i i = j + z[j] - 1 if i == -1: # if i=-1 sp[i] will be recognised as the last value of SP break SP[i][ord(pat[z[j]]) - ord('0')] = z[j] #recording spi and the next character return SP