def empirical_cost_edit_distance(r,q,uniform_cost=0.1,p_r_qr=0.95,mu=1.0): """ Estimates the probability P(q|r) where q is a candidate spelling of r The cost of a single edit in the Damerau-Levenshtein distance is calculated from a noisy chanel model if editDistance(r,q) == 1 then P(r|q) is taken from the empirical noisy model if editDistance(r,q) > 1 then P(r|q) = P_empirical(r|q) * P_uniform(r|q)^(distance-1) Returns log( P(q|r) ) if r != q then P(q|r) = cost * P(q)) if r == q then P(q|r) = p_r_qr * P(q) if editDistance(r,q) == 1 then cost = P_empirical(r|q) if editDistance(r,q) > 1 then cost = P_empirical(r|q) * (uniform_cost^(distance -1)) """ log_prob_q = calculate_log_prob(q) d = edit_distance(r,q) editOperation = findEditOperation(r,q) if d==0 or len(editOperation)==0: return log(p_r_qr) + mu*log_prob_q else: log_prob_q = calculate_log_prob(q) confusion_matrices = [edits_del_counter,edits_sub_counter,edits_tra_counter,edits_ins_counter] # editOperation e.g. [0, ('#','s')] from: actual = un; intended = sun editName = editOperation[0] editArguments = editOperation[1] # How many such edits were found on the training file for the noisy model numerator = confusion_matrices[editName][editArguments] if editName == 0: # deletion denominator = edits_bichar_counter[editArguments] elif editName == 1: # substitution denominator = edits_char_counter[editArguments[1]] elif editName == 2: # transposition denominator = edits_bichar_counter[editArguments] elif editName == 3: # insertion denominator = edits_char_counter[editArguments[0]] # Add-1 smoothing numberOfCharsInAlphabet = len(edits_char_counter) prob_r_q = float(numerator + 1) / float(denominator + numberOfCharsInAlphabet) log_prob_q_r = log(prob_r_q) + (d-1)*log(uniform_cost) + log_prob_q return log_prob_q_r
def is_good_candidate(candidate,word,jaccard_cutoff = 0.2, edit_cutoff = 3): '''Test if a candidate is good enough to a word with some heuristics''' # Candidate should start with same letter if word[0] != candidate[0]: return False # Candidate should have length within edit_cutoff of word if abs(len(candidate) - len(word)) >= edit_cutoff: return False # Jaccard overlap if len(word) > 10: jaccard_cutoff = max(jaccard_cutoff,0.5) if jaccard_coeff(candidate,word) <= jaccard_cutoff: return False #Edit distance should be <= 2 if edit_distance(candidate,word) >= edit_cutoff: return False return True
def uniform_cost_edit_distance(r,q,cost=0.001,p_r_qr=0.95,mu=1.0): """ Estimates the probability P(q|r) where q is a candidate spelling of r Any single edit using an operator defined in the Damerau-Levenshtein distance has uniform probability defined by 'cost' Returns log( P(q|r) ) if r != q then P(q|r) = (cost^edit_distance(r,q) * P(q)) if r == q then P(q|r) = p_r_qr * p(q) """ log_prob_q = calculate_log_prob(q) if r==q: return log(p_r_qr) + mu*log_prob_q else: d = edit_distance(r,q) return d * log(cost) + mu*log_prob_q