def empirical_cost_edit_distance(r,q,uniform_cost=0.1,p_r_qr=0.95,mu=1.0): """ Estimates the probability P(q|r) where q is a candidate spelling of r The cost of a single edit in the Damerau-Levenshtein distance is calculated from a noisy chanel model if editDistance(r,q) == 1 then P(r|q) is taken from the empirical noisy model if editDistance(r,q) > 1 then P(r|q) = P_empirical(r|q) * P_uniform(r|q)^(distance-1) Returns log( P(q|r) ) if r != q then P(q|r) = cost * P(q)) if r == q then P(q|r) = p_r_qr * P(q) if editDistance(r,q) == 1 then cost = P_empirical(r|q) if editDistance(r,q) > 1 then cost = P_empirical(r|q) * (uniform_cost^(distance -1)) """ log_prob_q = calculate_log_prob(q) d = edit_distance(r,q) editOperation = findEditOperation(r,q) if d==0 or len(editOperation)==0: return log(p_r_qr) + mu*log_prob_q else: log_prob_q = calculate_log_prob(q) confusion_matrices = [edits_del_counter,edits_sub_counter,edits_tra_counter,edits_ins_counter] # editOperation e.g. [0, ('#','s')] from: actual = un; intended = sun editName = editOperation[0] editArguments = editOperation[1] # How many such edits were found on the training file for the noisy model numerator = confusion_matrices[editName][editArguments] if editName == 0: # deletion denominator = edits_bichar_counter[editArguments] elif editName == 1: # substitution denominator = edits_char_counter[editArguments[1]] elif editName == 2: # transposition denominator = edits_bichar_counter[editArguments] elif editName == 3: # insertion denominator = edits_char_counter[editArguments[0]] # Add-1 smoothing numberOfCharsInAlphabet = len(edits_char_counter) prob_r_q = float(numerator + 1) / float(denominator + numberOfCharsInAlphabet) log_prob_q_r = log(prob_r_q) + (d-1)*log(uniform_cost) + log_prob_q return log_prob_q_r
def scan_edits(trainingFile): """ Builds a model for Noisy Channel using edits data from trainingFile argument The Noisy Channel model is represented by - 4 confusion matrices: delMatrix,subMatrix,traMatrix,insMatrix - 2 indexes: uniChar and biChar Confusion matrices and indexes are implemented as Counter (char1,char2) -> counts Order of elements in tuple (char1,char2) is defined using the approach described by Kernighan, Church and Gale in 'A Spelling Correction Program Based On Noisy Channel Model' del[(x,y)] = count(xy typed as x) sub[(x,y)] = count(y typed as x) tra[(x,y)] = count(xy typed as yx) ins[(x,y)] = count(x typed as xy) It writes 6 files to disk: edits_del_counter.mrshl edits_sub_counter.mrshl edits_tra_counter.mrshl edits_ins_counter.mrshl edits_char_counter.mrshl edits_bichar_counter.mrshl It returns a list with the 4 Confusion matrices and the 2 indexes [delMatrix,subMatrix,traMatrix,insMatrix,uniChar,biChar] """ delCounter = Counter() subCounter = Counter() traCounter = Counter() insCounter = Counter() matrices = [delCounter,subCounter,traCounter,insCounter] with open(trainingFile) as fTraining: for line in fTraining: actualQuery,intendedQuery= line.split('\t',1) actualQuery = actualQuery.split() intendedQuery = intendedQuery.split() noOperation = [] # Not considering splits or merges right now if len(actualQuery) == len(intendedQuery): for idx in range(len(actualQuery)): edit1 = findEditOperation(actualQuery[idx],intendedQuery[idx]) if edit1 != noOperation: matrix = matrices[edit1[0]] matrix[edit1[1]] += 1 serialize_data(delCounter,"edits_del_counter.mrshl") serialize_data(subCounter,"edits_sub_counter.mrshl") serialize_data(traCounter,"edits_tra_counter.mrshl") serialize_data(insCounter,"edits_ins_counter.mrshl") ngram_indexes = generateNGramsFromNoisyFile(trainingFile) serialize_data(ngram_indexes[0],"edits_char_counter.mrshl") serialize_data(ngram_indexes[1],"edits_bichar_counter.mrshl") return matrices + ngram_indexes