Esempio n. 1
0
def empirical_cost_edit_distance(r,q,uniform_cost=0.1,p_r_qr=0.95,mu=1.0):
  """
  Estimates the probability P(q|r) where q is a candidate spelling of r
  The cost of a single edit in the Damerau-Levenshtein distance is calculated from a noisy chanel model

  if editDistance(r,q) == 1 then P(r|q) is taken from the empirical noisy model
  if editDistance(r,q) > 1 then P(r|q) = P_empirical(r|q) * P_uniform(r|q)^(distance-1)
  
  Returns log( P(q|r) ) if r != q then P(q|r) = cost * P(q))
                        if r == q then P(q|r) = p_r_qr * P(q)
                        
                        if editDistance(r,q) == 1 then cost = P_empirical(r|q)
                        if editDistance(r,q) > 1  then cost = P_empirical(r|q) * (uniform_cost^(distance -1))
  
  """
  
  log_prob_q    = calculate_log_prob(q)
  d             = edit_distance(r,q)
  editOperation = findEditOperation(r,q)

  if d==0 or len(editOperation)==0:
    return log(p_r_qr) + mu*log_prob_q
  else: 
    
    log_prob_q         = calculate_log_prob(q)
    confusion_matrices = [edits_del_counter,edits_sub_counter,edits_tra_counter,edits_ins_counter]
    
    # editOperation e.g. [0, ('#','s')]  from: actual = un; intended = sun
    editName      = editOperation[0]
    editArguments = editOperation[1]

    # How many such edits were found on the training file for the noisy model
    numerator = confusion_matrices[editName][editArguments]
    
    if editName == 0: # deletion
        denominator = edits_bichar_counter[editArguments]
    elif editName == 1: # substitution
        denominator = edits_char_counter[editArguments[1]]
    elif editName == 2: # transposition
        denominator = edits_bichar_counter[editArguments]
    elif editName == 3: # insertion
        denominator = edits_char_counter[editArguments[0]]
    
    # Add-1 smoothing
    numberOfCharsInAlphabet = len(edits_char_counter)
    prob_r_q = float(numerator + 1) / float(denominator + numberOfCharsInAlphabet) 
    log_prob_q_r = log(prob_r_q) + (d-1)*log(uniform_cost) + log_prob_q
    
    return log_prob_q_r
Esempio n. 2
0
def scan_edits(trainingFile):
  """
  Builds a model for Noisy Channel using edits data from trainingFile argument
  The Noisy Channel model is represented by
  - 4 confusion matrices: delMatrix,subMatrix,traMatrix,insMatrix
  - 2 indexes: uniChar and biChar
  
  Confusion matrices and indexes are implemented as Counter (char1,char2) -> counts
  Order of elements in tuple (char1,char2) is defined using the approach described by Kernighan, Church and Gale in 'A Spelling Correction Program Based On Noisy Channel Model'
  
  del[(x,y)] = count(xy typed as x)
  sub[(x,y)] = count(y typed as x)
  tra[(x,y)] = count(xy typed as yx)
  ins[(x,y)] = count(x typed as xy)
  
  It writes 6 files to disk:
 
  edits_del_counter.mrshl
  edits_sub_counter.mrshl
  edits_tra_counter.mrshl
  edits_ins_counter.mrshl
  edits_char_counter.mrshl
  edits_bichar_counter.mrshl
  
  It returns a list with the 4 Confusion matrices and the 2 indexes
  [delMatrix,subMatrix,traMatrix,insMatrix,uniChar,biChar]
  
  """
  delCounter = Counter()
  subCounter = Counter()
  traCounter = Counter()
  insCounter = Counter()
  
  matrices = [delCounter,subCounter,traCounter,insCounter]
  
  with open(trainingFile) as fTraining:
    for line in fTraining:
      actualQuery,intendedQuery= line.split('\t',1)
      
      actualQuery = actualQuery.split()
      intendedQuery = intendedQuery.split()
      noOperation = []
      
      # Not considering splits or merges right now
      if len(actualQuery) == len(intendedQuery):
        for idx in range(len(actualQuery)):
          edit1 = findEditOperation(actualQuery[idx],intendedQuery[idx])
          
          if edit1 != noOperation:
            matrix = matrices[edit1[0]]
            matrix[edit1[1]] += 1
  
  serialize_data(delCounter,"edits_del_counter.mrshl")
  serialize_data(subCounter,"edits_sub_counter.mrshl")
  serialize_data(traCounter,"edits_tra_counter.mrshl")
  serialize_data(insCounter,"edits_ins_counter.mrshl")
  
  ngram_indexes = generateNGramsFromNoisyFile(trainingFile)
  serialize_data(ngram_indexes[0],"edits_char_counter.mrshl")
  serialize_data(ngram_indexes[1],"edits_bichar_counter.mrshl")
  
  return matrices + ngram_indexes