def test_subsequences(self): real_word = "which" expected = [ 'w', 'wh', 'whi', 'whic', 'which', 'h', 'hi', 'hic', 'hich', 'i', 'ic', 'ich', 'c', 'ch', 'h' ] actual = [s for s in subsequences(real_word)] self.assertEqual(len(expected), len(actual)) self.assertEqual(expected, actual)
def run(self): errors = [] pbar = build_progressbar(self.real_words) finder = EditFinder() for i,word in enumerate(self.real_words): pbar.update(i+1) # Find all the edits we can make to this word. possible_edits = list() probs = list() for subseq in subsequences(word): # Probably delete this if statement as redundant. for e in self.edit_db.edits(subseq): _, error_subseq, count = e possible_edit = (subseq, error_subseq) if count > 0: possible_edits.append(possible_edit) probs.append(count) if len(possible_edits) == 0: continue probs = np.array(probs) probs = probs / float(probs.sum()) seen_edits = set() errors_for_word = [] attempts = 0. # Try to generate up to the requested number of errors per word. while True: try: attempts += 1. if self.enough_errors_for_word(word, errors_for_word): # Generated enough errors for this word. break elif attempts > 10 and len(errors_for_word) / attempts < 0.1: # Not finding many errors to apply. Break out. break # Sample the number of edits. edit_sizes = np.arange(1, self.max_edits_per_error+1) edit_size_probs = 1. / edit_sizes edit_size_probs /= edit_size_probs.sum() size = self.random_state.choice(edit_sizes, size=1, replace=False, p=edit_size_probs)[0] # Sample edits with probability proportional to the edit's frequency. edit_idx = self.random_state.choice(len(probs), size=size, replace=False, p=probs) edit = [] for i in edit_idx: pe = possible_edits[i] if pe in seen_edits: continue seen_edits.add(pe) edit.append(pe) if len(edit) == 0: continue # Avoid applying edits that result in unlikely errors. for constraint in self.constraints: for e in edit: if constraint(word, e): raise EditConstraintError("can't apply edit %s=>%s to word '%s'" % \ (e[0], e[1], word)) error = finder.apply(word, edit) if error in self.blacklist: # Skip blacklisted words (i.e. non-words in a corpus used to generate the # edit patterns in the edit database). continue errors_for_word.append((word, len(possible_edits), edit, error)) except EditConstraintError as e: if self.verbose: print(e) errors.extend(errors_for_word) pbar.finish() return errors