Esempio n. 1
0
 def test_subsequences(self):
     real_word = "which"
     expected = [
         'w', 'wh', 'whi', 'whic', 'which',
         'h', 'hi', 'hic', 'hich',
         'i', 'ic', 'ich',
         'c', 'ch', 
         'h'
         ]
     actual = [s for s in subsequences(real_word)]
     self.assertEqual(len(expected), len(actual))
     self.assertEqual(expected, actual)
Esempio n. 2
0
    def run(self):
        errors = []
        pbar = build_progressbar(self.real_words)

        finder = EditFinder()

        for i,word in enumerate(self.real_words):
            pbar.update(i+1)

            # Find all the edits we can make to this word.
            possible_edits = list()
            probs = list()
            for subseq in subsequences(word):
                # Probably delete this if statement as redundant.
                for e in self.edit_db.edits(subseq):
                    _, error_subseq, count = e
                    possible_edit = (subseq, error_subseq)
                    if count > 0:
                        possible_edits.append(possible_edit)
                        probs.append(count)

            if len(possible_edits) == 0:
                continue

            probs = np.array(probs)
            probs = probs / float(probs.sum())

            seen_edits = set()
            errors_for_word = []
            attempts = 0.

            # Try to generate up to the requested number of errors per word.
            while True:
                try:
                    attempts += 1.

                    if self.enough_errors_for_word(word, errors_for_word):
                        # Generated enough errors for this word.
                        break
                    elif attempts > 10 and len(errors_for_word) / attempts < 0.1:
                        # Not finding many errors to apply.  Break out.
                        break

                    # Sample the number of edits.
                    edit_sizes = np.arange(1, self.max_edits_per_error+1)
                    edit_size_probs = 1. / edit_sizes
                    edit_size_probs /= edit_size_probs.sum()
                    size = self.random_state.choice(edit_sizes, size=1, replace=False,
                            p=edit_size_probs)[0]

                    # Sample edits with probability proportional to the edit's frequency.
                    edit_idx = self.random_state.choice(len(probs), size=size, replace=False, p=probs)

                    edit = []
                    for i in edit_idx:
                        pe = possible_edits[i]
                        if pe in seen_edits:
                            continue
                        seen_edits.add(pe)
                        edit.append(pe)

                    if len(edit) == 0:
                        continue
    
                    # Avoid applying edits that result in unlikely errors.
                    for constraint in self.constraints:
                        for e in edit:
                            if constraint(word, e):
                                raise EditConstraintError("can't apply edit %s=>%s to word '%s'" % \
                                        (e[0], e[1], word))

                    error = finder.apply(word, edit)
                    if error in self.blacklist:
                        # Skip blacklisted words (i.e. non-words in a corpus used to generate the
                        # edit patterns in the edit database).
                        continue

                    errors_for_word.append((word, len(possible_edits), edit, error))

                except EditConstraintError as e:
                    if self.verbose:
                        print(e)

            errors.extend(errors_for_word)

        pbar.finish()
    
        return errors