Example #1
0
class TestEditor(unittest.TestCase):
    def setUp(self):
        self.editor = Editor()

    def test_insert(self):
        edits = self.editor.insert("food")
        self.assertTrue('fozod' in edits)
        edits = self.editor.edit("food", "insert")
        self.assertTrue('fozod' in edits)

    def test_delete(self):
        edits = self.editor.delete("food")
        self.assertTrue('fod' in edits)

    def test_substitute(self):
        edits = self.editor.substitute("food")
        self.assertTrue('zood' in edits)

    def test_transpose(self):
        edits = self.editor.transpose("food")
        self.assertTrue('ofod' in edits)

    def test_split(self):
        edits = self.editor.split("food")
        self.assertTrue('fo od' in edits)
Example #2
0
 def setUp(self):
     self.editor = Editor()
def build_operation_corpus(distance, operation, words, n=3, random_state=17):
    if isinstance(random_state, int):
        random_state = np.random.RandomState(seed=random_state)

    editor = Editor()
    edit_finder = EditFinder()
    pbar = build_progressbar(words)

    corpus = init_corpus()

    words_set = set(words)

    for i,w in enumerate(words):
        pbar.update(i+1)
        edits = set([w])
        #print('initial edits', edits)
        for i in range(distance):
            #print(w, i)
            new_edits = set()
            for edit in edits:
                #print('getting edits for %s' % edit)
                edits_for = editor.edit(edit, operation)
                new_edits.update(edits_for)
                #print('edits for %s %s' % (edit, str(new_edits)))

            # Remove the word itself from new edits.
            try:
                new_edits.remove(w)
            except KeyError:
                pass

            # Remove real words from the edits.
            for edit in new_edits.copy():
                if edit in words_set:
                    new_edits.remove(edit)

            # Break out if we can't make any new edits.
            if len(new_edits) == 0:
                new_edits = edits
                break

            #print('new edits for %s %s (after removing %s)' % (edit, str(new_edits), w))

            n_choice = min(n, len(new_edits))

            try:
                edits = random_state.choice(list(new_edits), size=n_choice, replace=False)
            except ValueError as e:
                #print(w, new_edits, e)
                raise e

            #print('%d edits for %s %s (after sampling %d)' % (n_choice, edit, str(edits), n))

        try:
            edits = random_state.choice(list(edits), size=n, replace=False)
        except ValueError:
            pass

        for edit in edits:
            corpus['word'].append(unicode(edit))
            # Use start-of-word and end-of-word markers as in http://arxiv.org/abs/1602.02410.
            corpus['marked_word'].append('^' + edit + '$')
            corpus['real_word'].append(w)
            corpus['binary_target'].append(0)
            corpus['multiclass_target'].append(0)

            orig_chars = []
            changed_chars = []
            for orig,changed in edit_finder.find(w, edit):
                orig_chars.append(orig)
                changed_chars.append(changed)
            corpus['orig_pattern'].append('-'.join(orig_chars))
            corpus['changed_pattern'].append('-'.join(changed_chars))

    pbar.finish()

    corpus['distance'] = [distance for w in corpus['word']]
    corpus['operation'] = [operation for w in corpus['word']]

    return corpus