class TestEditor(unittest.TestCase): def setUp(self): self.editor = Editor() def test_insert(self): edits = self.editor.insert("food") self.assertTrue('fozod' in edits) edits = self.editor.edit("food", "insert") self.assertTrue('fozod' in edits) def test_delete(self): edits = self.editor.delete("food") self.assertTrue('fod' in edits) def test_substitute(self): edits = self.editor.substitute("food") self.assertTrue('zood' in edits) def test_transpose(self): edits = self.editor.transpose("food") self.assertTrue('ofod' in edits) def test_split(self): edits = self.editor.split("food") self.assertTrue('fo od' in edits)
def setUp(self): self.editor = Editor()
def build_operation_corpus(distance, operation, words, n=3, random_state=17): if isinstance(random_state, int): random_state = np.random.RandomState(seed=random_state) editor = Editor() edit_finder = EditFinder() pbar = build_progressbar(words) corpus = init_corpus() words_set = set(words) for i,w in enumerate(words): pbar.update(i+1) edits = set([w]) #print('initial edits', edits) for i in range(distance): #print(w, i) new_edits = set() for edit in edits: #print('getting edits for %s' % edit) edits_for = editor.edit(edit, operation) new_edits.update(edits_for) #print('edits for %s %s' % (edit, str(new_edits))) # Remove the word itself from new edits. try: new_edits.remove(w) except KeyError: pass # Remove real words from the edits. for edit in new_edits.copy(): if edit in words_set: new_edits.remove(edit) # Break out if we can't make any new edits. if len(new_edits) == 0: new_edits = edits break #print('new edits for %s %s (after removing %s)' % (edit, str(new_edits), w)) n_choice = min(n, len(new_edits)) try: edits = random_state.choice(list(new_edits), size=n_choice, replace=False) except ValueError as e: #print(w, new_edits, e) raise e #print('%d edits for %s %s (after sampling %d)' % (n_choice, edit, str(edits), n)) try: edits = random_state.choice(list(edits), size=n, replace=False) except ValueError: pass for edit in edits: corpus['word'].append(unicode(edit)) # Use start-of-word and end-of-word markers as in http://arxiv.org/abs/1602.02410. corpus['marked_word'].append('^' + edit + '$') corpus['real_word'].append(w) corpus['binary_target'].append(0) corpus['multiclass_target'].append(0) orig_chars = [] changed_chars = [] for orig,changed in edit_finder.find(w, edit): orig_chars.append(orig) changed_chars.append(changed) corpus['orig_pattern'].append('-'.join(orig_chars)) corpus['changed_pattern'].append('-'.join(changed_chars)) pbar.finish() corpus['distance'] = [distance for w in corpus['word']] corpus['operation'] = [operation for w in corpus['word']] return corpus