def test_preprocess_with_offsets(self): text = " ab\n \n cd- \n \n ef \n\n" result = segmenter.preprocess_with_offsets(text) self.assertListEqual([(0, " ab"), (6, " cd- "), (14, " ef "), (20, "")], result)
def _analyze(self, document: str, bracket_skip_len=None) -> Iterator[Iterator[List[Token]]]: tok = Tokenizer(replace_not_contraction=False, emit_hyphen_or_underscore_sep=True) for offset, paragraph in segmenter.preprocess_with_offsets(document): tokens = tok.tokenize(paragraph, offset) yield segmenter.segment(tokens, bracket_skip_len)
def test_preprocess_with_offsets(self): text = " ab\n\u00a0 \n cd- \n ef \n\n g \n \n" result = segmenter.preprocess_with_offsets(text) self.assertListEqual([(0, " ab"), (7, " cd- \n ef "), (19, " g "), (25, "")], result)