Ejemplo n.º 1
0
 def test_preprocess_with_offsets(self):
     text = " ab\n \n cd- \n \n ef \n\n"
     result = segmenter.preprocess_with_offsets(text)
     self.assertListEqual([(0, " ab"), (6, " cd- "), (14, " ef "), (20, "")], result)
Ejemplo n.º 2
0
    def _analyze(self, document: str, bracket_skip_len=None) -> Iterator[Iterator[List[Token]]]:
        tok = Tokenizer(replace_not_contraction=False, emit_hyphen_or_underscore_sep=True)

        for offset, paragraph in segmenter.preprocess_with_offsets(document):
            tokens = tok.tokenize(paragraph, offset)
            yield segmenter.segment(tokens, bracket_skip_len)
Ejemplo n.º 3
0
 def test_preprocess_with_offsets(self):
     text = " ab\n\u00a0 \n cd- \n ef \n\n g \n \n"
     result = segmenter.preprocess_with_offsets(text)
     self.assertListEqual([(0, " ab"), (7, " cd- \n ef "), (19, " g "),
                           (25, "")], result)