def get_tokens_and_token_offsets(self, tokenizer): tokens = [tokenizer(sentence) for sentence in self.sentences] token_offsets = [ utils.get_offsets(sentence, tokens, offset) for sentence, tokens, offset in zip(self.sentences, tokens, self.sentence_offsets) ] return tokens, token_offsets
def __init__(self, raw: dict) -> None: self.raw = raw self.id = raw['id'] self.text = raw['text'] self.sentences = utils.split_sentences(raw['text']) self.sentence_offsets = utils.get_offsets(raw['text'], self.sentences) self.sentence_offsets.append(len(raw['text']))
def test_get_offsets(self): text = ' This is Doccano Transformer . ' tokens = text.split() result = utils.get_offsets(text, tokens) expected = [1, 6, 9, 17, 29] self.assertListEqual(result, expected)