Exemple #1
0
 def get_tokens_and_token_offsets(self, tokenizer):
     tokens = [tokenizer(sentence) for sentence in self.sentences]
     token_offsets = [
         utils.get_offsets(sentence, tokens, offset) for sentence, tokens,
         offset in zip(self.sentences, tokens, self.sentence_offsets)
     ]
     return tokens, token_offsets
Exemple #2
0
 def __init__(self, raw: dict) -> None:
     self.raw = raw
     self.id = raw['id']
     self.text = raw['text']
     self.sentences = utils.split_sentences(raw['text'])
     self.sentence_offsets = utils.get_offsets(raw['text'], self.sentences)
     self.sentence_offsets.append(len(raw['text']))
 def test_get_offsets(self):
     text = ' This is Doccano Transformer . '
     tokens = text.split()
     result = utils.get_offsets(text, tokens)
     expected = [1, 6, 9, 17, 29]
     self.assertListEqual(result, expected)