Beispiel #1
0
 def __init__(self, vocab_file):
     #self.vocab = vocab_file
     self.sp = spm.SentencePieceProcessor()
     self.sp.load(vocab_file)
     self.spt = sentencepiece_pb2.SentencePieceText()
     self.huggingface = XLMRobertaTokenizer.from_pretrained(vocab_file)
     self.vocab = self.huggingface.get_vocab()
Beispiel #2
0
 def encode(self, sentence):
     spt = sentencepiece_pb2.SentencePieceText()
     spt.ParseFromString(self.sp.encode_as_serialized_proto(sentence))
     offsets = []
     tokens = []
     for piece in spt.pieces:
         tokens.append(piece.id)
         offsets.append((piece.begin, piece.end))
     return tokens, offsets