def __init__(self, vocab_file): #self.vocab = vocab_file self.sp = spm.SentencePieceProcessor() self.sp.load(vocab_file) self.spt = sentencepiece_pb2.SentencePieceText() self.huggingface = XLMRobertaTokenizer.from_pretrained(vocab_file) self.vocab = self.huggingface.get_vocab()
def encode(self, sentence): spt = sentencepiece_pb2.SentencePieceText() spt.ParseFromString(self.sp.encode_as_serialized_proto(sentence)) offsets = [] tokens = [] for piece in spt.pieces: tokens.append(piece.id) offsets.append((piece.begin, piece.end)) return tokens, offsets