def tokenized_index(tokenizer: GPT2Tokenizer, text): token_list = tokenizer.tokenize(text) text_bytes = [ bytes([tokenizer.byte_decoder[c] for c in text]) for text in token_list ] token_indexes = list(itertools.accumulate(map(len, text_bytes))) return token_indexes
def tokenize_list(content_list: List[str], tokenizer: GPT2Tokenizer): return [tokenizer.tokenize(sentence) for sentence in content_list]