Beispiel #1
0
def tokenized_index(tokenizer: GPT2Tokenizer, text):
    token_list = tokenizer.tokenize(text)
    text_bytes = [
        bytes([tokenizer.byte_decoder[c] for c in text]) for text in token_list
    ]
    token_indexes = list(itertools.accumulate(map(len, text_bytes)))
    return token_indexes
def tokenize_list(content_list: List[str], tokenizer: GPT2Tokenizer):
    return [tokenizer.tokenize(sentence) for sentence in content_list]