Python Tokenizer.with_truncation Beispiele

Programmiersprache: Python

Namespace / Paketname: tokenizers

Klasse / Typ: Tokenizer

Methode / Funktion: with_truncation

Beispiele auf hotexamples.com: 2

Python Tokenizer.with_truncation - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die tokenizers.Tokenizer.with_truncation, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Tokenizer(30)

decoder(30)

save(30)

normalizer(30)

from_file(30)

train(30)

post_processor(30)

encode(30)

pre_tokenizer(30)

add_special_tokens(26)

token_to_id(24)

encode_batch(18)

train_from_iterator(17)

add_tokens(17)

enable_padding(13)

enable_truncation(12)

normalize(8)

model(8)

decode_batch(6)

decode(5)

get_vocab(5)

from_str(4)

get_vocab_size(3)

_process_all_1(3)

decoders(3)

save_pretrained(2)

save_model(2)

with_pre_tokenizer(2)

with_padding(2)

with_decoder(2)

with_truncation(2)

from_pretrained(2)

convert_tokens_to_ids(2)

post_process(1)

raise_error(1)

pad_token(1)

mask_token(1)

get_special_tokens_mask(1)

save_tokenizer(1)

to_str(1)

tokenize(1)

tokenize_and_pad_training_data(1)

encode_plus(1)

convert_ids_to_tokens(1)

build_inputs_with_special_tokens(1)

batch_encode_plus(1)

load_tokenizer(1)

Beispiel #1

Datei anzeigen

 def __init__(self, vocab_path, bpe_merges_path):
   tokenizer = Tokenizer(models.BPE.from_files(vocab_path, bpe_merges_path))
   # Use the byte level
   add_prefix_spaces = False # Whether to automatically prefix the sequences with a space if none found
   tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_spaces))
   tokenizer.with_decoder(decoders.ByteLevel.new())
   # Setup truncation if needed
   truncate = False
   max_length = 1024
   if truncate:
     stride = 0
     strategy = 'longest_first' # Can also be `only_first` or `only_second`
     tokenizer.with_truncation(max_length, stride, strategy)
   # Setup padding if needed
   padding = False
   # Whether to always pad to max_length. If this is false, we will pad to the
   # longest sequence in the batch.
   pad_to_max_length = False
   padding_side = "right" # Can also be "left"
   pad_token_id = 0
   pad_token_type_id = 0
   pad_token = "[PAD]"
   if padding:
     tokenizer.with_padding(
       max_length if pad_to_max_length else None,
       padding_side,
       pad_token_id,
       pad_token_type_id,
       pad_token
     )
   self.tokenizer = tokenizer

Beispiel #2

Datei anzeigen

Datei: tokenize_dataset.py Projekt: jennettefir/news

merges = "./models/%s/vocab.bpe" % args.model_name
tokenizer = Tokenizer(models.BPE.from_files(vocab, merges))

# Use the byte level
add_prefix_spaces = False  # Whether to automatically prefix the sequences with a space if none found
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_spaces))
tokenizer.with_decoder(decoders.ByteLevel.new())

# Setup truncation if needed
truncate = False
max_length = 1024

if truncate:
    stride = 0
    strategy = 'longest_first'  # Can also be `only_first` or `only_second`
    tokenizer.with_truncation(max_length, stride, strategy)

# Setup padding if needed
padding = False
# Whether to always pad to max_length. If this is false, we will pad to the
# longest sequence in the batch.
pad_to_max_length = False
padding_side = "right"  # Can also be "left"
pad_token_id = 0
pad_token_type_id = 0
pad_token = "[PAD]"

if padding:
    tokenizer.with_padding(max_length if pad_to_max_length else None,
                           padding_side, pad_token_id, pad_token_type_id,
                           pad_token)