Beispiel #1
0
 def __init__(self, vocab_path, bpe_merges_path):
   tokenizer = Tokenizer(models.BPE.from_files(vocab_path, bpe_merges_path))
   # Use the byte level
   add_prefix_spaces = False # Whether to automatically prefix the sequences with a space if none found
   tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_spaces))
   tokenizer.with_decoder(decoders.ByteLevel.new())
   # Setup truncation if needed
   truncate = False
   max_length = 1024
   if truncate:
     stride = 0
     strategy = 'longest_first' # Can also be `only_first` or `only_second`
     tokenizer.with_truncation(max_length, stride, strategy)
   # Setup padding if needed
   padding = False
   # Whether to always pad to max_length. If this is false, we will pad to the
   # longest sequence in the batch.
   pad_to_max_length = False
   padding_side = "right" # Can also be "left"
   pad_token_id = 0
   pad_token_type_id = 0
   pad_token = "[PAD]"
   if padding:
     tokenizer.with_padding(
       max_length if pad_to_max_length else None,
       padding_side,
       pad_token_id,
       pad_token_type_id,
       pad_token
     )
   self.tokenizer = tokenizer
Beispiel #2
0
merges = "./models/%s/vocab.bpe" % args.model_name
tokenizer = Tokenizer(models.BPE.from_files(vocab, merges))

# Use the byte level
add_prefix_spaces = False  # Whether to automatically prefix the sequences with a space if none found
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_spaces))
tokenizer.with_decoder(decoders.ByteLevel.new())

# Setup truncation if needed
truncate = False
max_length = 1024

if truncate:
    stride = 0
    strategy = 'longest_first'  # Can also be `only_first` or `only_second`
    tokenizer.with_truncation(max_length, stride, strategy)

# Setup padding if needed
padding = False
# Whether to always pad to max_length. If this is false, we will pad to the
# longest sequence in the batch.
pad_to_max_length = False
padding_side = "right"  # Can also be "left"
pad_token_id = 0
pad_token_type_id = 0
pad_token = "[PAD]"

if padding:
    tokenizer.with_padding(max_length if pad_to_max_length else None,
                           padding_side, pad_token_id, pad_token_type_id,
                           pad_token)