def load_tokenizer(tokenizer_name=None): tokenizer_name = tokenizer_name or DEFAULT_TOKENIZER_NAME if tokenizer_name not in POSSIBLE_TOKENIZER_NAMES: raise ValueError(f"Tokenizer name should be from {POSSIBLE_TOKENIZER_NAMES}") if tokenizer_name.lower() == "my_ru": # *** My RU tokenizer *** # my_ru_tokenizer = RobertaTokenizerFast( f"{data_paths['ru']}/my_ru_tokenizer/vocab.json", f"{data_paths['ru']}/my_ru_tokenizer/merges.txt", f"{data_paths['ru']}/my_ru_tokenizer/tokenizer.json", **special_tokens["<>"] ) my_ru_tokenizer.add_special_tokens(special_tokens["<>"]) return my_ru_tokenizer elif tokenizer_name == "rubert": # *** RuBERT tokenizer *** # rubert_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased", **special_tokens["[]"]) rubert_tokenizer.add_special_tokens(special_tokens["[]"]) return rubert_tokenizer elif tokenizer_name.lower() == "xlm": # *** XLM tokenizer *** # xlm_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base", **special_tokens["<>"]) xlm_tokenizer.add_special_tokens(special_tokens["<>"]) return xlm_tokenizer elif tokenizer_name.lower() == "helsinki-nlp-ru": # *** Helsinki-NLP RU tokenizer *** # ru_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en", **special_tokens["<>"]) ru_tokenizer.add_special_tokens(special_tokens["<>"]) return ru_tokenizer
def get_bert_tokenizer(bert_model_type): if bert_model_type in [ 'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'tune_bert-base-uncased_nsp', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: if '-cased' in bert_model_type: do_lower_case = False else: do_lower_case = True # default return BertTokenizerFast(vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) elif bert_model_type in [ 'roberta-base', 'prod-roberta-base-cased', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: return RobertaTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], merges_file=BERT_MERGE_FILE[bert_model_type], add_prefix_space=True) elif bert_model_type in ['xlnet-base-cased']: if '-uncased' in bert_model_type: do_lower_case = True else: do_lower_case = False # default return XLNetTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: return AlbertTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type]) elif bert_model_type in ['gpt2', 'gpt2-medium']: tokenizer = GPT2TokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], merges_file=BERT_MERGE_FILE[bert_model_type], add_prefix_space=True) # https://github.com/huggingface/transformers/issues/3859 tokenizer.pad_token = tokenizer.eos_token return tokenizer elif bert_model_type in ['transfo-xl']: return TransfoXLTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type]) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: if '-cased' in bert_model_type: do_lower_case = False else: do_lower_case = True # default return DistilBertTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}')
def save_tmp_tokenizer(): paths = [str(dataset_path / 'oscar.eo.1000.txt')] # Initialize a tokenizer tokenizer_tmp = ByteLevelBPETokenizer() # Customize training tokenizer_tmp.train(files=paths, vocab_size=10_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer_tmp_path.mkdir(parents=True, exist_ok=True) tokenizer_tmp.save_model(str(tokenizer_tmp_path)) save_tmp_tokenizer() tokenizer = RobertaTokenizerFast( tokenizer_tmp_path / 'vocab.json', tokenizer_tmp_path / 'merges.txt' ) tokenizer.save_pretrained(tokenizer_path) # from transformers import AutoTokenizer, RobertaConfig # tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, config=RobertaConfig())
from transformers import RobertaTokenizerFast import config import os tokenizer = RobertaTokenizerFast( os.path.join(config.TOKENIZER_PATH, 'vocab.json'), os.path.join(config.TOKENIZER_PATH, 'merges.txt'), bos_token=config.BOS_TOKEN, eos_token=config.EOS_TOKEN, sep_token=config.SEP_TOKEN, cls_token=config.CLS_TOKEN, unk_token=config.UNK_TOKEN, pad_token=config.PAD_TOKEN, mask_token=config.MASK_TOKEN ) # tokenizer.enable_truncation(max_length=512) print( tokenizer.convert_ids_to_tokens(tokenizer.encode('Hello world World WorLd')) ) print( tokenizer('Hello world World WorLd') )
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub # # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this # behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = RobertaTokenizerFast( os.path.join(model_args.tokenizer_name, 'vocab.json'), os.path.join(model_args.tokenizer_name, 'merges.txt'), bos_token=cg.BOS_TOKEN, eos_token=cg.EOS_TOKEN, sep_token=cg.SEP_TOKEN, cls_token=cg.CLS_TOKEN, unk_token=cg.UNK_TOKEN, pad_token=cg.PAD_TOKEN, mask_token=cg.MASK_TOKEN, max_len=cg.MAX_LEN) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warn( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer( examples["text"], padding=padding, truncation=True, max_length=max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = tokenized_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = tokenized_datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif model_args.model_name_or_path is not None and os.path.isdir( model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) perplexity = math.exp(metrics["eval_loss"]) metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics)