data_collator=collator, ) # Training if training_args.do_train: trainer.train() trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(output_dir) # Evaluate Each Checkpoints dev_results = {} test_results = {} checkpoints_list = trainer._sorted_checkpoints() logger.info("Saved Checkpoints:") logger.info(str(checkpoints_list)) cnt = 0 max_dev_f1 = -0.1 max_f1_checkpoint_name = None for checkpoint_name in checkpoints_list: path = ( checkpoint_name # os.path.join(training_args.output_dir, checkpoint_name) ) model_new = SeqClassModel.from_pretrained( path, params_dict=config_dict,
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) hf_logging.set_verbosity_info() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): logger.info( f"Output dir ({training_args.output_dir}) is not empty, will try to reload from there." ) model_args.model_name_or_path = training_args.output_dir # raise ValueError( # f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." # ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) logger.info(model) num_params = sum(p.numel() for p in model.parameters()) logger.info('Model has %d parameters' % num_params) num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) logger.info('Model has %d trainable parameters' % num_params) # ADD special tokens tokenizer.pad_token = tokenizer.eos_token special_tokens_dict = { 'additional_special_tokens': ['<STORY>', '<QUERY>', '<PROOF>', '<ANSWER>'] } # NOTE: should also have added "ent_1", "ent_2", ..., "ent_20" :/ num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) logger.info(f'We have added {num_added_toks} tokens') ''' if tokenizer.pad_token_id is None and data_args.line_by_line: # See PR 3388. Some tokenizers don't had pad tokens which causes errors at the encoding step in the collate_fn. # We give here the option to force the addition of a pad token. The attention mask is used to ignore this token # when feeding to the model. # tokenizer.pad_token = tokenizer.eos_token num_added_toks = tokenizer.add_special_tokens({"pad_token": "<pad>"}) ''' model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.model_max_length # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.model_max_length) # Get datasets train_dataset = (get_dataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if (training_args.do_eval or training_args.evaluate_during_training) else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, ) # start by saving tokenizer so that we can restart training! # if trainer.is_world_master(): # tokenizer.save_pretrained(training_args.output_dir) results = {} # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) logger.info(f"model_path: {model_path}") if model_path is not None: # Grab the most recent checkpoint checkpoints_sorted = trainer._sorted_checkpoints(use_mtime=True) assert len(checkpoints_sorted) > 0 checkpoint_most_recent = checkpoints_sorted[-1] logger.info( f"most recent checkpoint: {checkpoint_most_recent}. setting model_path to this." ) # TODO: find a way to set: # - patience_best_eval_loss = None # - patience_evals_without_improvement = 0 # - patience_should_stop = False model_path = checkpoint_most_recent train_results = trainer.train(model_path=model_path, ) results["train_step"] = train_results.global_step results["train_loss"] = train_results.training_loss results["train_ppl"] = math.exp(train_results.training_loss) # trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) # if trainer.is_world_master(): # tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() results["valid_loss"] = eval_output["eval_loss"] results["valid_ppl"] = math.exp(eval_output["eval_loss"]) output_eval_file = os.path.join(training_args.output_dir, "results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key]))) return results