def main(args_file=None): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if (len(sys.argv) == 2 and sys.argv[1].endswith(".json")) or args_file is not None: # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. args_file_path = os.path.abspath( sys.argv[1]) if args_file is None else args_file model_args, data_args, training_args = parser.parse_json_file( json_file=args_file_path) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) assert model_args.model_type in list( MODEL_TYPE_TO_TOKENIZER.keys()), "model type should be 't5' or 'bart'" if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Set project name os.environ["WANDB_PROJECT"] = "question-generation" # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer_cls = MODEL_TYPE_TO_TOKENIZER[model_args.model_type] tokenizer = tokenizer_cls.from_pretrained( model_args.tokenizer_name_or_path if model_args.tokenizer_name_or_path else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model.resize_token_embeddings(len(tokenizer)) if model_args.freeze_embeds: logger.info("freezing embeddings of the model") freeze_embeds(model) assert_not_all_frozen(model) # Get datasets logger.info('loading dataset') train_dataset = torch.load( data_args.train_file_path) if training_args.do_train else None valid_dataset = torch.load( data_args.valid_file_path) if training_args.do_eval else None logger.info('finished loading dataset') # Initialize data_collator data_collator = T2TDataCollator(tokenizer=tokenizer, model_type=model_args.model_type, mode="training", using_tpu=training_args.tpu_num_cores is not None) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=valid_dataset, data_collator=data_collator, #prediction_loss_only=True, label_smoothing=model_args.label_smoothing) # disable wandb console logs logging.getLogger('wandb.run_manager').setLevel(logging.WARNING) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) #if trainer.is_world_master(): # tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(eval_output.keys()): logger.info(" %s = %s", key, str(eval_output[key])) writer.write("%s = %s\n" % (key, str(eval_output[key]))) results.update(eval_output) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = BartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = BartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = BartForConditionalGeneration.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, tokenizer=tokenizer, ) all_metrics = {} # Training if training_args.do_train: logger.info("*** Train ***") train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer if trainer.is_world_process_zero(): handle_metrics("train", metrics, training_args.output_dir) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams) metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Predict ***") test_output = trainer.predict( test_dataset=test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics = test_output.metrics metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) handle_metrics("test", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return all_metrics
def __init__(self, hparams, **kwargs): if hparams.sortish_sampler and hparams.gpus > 1: hparams.replace_sampler_ddp = False elif hparams.max_tokens_per_batch is not None: if hparams.gpus > 1: raise NotImplementedError( "Dynamic Batch size does not work for multi-gpu training") if hparams.sortish_sampler: raise ValueError( "--sortish_sampler and --max_tokens_per_batch may not be used simultaneously" ) super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.model_type = self.config.model_type self.vocab_size = self.config.tgt_vocab_size if self.model_type == "fsmt" else self.config.vocab_size self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: freeze_embeds(self.model) if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None # default to config if self.model.config.decoder_start_token_id is None and isinstance( self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[ hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id self.dataset_class = (Seq2SeqDataset if hasattr( self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset) self.already_saved_batch = False self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams if self.hparams.eval_max_gen_length is not None: self.eval_max_length = self.hparams.eval_max_gen_length else: self.eval_max_length = self.model.config.max_length self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset if hasattr( tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, config=config, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, data_args=data_args, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() if trainer.is_world_process_zero(): logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) save_json( result, os.path.join(training_args.output_dir, "eval_results.json")) eval_results.update(result) if training_args.do_predict: logging.info("*** Test ***") test_output = trainer.predict(test_dataset=test_dataset) test_metrics = { k.replace("eval", "test"): v for k, v in test_output.metrics.items() } if trainer.is_world_process_zero(): logger.info("***** Test results *****") for key, value in test_metrics.items(): logger.info(" %s = %s", key, value) save_json( test_metrics, os.path.join(training_args.output_dir, "test_results.json")) eval_results.update(test_metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(eval_results, "all_results.json") return eval_results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, tokenizer=tokenizer, ) all_metrics = {} # Training if training_args.do_train: logger.info("*** Train ***") train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer if trainer.is_world_process_zero(): handle_metrics("train", metrics, training_args.output_dir) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) tokenizer.save_pretrained(training_args.output_dir) if training_args.tune: def eval_func_for_lpot(model): trainer.model = model results = trainer.evaluate( eval_dataset=eval_dataset, metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams) assert data_args.task.startswith("summarization") or data_args.task.startswith("translation") , \ "data_args.task should startswith summarization or translation" task_metrics_keys = [ 'val_bleu', 'val_rouge1', 'val_rouge2', 'val_rougeL', 'val_rougeLsum' ] for key in task_metrics_keys: if key in results.keys(): logger.info("Finally Eval {}:{}".format(key, results[key])) if 'bleu' in key: acc = results[key] break if 'rouge' in key: acc = sum( [v for k, v in results.items() if "rouge" in k]) / 4 break return acc from lpot.experimental import Quantization, common quantizer = Quantization("./conf.yaml") quantizer.model = common.Model(model) quantizer.calib_dataloader = common.DataLoader( eval_dataset, batch_size=training_args.eval_batch_size, collate_fn=Seq2SeqDataCollator_lpot(tokenizer, data_args, training_args.tpu_num_cores)) quantizer.eval_func = eval_func_for_lpot q_model = quantizer() q_model.save(training_args.tuned_checkpoint) exit(0) if training_args.benchmark: if training_args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(training_args.tuned_checkpoint)), model) else: new_model = model trainer.model = new_model results = trainer.evaluate( eval_dataset=eval_dataset, metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, iters=training_args.iters, warmup_iter=training_args.warmup_iter, ) if data_args.task.startswith("summarization"): print('Accuracy: %.4f' % (sum([v for k, v in results.items() if "rouge" in k]) / 4)) if data_args.task.startswith("translation"): print('Accuracy: %.4f' % (results['val_bleu'])) print('Throughput: %.3f samples/sec' % (results["val_samples_per_second"])) print('Latency: %.3f ms' % (1 * 1000 / results["val_samples_per_second"])) print('Batch size = %d' % training_args.per_device_eval_batch_size) exit(0) if training_args.accuracy_only: if training_args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(training_args.tuned_checkpoint)), model) else: new_model = model trainer.model = new_model results = trainer.evaluate( eval_dataset=eval_dataset, metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) if data_args.task.startswith("summarization"): print('Accuracy: %.4f' % (sum([v for k, v in results.items() if "rouge" in k]) / 4)) if data_args.task.startswith("translation"): print('Accuracy: %.4f' % (results['val_bleu'])) print('Latency: %.3f ms' % (1 * 1000 / results["val_samples_per_second"])) print('Batch size = %d' % training_args.per_device_eval_batch_size) exit(0) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Predict ***") test_output = trainer.predict( test_dataset=test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics = test_output.metrics metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) handle_metrics("test", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return all_metrics
def run(args, logger): tokenizer = BartTokenizer.from_pretrained(args.model) train_tasks = get_tasks_list(args.custom_tasks_splits, "train") logger.info("Training on the following tasks: {}".format(train_tasks)) train_data = NLPFewshotGymMetaLearningData(logger, args, args.train_dir, tasks=train_tasks, data_type="train", is_training=True) # dev_data = NLPFewshotGymMetaLearningData(logger, args, args.train_dir, tasks=DEFAULT_SPLIT["dev"], data_type="dev", is_training=False) dev_data = None train_data.load_dataset(tokenizer) train_data.load_dataloader() # dev_data.load_dataset(tokenizer) # dev_data.load_dataloader() if args.do_train: if args.checkpoint is not None: def convert_to_single_gpu(state_dict): def _convert(key): if key.startswith('module.'): return key[7:] return key return { _convert(key): value for key, value in state_dict.items() } model = MyBart.from_pretrained(args.model, state_dict=convert_to_single_gpu( torch.load(args.checkpoint))) else: model = MyBart.from_pretrained(args.model) if args.freeze_embeds: logger.info("Freezing embeddings") freeze_embeds(model) if args.n_gpu > 1: model = torch.nn.DataParallel(model) if torch.cuda.is_available(): model.to(torch.device("cuda")) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=args.total_steps) train(args, logger, model, train_data, dev_data, optimizer, scheduler)
def run(args, logger): tokenizer = BartTokenizer.from_pretrained(args.model) train_data = MyDatasetCollection(logger, args, args.train_file, True) dev_data = MyDatasetCollection(logger, args, args.predict_file, False) train_data.load_dataset(tokenizer) train_data.load_dataloader() dev_data.load_dataset(tokenizer) dev_data.load_dataloader() if args.do_train: config = BartWithAdapterConfig.from_pretrained(args.model) config.adapter_dim = args.adapter_dim config.adapt_layer_norm = args.adapt_layer_norm config.unfreeze_hyper_encoder = args.unfreeze_hyper_encoder bart = MyBartWithAdapter(config) if args.checkpoint is not None: def convert_to_single_gpu(state_dict): def _convert(key): if key.startswith('module.'): return key[7:] return key return { _convert(key): value for key, value in state_dict.items() } bart_old = MyBart.from_pretrained(args.model, state_dict=convert_to_single_gpu( torch.load(args.checkpoint))) bart.model.load_state_dict(bart_old.model.state_dict(), strict=False) logger.info("Loading checkpoint from {}".format(args.checkpoint)) else: bart_old = MyBart.from_pretrained(args.model) bart.model.load_state_dict(bart_old.model.state_dict(), strict=False) model = bart if args.freeze_embeds: logger.info("Freezing embeddings") freeze_embeds(model) if args.n_gpu > 1: model = torch.nn.DataParallel(model) if torch.cuda.is_available(): model.to(torch.device("cuda")) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) logger.info("#Params: {}".format(num_parameters)) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=args.total_steps) train(args, logger, model, train_data, dev_data, optimizer, scheduler) if args.do_predict: checkpoint = os.path.join(args.output_dir, args.predict_checkpoint) def convert_to_single_gpu(state_dict): def _convert(key): if key.startswith('module.'): return key[7:] return key return {_convert(key): value for key, value in state_dict.items()} config = BartWithAdapterConfig.from_pretrained(args.model) config.adapter_dim = args.adapter_dim # model = MyBartWithAdapter.from_pretrained(args.model, # state_dict=convert_to_single_gpu(torch.load(checkpoint))) model = MyBartWithAdapter(config) model.load_state_dict(convert_to_single_gpu(torch.load(checkpoint)), strict=True) # generator = ParameterGenerator(config) # model = GrowingBart(bart, generator, config) # model.load_state_dict(convert_to_single_gpu(torch.load(checkpoint)), strict=False) logger.info("Loading checkpoint from {}".format(checkpoint)) if torch.cuda.is_available(): model.to(torch.device("cuda")) model.eval() ems = inference(model, dev_data, save_predictions=True, verbose=True) logger.info("%s on %s data: %.2f" % (dev_data.metric, dev_data.data_type, np.mean(ems) * 100))
def run(args, logger): tokenizer = BartTokenizer.from_pretrained(args.model) train_data = NLPFewshotGymSingleTaskData(logger, args, args.train_file, data_type="train", is_training=True) dev_data = NLPFewshotGymSingleTaskData(logger, args, args.dev_file, data_type="dev", is_training=False) train_data.load_dataset(tokenizer) train_data.load_dataloader() dev_data.load_dataset(tokenizer) dev_data.load_dataloader() best_dev_performance = None test_performance = None best_model_state_dict = None if args.do_train: if args.checkpoint is not None and args.checkpoint != "None": def convert_to_single_gpu(state_dict): def _convert(key): if key.startswith('module.'): return key[7:] return key return {_convert(key):value for key, value in state_dict.items()} model = MyBart.from_pretrained(args.model, state_dict=convert_to_single_gpu(torch.load(args.checkpoint))) else: model = MyBart.from_pretrained(args.model) if args.freeze_embeds: logger.info("Freezing embeddings") freeze_embeds(model) if args.n_gpu>1: model = torch.nn.DataParallel(model) if torch.cuda.is_available(): model.to(torch.device("cuda")) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=args.total_steps) best_dev_performance, best_model_state_dict = train(args, logger, model, train_data, dev_data, optimizer, scheduler) if args.do_predict: if args.do_train and best_model_state_dict is not None: model = MyBart.from_pretrained(args.model, state_dict=best_model_state_dict) logger.info("Loading checkpoint from CPU") else: checkpoint = os.path.join(args.output_dir, args.predict_checkpoint) def convert_to_single_gpu(state_dict): def _convert(key): if key.startswith('module.'): return key[7:] return key return {_convert(key):value for key, value in state_dict.items()} model = MyBart.from_pretrained(args.model, state_dict=convert_to_single_gpu(torch.load(checkpoint))) logger.info("Loading checkpoint from {}".format(checkpoint)) if torch.cuda.is_available(): model.to(torch.device("cuda")) model.eval() data_type = "test" if "test" in args.test_file else "dev" test_data = NLPFewshotGymSingleTaskData(logger, args, args.test_file, data_type=data_type, is_training=False) test_data.load_dataset(tokenizer) test_data.load_dataloader() test_performance = inference(model, test_data, save_predictions=True, verbose=True) logger.info("%s on %s data: %.2f" % (test_data.metric, test_data.data_type, test_performance)) return best_dev_performance, test_performance