def evaluate(self, eval_dataset: Optional[Dataset] = None) -> Dict[str, float]: """ Run evaluation and returns metrics. Adds extra VAE tests: - Interpolation between samples in latent space. - Random latent codes from normal distribution. if class column provided? - tSNE plots with class-label colouring. """ if self.state.global_step < wandb.run.history._step: self.state.global_step = wandb.run.history._step if is_wandb_available(): start_eval = time.time() with torch.no_grad(): self.model.eval() self._evaluate_latent_samples(eval_dataset=eval_dataset) generate_time = time.time() - start_eval output_metrics = super().evaluate(eval_dataset=eval_dataset) if is_wandb_available(): wandb.log( { "eval_get_test_loss_time": time.time() - start_eval + generate_time }, step=self.state.global_step) # type: ignore wandb.log({"eval_generate_time": generate_time}, step=self.state.global_step) # type: ignore return output_metrics
def log(self, logs, mode="train"): self._setup_loggers() if self.global_step is None: # when logging evaluation metrics without training self.global_step = 0 if self.tb_writer: for k, v in logs.items(): if isinstance(v, (int, float)): self.tb_writer.add_scalar(k, v, self.global_step) else: logger.warning( "Trainer is attempting to log a value of " '"%s" of type %s for key "%s" as a scalar. ' "This invocation of Tensorboard's writer.add_scalar() " "is incorrect so we dropped this attribute.", v, type(v), k, ) self.tb_writer.flush() if is_wandb_available(): if self.is_world_process_zero(): wandb.log(logs, step=self.global_step) if is_comet_available(): if self.is_world_process_zero(): experiment = comet_ml.config.get_global_experiment() if experiment is not None: experiment._log_metrics(logs, step=self.global_step, epoch=self.epoch, framework="transformers") output = {**logs, **{"step": self.global_step}} if self.is_world_process_zero(): self.log_history.append(output)
def log(self, logs: Dict[str, float]) -> None: """ Log :obj:`logs` on the various objects watching training. Subclass and override this method to inject custom behavior. Args: logs (:obj:`Dict[str, float]`): The values to log. """ logs["epoch"] = self.epoch_logging # if self.tb_writer: # with self.tb_writer.as_default(): # for k, v in logs.items(): # tf.summary.scalar(k, v, step=self.global_step) # self.tb_writer.flush() if is_wandb_available(): wandb.log(logs, step=self.global_step) if is_comet_available(): experiment = comet_ml.config.get_global_experiment() if experiment is not None: experiment._log_metrics( logs, step=self.global_step, epoch=self.epoch_logging, framework="transformers" ) output = {**logs, **{"step": self.global_step}} logger.info(output)
def load_model_and_tokenizer(model_args): # Distributed training: # The `.from_pretrained` methods guarantee that only one local process can concurrently # download model & vocab. if model_args.set_seq_size and model_args.set_seq_size <= 4: logger.warning( '`set_seq_size` is to small to work with the Funnel transformer. now using set_seq_size=5' ) model_args.set_seq_size = 5 if model_args.config_path: config = Funnel_T5_VAE_Config.from_pretrained( model_args.config_path, cache_dir=model_args.cache_dir) elif model_args.model_path: config = Funnel_T5_VAE_Config.from_pretrained( model_args.model_path, cache_dir=model_args.cache_dir) else: config = Funnel_T5_VAE_Config(use_extra_logs=is_wandb_available(), **model_args.__dict__) logger.warning( "You are instantiating a new config instance from scratch (still using T5 checkpoint)." ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) if model_args.model_path: model = Funnel_T5_VAE_Model.from_pretrained( model_args.model_path, from_tf=bool(".ckpt" in model_args.model_path), config=config, cache_dir=model_args.cache_dir, ) model.resize_token_embeddings(len(tokenizer)) else: vocab_size = len(tokenizer) config.funnel.vocab_size = vocab_size config.t5.vocab_size = vocab_size config.vocab_size = vocab_size logger.info("Training new model from scratch") model = Funnel_T5_VAE_Model(config) if model_args.set_seq_size: tokenizer.model_max_length = model_args.set_seq_size tokenizer.mask_token = tokenizer.unk_token return model, tokenizer
def init_ray_wandb_logger_callback(training_args): """ Initialize the ray wandb integration, used specifically for hyperparameter tuning. Returns either None or a list containing the initialized callback, so the output can be passed directly to hp_search_kwargs. """ has_wandb = is_wandb_available() if not has_wandb: return None project = os.getenv("WANDB_PROJECT", "huggingface") group = training_args.run_name callbacks = [WandbLoggerCallback( project=project, group=group, )] return callbacks
def early_init(cls, trainer_args, local_rank): has_wandb = is_wandb_available() assert has_wandb, \ "WandbCallback requires wandb to be installed. Run `pip install wandb`." logger.info("Initializing wandb on rank", local_rank) if local_rank not in [-1, 0]: return # Deduce run name and group. init_args = {} if hasattr(trainer_args, "trial_name") and trainer_args.trial_name is not None: run_name = trainer_args.trial_name init_args["group"] = trainer_args.run_name else: run_name = trainer_args.run_name wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), name=run_name, reinit=True, **init_args) return wandb.run.id
def log(self, logs: Dict[str, float]) -> None: """ Log :obj:`logs` on the various objects watching training. Subclass and override this method to inject custom behavior. Args: logs (:obj:`Dict[str, float]`): The values to log. """ if hasattr(self, "_log"): warnings.warn( "The `_log` method is deprecated and won't be called in a future version, define `log` in your subclass.", FutureWarning, ) return self._log(logs) logs["epoch"] = self.epoch_logging # if self.tb_writer: # with self.tb_writer.as_default(): # for k, v in logs.items(): # tf.summary.scalar(k, v, step=self.global_step) # self.tb_writer.flush() if is_wandb_available(): wandb.log(logs, step=self.global_step) if is_comet_available(): experiment = comet_ml.config.get_global_experiment() if experiment is not None: experiment._log_metrics(logs, step=self.global_step, epoch=self.epoch_logging, framework="transformers") output = {**logs, **{"step": self.global_step}} logger.info(output)
_use_apex = True else: _use_native_amp = True if is_torch_tpu_available(): import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met import torch_xla.distributed.parallel_loader as pl if is_tensorboard_available(): try: from torch.utils.tensorboard import SummaryWriter except ImportError: from tensorboardX import SummaryWriter if is_wandb_available(): import wandb if is_comet_available(): import comet_ml if is_optuna_available(): import optuna if is_ray_available(): from ray import tune from length_adaptive_transformer.drop_and_restore_utils import ( LengthDropArguments, sample_length_configuration,
def main(): cmd_parser = argparse.ArgumentParser() cmd_parser.add_argument("experiments", nargs="+", choices=list(CONFIGS.keys()), help="Available experiments") cmd_parser.add_argument("--local_rank", default=None, help="added by torch.distributed.launch") cmd_args = cmd_parser.parse_args() for experiment in cmd_args.experiments: config_dict = CONFIGS[experiment] local_rank = int(cmd_args.local_rank or -1) config_dict["local_rank"] = local_rank # See all possible arguments in transformers/training_args.py and ./run_args.py exp_parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, CustomTrainingArguments)) model_args, data_args, training_args = exp_parser.parse_dict( config_dict) # Overrides default behavior of TrainingArguments of setting run name # equal to output_dir when not available if training_args.run_name == training_args.output_dir: training_args.run_name = experiment # Run name (or experiment name) is added to the output_dir training_args.output_dir = os.path.join(training_args.output_dir, training_args.run_name) # Initialize wandb now to include the logs that follow. # For now, only support early wandb logging when running one experiment. distributed_initialized = torch.distributed.is_initialized() rank = -1 if not distributed_initialized else torch.distributed.get_rank( ) if is_wandb_available() and len(cmd_args.experiments) == 1: CustomWandbCallback.early_init(training_args, rank) # Detecting last checkpoint. last_checkpoint = None if (os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) logging.warning(f"Loading from checkpoint: {last_checkpoint} ") if (last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and " "is not empty. Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logging.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To " "avoid this behavior, change the `--output_dir` or add " "`--overwrite_output_dir` to train from scratch.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], level=(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)) # Log config. logging.info(f"Running with config:\n{pformat(config_dict, indent=4)}") # Log on each process the small summary: logging.warning( f"Process rank: {training_args.local_rank}, " f"device: {training_args.device}, n_gpu: {training_args.n_gpu} " f"distributed training: {bool(training_args.local_rank != -1)}, " f"16-bits training: {training_args.fp16}") # Set the verbosity to info of the Transformers logging (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logging.info("Training/evaluation parameters %s", training_args) logging.info("Model parameters: %s", model_args) logging.info("Data parameters: %s", data_args) # Set seed before initializing model. set_seed(training_args.seed) logging.info(f"Seed to reproduce: {training_args.seed}") if model_args.finetuning: run_finetuning_multiple_tasks(model_args, data_args, training_args, last_checkpoint=last_checkpoint) else: run_pretraining(model_args, data_args, training_args, last_checkpoint=last_checkpoint) # destroy process group before launching another experiment if cmd_args.local_rank: torch.distributed.destroy_process_group()
def load_model_and_tokenizer(model_args): # Distributed training: # The `.from_pretrained` methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_path: config = CONFIG[model_args.transformer_type].from_pretrained( model_args.config_path, cache_dir=model_args.cache_dir) elif model_args.model_path: config = CONFIG[model_args.transformer_type].from_pretrained( model_args.model_path, cache_dir=model_args.cache_dir) else: config = CONFIG[model_args.transformer_type]( latent_size=model_args.latent_size, transformer_name=model_args.transformer_name, transformer_decoder_name=model_args.transformer_decoder_name, encoder_model=model_args.encoder_model, decoder_model=model_args.decoder_model, set_seq_size=model_args.set_seq_size, encoded_seq_size=model_args.encoded_seq_size, n_previous_latent_codes=model_args.n_previous_latent_codes, mmd_batch_size=model_args.mmd_batch_size, use_reg_loss=(not model_args.dont_use_reg_loss), reg_schedule_k=model_args.reg_schedule_k, reg_schedule_b=model_args.reg_schedule_b, skip_schedule_k=model_args.skip_schedule_k, skip_schedule_b=model_args.skip_schedule_b, n_latent_tokens=model_args.n_latent_tokens, use_extra_logs=is_wandb_available(), use_skip_connection=model_args.use_skip_connection, use_latent_dropout=model_args.use_latent_dropout, max_latent_dropout_rate=model_args.max_latent_dropout_rate, latent_dropout_schedule_k=model_args.latent_dropout_schedule_k, latent_dropout_schedule_b=model_args.latent_dropout_schedule_b, ) logger.warning( "You are instantiating a new config instance from scratch (still using T5 checkpoint)." ) if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) if 'gpt' in model_args.tokenizer_name: tokenizer.pad_token = tokenizer.eos_token elif model_args.model_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.transformer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.transformer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_path: model = MODEL[model_args.transformer_type].from_pretrained( model_args.model_path, from_tf=bool(".ckpt" in model_args.model_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = MODEL[model_args.transformer_type](config) model.resize_token_embeddings(len(tokenizer)) if model_args.set_seq_size: tokenizer.model_max_length = model_args.set_seq_size tokenizer.mask_token = tokenizer.unk_token return model, tokenizer