Beispiel #1
0
    def evaluate(self,
                 eval_dataset: Optional[Dataset] = None) -> Dict[str, float]:
        """
        Run evaluation and returns metrics.

        Adds extra VAE tests:
        - Interpolation between samples in latent space.
        - Random latent codes from normal distribution.
        if class column provided?
        - tSNE plots with class-label colouring.
        """
        if self.state.global_step < wandb.run.history._step:
            self.state.global_step = wandb.run.history._step
        if is_wandb_available():
            start_eval = time.time()
            with torch.no_grad():
                self.model.eval()
                self._evaluate_latent_samples(eval_dataset=eval_dataset)
            generate_time = time.time() - start_eval
        output_metrics = super().evaluate(eval_dataset=eval_dataset)
        if is_wandb_available():
            wandb.log(
                {
                    "eval_get_test_loss_time":
                    time.time() - start_eval + generate_time
                },
                step=self.state.global_step)  # type: ignore
            wandb.log({"eval_generate_time": generate_time},
                      step=self.state.global_step)  # type: ignore
        return output_metrics
 def log(self, logs, mode="train"):
     self._setup_loggers()
     if self.global_step is None:
         # when logging evaluation metrics without training
         self.global_step = 0
     if self.tb_writer:
         for k, v in logs.items():
             if isinstance(v, (int, float)):
                 self.tb_writer.add_scalar(k, v, self.global_step)
             else:
                 logger.warning(
                     "Trainer is attempting to log a value of "
                     '"%s" of type %s for key "%s" as a scalar. '
                     "This invocation of Tensorboard's writer.add_scalar() "
                     "is incorrect so we dropped this attribute.",
                     v,
                     type(v),
                     k,
                 )
         self.tb_writer.flush()
     if is_wandb_available():
         if self.is_world_process_zero():
             wandb.log(logs, step=self.global_step)
     if is_comet_available():
         if self.is_world_process_zero():
             experiment = comet_ml.config.get_global_experiment()
             if experiment is not None:
                 experiment._log_metrics(logs, step=self.global_step, epoch=self.epoch, framework="transformers")
     output = {**logs, **{"step": self.global_step}}
     if self.is_world_process_zero():
         self.log_history.append(output)
Beispiel #3
0
    def log(self, logs: Dict[str, float]) -> None:
        """
        Log :obj:`logs` on the various objects watching training.

        Subclass and override this method to inject custom behavior.

        Args:
            logs (:obj:`Dict[str, float]`):
                The values to log.
        """
        logs["epoch"] = self.epoch_logging

        # if self.tb_writer:
        #     with self.tb_writer.as_default():
        #         for k, v in logs.items():
        #             tf.summary.scalar(k, v, step=self.global_step)
        #     self.tb_writer.flush()

        if is_wandb_available():
            wandb.log(logs, step=self.global_step)

        if is_comet_available():
            experiment = comet_ml.config.get_global_experiment()
            if experiment is not None:
                experiment._log_metrics(
                    logs, step=self.global_step, epoch=self.epoch_logging, framework="transformers"
                )

        output = {**logs, **{"step": self.global_step}}

        logger.info(output)
Beispiel #4
0
def load_model_and_tokenizer(model_args):
    # Distributed training:
    # The `.from_pretrained` methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.set_seq_size and model_args.set_seq_size <= 4:
        logger.warning(
            '`set_seq_size` is to small to work with the Funnel transformer. now using set_seq_size=5'
        )
        model_args.set_seq_size = 5

    if model_args.config_path:
        config = Funnel_T5_VAE_Config.from_pretrained(
            model_args.config_path, cache_dir=model_args.cache_dir)
    elif model_args.model_path:
        config = Funnel_T5_VAE_Config.from_pretrained(
            model_args.model_path, cache_dir=model_args.cache_dir)
    else:
        config = Funnel_T5_VAE_Config(use_extra_logs=is_wandb_available(),
                                      **model_args.__dict__)
        logger.warning(
            "You are instantiating a new config instance from scratch (still using T5 checkpoint)."
        )

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer)

    if model_args.model_path:
        model = Funnel_T5_VAE_Model.from_pretrained(
            model_args.model_path,
            from_tf=bool(".ckpt" in model_args.model_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
        model.resize_token_embeddings(len(tokenizer))
    else:
        vocab_size = len(tokenizer)
        config.funnel.vocab_size = vocab_size
        config.t5.vocab_size = vocab_size
        config.vocab_size = vocab_size
        logger.info("Training new model from scratch")
        model = Funnel_T5_VAE_Model(config)

    if model_args.set_seq_size:
        tokenizer.model_max_length = model_args.set_seq_size
    tokenizer.mask_token = tokenizer.unk_token

    return model, tokenizer
def init_ray_wandb_logger_callback(training_args):
    """
    Initialize the ray wandb integration, used specifically for hyperparameter
    tuning. Returns either None or a list containing the initialized callback,
    so the output can be passed directly to hp_search_kwargs.
    """
    has_wandb = is_wandb_available()
    if not has_wandb:
        return None

    project = os.getenv("WANDB_PROJECT", "huggingface")
    group = training_args.run_name
    callbacks = [WandbLoggerCallback(
        project=project,
        group=group,
    )]

    return callbacks
Beispiel #6
0
    def early_init(cls, trainer_args, local_rank):
        has_wandb = is_wandb_available()
        assert has_wandb, \
            "WandbCallback requires wandb to be installed. Run `pip install wandb`."

        logger.info("Initializing wandb on rank", local_rank)
        if local_rank not in [-1, 0]:
            return

        # Deduce run name and group.
        init_args = {}
        if hasattr(trainer_args,
                   "trial_name") and trainer_args.trial_name is not None:
            run_name = trainer_args.trial_name
            init_args["group"] = trainer_args.run_name
        else:
            run_name = trainer_args.run_name

        wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"),
                   name=run_name,
                   reinit=True,
                   **init_args)

        return wandb.run.id
Beispiel #7
0
    def log(self, logs: Dict[str, float]) -> None:
        """
        Log :obj:`logs` on the various objects watching training.
        Subclass and override this method to inject custom behavior.
        Args:
            logs (:obj:`Dict[str, float]`):
                The values to log.
        """
        if hasattr(self, "_log"):
            warnings.warn(
                "The `_log` method is deprecated and won't be called in a future version, define `log` in your subclass.",
                FutureWarning,
            )
            return self._log(logs)
        logs["epoch"] = self.epoch_logging

        # if self.tb_writer:
        #     with self.tb_writer.as_default():
        #         for k, v in logs.items():
        #             tf.summary.scalar(k, v, step=self.global_step)
        #     self.tb_writer.flush()

        if is_wandb_available():
            wandb.log(logs, step=self.global_step)

        if is_comet_available():
            experiment = comet_ml.config.get_global_experiment()
            if experiment is not None:
                experiment._log_metrics(logs,
                                        step=self.global_step,
                                        epoch=self.epoch_logging,
                                        framework="transformers")

        output = {**logs, **{"step": self.global_step}}

        logger.info(output)
    _use_apex = True
else:
    _use_native_amp = True

if is_torch_tpu_available():
    import torch_xla.core.xla_model as xm
    import torch_xla.debug.metrics as met
    import torch_xla.distributed.parallel_loader as pl

if is_tensorboard_available():
    try:
        from torch.utils.tensorboard import SummaryWriter
    except ImportError:
        from tensorboardX import SummaryWriter

if is_wandb_available():
    import wandb

if is_comet_available():
    import comet_ml

if is_optuna_available():
    import optuna

if is_ray_available():
    from ray import tune


from length_adaptive_transformer.drop_and_restore_utils import (
    LengthDropArguments,
    sample_length_configuration,
Beispiel #9
0
def main():
    cmd_parser = argparse.ArgumentParser()
    cmd_parser.add_argument("experiments",
                            nargs="+",
                            choices=list(CONFIGS.keys()),
                            help="Available experiments")
    cmd_parser.add_argument("--local_rank",
                            default=None,
                            help="added by torch.distributed.launch")

    cmd_args = cmd_parser.parse_args()

    for experiment in cmd_args.experiments:
        config_dict = CONFIGS[experiment]
        local_rank = int(cmd_args.local_rank or -1)
        config_dict["local_rank"] = local_rank

        # See all possible arguments in transformers/training_args.py and ./run_args.py
        exp_parser = HfArgumentParser(
            (ModelArguments, DataTrainingArguments, CustomTrainingArguments))
        model_args, data_args, training_args = exp_parser.parse_dict(
            config_dict)

        # Overrides default behavior of TrainingArguments of setting run name
        # equal to output_dir when not available
        if training_args.run_name == training_args.output_dir:
            training_args.run_name = experiment
        # Run name (or experiment name) is added to the output_dir
        training_args.output_dir = os.path.join(training_args.output_dir,
                                                training_args.run_name)

        # Initialize wandb now to include the logs that follow.
        # For now, only support early wandb logging when running one experiment.
        distributed_initialized = torch.distributed.is_initialized()
        rank = -1 if not distributed_initialized else torch.distributed.get_rank(
        )
        if is_wandb_available() and len(cmd_args.experiments) == 1:
            CustomWandbCallback.early_init(training_args, rank)

        # Detecting last checkpoint.
        last_checkpoint = None
        if (os.path.isdir(training_args.output_dir) and training_args.do_train
                and not training_args.overwrite_output_dir):
            last_checkpoint = get_last_checkpoint(training_args.output_dir)
            logging.warning(f"Loading from checkpoint: {last_checkpoint} ")
            if (last_checkpoint is None
                    and len(os.listdir(training_args.output_dir)) > 0):
                raise ValueError(
                    f"Output directory ({training_args.output_dir}) already exists and "
                    "is not empty. Use --overwrite_output_dir to overcome.")
            elif last_checkpoint is not None:
                logging.info(
                    f"Checkpoint detected, resuming training at {last_checkpoint}. To "
                    "avoid this behavior, change the `--output_dir` or add "
                    "`--overwrite_output_dir` to train from scratch.")

        # Setup logging
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            handlers=[logging.StreamHandler(sys.stdout)],
            level=(logging.INFO if is_main_process(training_args.local_rank)
                   else logging.WARN))

        # Log config.
        logging.info(f"Running with config:\n{pformat(config_dict, indent=4)}")

        # Log on each process the small summary:
        logging.warning(
            f"Process rank: {training_args.local_rank}, "
            f"device: {training_args.device}, n_gpu: {training_args.n_gpu} "
            f"distributed training: {bool(training_args.local_rank != -1)}, "
            f"16-bits training: {training_args.fp16}")
        # Set the verbosity to info of the Transformers logging (on main process only):
        if is_main_process(training_args.local_rank):
            transformers.utils.logging.set_verbosity_info()
            transformers.utils.logging.enable_default_handler()
            transformers.utils.logging.enable_explicit_format()
        logging.info("Training/evaluation parameters %s", training_args)
        logging.info("Model parameters: %s", model_args)
        logging.info("Data parameters: %s", data_args)

        # Set seed before initializing model.
        set_seed(training_args.seed)
        logging.info(f"Seed to reproduce: {training_args.seed}")

        if model_args.finetuning:
            run_finetuning_multiple_tasks(model_args,
                                          data_args,
                                          training_args,
                                          last_checkpoint=last_checkpoint)
        else:
            run_pretraining(model_args,
                            data_args,
                            training_args,
                            last_checkpoint=last_checkpoint)

        # destroy process group before launching another experiment
        if cmd_args.local_rank:
            torch.distributed.destroy_process_group()
Beispiel #10
0
def load_model_and_tokenizer(model_args):
    # Distributed training:
    # The `.from_pretrained` methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_path:
        config = CONFIG[model_args.transformer_type].from_pretrained(
            model_args.config_path, cache_dir=model_args.cache_dir)
    elif model_args.model_path:
        config = CONFIG[model_args.transformer_type].from_pretrained(
            model_args.model_path, cache_dir=model_args.cache_dir)
    else:
        config = CONFIG[model_args.transformer_type](
            latent_size=model_args.latent_size,
            transformer_name=model_args.transformer_name,
            transformer_decoder_name=model_args.transformer_decoder_name,
            encoder_model=model_args.encoder_model,
            decoder_model=model_args.decoder_model,
            set_seq_size=model_args.set_seq_size,
            encoded_seq_size=model_args.encoded_seq_size,
            n_previous_latent_codes=model_args.n_previous_latent_codes,
            mmd_batch_size=model_args.mmd_batch_size,
            use_reg_loss=(not model_args.dont_use_reg_loss),
            reg_schedule_k=model_args.reg_schedule_k,
            reg_schedule_b=model_args.reg_schedule_b,
            skip_schedule_k=model_args.skip_schedule_k,
            skip_schedule_b=model_args.skip_schedule_b,
            n_latent_tokens=model_args.n_latent_tokens,
            use_extra_logs=is_wandb_available(),
            use_skip_connection=model_args.use_skip_connection,
            use_latent_dropout=model_args.use_latent_dropout,
            max_latent_dropout_rate=model_args.max_latent_dropout_rate,
            latent_dropout_schedule_k=model_args.latent_dropout_schedule_k,
            latent_dropout_schedule_b=model_args.latent_dropout_schedule_b,
        )
        logger.warning(
            "You are instantiating a new config instance from scratch (still using T5 checkpoint)."
        )

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
        if 'gpt' in model_args.tokenizer_name:
            tokenizer.pad_token = tokenizer.eos_token
    elif model_args.model_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_path,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    elif model_args.transformer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.transformer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_path:
        model = MODEL[model_args.transformer_type].from_pretrained(
            model_args.model_path,
            from_tf=bool(".ckpt" in model_args.model_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = MODEL[model_args.transformer_type](config)

    model.resize_token_embeddings(len(tokenizer))
    if model_args.set_seq_size:
        tokenizer.model_max_length = model_args.set_seq_size
    tokenizer.mask_token = tokenizer.unk_token

    return model, tokenizer