コード例 #1
0
checkpointer = ModelCheckpoint(filepath=checkpoint_filename + '.ckpt',
                               monitor='swa_loss_no_reg')

trainer = Trainer(gpus=1,
                  num_nodes=1,
                  max_epochs=epochs,
                  logger=logger,
                  callbacks=[lr_logger],
                  checkpoint_callback=checkpointer,
                  benchmark=True,
                  terminate_on_nan=True,
                  gradient_clip_val=max_l2_norm)

try:
    trainer.fit(swag_model)
except ValueError:
    print("Model", checkpoint_filename, 'exited early!', flush=True)
    exit(1)

# Save model:

logger.log_hyperparams(
    params=swag_model.hparams,
    metrics={'swa_loss_no_reg': checkpointer.best_model_score.item()})
logger.save()
logger.finalize('success')

spock_reg_model.save_swag(swag_model, output_filename + '.pkl')
import pickle as pkl
pkl.dump(swag_model.ssX, open(output_filename + '_ssX.pkl', 'wb'))
コード例 #2
0
class TrainerLoggingMixin(ABC):

    def __init__(self):
        # this is just a summary on variables used in this abstract class,
        #  the proper values/initialisation should be done in child class
        self.current_epoch = None
        self.on_gpu = None
        self.log_gpu_memory = None
        self.logger = None
        self.tqdm_metrics = None
        self.global_step = None
        self.proc_rank = None
        self.use_dp = None
        self.use_ddp2 = None
        self.num_gpus = None

    def configure_logger(self, logger):
        if logger is True:
            # default logger
            self.logger = TensorBoardLogger(
                save_dir=self.default_save_path,
                version=self.slurm_job_id,
                name='lightning_logs'
            )
            self.logger.rank = 0
        elif logger is False:
            self.logger = None
        else:
            self.logger = logger
            self.logger.rank = 0

    def log_metrics(self, metrics, grad_norm_dic, step=None):
        """Logs the metric dict passed in.
        If `step` parameter is None and `step` key is presented is metrics,
        uses metrics["step"] as a step
        :param metrics (dict): Metric values
        :param grad_norm_dic (dict): Gradient norms
        :param step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step`
        """
        # add gpu memory
        if self.on_gpu and self.log_gpu_memory:
            mem_map = memory.get_memory_profile(self.log_gpu_memory)
            metrics.update(mem_map)

        # add norms
        metrics.update(grad_norm_dic)

        # turn all tensors to scalars
        scalar_metrics = self.metrics_to_scalars(metrics)

        if "step" in scalar_metrics and step is None:
            step = scalar_metrics.pop("step")
        else:
            # added metrics by Lightning for convenience
            metrics['epoch'] = self.current_epoch
            step = step if step is not None else self.global_step
        # log actual metrics
        if self.proc_rank == 0 and self.logger is not None:
            self.logger.log_metrics(scalar_metrics, step=step)
            self.logger.save()

    def add_tqdm_metrics(self, metrics):
        for k, v in metrics.items():
            if isinstance(v, torch.Tensor):
                v = v.item()

            self.tqdm_metrics[k] = v

    def metrics_to_scalars(self, metrics):
        new_metrics = {}
        for k, v in metrics.items():
            if isinstance(v, torch.Tensor):
                v = v.item()

            if isinstance(v, dict):
                v = self.metrics_to_scalars(v)

            new_metrics[k] = v

        return new_metrics

    def process_output(self, output, train=False):
        """Reduces output according to the training mode.

        Separates loss from logging and tqdm metrics
        :param output:
        :return:
        """
        # ---------------
        # EXTRACT CALLBACK KEYS
        # ---------------
        # all keys not progress_bar or log are candidates for callbacks
        callback_metrics = {}
        for k, v in output.items():
            if k not in ['progress_bar', 'log', 'hiddens']:
                callback_metrics[k] = v

        if train and (self.use_dp or self.use_ddp2):
            num_gpus = self.num_gpus
            callback_metrics = self.reduce_distributed_output(callback_metrics, num_gpus)

        for k, v in callback_metrics.items():
            if isinstance(v, torch.Tensor):
                callback_metrics[k] = v.item()

        # ---------------
        # EXTRACT PROGRESS BAR KEYS
        # ---------------
        try:
            progress_output = output['progress_bar']

            # reduce progress metrics for tqdm when using dp
            if train and (self.use_dp or self.use_ddp2):
                num_gpus = self.num_gpus
                progress_output = self.reduce_distributed_output(progress_output, num_gpus)

            progress_bar_metrics = progress_output
        except Exception:
            progress_bar_metrics = {}

        # ---------------
        # EXTRACT LOGGING KEYS
        # ---------------
        # extract metrics to log to experiment
        try:
            log_output = output['log']

            # reduce progress metrics for tqdm when using dp
            if train and (self.use_dp or self.use_ddp2):
                num_gpus = self.num_gpus
                log_output = self.reduce_distributed_output(log_output, num_gpus)

            log_metrics = log_output
        except Exception:
            log_metrics = {}

        # ---------------
        # EXTRACT LOSS
        # ---------------
        # if output dict doesn't have the keyword loss
        # then assume the output=loss if scalar
        loss = None
        if train:
            try:
                loss = output['loss']
            except Exception:
                if isinstance(output, torch.Tensor):
                    loss = output
                else:
                    raise RuntimeError(
                        'No `loss` value in the dictionary returned from `model.training_step()`.'
                    )

            # when using dp need to reduce the loss
            if self.use_dp or self.use_ddp2:
                loss = self.reduce_distributed_output(loss, self.num_gpus)

        # ---------------
        # EXTRACT HIDDEN
        # ---------------
        hiddens = output.get('hiddens')

        # use every metric passed in as a candidate for callback
        callback_metrics.update(progress_bar_metrics)
        callback_metrics.update(log_metrics)

        # convert tensors to numpy
        for k, v in callback_metrics.items():
            if isinstance(v, torch.Tensor):
                callback_metrics[k] = v.item()

        return loss, progress_bar_metrics, log_metrics, callback_metrics, hiddens

    def reduce_distributed_output(self, output, num_gpus):
        if num_gpus <= 1:
            return output

        # when using DP, we get one output per gpu
        # average outputs and return
        if isinstance(output, torch.Tensor):
            return output.mean()

        for k, v in output.items():
            # recurse on nested dics
            if isinstance(output[k], dict):
                output[k] = self.reduce_distributed_output(output[k], num_gpus)

            # do nothing when there's a scalar
            elif isinstance(output[k], torch.Tensor) and output[k].dim() == 0:
                pass

            # reduce only metrics that have the same number of gpus
            elif output[k].size(0) == num_gpus:
                reduced = torch.mean(output[k])
                output[k] = reduced
        return output
コード例 #3
0
def main(arguments: argparse.Namespace) -> None:
    """Train the model.
    Args:
        arguments: Model hyper-parameters
    Note:
        For the sake of the example, the images dataset will be downloaded
        to a temporary directory.
    """

    print_system_info()
    print("Using following configuration: ")
    pprint(vars(arguments))

    for fold in range(arguments.folds):

        if arguments.only_fold != -1:
            fold = arguments.only_fold

        print(f"Fold {fold}: Training is starting...")
        arguments.fold = fold
        model = OneCycleModule(arguments)
        logger = TensorBoardLogger("../logs",
                                   name=f"{arguments.backbone}-fold-{fold}")

        early_stop_callback = EarlyStopping(monitor='val_f1',
                                            min_delta=0.00,
                                            patience=5,
                                            verbose=True,
                                            mode='max')
        checkpoint_callback = ModelCheckpoint(filepath=os.path.join(
            arguments.save_model_path,
            f"checkpoint-{arguments.backbone}-fold-{fold}" +
            "-{epoch:02d}-{val_f1:.2f}"),
                                              save_top_k=arguments.save_top_k,
                                              monitor="val_f1",
                                              mode="max",
                                              verbose=True)
        trainer = pl.Trainer(
            weights_summary=None,
            num_sanity_val_steps=0,
            gpus=arguments.gpus,
            min_epochs=arguments.epochs,
            max_epochs=arguments.epochs,
            logger=logger,
            deterministic=True,
            benchmark=True,
            early_stop_callback=early_stop_callback,
            checkpoint_callback=checkpoint_callback,
            callbacks=[lr_logger],
            precision=arguments.precision,
            row_log_interval=10,
            # val_check_interval=0.5,
            accumulate_grad_batches=1
            # fast_dev_run=True
        )

        trainer.fit(model)

        logger.log_hyperparams(
            arguments,
            {"hparams/val_f1": checkpoint_callback.best_model_score.item()})
        logger.save()

        print("-" * 80)
        print(f"Testing the model on fold: {fold}")
        trainer.test(model)

        model.cpu()
        del model
        del trainer
        del logger
        del early_stop_callback
        del checkpoint_callback

        # end CV loop if we only train on one fold
        if arguments.only_fold != -1:
            break