Ejemplo n.º 1
0
    def __set_fit_dataloaders(self, model, train_dataloader, val_dataloaders, test_dataloaders):
        # when dataloader is passed via fit, patch the train_dataloader
        # functions to overwrite with these implementations
        if train_dataloader is not None:
            if not self.is_overriden('training_step', model):
                m = 'You called .fit() with a train_dataloader but did not define training_step()'
                raise MisconfigurationException(m)

            def patch_train_dataloader():
                return train_dataloader

            model.train_dataloader = patch_train_dataloader

        if val_dataloaders is not None:
            if not self.is_overriden('validation_step', model):
                m = 'You called .fit() with a val_dataloaders but did not define validation_step()'
                raise MisconfigurationException(m)

            def patch_val_dataloader():
                return val_dataloaders

            model.val_dataloader = patch_val_dataloader

        if test_dataloaders is not None:
            if not self.is_overriden('test_step', model):
                m = 'You called .fit() with a test_dataloaders but did not define test_step()'
                raise MisconfigurationException(m)

            def patch_test_dataloader():
                return test_dataloaders

            model.test_dataloader = patch_test_dataloader
Ejemplo n.º 2
0
    def __attach_dataloaders(self, model, train_dataloader, val_dataloaders,
                             test_dataloaders):
        # when dataloader is passed via fit, patch the train_dataloader
        # functions to overwrite with these implementations
        if train_dataloader is not None:
            if not self.is_overriden('training_step', model):
                raise MisconfigurationException(
                    'You called `.fit()` with a `train_dataloader` but did not define `training_step()`'
                )

            model.train_dataloader = _PatchDataLoader(train_dataloader)

        if val_dataloaders is not None:
            if not self.is_overriden('validation_step', model):
                raise MisconfigurationException(
                    'You called `.fit()` with a `val_dataloaders` but did not define `validation_step()`'
                )

            model.val_dataloader = _PatchDataLoader(val_dataloaders)

        if test_dataloaders is not None:
            if not self.is_overriden('test_step', model):
                raise MisconfigurationException(
                    'You called `.fit()` with a `test_dataloaders` but did not define `test_step()`'
                )

            model.test_dataloader = _PatchDataLoader(test_dataloaders)
Ejemplo n.º 3
0
    def __set_distributed_mode(self, distributed_backend, nb_gpu_nodes):
        # skip for CPU
        if self.num_gpus == 0:
            return

        # single GPU case
        if self.num_gpus == 1:
            self.single_gpu = True

            if distributed_backend is not None:
                self.use_dp = distributed_backend == 'dp'
                self.use_ddp = distributed_backend == 'ddp'

        # multiple GPU case
        elif self.num_gpus > 1:
            if distributed_backend is not None:
                # DP, DDP case
                self.use_dp = distributed_backend == 'dp'
                self.use_ddp = distributed_backend == 'ddp'

            elif distributed_backend is None:
                m = 'When using multiple GPUs set ' \
                    'Trainer(distributed_backend=dp) (or ddp)'
                raise MisconfigurationException(m)

        # use ddp automatically if nb_gpu_nodes > 1
        if nb_gpu_nodes > 1 and self.use_dp:  # pragma: no cover
            self.use_ddp = True
            self.use_dp = False
            w = 'DataParallel does not support nb_gpu_nodes > 1. ' \
                'Switching to DistributedDataParallel for you. ' \
                'To silence this warning set distributed_backend=ddp'
            warnings.warn(w)

        print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))
Ejemplo n.º 4
0
    def update_learning_rates(self, interval):
        ''' Update learning rates
        Args:
            interval (str): either 'epoch' or 'step'.
        '''
        if not self.lr_schedulers:
            return

        for lr_scheduler in self.lr_schedulers:
            current_idx = self.batch_idx if interval == 'step' else self.current_epoch
            current_idx += 1  # account for both batch and epoch starts from 0
            # Take step if call to update_learning_rates matches the interval key and
            # the current step modulo the schedulers frequency is zero
            if lr_scheduler[
                    'interval'] == interval and current_idx % lr_scheduler[
                        'frequency'] == 0:
                # If instance of ReduceLROnPlateau, we need to pass validation loss
                if lr_scheduler['reduce_on_plateau']:
                    monitor_key = lr_scheduler['monitor']
                    monitor_val = self.callback_metrics.get(monitor_key)
                    if monitor_val is None:
                        avail_metrics = ','.join(
                            list(self.callback_metrics.keys()))
                        m = f'ReduceLROnPlateau conditioned on metric {monitor_key} ' \
                            f'which is not available. Available metrics are: {avail_metrics}. ' \
                            'Condition can be set using `monitor` key in lr scheduler dict'
                        raise MisconfigurationException(m)
                    lr_scheduler['scheduler'].step(monitor_val)
                else:
                    lr_scheduler['scheduler'].step()
Ejemplo n.º 5
0
def parse_gpu_ids(gpus):
    """
    :param gpus: Int, string or list
        An int -1 or string '-1' indicate that all available GPUs should be used.
        A list of ints or a string containing list of comma separated integers
        indicates specific GPUs to use
        An int 0 means that no GPUs should be used
        Any int N > 0 indicates that GPUs [0..N) should be used.
    :return: List of gpus to be used

        If no GPUs are available but the value of gpus variable indicates request for GPUs
        then a misconfiguration exception is raised.
    """

    # Check that gpus param is None, Int, String or List
    check_gpus_data_type(gpus)

    # Handle the case when no gpus are requested
    if gpus is None or isinstance(gpus, int) and gpus == 0:
        return None

    # We know user requested GPUs therefore if some of the
    # requested GPUs are not available an exception is thrown.

    gpus = normalize_parse_gpu_string_input(gpus)
    gpus = normalize_parse_gpu_input_to_list(gpus)
    gpus = sanitize_gpu_ids(gpus)

    if not gpus:
        raise MisconfigurationException(
            "GPUs requested but none are available.")
    return gpus
Ejemplo n.º 6
0
    def get_dataloaders(self, model):
        """
        Dataloaders are provided by the model
        :param model:
        :return:
        """
        self.tng_dataloader = model.tng_dataloader
        self.test_dataloader = model.test_dataloader
        self.val_dataloader = model.val_dataloader

        if self.use_ddp and not isinstance(self.tng_dataloader.sampler,
                                           DistributedSampler):
            msg = """
when using multiple gpus and multiple nodes you must pass
 a DistributedSampler to DataLoader(sampler).

ie: this:
dataset = myDataset()
dataloader = Dataloader(dataset)

becomes:
dataset = myDataset()
dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
dataloader = Dataloader(dataset, sampler=dist_sampler)
"""
            raise MisconfigurationException(msg)
Ejemplo n.º 7
0
    def _load_model_state(cls, checkpoint):
        cls_takes_hparams = 'hparams' in inspect.signature(
            cls.__init__).parameters
        ckpt_hparams = checkpoint.get('hparams')

        if cls_takes_hparams:
            if ckpt_hparams is not None:
                hparams = Namespace(**ckpt_hparams)
            else:
                warnings.warn(
                    f"Checkpoint does not contain hyperparameters but {cls.__name__}'s __init__ contains"
                    " argument 'hparams'. Will pass in an empty Namespace instead."
                    " Did you forget to store your model hyperparameters in self.hparams?"
                )
                hparams = Namespace()
        else:  # The user's LightningModule does not define a hparams argument
            if ckpt_hparams is None:
                hparams = None
            else:
                raise MisconfigurationException(
                    f"Checkpoint contains hyperparameters but {cls.__name__}'s __init__ is missing the"
                    " argument 'hparams'. Are you loading the correct checkpoint?"
                )

        # load the state_dict on the model automatically
        model_args = [hparams] if hparams else []
        model = cls(*model_args)
        model.load_state_dict(checkpoint['state_dict'])

        # give model a chance to load something
        model.on_load_checkpoint(checkpoint)

        return model
Ejemplo n.º 8
0
    def dp_train(self, model):

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers = self.init_optimizers(
            model.configure_optimizers())

        model.cuda(self.root_gpu)

        # check for this bug (amp + dp + !01 doesn't work)
        # https://github.com/NVIDIA/apex/issues/227
        if self.use_dp and self.use_amp:
            if self.amp_level == 'O2':  # pragma: no cover
                m = f"""
                Amp level {self.amp_level} with DataParallel is not supported.
                See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
                We recommend you switch to ddp if you want to use amp
                """
                raise MisconfigurationException(m)
            else:
                model, optimizers = model.configure_apex(
                    amp, model, self.optimizers, self.amp_level)

        # create list of device ids
        device_ids = self.data_parallel_device_ids
        if isinstance(device_ids, int):
            device_ids = list(range(device_ids))

        model = LightningDataParallel(model, device_ids=device_ids)

        self.run_pretrain_routine(model)
    def get_dataloaders(self, model):
        """
        Dataloaders are provided by the model
        :param model:
        :return:
        """

        self.init_train_dataloader(model)
        self.init_test_dataloader(model)
        self.init_val_dataloader(model)

        if self.use_ddp or self.use_ddp2:
            # wait for all processes to catch up
            dist.barrier()

            # load each dataloader
            self.get_train_dataloader()
            self.get_test_dataloaders()
            self.get_val_dataloaders()

        # support IterableDataset for train data
        self.is_iterable_train_dataloader = (EXIST_ITER_DATASET and isinstance(
            self.get_train_dataloader().dataset, IterableDataset))
        if self.is_iterable_train_dataloader and not isinstance(
                self.val_check_interval, int):
            m = '''
            When using an iterableDataset for `train_dataloader`,
            `Trainer(val_check_interval)` must be an int.
            An int k specifies checking validation every k training batches
            '''
            raise MisconfigurationException(m)
Ejemplo n.º 10
0
    def __set_distributed_mode(self, distributed_backend, nb_gpu_nodes):
        # make DP and DDP mutually exclusive
        # single GPU will also use DP with devices=[0]
        requested_gpus = self.data_parallel_device_ids is not None

        num_gpus = self.num_gpus
        if num_gpus > 0:
            # single GPU case
            if num_gpus == 1:
                self.single_gpu = True

            elif num_gpus > 1 and distributed_backend is not None:
                # DP, DDP case
                self.use_dp = distributed_backend == 'dp'
                self.use_ddp = distributed_backend == 'ddp'

                # use ddp automatically if nb_gpu_nodes > 1
                if nb_gpu_nodes > 1 and self.use_dp:  # pragma: no cover
                    self.use_ddp = True
                    self.use_dp = False
                    w = 'DataParallel does not support nb_gpu_nodes > 1. ' \
                        'Switching to DistributedDataParallel for you. ' \
                        'To silence this warning set distributed_backend=ddp'
                    warnings.warn(w)

            elif distributed_backend is None:
                m = 'When using multiple GPUs set ' \
                    'Trainer(distributed_backend=dp) (or ddp)'
                raise MisconfigurationException(m)

        print('gpu available: {}, used: {}'.format(torch.cuda.is_available(),
                                                   self.on_gpu))
Ejemplo n.º 11
0
    def __dp_train(self, model):

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers = self.init_optimizers(
            model.configure_optimizers())

        root_gpu = 0
        if type(self.data_parallel_device_ids) is list:
            root_gpu = self.data_parallel_device_ids[0]
        model.cuda(root_gpu)

        # check for this bug (amp + dp + !01 doesn't work)
        # https://github.com/NVIDIA/apex/issues/227
        if self.use_dp and self.use_amp:
            m = f"""
            Amp level {self.amp_level} with DataParallel is not supported.
            See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
            We recommend you switch to ddp if you want to use amp
            """
            raise MisconfigurationException(m)

        model = LightningDataParallel(model,
                                      device_ids=self.data_parallel_device_ids)

        self.__run_pretrain_routine(model)
Ejemplo n.º 12
0
    def set_distributed_mode(self, distributed_backend, nb_gpu_nodes):
        # skip for CPU
        if self.num_gpus == 0:
            return

        # single GPU case
        # in single gpu case we allow ddp so we can train on multiple
        # nodes, 1 gpu per node
        if self.num_gpus == 1:
            self.single_gpu = True

            if distributed_backend is not None:
                self.use_dp = distributed_backend == 'dp'
                self.use_ddp = distributed_backend == 'ddp'
                self.use_ddp2 = distributed_backend == 'ddp2'

                # disable single gpu when using ddp2
                if self.use_ddp2:
                    self.single_gpu = False

        # multiple GPU case
        elif self.num_gpus > 1:
            if distributed_backend is not None:
                # DP, DDP case
                self.use_dp = distributed_backend == 'dp'
                self.use_ddp = distributed_backend == 'ddp'
                self.use_ddp2 = distributed_backend == 'ddp2'

            elif distributed_backend is None:
                m = 'When using multiple GPUs set ' \
                    'Trainer(distributed_backend=dp) (or ddp)'
                raise MisconfigurationException(m)

        # throw error to force user ddp or ddp2 choice
        if nb_gpu_nodes > 1 and not (self.use_ddp2
                                     or self.use_ddp):  # pragma: no cover
            w = 'DataParallel does not support nb_gpu_nodes > 1. ' \
                'Switching to DistributedDataParallel for you. ' \
                'To silence this warning set distributed_backend=ddp' \
                'or distributed_backend=ddp2'
            raise MisconfigurationException(w)

        logging.info(
            f'gpu available: {torch.cuda.is_available()}, used: {self.on_gpu}')
Ejemplo n.º 13
0
    def init_train_dataloader(self, model):
        """
        Dataloaders are provided by the model
        :param model:
        :return:
        """
        self.get_train_dataloader = model.train_dataloader
        self._percent_range_check('train_percent_check')

        # determine number of training batches
        if DALI_AVAILABLE and isinstance(self.get_train_dataloader(),
                                         DALIGenericIterator):
            self._dali_iterator_check(model, 'train')
            self.num_training_batches = self._get_dali_batch_count(
                self.get_train_dataloader())
        elif ITER_DATASET_AVAILABLE and isinstance(
                self.get_train_dataloader().dataset, IterableDataset):
            self.num_training_batches = float('inf')
        else:
            self.num_training_batches = len(self.get_train_dataloader())
            self.num_training_batches = int(self.num_training_batches *
                                            self.train_percent_check)

        # determine when to check validation
        # if int passed in, val checks that often
        # otherwise, it checks in [0, 1.0] % range of a training epoch
        if isinstance(self.val_check_interval, int):
            self.val_check_batch = self.val_check_interval
            if self.val_check_batch > self.num_training_batches:
                raise ValueError(
                    f"`val_check_interval` ({self.val_check_interval}) must be less than or equal "
                    f"to the number of the training batches ({self.num_training_batches}). "
                    f"If you want to disable validation set `val_percent_check` to 0.0 instead."
                )
        else:
            self._percent_range_check('val_check_interval')

            self.val_check_batch = int(self.num_training_batches *
                                       self.val_check_interval)
            self.val_check_batch = max(1, self.val_check_batch)

        self._ddp_sampler_check(self.get_train_dataloader(), 'train')

        # support IterableDataset for train data
        self.is_iterable_train_dataloader = (
            isinstance(self.get_train_dataloader(), DataLoader)
            and ITER_DATASET_AVAILABLE and isinstance(
                self.get_train_dataloader().dataset, IterableDataset))
        if self.is_iterable_train_dataloader and not isinstance(
                self.val_check_interval, int):
            m = '''
            When using an iterableDataset for `train_dataloader`,
            `Trainer(val_check_interval)` must be an int.
            An int k specifies checking validation every k training batches
            '''
            raise MisconfigurationException(m)
Ejemplo n.º 14
0
    def reset_train_dataloader(self, model):
        """
        Dataloaders are provided by the model
        :param model:
        :return:
        """

        self.train_dataloader = self.request_data_loader(
            model.train_dataloader)
        self.num_training_batches = 0

        # automatically add samplers
        self.train_dataloader = self.auto_add_sampler(self.train_dataloader,
                                                      train=True)

        # determine number of training batches
        if EXIST_ITER_DATASET and isinstance(self.train_dataloader.dataset,
                                             IterableDataset):
            self.num_training_batches = float('inf')
        else:
            self._percent_range_check('train_percent_check')

            self.num_training_batches = len(self.train_dataloader)
            self.num_training_batches = int(self.num_training_batches *
                                            self.train_percent_check)

        # determine when to check validation
        # if int passed in, val checks that often
        # otherwise, it checks in [0, 1.0] % range of a training epoch
        if isinstance(self.val_check_interval, int):
            self.val_check_batch = self.val_check_interval
            if self.val_check_batch > self.num_training_batches:
                raise ValueError(
                    f"`val_check_interval` ({self.val_check_interval}) must be less than or equal "
                    f"to the number of the training batches ({self.num_training_batches}). "
                    f"If you want to disable validation set `val_percent_check` to 0.0 instead."
                )
        else:
            self._percent_range_check('val_check_interval')

            self.val_check_batch = int(self.num_training_batches *
                                       self.val_check_interval)
            self.val_check_batch = max(1, self.val_check_batch)

        # support IterableDataset for train data
        self.is_iterable_train_dataloader = (EXIST_ITER_DATASET and isinstance(
            self.train_dataloader.dataset, IterableDataset))
        if self.is_iterable_dataloader(
                self.train_dataloader) and not isinstance(
                    self.val_check_interval, int):
            m = '''
            When using an iterableDataset for `train_dataloader`,
            `Trainer(val_check_interval)` must be an int.
            An int k specifies checking validation every k training batches
            '''
            raise MisconfigurationException(m)
Ejemplo n.º 15
0
def check_gpus_data_type(gpus):
    """
    :param gpus: gpus parameter as passed to the Trainer
        Function checks that it is one of: None, Int, String or List
        Throws otherwise
    :return: return unmodified gpus variable
    """

    if gpus is not None and type(gpus) not in (int, str, list):
        raise MisconfigurationException("GPUs must be int, string or list of ints or None.")
Ejemplo n.º 16
0
    def __run_evaluation(self, test=False):
        # when testing make sure user defined a test step
        can_run_test_step = False
        if test:
            can_run_test_step = self.__is_overriden('test_step') and self.__is_overriden('test_end')
            if not can_run_test_step:
                m = '''You called .test() without defining a test step or test_end.
                Please define and try again'''
                raise MisconfigurationException(m)

        # validate only if model has validation_step defined
        # test only if test_step or validation_step are defined
        run_val_step = self.__is_overriden('validation_step')

        if run_val_step or can_run_test_step:

            # hook
            model = self.__get_model()
            model.on_pre_performance_check()

            # select dataloaders
            dataloaders = self.val_dataloader
            max_batches = self.nb_val_batches

            # calculate max batches to use
            if test:
                dataloaders = self.test_dataloader
                max_batches = self.nb_test_batches

            # cap max batches to 1 when using fast_dev_run
            if self.fast_dev_run:
                max_batches = 1

            for ds_i, dataloader in enumerate(dataloaders):
                eval_out_metrics = self.evaluate(self.model,
                                                 dataloader,
                                                 max_batches,
                                                 ds_i,
                                                 test)

                self.__add_tqdm_metrics(eval_out_metrics)

                # hook
                model.on_post_performance_check()

            if self.show_progress_bar:
                # add model specific metrics
                tqdm_metrics = self.__tng_tqdm_dic
                self.progress_bar.set_postfix(**tqdm_metrics)

        # model checkpointing
        if self.proc_rank == 0 and self.checkpoint_callback is not None and not test:
            print('save callback...')
            self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch,
                                                  logs=self.__tng_tqdm_dic)
Ejemplo n.º 17
0
    def num_gpus(self):
        gpus = self.data_parallel_device_ids
        if gpus is None:
            return 0
        if type(gpus) is list:
            return len(gpus)
        if type(gpus) is int:
            return gpus

        m = 'gpus must be int, none or list of ints'
        raise MisconfigurationException(m)
Ejemplo n.º 18
0
    def fit(self, model):

        # when using multi-node or DDP within a node start each module in a separate process
        if self.use_ddp:
            # must copy only the meta of the exp so it survives pickle/unpickle
            #  when going to new process
            if self.experiment is not None:
                self.experiment = self.experiment.get_meta_copy()

            if self.is_slurm_managing_tasks:
                task = int(os.environ['SLURM_LOCALID'])
                self.ddp_train(task, model)
            else:
                msg = """
You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks.
We will launch %(nb_gpus)s processes for you.
We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s
If you're not using SLURM, ignore this message!
""" % {
                    'nb_gpus': self.nb_requested_gpus,
                    'nb_tasks': self.nb_slurm_tasks
                }
                warnings.warn(msg)
                mp.spawn(self.ddp_train,
                         nprocs=len(self.data_parallel_device_ids),
                         args=(model, ))

        # 1 gpu or dp option triggers training using DP module
        # easier to avoid NCCL issues
        elif self.use_dp:
            self.__dp_train(model)

        elif self.single_gpu:
            self.__single_gpu_train(model)

        # ON CPU
        else:
            # run through amp wrapper
            if self.use_amp:
                raise MisconfigurationException('amp + cpu is not supported.'
                                                ' Please use a GPU option')

            # CHOOSE OPTIMIZER
            # allow for lr schedulers as well
            self.optimizers = model.configure_optimizers()
            if len(self.optimizers) == 2 and type(self.optimizers[0]) is list:
                self.optimizers, self.lr_schedulers = self.optimizers

            self.__run_pretrain_routine(model)

        # return 1 when finished
        # used for testing or when we need to know that training succeeded
        return 1
Ejemplo n.º 19
0
    def fit(self, model):
        r"""
        Runs the full optimization routine.

        Example::

            trainer = Trainer()
            model = LightningModule()

            trainer.fit()
        """
        # when using multi-node or DDP within a node start each module in a separate process
        if self.use_ddp2:
            task = int(os.environ['SLURM_LOCALID'])
            self.ddp_train(task, model)

        elif self.use_ddp:
            if self.is_slurm_managing_tasks:
                task = int(os.environ['SLURM_LOCALID'])
                self.ddp_train(task, model)
            else:
                mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model,))

        # 1 gpu or dp option triggers training using DP module
        # easier to avoid NCCL issues
        elif self.use_dp:
            self.dp_train(model)

        elif self.single_gpu:
            self.single_gpu_train(model)

        elif self.use_tpu:
            log.info(f'training on {self.num_tpu_cores} TPU cores')

            #  COLAB_GPU is an env var available by default in Colab environments.
            start_method = 'fork' if os.getenv('COLAB_GPU') else 'spawn'
            xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method)

        # ON CPU
        else:
            # run through amp wrapper
            if self.use_amp:
                raise MisconfigurationException('amp + cpu is not supported.  Please use a GPU option')

            # CHOOSE OPTIMIZER
            # allow for lr schedulers as well
            self.optimizers, self.lr_schedulers = self.init_optimizers(model.configure_optimizers())

            self.run_pretrain_routine(model)

        # return 1 when finished
        # used for testing or when we need to know that training succeeded
        return 1
Ejemplo n.º 20
0
    def reset_train_dataloader(self, model: LightningModule) -> None:
        """Resets the train dataloader and initialises required variables
        (number of batches, when to validate, etc.).

        Args:
            model: The current `LightningModule`
        """
        self.train_dataloader = self.request_dataloader(model.train_dataloader)
        self.num_training_batches = 0

        # automatically add samplers
        self.train_dataloader = self.auto_add_sampler(self.train_dataloader,
                                                      train=True)

        self._percent_range_check('train_percent_check')

        if not _has_len(self.train_dataloader):
            self.num_training_batches = float('inf')
        else:
            # try getting the length
            self.num_training_batches = len(self.train_dataloader)
            self.num_training_batches = int(self.num_training_batches *
                                            self.train_percent_check)

        # determine when to check validation
        # if int passed in, val checks that often
        # otherwise, it checks in [0, 1.0] % range of a training epoch
        if isinstance(self.val_check_interval, int):
            self.val_check_batch = self.val_check_interval
            if self.val_check_batch > self.num_training_batches:
                raise ValueError(
                    f'`val_check_interval` ({self.val_check_interval}) must be less than or equal '
                    f'to the number of the training batches ({self.num_training_batches}). '
                    'If you want to disable validation set `val_percent_check` to 0.0 instead.'
                )
        else:
            if not _has_len(self.train_dataloader):
                if self.val_check_interval == 1.0:
                    self.val_check_batch = float('inf')
                else:
                    raise MisconfigurationException(
                        'When using an infinite DataLoader (e.g. with an IterableDataset or when '
                        'DataLoader does not implement `__len__`) for `train_dataloader`, '
                        '`Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies '
                        'checking validation every k training batches.')
            else:
                self._percent_range_check('val_check_interval')

                self.val_check_batch = int(self.num_training_batches *
                                           self.val_check_interval)
                self.val_check_batch = max(1, self.val_check_batch)
Ejemplo n.º 21
0
def sanitize_gpu_ids(gpus):
    """
    :param gpus: list of ints corresponding to GPU indices
        Checks that each of the GPUs in the list is actually available.
        Throws if any of the GPUs is not available.
    :return: unmodified gpus variable
    """
    all_available_gpus = get_all_available_gpus()
    for gpu in gpus:
        if gpu not in all_available_gpus:
            raise MisconfigurationException(f"""
                You requested GPUs: {gpus}
                But your machine only has: {all_available_gpus}
            """)
    return gpus
Ejemplo n.º 22
0
def sanitize_gpu_ids(gpus):
    """
    :param gpus: list of ints corresponding to GPU indices
        Checks that each of the GPUs in the list is actually available.
        Throws if any of the GPUs is not available.
    :return: unmodified gpus variable
    """
    all_available_gpus = get_all_available_gpus()
    for gpu in gpus:
        if gpu not in all_available_gpus:
            message = f"""
            Non-available gpu index {gpu} specified:
            Available gpu indices are: {all_available_gpus}
            """
            raise MisconfigurationException(message)
    return gpus
Ejemplo n.º 23
0
    def set_distributed_mode(self, distributed_backend, num_gpu_nodes):
        # skip for CPU
        if self.num_gpus == 0:
            return

        # single GPU case
        # in single gpu case we allow ddp so we can train on multiple
        # nodes, 1 gpu per node
        if self.num_gpus == 1:
            self.single_gpu = True

            if distributed_backend is not None:
                self.use_dp = distributed_backend == 'dp'
                self.use_ddp = distributed_backend == 'ddp'
                self.use_ddp2 = distributed_backend == 'ddp2'

                # disable single gpu when using ddp2
                if self.use_ddp2:
                    self.single_gpu = False

        # multiple GPU case
        elif self.num_gpus > 1:
            if distributed_backend is not None:
                # DP, DDP case
                self.use_dp = distributed_backend == 'dp'
                self.use_ddp = distributed_backend == 'ddp'
                self.use_ddp2 = distributed_backend == 'ddp2'

            elif distributed_backend is None:
                warnings.warn(
                    'You requested multiple GPUs but did not specify a backend, e.g.'
                    ' Trainer(distributed_backend=dp) (or ddp, ddp2).'
                    ' Setting distributed_backend=dp for you.')
                self.use_dp = True
                self.use_ddp = False
                self.use_ddp2 = False

        # throw error to force user ddp or ddp2 choice
        if num_gpu_nodes > 1 and not (self.use_ddp2 or self.use_ddp):
            w = 'DataParallel does not support num_nodes > 1. ' \
                'Switching to DistributedDataParallel for you. ' \
                'To silence this warning set distributed_backend=ddp' \
                'or distributed_backend=ddp2'
            raise MisconfigurationException(w)

        log.info(
            f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}')
Ejemplo n.º 24
0
    def _reset_eval_dataloader(self, model: LightningModule,
                               mode: str) -> Tuple[int, List[DataLoader]]:
        """Generic method to reset a dataloader for evaluation.

        Args:
            model: The current `LightningModule`
            mode: Either `'val'` or `'test'`

        Returns:
            Tuple (num_batches, dataloaders)
        """
        dataloaders = self.request_dataloader(
            getattr(model, f'{mode}_dataloader'))

        if not isinstance(dataloaders, list):
            dataloaders = [dataloaders]

        # add samplers
        dataloaders = [
            self.auto_add_sampler(dl, train=False) for dl in dataloaders if dl
        ]

        num_batches = 0

        # determine number of batches
        # datasets could be none, 1 or 2+
        if len(dataloaders) != 0:
            for dataloader in dataloaders:
                if not _has_len(dataloader):
                    num_batches = float('inf')
                    break

            percent_check = getattr(self, f'{mode}_percent_check')

            if num_batches != float('inf'):
                self._percent_range_check(f'{mode}_percent_check')

                num_batches = sum(
                    len(dataloader) for dataloader in dataloaders)
                num_batches = int(num_batches * percent_check)
            elif percent_check not in (0.0, 1.0):
                raise MisconfigurationException(
                    'When using an infinite DataLoader (e.g. with an IterableDataset or when '
                    f'DataLoader does not implement `__len__`) for `{mode}_dataloader`, '
                    f'`Trainer({mode}_percent_check)` must be `0.0` or `1.0`.')
        return num_batches, dataloaders
Ejemplo n.º 25
0
    def fit(self, model):
        # when using multi-node or DDP within a node start each module in a separate process
        if self.use_ddp:

            if self.is_slurm_managing_tasks:
                task = int(os.environ['SLURM_LOCALID'])
                self.ddp_train(task, model)
            else:
                nb_gpus = self.nb_requested_gpus
                nb_tasks = self.nb_slurm_tasks
                msg = f"""
                You requested {nb_gpus}s GPUs but launched {nb_tasks}s slurm tasks.
                We will launch {nb_gpus}s processes for you.
                We recommend you let slurm manage the processes by setting:
                --ntasks-per-node={nb_gpus}s
                If you're not using SLURM, ignore this message!
                """
                warnings.warn(msg)
                mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model, ))

        # 1 gpu or dp option triggers training using DP module
        # easier to avoid NCCL issues
        elif self.use_dp:
            self.__dp_train(model)

        elif self.single_gpu:
            self.__single_gpu_train(model)

        # ON CPU
        else:
            # run through amp wrapper
            if self.use_amp:
                raise MisconfigurationException('amp + cpu is not supported.'
                                                ' Please use a GPU option')

            # CHOOSE OPTIMIZER
            # allow for lr schedulers as well
            self.optimizers, self.lr_schedulers = self.init_optimizers(
                model.configure_optimizers())

            self.__run_pretrain_routine(model)

        # return 1 when finished
        # used for testing or when we need to know that training succeeded
        return 1
Ejemplo n.º 26
0
    def get_dataloaders(self, model):
        """
        Dataloaders are provided by the model
        :param model:
        :return:
        """

        self.init_train_dataloader(model)
        self.init_test_dataloader(model)
        self.init_val_dataloader(model)

        if self.use_ddp or self.use_ddp2:
            # wait for all processes to catch up
            dist.barrier()

            # load each dataloader
            self.get_train_dataloader()
            self.get_test_dataloaders()
            self.get_val_dataloaders()

        # on TPUs load each dataloader only on process 0
        # this will trigger the data downloads
        if self.use_tpu and XLA_AVAILABLE:
            if self.tpu_local_core_rank == 0:
                self.get_train_dataloader()
                self.get_test_dataloaders()
                self.get_val_dataloaders()

            # wait for all processes to catch up
            torch_xla.core.xla_model.rendezvous(
                "pl.TrainerDataLoadingMixin.get_dataloaders")

        # support IterableDataset for train data
        self.is_iterable_train_dataloader = (EXIST_ITER_DATASET and isinstance(
            self.get_train_dataloader().dataset, IterableDataset))
        if self.is_iterable_train_dataloader and not isinstance(
                self.val_check_interval, int):
            m = '''
            When using an iterableDataset for `train_dataloader`,
            `Trainer(val_check_interval)` must be an int.
            An int k specifies checking validation every k training batches
            '''
            raise MisconfigurationException(m)
Ejemplo n.º 27
0
    def fit(self, model):
        # when using multi-node or DDP within a node start each module in a separate process
        if self.use_ddp2:
            task = int(os.environ['SLURM_LOCALID'])
            self.ddp_train(task, model)

        elif self.use_ddp:
            if self.is_slurm_managing_tasks:
                task = int(os.environ['SLURM_LOCALID'])
                self.ddp_train(task, model)
            else:
                mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model, ))

        # 1 gpu or dp option triggers training using DP module
        # easier to avoid NCCL issues
        elif self.use_dp:
            self.dp_train(model)

        elif self.single_gpu:
            self.single_gpu_train(model)

        # ON CPU
        else:
            # run through amp wrapper
            if self.use_amp:
                raise MisconfigurationException(
                    'amp + cpu is not supported.  Please use a GPU option')

            # CHOOSE OPTIMIZER
            # allow for lr schedulers as well
            self.optimizers, self.lr_schedulers = self.init_optimizers(
                model.configure_optimizers())

            self.run_pretrain_routine(model)

        # return 1 when finished
        # used for testing or when we need to know that training succeeded
        return 1
Ejemplo n.º 28
0
    def run_pretrain_routine(self, model):
        """Sanity check a few things before starting actual training.

        :param model:
        """
        ref_model = model
        if self.data_parallel:
            ref_model = model.module

        # give model convenience properties
        ref_model.trainer = self

        # set local properties on the model
        self.copy_trainer_model_properties(ref_model)

        # link up experiment object
        if self.logger is not None:
            ref_model.logger = self.logger

            # save exp to get started
            if hasattr(ref_model, "hparams"):
                self.logger.log_hyperparams(ref_model.hparams)

            self.logger.save()

        if self.use_ddp or self.use_ddp2:
            dist.barrier()

        # set up checkpoint callback
        self.configure_checkpoint_callback()

        # register auto-resubmit when on SLURM
        self.register_slurm_signal_handlers()

        # transfer data loaders from model
        self.get_dataloaders(ref_model)

        # print model summary
        if self.proc_rank == 0 and self.weights_summary is not None:
            if self.weights_summary in ['full', 'top']:
                ref_model.summarize(mode=self.weights_summary)
            else:
                m = "weights_summary can be None, 'full' or 'top'"
                raise MisconfigurationException(m)

        # track model now.
        # if cluster resets state, the model will update with the saved weights
        self.model = model

        # restore training and model before hpc call
        self.restore_weights(model)

        # when testing requested only run test and return
        if self.testing:
            self.run_evaluation(test=True)
            return

        # run tiny validation (if validation defined)
        # to make sure program won't crash during val
        ref_model.on_sanity_check_start()
        if self.get_val_dataloaders(
        ) is not None and self.num_sanity_val_steps > 0:
            # init progress bars for validation sanity check
            pbar = tqdm.tqdm(desc='Validation sanity check',
                             total=self.num_sanity_val_steps,
                             leave=False,
                             position=2 * self.process_position,
                             disable=not self.show_progress_bar,
                             dynamic_ncols=True,
                             unit='batch')
            self.main_progress_bar = pbar
            # dummy validation progress bar
            self.val_progress_bar = tqdm.tqdm(disable=True)

            self.evaluate(model, self.get_val_dataloaders(),
                          self.num_sanity_val_steps, self.testing)

            # close progress bars
            self.main_progress_bar.close()
            self.val_progress_bar.close()

        # init progress bar
        pbar = tqdm.tqdm(leave=True,
                         position=2 * self.process_position,
                         disable=not self.show_progress_bar,
                         dynamic_ncols=True,
                         unit='batch',
                         file=sys.stdout)
        self.main_progress_bar = pbar

        # clear cache before training
        if self.on_gpu:
            torch.cuda.empty_cache()

        # CORE TRAINING LOOP
        self.train()
Ejemplo n.º 29
0
    def run_evaluation(self, test_mode: bool = False):
        # when testing make sure user defined a test step
        if test_mode and not self.is_overriden('test_step'):
            m = "You called `.test()` without defining model's `.test_step()`." \
                " Please define and try again"
            raise MisconfigurationException(m)

        # Validation/Test begin callbacks
        if test_mode:
            self.on_test_start()
        else:
            self.on_validation_start()

        # hook
        model = self.get_model()
        model.on_pre_performance_check()

        # select dataloaders
        if test_mode:
            if self.reload_dataloaders_every_epoch or self.test_dataloaders is None:
                self.reset_test_dataloader(model)

            dataloaders = self.test_dataloaders
            max_batches = self.num_test_batches
        else:
            # val
            if self.reload_dataloaders_every_epoch or self.val_dataloaders is None:
                self.reset_val_dataloader(model)

            dataloaders = self.val_dataloaders
            max_batches = self.num_val_batches

        # cap max batches to 1 when using fast_dev_run
        if self.fast_dev_run:
            max_batches = 1

        # init validation or test progress bar
        # main progress bar will already be closed when testing so initial position is free
        position = 2 * self.process_position + (not test_mode)
        desc = 'Testing' if test_mode else 'Validating'
        pbar = tqdm(desc=desc,
                    total=max_batches,
                    leave=test_mode,
                    position=position,
                    disable=not self.show_progress_bar,
                    dynamic_ncols=True,
                    file=sys.stdout)
        setattr(self, f'{"test" if test_mode else "val"}_progress_bar', pbar)

        # run evaluation
        eval_results = self.evaluate(self.model, dataloaders, max_batches,
                                     test_mode)
        _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.process_output(
            eval_results)

        # add metrics to prog bar
        self.add_tqdm_metrics(prog_bar_metrics)

        # log metrics
        self.log_metrics(log_metrics, {})

        # track metrics for callbacks
        self.callback_metrics.update(callback_metrics)

        # hook
        model.on_post_performance_check()

        # add model specific metrics
        if not test_mode:
            self.main_progress_bar.set_postfix(**self.training_tqdm_dict)

        # close progress bar
        if test_mode:
            self.test_progress_bar.close()
        else:
            self.val_progress_bar.close()

        # model checkpointing
        if self.proc_rank == 0 and self.checkpoint_callback is not None and not test_mode:
            self.checkpoint_callback.on_validation_end(self, self.get_model())

        # Validation/Test end callbacks
        if test_mode:
            self.on_test_end()
        else:
            self.on_validation_end()
Ejemplo n.º 30
0
    def fit(
            self,
            model: LightningModule,
            train_dataloader: Optional[DataLoader] = None,
            val_dataloaders: Optional[DataLoader] = None,
            test_dataloaders: Optional[DataLoader] = None
    ):
        r"""
        Runs the full optimization routine.

        Args:
            model: Model to fit.

            train_dataloader: A Pytorch
                DataLoader with training samples. If the model has
                a predefined train_dataloader method this will be skipped.

            val_dataloaders: Either a single
                Pytorch Dataloader or a list of them, specifying validation samples.
                If the model has a predefined val_dataloaders method this will be skipped

            test_dataloaders: Either a single
                Pytorch Dataloader or a list of them, specifying validation samples.
                If the model has a predefined test_dataloaders method this will be skipped

        Example::

            # Option 1,
            # Define the train_dataloader(), test_dataloader() and val_dataloader() fxs
            # in the lightningModule
            # RECOMMENDED FOR MOST RESEARCH AND APPLICATIONS TO MAINTAIN READABILITY
            trainer = Trainer()
            model = LightningModule()
            trainer.fit(model)

            # Option 2
            # in production cases we might want to pass different datasets to the same model
            # Recommended for PRODUCTION SYSTEMS
            train, val, test = DataLoader(...), DataLoader(...), DataLoader(...)
            trainer = Trainer()
            model = LightningModule()
            trainer.fit(model, train_dataloader=train,
                        val_dataloader=val, test_dataloader=test)

            # Option 1 & 2 can be mixed, for example the training set can be
            # defined as part of the model, and validation/test can then be
            # feed to .fit()

        """
        # Fit begin callbacks
        self.on_fit_start()

        # set up the passed in dataloaders (if needed)
        self.__set_fit_dataloaders(model, train_dataloader, val_dataloaders, test_dataloaders)

        # route to appropriate start method
        # when using multi-node or DDP within a node start each module in a separate process
        if self.use_ddp2:
            task = int(os.environ['SLURM_LOCALID'])
            self.ddp_train(task, model)

        elif self.use_ddp:
            if self.is_slurm_managing_tasks:
                task = int(os.environ['SLURM_LOCALID'])
                self.ddp_train(task, model)
            else:
                mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model,))

        # 1 gpu or dp option triggers training using DP module
        # easier to avoid NCCL issues
        elif self.use_dp:
            self.dp_train(model)

        elif self.single_gpu:
            self.single_gpu_train(model)

        elif self.use_tpu:
            log.info(f'training on {self.num_tpu_cores} TPU cores')

            #  COLAB_GPU is an env var available by default in Colab environments.
            start_method = 'fork' if os.getenv('COLAB_GPU') else 'spawn'
            xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method)

        # ON CPU
        else:
            # run through amp wrapper
            if self.use_amp:
                raise MisconfigurationException('amp + cpu is not supported.  Please use a GPU option')

            # CHOOSE OPTIMIZER
            # allow for lr schedulers as well
            self.optimizers, self.lr_schedulers = self.init_optimizers(model.configure_optimizers())

            self.run_pretrain_routine(model)

        # Fit end callbacks
        self.on_fit_end()

        # return 1 when finished
        # used for testing or when we need to know that training succeeded
        return 1