def __dp_train(self, model):

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers = self.init_optimizers(
            model.configure_optimizers())

        root_gpu = 0
        if type(self.data_parallel_device_ids) is list:
            root_gpu = self.data_parallel_device_ids[0]
        model.cuda(root_gpu)

        # check for this bug (amp + dp + !01 doesn't work)
        # https://github.com/NVIDIA/apex/issues/227
        if self.use_dp and self.use_amp:
            m = f"""
            Amp level {self.amp_level} with DataParallel is not supported.
            See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
            We recommend you switch to ddp if you want to use amp
            """
            raise MisconfigurationException(m)

        model = LightningDataParallel(model,
                                      device_ids=self.data_parallel_device_ids)

        self.__run_pretrain_routine(model)
Beispiel #2
0
    def fit(self, model):

        # give model convenience properties
        model.trainer = self
        model.experiment = self.experiment

        # transfer data loaders from model
        self.__get_dataloaders(model)

        # init training constants
        self.__layout_bookeeping(model)

        # CHOOSE OPTIMIZER
        # filter out the weights that were done on gpu so we can load on good old cpus
        self.optimizers = model.configure_optimizers()

        if self.use_amp:
            # An example
            model, optimizer = amp.initialize(
                model,
                self.optimizers[0],
                opt_level=self.amp_level,
            )
            self.optimizers[0] = optimizer
            model.trainer = self

        # add lr schedulers
        if self.lr_scheduler_milestones is not None:
            for optimizer in self.optimizers:
                scheduler = MultiStepLR(optimizer,
                                        self.lr_scheduler_milestones)
                self.lr_schedulers.append(scheduler)

        # print model summary
        model.summarize()

        # put on gpu if needed
        if self.on_gpu:
            model = LightningDataParallel(
                model, device_ids=self.data_parallel_device_ids)

        # run tiny validation to make sure program won't crash during val
        _ = self.validate(model,
                          self.val_dataloader,
                          max_batches=self.nb_sanity_val_steps)

        # save exp to get started
        self.experiment.save()

        # enable cluster checkpointing
        if self.cluster is not None:
            self.enable_auto_hpc_walltime_manager()

        # ---------------------------
        # CORE TRAINING LOOP
        # ---------------------------
        self.model = model
        self.__train()
Beispiel #3
0
    def __dp_train(self, model):

        # CHOOSE OPTIMIZER
        # filter out the weights that were done on gpu so we can load on good old cpus
        self.optimizers = model.configure_optimizers()

        model.cuda(self.data_parallel_device_ids[0])

        # check for this bug (amp + dp + !01 doesn't work)
        # https://github.com/NVIDIA/apex/issues/227
        if self.use_dp and self.use_amp:
            m = f'amp level {self.amp_level} with DataParallel is not supported. ' \
                f'See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. ' \
                f'We recommend you switch to ddp if you want to use amp'
            raise MisconfigurationException(m)

        model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids)

        self.__run_pretrain_routine(model)