Beispiel #1
0
    def run(self):
        ## init distributed
        self.cfg = init_distributed(self.cfg)

        cfg = self.cfg
        # cfg.print()

        ## parser_dict
        self.dictionary = self._parser_dict()

        ## parser_datasets
        datasets, dataloaders, data_samplers = self._parser_datasets()
        # dataset_sizes = {x: len(datasets[x]) for x in ['train', 'val']}
        # class_names = datasets['train'].classes

        ## parser_model
        model_ft = self._parser_model()

        ## parser_optimizer
        # Scale learning rate based on global batch size
        # cfg.INIT_LR = cfg.INIT_LR * float(self.batch_size_all) / 256
        optimizer_ft = parser_optimizer(cfg, model_ft)

        ## parser_lr_scheduler
        lr_scheduler_ft = parser_lr_scheduler(cfg, optimizer_ft)
        '''
        # Scheduler https://arxiv.org/pdf/1812.01187.pdf
        # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
        lf = lambda x: (((1 + math.cos(x * math.pi / self.cfg.N_MAX_EPOCHS)) / 2) ** 1.0) * 0.8 + 0.2  # cosine
        lr_scheduler_ft = lr_scheduler.LambdaLR(optimizer_ft, lr_lambda=lf)
        '''

        if cfg.distributed:
            model_ft = DDP(model_ft,
                           device_ids=[cfg.local_rank],
                           output_device=(cfg.local_rank))

        # Freeze
        freeze = [
            '',
        ]  # parameter names to freeze (full or partial)
        if any(freeze):
            for k, v in model_ft.named_parameters():
                if any(x in k for x in freeze):
                    print('freezing %s' % k)
                    v.requires_grad = False

        if self.cfg.PRETRAIN_MODEL is not None:
            if self.cfg.RESUME:
                self.start_epoch = self.ckpts.load_checkpoint(
                    self.cfg.PRETRAIN_MODEL, model_ft, optimizer_ft,
                    lr_scheduler_ft)
            else:
                self.ckpts.load_checkpoint(self.cfg.PRETRAIN_MODEL, model_ft)

        ## vis net graph
        if self.cfg.TENSORBOARD_MODEL and False:
            self.tb_writer.add_graph(model_ft, (model_ft.dummy_input.cuda(), ))

        self.n_steps_per_epoch = int(
            ceil(sum(len(t) for t in datasets['train'])))

        best_acc = 0.0
        scaler = amp.GradScaler(enabled=True)
        for epoch in range(self.start_epoch + 1, self.cfg.N_MAX_EPOCHS):
            if cfg.distributed:
                dataloaders['train'].sampler.set_epoch(epoch)
            self.train_epoch(scaler, epoch, model_ft, dataloaders['train'],
                             optimizer_ft)
            lr_scheduler_ft.step()

            if self.cfg.DATASET.VAL:
                acc = self.val_epoch(epoch, model_ft, dataloaders['val'])

                if cfg.local_rank == 0:
                    # start to save best performance model after learning rate decay to 1e-6
                    if best_acc < acc:
                        self.ckpts.autosave_checkpoint(model_ft, epoch, 'best',
                                                       optimizer_ft,
                                                       lr_scheduler_ft)
                        best_acc = acc
                        # continue

            if not epoch % cfg.N_EPOCHS_TO_SAVE_MODEL:
                if cfg.local_rank == 0:
                    self.ckpts.autosave_checkpoint(model_ft, epoch, 'autosave',
                                                   optimizer_ft,
                                                   lr_scheduler_ft)

        if cfg.local_rank == 0:
            self.tb_writer.close()

        dist.destroy_process_group() if cfg.local_rank != 0 else None
        torch.cuda.empty_cache()
Beispiel #2
0
    def run(self):
        cfg = self.cfg
        # cfg.print()

        ## parser_dict
        dictionary = self._parser_dict()

        ## parser_datasets
        datasets, dataloaders = self._parser_datasets()
        # dataset_sizes = {x: len(datasets[x]) for x in ['train', 'val']}
        # class_names = datasets['train'].classes

        ## parser_model
        model_ft = self._parser_model(dictionary)

        ## parser_optimizer
        optimizer_ft = parser_optimizer(cfg, model_ft)

        ## parser_lr_scheduler
        lr_scheduler_ft = parser_lr_scheduler(cfg, optimizer_ft)

        if self.cfg.PRETRAIN_MODEL is not None:
            if self.cfg.RESUME:
                self.start_epoch = self.checkpoints.load_checkpoint(
                    self.cfg.PRETRAIN_MODEL, model_ft, optimizer_ft,
                    lr_scheduler_ft)
            else:
                self.checkpoints.load_checkpoint(self.cfg.PRETRAIN_MODEL,
                                                 model_ft)

        if torch.cuda.is_available():
            model_ft = model_ft.cuda()
            cudnn.benchmark = True
            for state in optimizer_ft.state.values():
                for k, v in state.items():
                    if torch.is_tensor(v):
                        state[k] = v.cuda()

        ## vis net graph
        if self.cfg.TENSORBOARD_MODEL and False:
            self.tb_writer.add_graph(model_ft, (model_ft.dummy_input.cuda(), ))

        if self.cfg.HALF:
            model_ft.half()

        self.n_steps_per_epoch = int(
            ceil(sum(len(t) for t in datasets['train'])))

        best_acc = 0.0
        for epoch in range(self.start_epoch + 1, self.cfg.N_MAX_EPOCHS):
            self.train_epoch(epoch, model_ft, dataloaders['train'],
                             optimizer_ft, lr_scheduler_ft, None)
            if self.cfg.DATASET.VAL:
                acc = self.val_epoch(epoch,
                                     model_ft,
                                     dataloaders['val'],
                                     optimizer=optimizer_ft,
                                     lr_scheduler=lr_scheduler_ft)
                # start to save best performance model after learning rate decay to 1e-6
                if best_acc < acc:
                    self.checkpoints.autosave_checkpoint(
                        model_ft, epoch, 'best', optimizer_ft, lr_scheduler_ft)
                    best_acc = acc
                    continue

            if not epoch % cfg.N_EPOCHS_TO_SAVE_MODEL:
                self.checkpoints.autosave_checkpoint(model_ft, epoch,
                                                     'autosave', optimizer_ft,
                                                     lr_scheduler_ft)

        self.tb_writer.close()