Ejemplo n.º 1
0
    def fit_model(self):
        """
        Fits model. Uses AdamW optimizer, model averaging, and a cosine annealing learning rate schedule.
        """
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.001)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, 100, 2
        )

        self.swa_model = AveragedModel(self.model)
        swa_start = 750
        swa_scheduler = SWALR(
            optimizer, swa_lr=0.001, anneal_epochs=10, anneal_strategy="cos"
        )

        self.model.train()
        self.swa_model.train()
        for epoch in range(1000):
            optimizer.zero_grad()
            output = self.model(self.x)

            loss = -output.log_prob(self.y.view(-1, 1)).sum()

            loss.backward()
            optimizer.step()

            if epoch > swa_start:
                self.swa_model.update_parameters(self.model)
                swa_scheduler.step()
            else:
                scheduler.step()

            if epoch % 10 == 0:
                print(f"Epoch {epoch} complete. Loss: {loss}")
Ejemplo n.º 2
0
    def __init__(self, cfg_dir: str):
        # load config file and initialize the logger and the device
        self.cfg = get_conf(cfg_dir)
        self.logger = self.init_logger(self.cfg.logger)
        self.device = self.init_device()
        # creating dataset interface and dataloader for trained data
        self.data, self.val_data = self.init_dataloader()
        # create model and initialize its weights and move them to the device
        self.model = self.init_model()
        # initialize the optimizer
        self.optimizer, self.lr_scheduler = self.init_optimizer()
        # define loss function
        self.criterion = torch.nn.CrossEntropyLoss()
        # if resuming, load the checkpoint
        self.if_resume()

        # initialize the early_stopping object
        self.early_stopping = EarlyStopping(
            patience=self.cfg.train_params.patience,
            verbose=True,
            delta=self.cfg.train_params.early_stopping_delta,
        )

        # stochastic weight averaging
        if self.cfg.train_params.epochs > self.cfg.train_params.swa_start:
            self.swa_model = AveragedModel(self.model)
            self.swa_scheduler = SWALR(self.optimizer, **self.cfg.SWA)
Ejemplo n.º 3
0
 def training_epoch_end(self, outputs):
     self.log('epoch_now',
              self.current_epoch,
              on_step=False,
              on_epoch=True,
              logger=True)
     (oppp) = self.optimizers(use_pl_optimizer=True)
     self.log('lr_now',
              self.get_lr_inside(oppp),
              on_step=False,
              on_epoch=True,
              logger=True)
     # https://github.com/PyTorchLightning/pytorch-lightning/issues/3095
     if self.learning_params["swa"] and (
             self.current_epoch >= self.learning_params["swa_start_epoch"]):
         if self.swa_model is None:
             (optimizer) = self.optimizers(use_pl_optimizer=True)
             print("creating_swa")
             self.swa_model = AveragedModel(self.network)
             self.new_scheduler = SWALR(
                 optimizer,
                 anneal_strategy="linear",
                 anneal_epochs=5,
                 swa_lr=self.learning_params["swa_lr"])
         # https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging/
         self.swa_model.update_parameters(self.network)
         self.new_scheduler.step()
Ejemplo n.º 4
0
    def __init__(self, cfg_dir: str, data_loader: DataLoader, model,
                 labels_definition):
        self.cfg = get_conf(cfg_dir)
        self._labels_definition = labels_definition
        #TODO
        self.logger = self.init_logger(self.cfg.logger)
        #self.dataset = CustomDataset(**self.cfg.dataset)
        self.data = data_loader
        #self.val_dataset = CustomDatasetVal(**self.cfg.val_dataset)
        #self.val_data = DataLoader(self.val_dataset, **self.cfg.dataloader)
        # self.logger.log_parameters({"tr_len": len(self.dataset),
        #                             "val_len": len(self.val_dataset)})
        self.model = model
        #self.model._resnet.conv1.apply(init_weights_normal)
        self.device = self.cfg.train_params.device
        self.model = self.model.to(device=self.device)
        if self.cfg.train_params.optimizer.lower() == "adam":
            self.optimizer = optim.Adam(self.model.parameters(),
                                        **self.cfg.adam)
        elif self.cfg.train_params.optimizer.lower() == "rmsprop":
            self.optimizer = optim.RMSprop(self.model.parameters(),
                                           **self.cfg.rmsprop)
        else:
            raise ValueError(
                f"Unknown optimizer {self.cfg.train_params.optimizer}")

        self.lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer, T_max=100)
        self.criterion = nn.BCELoss()

        if self.cfg.logger.resume:
            # load checkpoint
            print("Loading checkpoint")
            save_dir = self.cfg.directory.load
            checkpoint = load_checkpoint(save_dir, self.device)
            self.model.load_state_dict(checkpoint["model"])
            self.optimizer.load_state_dict(checkpoint["optimizer"])
            self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
            self.epoch = checkpoint["epoch"]
            self.e_loss = checkpoint["e_loss"]
            self.best = checkpoint["best"]
            print(
                f"{datetime.now():%Y-%m-%d %H:%M:%S} "
                f"Loading checkpoint was successful, start from epoch {self.epoch}"
                f" and loss {self.best}")
        else:
            self.epoch = 1
            self.best = np.inf
            self.e_loss = []

        # initialize the early_stopping object
        self.early_stopping = EarlyStopping(
            patience=self.cfg.train_params.patience,
            verbose=True,
            delta=self.cfg.train_params.early_stopping_delta,
        )

        # stochastic weight averaging
        self.swa_model = AveragedModel(self.model)
        self.swa_scheduler = SWALR(self.optimizer, **self.cfg.SWA)
Ejemplo n.º 5
0
def train(num_epochs, model, data_loader, val_loader, val_every, device, file_name):
    learning_rate = 0.0001
    from torch.optim.swa_utils import AveragedModel, SWALR
    from torch.optim.lr_scheduler import CosineAnnealingLR
    from segmentation_models_pytorch.losses import SoftCrossEntropyLoss, JaccardLoss
    from adamp import AdamP

    criterion = [SoftCrossEntropyLoss(smooth_factor=0.1), JaccardLoss('multiclass', classes=12)]
    optimizer = AdamP(params=model.parameters(), lr=learning_rate, weight_decay=1e-6)
    swa_scheduler = SWALR(optimizer, swa_lr=learning_rate)
    swa_model = AveragedModel(model)
    look = Lookahead(optimizer, la_alpha=0.5)

    print('Start training..')
    best_miou = 0
    for epoch in range(num_epochs):
        hist = np.zeros((12, 12))
        model.train()
        for step, (images, masks, _) in enumerate(data_loader):
            loss = 0
            images = torch.stack(images)  # (batch, channel, height, width)
            masks = torch.stack(masks).long()  # (batch, channel, height, width)

            # gpu 연산을 위해 device 할당
            images, masks = images.to(device), masks.to(device)

            # inference
            outputs = model(images)
            for i in criterion:
                loss += i(outputs, masks)
            # loss 계산 (cross entropy loss)

            look.zero_grad()
            loss.backward()
            look.step()

            outputs = torch.argmax(outputs.squeeze(), dim=1).detach().cpu().numpy()
            hist = add_hist(hist, masks.detach().cpu().numpy(), outputs, n_class=12)
            acc, acc_cls, mIoU, fwavacc = label_accuracy_score(hist)
            # step 주기에 따른 loss, mIoU 출력
            if (step + 1) % 25 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, mIoU: {:.4f}'.format(
                    epoch + 1, num_epochs, step + 1, len(data_loader), loss.item(), mIoU))

        # validation 주기에 따른 loss 출력 및 best model 저장
        if (epoch + 1) % val_every == 0:
            avrg_loss, val_miou = validation(epoch + 1, model, val_loader, criterion, device)
            if val_miou > best_miou:
                print('Best performance at epoch: {}'.format(epoch + 1))
                print('Save model in', saved_dir)
                best_miou = val_miou
                save_model(model, file_name = file_name)

        if epoch > 3:
            swa_model.update_parameters(model)
            swa_scheduler.step()
Ejemplo n.º 6
0
    def test_fit_swa_cuda(self):
        for model_name in supported_tv_models:
            model = cnn.create_cnn(model_name, 10, pretrained=None)
            opt = torch.optim.Adam(model.parameters(), lr=1e-3)
            loss = nn.CrossEntropyLoss()
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt,
                                                                   T_max=300)
            swa_scheduler = SWALR(opt,
                                  anneal_strategy="linear",
                                  anneal_epochs=20,
                                  swa_lr=0.05)
            swa_start = 2
            history = cnn.fit(model,
                              3,
                              train_loader,
                              val_loader,
                              loss,
                              device="cpu",
                              optimizer=opt,
                              scheduler=scheduler,
                              num_batches=10,
                              swa_start=swa_start,
                              swa_scheduler=swa_scheduler)
            self.assertIsInstance(history, Dict)
            exp_keys = ("train", "val")
            for exp_k in exp_keys:
                self.assertTrue(exp_k in history.keys())

            exp_keys2 = ("top1_acc", "top5_acc", "loss")
            for exp_k2 in exp_keys2:
                self.assertTrue(exp_k2 in history["train"].keys())
                self.assertTrue(exp_k2 in history["val"].keys())
Ejemplo n.º 7
0
    def on_train_epoch_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule'):
        if trainer.current_epoch == self.swa_start:
            # move average model to request device.
            self._average_model = self._average_model.to(self._device or pl_module.device)

            optimizers = trainer.optimizers

            for param_group in optimizers[0].param_groups:
                if self._swa_lrs is None:
                    initial_lr = param_group["lr"]

                elif isinstance(self._swa_lrs, float):
                    initial_lr = self._swa_lrs

                else:
                    initial_lr = self._swa_lrs[0]

                param_group["initial_lr"] = initial_lr

            self._swa_lrs = initial_lr

            self._swa_scheduler = SWALR(
                optimizers[0],
                swa_lr=initial_lr,
                anneal_epochs=self._annealing_epochs,
                anneal_strategy=self._annealing_strategy,
                last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1
            )
            _scheduler_config = _get_default_scheduler_config()
            assert _scheduler_config["interval"] == "epoch" and _scheduler_config["frequency"] == 1
            _scheduler_config["scheduler"] = self._swa_scheduler

            if trainer.lr_schedulers:
                lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
                rank_zero_warn(f"Swapping lr_scheduler {lr_scheduler} for {self._swa_scheduler}")
                trainer.lr_schedulers[0] = _scheduler_config
            else:
                trainer.lr_schedulers.append(_scheduler_config)

            self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device)

        if self.swa_start <= trainer.current_epoch <= self.swa_end:
            self.update_parameters(self._average_model, pl_module, self.n_averaged, self.avg_fn)

        # Note: No > here in case the callback is saved with the model and training continues
        if trainer.current_epoch == self.swa_end + 1:

            # Transfer weights from average model to pl_module
            self.transfer_weights(self._average_model, pl_module)

            # Reset BatchNorm for update
            self.reset_batch_norm_and_save_state(pl_module)

            # There is no need to perform either backward or optimizer.step as we are
            # performing only one pass over the train data-loader to compute activation statistics
            # Therefore, we will virtually increase `num_training_batches` by 1 and skip backward.
            trainer.num_training_batches += 1
            trainer.train_loop._skip_backward = True
            self._accumulate_grad_batches = trainer.accumulate_grad_batches
            trainer.accumulate_grad_batches = len(trainer.train_dataloader)
Ejemplo n.º 8
0
    def _configure_optimizers(self, ) -> None:
        """Loads the optimizers."""
        if self._optimizer is not None:
            self._optimizer = self._optimizer(self._network.parameters(),
                                              **self.optimizer_args)
        else:
            self._optimizer = None

        if self._optimizer and self._lr_scheduler is not None:
            if "steps_per_epoch" in self.lr_scheduler_args:
                self.lr_scheduler_args["steps_per_epoch"] = len(
                    self.train_dataloader())

            # Assume lr scheduler should update at each epoch if not specified.
            if "interval" not in self.lr_scheduler_args:
                interval = "epoch"
            else:
                interval = self.lr_scheduler_args.pop("interval")
            self._lr_scheduler = {
                "lr_scheduler":
                self._lr_scheduler(self._optimizer, **self.lr_scheduler_args),
                "interval":
                interval,
            }

        if self.swa_args is not None:
            self._swa_scheduler = {
                "swa_scheduler": SWALR(self._optimizer,
                                       swa_lr=self.swa_args["lr"]),
                "swa_start": self.swa_args["start"],
            }
            self._swa_network = AveragedModel(self._network).to(self.device)
Ejemplo n.º 9
0
def get_swa(optimizer,
            model,
            swa_lr=0.005,
            anneal_epochs=10,
            anneal_strategy="cos"):
    '''
    SWALR Arguments:
        optimizer (torch.optim.Optimizer): wrapped optimizer
        swa_lr (float or list): the learning rate value for all param groups
            together or separately for each group.
        anneal_epochs (int): number of epochs in the annealing phase 
            (default: 10)
        anneal_strategy (str): "cos" or "linear"; specifies the annealing 
            strategy: "cos" for cosine annealing, "linear" for linear annealing
            (default: "cos")
        last_epoch (int): the index of the last epoch (default: 'cos')
    
    '''
    swa_model = AveragedModel(model)
    # swa_scheduler = SWALR(optimizer, swa_lr=swa_lr)
    # swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, anneal_strategy="linear", anneal_epochs=5, swa_lr=swa_lr)
    swa_scheduler = SWALR(optimizer,
                          swa_lr=swa_lr,
                          anneal_epochs=anneal_epochs,
                          anneal_strategy=anneal_strategy)

    return swa_scheduler, swa_model
Ejemplo n.º 10
0
def train_model(indep_vars, dep_var, verbose=True):
    """
    Trains MDNVol network. Uses AdamW optimizer with cosine annealing learning rate schedule.
    Ouputs averaged model over the last 25% of training epochs.

    indep_vars: n x m torch tensor containing independent variables
        n = number of data points
        m = number of input variables
    dep_var: n x 1 torch tensor containing single dependent variable
        n = number of data points
        1 = single output variable
    """
    model = MDN(indep_vars.shape[1], 1, 250, 5)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, 100, 2)

    swa_model = AveragedModel(model)
    swa_start = 750
    swa_scheduler = SWALR(optimizer,
                          swa_lr=0.001,
                          anneal_epochs=10,
                          anneal_strategy="cos")

    model.train()
    swa_model.train()
    for epoch in range(1000):
        optimizer.zero_grad()
        output = model(indep_vars)

        loss = -output.log_prob(dep_var).sum()

        loss.backward()
        optimizer.step()

        if epoch > swa_start:
            swa_model.update_parameters(model)
            swa_scheduler.step()
        else:
            scheduler.step()

        if epoch % 10 == 0:
            if verbose:
                print(f"Epoch {epoch} complete. Loss: {loss}")

    swa_model.eval()
    return swa_model
Ejemplo n.º 11
0
    def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"):
        if trainer.current_epoch == self.swa_start:
            # move average model to request device.
            self._average_model = self._average_model.to(self._device or pl_module.device)

            optimizer = trainer.optimizers[0]
            if self._swa_lrs is None:
                self._swa_lrs = [param_group["lr"] for param_group in optimizer.param_groups]
            if isinstance(self._swa_lrs, float):
                self._swa_lrs = [self._swa_lrs] * len(optimizer.param_groups)

            for lr, group in zip(self._swa_lrs, optimizer.param_groups):
                group["initial_lr"] = lr

            self._swa_scheduler = SWALR(
                optimizer,
                swa_lr=self._swa_lrs,
                anneal_epochs=self._annealing_epochs,
                anneal_strategy=self._annealing_strategy,
                last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1,
            )
            default_scheduler_cfg = _get_default_scheduler_config()
            assert default_scheduler_cfg["interval"] == "epoch" and default_scheduler_cfg["frequency"] == 1
            default_scheduler_cfg["scheduler"] = self._swa_scheduler

            if trainer.lr_schedulers:
                scheduler_cfg = trainer.lr_schedulers[0]
                if scheduler_cfg["interval"] != "epoch" or scheduler_cfg["frequency"] != 1:
                    rank_zero_warn(f"SWA is currently only supported every epoch. Found {scheduler_cfg}")
                rank_zero_info(
                    f"Swapping scheduler `{scheduler_cfg['scheduler'].__class__.__name__}`"
                    f" for `{self._swa_scheduler.__class__.__name__}`"
                )
                trainer.lr_schedulers[0] = default_scheduler_cfg
            else:
                trainer.lr_schedulers.append(default_scheduler_cfg)

            self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device)

        if self.swa_start <= trainer.current_epoch <= self.swa_end:
            self.update_parameters(self._average_model, pl_module, self.n_averaged, self.avg_fn)

        # Note: No > here in case the callback is saved with the model and training continues
        if trainer.current_epoch == self.swa_end + 1:

            # Transfer weights from average model to pl_module
            self.transfer_weights(self._average_model, pl_module)

            # Reset BatchNorm for update
            self.reset_batch_norm_and_save_state(pl_module)

            # There is no need to perform either backward or optimizer.step as we are
            # performing only one pass over the train data-loader to compute activation statistics
            # Therefore, we will virtually increase `num_training_batches` by 1 and skip backward.
            trainer.num_training_batches += 1
            trainer.fit_loop._skip_backward = True
            self._accumulate_grad_batches = trainer.accumulate_grad_batches

            trainer.accumulate_grad_batches = trainer.num_training_batches
Ejemplo n.º 12
0
 def configure_optimizers(self):
     optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.lr)
     if self.hparams.use_swa:
         self.swa_scheduler = SWALR(
             optimizer, swa_lr=self.hparams.swa_lr,
             anneal_strategy='linear', anneal_epochs=10
         )
     return optimizer
Ejemplo n.º 13
0
class SWALRRunner(ClassificationRunner):
    def __init__(self, *args, **kwargs):
        super(SWALRRunner, self).__init__(*args, **kwargs)
        self.swa_model = AveragedModel(self.model)
        self.swa_scheduler = SWALR(self.optimizer, swa_lr=0.05)
        self.swa_start = 5

    def update_scheduler(self, epoch: int) -> None:
        if epoch > self.swa_start:
            self.swa_model.update_parameters(self.model)
            self.swa_scheduler.step()

        else:
            super(SWALRRunner, self).update_scheduler(epoch)

    def train_end(self, outputs):
        update_bn(self.loaders["train"], self.swa_model)
        return super(SWALRRunner, self).train_end(outputs)
Ejemplo n.º 14
0
def build_swa_model(cfg: CfgNode, model: torch.nn.Module,
                    optimizer: torch.optim.Optimizer):
    # Instead of copying weights during initialization, the SWA model copys
    # the model weights when self.update_parameters is first called.
    # https://github.com/pytorch/pytorch/blob/1.7/torch/optim/swa_utils.py#L107

    # The SWA model needs to be constructed for all processes in distributed
    # training, otherwise the training can get stuck.
    swa_model = AveragedModel(model)
    lr = cfg.SOLVER.BASE_LR
    lr *= cfg.SOLVER.SWA.LR_FACTOR
    swa_scheduler = SWALR(optimizer, swa_lr=lr)
    return swa_model, swa_scheduler
Ejemplo n.º 15
0
    def __init__(self, config: DNNConfig):
        self.config = config
        self.epochs = config.epoch_num
        self.device = config.device

        self.model = tmp_model
        #self.criterion = CustomLoss()

        self.criterion = nn.MSELoss()

        optimizer_kwargs = {
            'lr': config.lr,
            'weight_decay': config.weight_decay
        }
        self.sam = config.issam
        self.optimizer = make_optimizer(self.model,
                                        optimizer_kwargs,
                                        optimizer_name=config.optimizer_name,
                                        sam=config.issam)
        self.scheduler_name = config.scheduler_name
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer=self.optimizer, T_max=config.T_max)

        self.isswa = config.getattr('isswa', False)
        self.swa_start = config.getattr('swa_start', 0)

        if config.isswa:
            self.swa_model = AveragedModel(self.model)
            self.swa_scheduler = SWALR(self.optimizer, swa_lr=0.025)

        #self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=self.optimizer,
        #                                                      mode=config.mode, factor=config.factor)

        self.loss_log = {
            'train_loss': [],
            'train_score': [],
            'valid_loss': [],
            'valid_score': []
        }
Ejemplo n.º 16
0
    def __init__(self, config):
        self.config = config
        self.device =  'cuda' if cuda.is_available() else 'cpu'
        
        self.model = MLP(config)
        self.swa_model = AveragedModel(self.model)

        self.optimizer = make_optimizer(self.model, optimizer_name=self.config.optimizer, sam=self.config.sam)
        self.scheduler = make_scheduler(self.optimizer, decay_name=self.config.scheduler,
                                        num_training_steps=self.config.num_training_steps,
                                        num_warmup_steps=self.config.num_warmup_steps)
        self.swa_start = self.config.swa_start
        self.swa_scheduler = SWALR(self.optimizer, swa_lr=self.config.swa_lr)
        self.epoch_num = 0
        self.criterion = self.config.criterion
 def configure_optimizers(self):
     if not hasattr(self, 'train_data'):
         self.prepare_data()
     if type(self.lr) is float:
         optimizer = torch.optim.AdamW([{'params': self.model.parameters(), 'lr': self.lr}])
     else:
         param_lr_mappings, self.lr = self.get_param_lr_maps(self.lr)
         optimizer = torch.optim.AdamW(param_lr_mappings)
     if self.use_one_cycle_lr_scheduler:
         return (
             [optimizer],
             [
                 torch.optim.lr_scheduler.OneCycleLR(
                     optimizer, self.lr, epochs=self.max_epochs,
                     steps_per_epoch=int(np.ceil(len(self.trainer.datamodule.train_dataset) / self.batch_size)),
                     div_factor=1e2
                 )
             ]
         )
     elif self.lr_decay:
         return (
             [optimizer],
             [
                 torch.optim.lr_scheduler.StepLR(
                     optimizer, step_size=self.lr_decay_period, gamma=self.lr_decay_gamma
                 )
             ]
         )
     elif self.swa:
         if type(self.lr) is float:
             optimizer = torch.optim.SGD([{'params': self.model.parameters(), 'lr': self.lr}])
         else:
             param_lr_mappings, self.lr = self.get_param_lr_maps(self.lr)
             optimizer = torch.optim.SGD(param_lr_mappings)
         return [optimizer], [SWALR(optimizer, swa_lr=self.swa_lr)]
     else:
         return optimizer
Ejemplo n.º 18
0
    def __init__(self):

        if args.train is not None:
            self.train_tuple = get_tuple(args.train,
                                         bs=args.batch_size,
                                         shuffle=True,
                                         drop_last=False)

        if args.valid is not None:
            valid_bsize = 2048 if args.multiGPU else 50
            self.valid_tuple = get_tuple(args.valid,
                                         bs=valid_bsize,
                                         shuffle=False,
                                         drop_last=False)
        else:
            self.valid_tuple = None

        # Select Model, X is default
        if args.model == "X":
            self.model = ModelX(args)
        elif args.model == "V":
            self.model = ModelV(args)
        elif args.model == "U":
            self.model = ModelU(args)
        elif args.model == "D":
            self.model = ModelD(args)
        elif args.model == 'O':
            self.model = ModelO(args)
        else:
            print(args.model, " is not implemented.")

        # Load pre-trained weights from paths
        if args.loadpre is not None:
            self.model.load(args.loadpre)

        # GPU options
        if args.multiGPU:
            self.model.lxrt_encoder.multi_gpu()

        self.model = self.model.cuda()

        # Losses and optimizer
        self.logsoftmax = nn.LogSoftmax(dim=1)
        self.nllloss = nn.NLLLoss()

        if args.train is not None:
            batch_per_epoch = len(self.train_tuple.loader)
            self.t_total = int(batch_per_epoch * args.epochs // args.acc)
            print("Total Iters: %d" % self.t_total)

        def is_backbone(n):
            if "encoder" in n:
                return True
            elif "embeddings" in n:
                return True
            elif "pooler" in n:
                return True
            print("F: ", n)
            return False

        no_decay = ['bias', 'LayerNorm.weight']

        params = list(self.model.named_parameters())
        if args.reg:
            optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in params if is_backbone(n)],
                    "lr": args.lr
                },
                {
                    "params": [p for n, p in params if not is_backbone(n)],
                    "lr": args.lr * 500
                },
            ]

            for n, p in self.model.named_parameters():
                print(n)

            self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr)
        else:
            optimizer_grouped_parameters = [{
                'params':
                [p for n, p in params if not any(nd in n for nd in no_decay)],
                'weight_decay':
                args.wd
            }, {
                'params':
                [p for n, p in params if any(nd in n for nd in no_decay)],
                'weight_decay':
                0.0
            }]

            self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr)

        if args.train is not None:
            self.scheduler = get_linear_schedule_with_warmup(
                self.optim, self.t_total * 0.1, self.t_total)

        self.output = args.output
        os.makedirs(self.output, exist_ok=True)

        # SWA Method:
        if args.contrib:
            self.optim = SWA(self.optim,
                             swa_start=self.t_total * 0.75,
                             swa_freq=5,
                             swa_lr=args.lr)

        if args.swa:
            self.swa_model = AveragedModel(self.model)
            self.swa_start = self.t_total * 0.75
            self.swa_scheduler = SWALR(self.optim, swa_lr=args.lr)
Ejemplo n.º 19
0
class HM:
    def __init__(self):

        if args.train is not None:
            self.train_tuple = get_tuple(args.train,
                                         bs=args.batch_size,
                                         shuffle=True,
                                         drop_last=False)

        if args.valid is not None:
            valid_bsize = 2048 if args.multiGPU else 50
            self.valid_tuple = get_tuple(args.valid,
                                         bs=valid_bsize,
                                         shuffle=False,
                                         drop_last=False)
        else:
            self.valid_tuple = None

        # Select Model, X is default
        if args.model == "X":
            self.model = ModelX(args)
        elif args.model == "V":
            self.model = ModelV(args)
        elif args.model == "U":
            self.model = ModelU(args)
        elif args.model == "D":
            self.model = ModelD(args)
        elif args.model == 'O':
            self.model = ModelO(args)
        else:
            print(args.model, " is not implemented.")

        # Load pre-trained weights from paths
        if args.loadpre is not None:
            self.model.load(args.loadpre)

        # GPU options
        if args.multiGPU:
            self.model.lxrt_encoder.multi_gpu()

        self.model = self.model.cuda()

        # Losses and optimizer
        self.logsoftmax = nn.LogSoftmax(dim=1)
        self.nllloss = nn.NLLLoss()

        if args.train is not None:
            batch_per_epoch = len(self.train_tuple.loader)
            self.t_total = int(batch_per_epoch * args.epochs // args.acc)
            print("Total Iters: %d" % self.t_total)

        def is_backbone(n):
            if "encoder" in n:
                return True
            elif "embeddings" in n:
                return True
            elif "pooler" in n:
                return True
            print("F: ", n)
            return False

        no_decay = ['bias', 'LayerNorm.weight']

        params = list(self.model.named_parameters())
        if args.reg:
            optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in params if is_backbone(n)],
                    "lr": args.lr
                },
                {
                    "params": [p for n, p in params if not is_backbone(n)],
                    "lr": args.lr * 500
                },
            ]

            for n, p in self.model.named_parameters():
                print(n)

            self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr)
        else:
            optimizer_grouped_parameters = [{
                'params':
                [p for n, p in params if not any(nd in n for nd in no_decay)],
                'weight_decay':
                args.wd
            }, {
                'params':
                [p for n, p in params if any(nd in n for nd in no_decay)],
                'weight_decay':
                0.0
            }]

            self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr)

        if args.train is not None:
            self.scheduler = get_linear_schedule_with_warmup(
                self.optim, self.t_total * 0.1, self.t_total)

        self.output = args.output
        os.makedirs(self.output, exist_ok=True)

        # SWA Method:
        if args.contrib:
            self.optim = SWA(self.optim,
                             swa_start=self.t_total * 0.75,
                             swa_freq=5,
                             swa_lr=args.lr)

        if args.swa:
            self.swa_model = AveragedModel(self.model)
            self.swa_start = self.t_total * 0.75
            self.swa_scheduler = SWALR(self.optim, swa_lr=args.lr)

    def train(self, train_tuple, eval_tuple):

        dset, loader, evaluator = train_tuple
        iter_wrapper = (lambda x: tqdm(x, total=len(loader))
                        ) if args.tqdm else (lambda x: x)

        print("Batches:", len(loader))

        self.optim.zero_grad()

        best_roc = 0.
        ups = 0

        total_loss = 0.

        for epoch in range(args.epochs):

            if args.reg:
                if args.model != "X":
                    print(self.model.model.layer_weights)

            id2ans = {}
            id2prob = {}

            for i, (ids, feats, boxes, sent,
                    target) in iter_wrapper(enumerate(loader)):

                if ups == args.midsave:
                    self.save("MID")

                self.model.train()

                if args.swa:
                    self.swa_model.train()

                feats, boxes, target = feats.cuda(), boxes.cuda(), target.long(
                ).cuda()

                # Model expects visual feats as tuple of feats & boxes
                logit = self.model(sent, (feats, boxes))

                # Note: LogSoftmax does not change order, hence there should be nothing wrong with taking it as our prediction
                # In fact ROC AUC stays the exact same for logsoftmax / normal softmax, but logsoftmax is better for loss calculation
                # due to stronger penalization & decomplexifying properties (log(a/b) = log(a) - log(b))
                logit = self.logsoftmax(logit)
                score = logit[:, 1]

                if i < 1:
                    print(logit[0, :].detach())

                # Note: This loss is the same as CrossEntropy (We splitted it up in logsoftmax & neg. log likelihood loss)
                loss = self.nllloss(logit.view(-1, 2), target.view(-1))

                # Scaling loss by batch size, as we have batches with different sizes, since we do not "drop_last" & dividing by acc for accumulation
                # Not scaling the loss will worsen performance by ~2abs%
                loss = loss * logit.size(0) / args.acc
                loss.backward()

                total_loss += loss.detach().item()

                # Acts as argmax - extracting the higher score & the corresponding index (0 or 1)
                _, predict = logit.detach().max(1)
                # Getting labels for accuracy
                for qid, l in zip(ids, predict.cpu().numpy()):
                    id2ans[qid] = l
                # Getting probabilities for Roc auc
                for qid, l in zip(ids, score.detach().cpu().numpy()):
                    id2prob[qid] = l

                if (i + 1) % args.acc == 0:

                    nn.utils.clip_grad_norm_(self.model.parameters(),
                                             args.clip)

                    self.optim.step()

                    if (args.swa) and (ups > self.swa_start):
                        self.swa_model.update_parameters(self.model)
                        self.swa_scheduler.step()
                    else:
                        self.scheduler.step()
                    self.optim.zero_grad()

                    ups += 1

                    # Do Validation in between
                    if ups % 250 == 0:

                        log_str = "\nEpoch(U) %d(%d): Train AC %0.2f RA %0.4f LOSS %0.4f\n" % (
                            epoch, ups, evaluator.evaluate(id2ans) * 100,
                            evaluator.roc_auc(id2prob) * 100, total_loss)

                        # Set loss back to 0 after printing it
                        total_loss = 0.

                        if self.valid_tuple is not None:  # Do Validation
                            acc, roc_auc = self.evaluate(eval_tuple)
                            if roc_auc > best_roc:
                                best_roc = roc_auc
                                best_acc = acc
                                # Only save BEST when no midsave is specified to save space
                                #if args.midsave < 0:
                                #    self.save("BEST")

                            log_str += "\nEpoch(U) %d(%d): DEV AC %0.2f RA %0.4f \n" % (
                                epoch, ups, acc * 100., roc_auc * 100)
                            log_str += "Epoch(U) %d(%d): BEST AC %0.2f RA %0.4f \n" % (
                                epoch, ups, best_acc * 100., best_roc * 100.)

                        print(log_str, end='')

                        with open(self.output + "/log.log", 'a') as f:
                            f.write(log_str)
                            f.flush()

        if (epoch + 1) == args.epochs:
            if args.contrib:
                self.optim.swap_swa_sgd()

        self.save("LAST" + args.train)

    def predict(self, eval_tuple: DataTuple, dump=None, out_csv=True):

        dset, loader, evaluator = eval_tuple
        id2ans = {}
        id2prob = {}

        for i, datum_tuple in enumerate(loader):

            ids, feats, boxes, sent = datum_tuple[:4]

            self.model.eval()

            if args.swa:
                self.swa_model.eval()

            with torch.no_grad():

                feats, boxes = feats.cuda(), boxes.cuda()
                logit = self.model(sent, (feats, boxes))

                # Note: LogSoftmax does not change order, hence there should be nothing wrong with taking it as our prediction
                logit = self.logsoftmax(logit)
                score = logit[:, 1]

                if args.swa:
                    logit = self.swa_model(sent, (feats, boxes))
                    logit = self.logsoftmax(logit)

                _, predict = logit.max(1)

                for qid, l in zip(ids, predict.cpu().numpy()):
                    id2ans[qid] = l

                # Getting probas for Roc Auc
                for qid, l in zip(ids, score.cpu().numpy()):
                    id2prob[qid] = l

        if dump is not None:
            if out_csv == True:
                evaluator.dump_csv(id2ans, id2prob, dump)
            else:
                evaluator.dump_result(id2ans, dump)

        return id2ans, id2prob

    def evaluate(self, eval_tuple: DataTuple, dump=None):
        """Evaluate all data in data_tuple."""
        id2ans, id2prob = self.predict(eval_tuple, dump=dump)

        acc = eval_tuple.evaluator.evaluate(id2ans)
        roc_auc = eval_tuple.evaluator.roc_auc(id2prob)

        return acc, roc_auc

    def save(self, name):
        if args.swa:
            torch.save(self.swa_model.state_dict(),
                       os.path.join(self.output, "%s.pth" % name))
        else:
            torch.save(self.model.state_dict(),
                       os.path.join(self.output, "%s.pth" % name))

    def load(self, path):
        print("Load model from %s" % path)

        state_dict = torch.load("%s" % path)
        new_state_dict = {}
        for key, value in state_dict.items():
            # N_averaged is a key in SWA models we cannot load, so we skip it
            if key.startswith("n_averaged"):
                print("n_averaged:", value)
                continue
            # SWA Models will start with module
            if key.startswith("module."):
                new_state_dict[key[len("module."):]] = value
            else:
                new_state_dict[key] = value
        state_dict = new_state_dict
        self.model.load_state_dict(state_dict)
def test_models():
    # print("Setting Seed for the run, seed = {}".format(SEED))
    # utils.seed_everything(SEED)
    # We don't need seeds for tests

    print("Creating Train and Validation Dataset")
    train_transforms = T.Compose([T.ToTensor(), T.Normalize((0.5, ), (0.5, ))])
    valid_transforms = T.Compose([T.ToTensor(), T.Normalize((0.5, ), (0.5, ))])

    train_set, valid_set = dataset.create_cifar10_dataset(
        train_transforms, valid_transforms)
    print("Train and Validation Datasets Created")

    print("Creating DataLoaders")
    train_loader, valid_loader = dataset.create_loaders(train_set, train_set)

    print("Train and Validation Dataloaders Created")
    print("Creating Model")

    all_supported_models = [
        "resnet18",
        # "resnet34",
        # "resnet50",
        # "resnet101",
        # "resnet152",
        # "resnext50_32x4d",
        # "resnext101_32x8d",
        # "vgg11",
        # "vgg13",
        # "vgg16",
        # "vgg19",
        # "mobilenet",
        # "mnasnet0_5",
        # "mnasnet1_0",
    ]

    for model_name in all_supported_models:
        model = model_factory.create_torchvision_model(
            model_name, num_classes=10, pretrained=False
        )  # We don't need pretrained True, we just need a forward pass

        if torch.cuda.is_available():
            print("Model Created. Moving it to CUDA")
        else:
            print("Model Created. Training on CPU only")
        model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=1e-3)

        criterion = (
            nn.CrossEntropyLoss()
        )  # All classification problems we need Cross entropy loss

        # early_stopper = utils.EarlyStopping(
        #     patience=7, verbose=True, path=SAVE_PATH
        # )
        # We do not need early stopping too

        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               T_max=300)

        swa_scheduler = SWALR(optimizer,
                              anneal_strategy="linear",
                              anneal_epochs=20,
                              swa_lr=0.05)
        swa_start = 2

        if torch.cuda.is_available():
            scaler = amp.GradScaler()

            train_metrics = engine.train_step(
                model,
                train_loader,
                criterion,
                device,
                optimizer,
                num_batches=10,
                fp16_scaler=scaler,
            )

            history2 = engine.fit(
                1,
                model,
                train_loader,
                valid_loader,
                criterion,
                device,
                optimizer,
                num_batches=10,
                grad_penalty=True,
                use_fp16=True,
            )

        train_metrics = engine.train_step(
            model,
            train_loader,
            criterion,
            device,
            optimizer,
            num_batches=10,
        )

        history = engine.sanity_fit(
            model,
            train_loader,
            valid_loader,
            criterion,
            device,
            num_batches=10,
            grad_penalty=True,
        )

        history2 = engine.fit(
            1,
            model,
            train_loader,
            valid_loader,
            criterion,
            device,
            optimizer,
            num_batches=10,
            grad_penalty=True,
        )

        history3 = engine.fit(
            3,
            model,
            train_loader,
            valid_loader,
            criterion,
            device,
            optimizer,
            scheduler=scheduler,
            num_batches=10,
            grad_penalty=True,
            swa_start=swa_start,
            swa_scheduler=swa_scheduler,
        )

    print("Done !!")
    return 1
Ejemplo n.º 21
0
def training(model,
             train_dataloader,
             valid_dataloader,
             test_dataloader,
             model_cfg,
             fold_idx=1):

    print("--------  ", str(fold_idx), "  --------")
    global model_config
    model_config = model_cfg

    device = get_device()
    model.to(device)

    if fold_idx == 1: print('CONFIG: ')
    if fold_idx == 1:
        print([(v, getattr(model_config, v)) for v in dir(model_config)
               if v[:2] != "__"])
    if fold_idx == 1: print('MODEL: ', model)

    epochs = model_config.epochs

    if model_config.optimizer == 'AdamW':
        optimizer = torch.optim.AdamW(model.parameters(),
                                      lr=float(model_config.lr),
                                      eps=float(model_config.eps),
                                      weight_decay=float(
                                          model_config.weight_decay))
    elif model_config.optimizer == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=float(model_config.lr))

    if model_config.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(model_config.warmup_steps),
            num_training_steps=len(train_dataloader) * epochs)
    else:
        scheduler = None

    criterion = nn.BCEWithLogitsLoss()  #nn.CrossEntropyLoss()

    swa_model = AveragedModel(model)
    if model_config.swa_scheduler == 'linear':
        swa_scheduler = SWALR(optimizer, swa_lr=float(model_config.lr))
    else:
        swa_scheduler = CosineAnnealingLR(optimizer, T_max=100)

    print('TRAINING...')

    training_stats = []

    best_dev_auc = float('-inf')

    with tqdm(total=epochs, leave=False) as pbar:
        for epoch_i in range(0, epochs):

            if epoch_i >= int(model_config.swa_start):
                update_bn(train_dataloader, swa_model)
                train_auc, train_acc, avg_train_loss = train(
                    model, train_dataloader, device, criterion, optimizer)
                swa_model.update_parameters(model)
                swa_scheduler.step()
                update_bn(valid_dataloader, swa_model)
                valid_auc, valid_acc, avg_dev_loss, dev_d = valid(
                    swa_model, valid_dataloader, device, criterion)
            else:
                train_auc, train_acc, avg_train_loss = train(
                    model,
                    train_dataloader,
                    device,
                    criterion,
                    optimizer,
                    scheduler=scheduler)
                valid_auc, valid_acc, avg_dev_loss, dev_d = valid(
                    model, valid_dataloader, device, criterion)
            if cfg.final_train:
                valid_auc = 0
                valid_acc = 0
                avg_dev_loss = 0

            add_stats(training_stats, avg_train_loss, avg_dev_loss, train_acc,
                      train_auc, valid_acc, valid_auc)

            if (cfg.final_train &
                (epoch_i == epochs - 1)) | (not cfg.final_train &
                                            (valid_auc > best_dev_auc)):
                best_dev_auc = valid_auc
                if epoch_i >= int(model_config.swa_start):
                    update_bn(test_dataloader, swa_model)
                    test_d = gen_test(swa_model, test_dataloader, device)
                    save(fold_idx, swa_model, optimizer, dev_d, test_d,
                         valid_auc)
                else:
                    test_d = gen_test(model, test_dataloader, device)
                    save(fold_idx, model, optimizer, dev_d, test_d, valid_auc)

            pbar.update(1)

    print('TRAINING COMPLETED')

    # Show training results
    col_names = [
        'train_loss', 'train_acc', 'train_auc', 'dev_loss', 'dev_acc',
        'dev_auc'
    ]
    training_stats = pd.DataFrame(training_stats, columns=col_names)
    print(training_stats.head(epochs))
    plot_training_results(training_stats, fold_idx)

    # If config, get best model and make submission
    if cfg.run['submission'] == True:
        make_submission(model, test_dataloader)
Ejemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--batch_size",
        default=8,
        type=int,
        help="batch size of both segmentation and classification training")
    parser.add_argument(
        "--seg_epoch",
        default=100,
        type=int,
        help="the number of epoch in the segmentation training")
    parser.add_argument(
        "--cls_epoch",
        default=20,
        type=int,
        help="the number of epoch in the classification training")
    parser.add_argument("--lr",
                        default=0.01,
                        type=float,
                        help="the learning rate of training")
    parser.add_argument("--swa_lr",
                        default=0.005,
                        type=float,
                        help="the stochastic learning rate of training")
    parser.add_argument(
        "--seg_weight",
        default=[0.1, 1],
        type=list,
        nargs='+',
        help="the weight of Binary Cross Entropy in the segmentation learning")
    parser.add_argument(
        "--cls_weight",
        default=[1, 1],
        type=list,
        nargs='+',
        help="the weight of Binary Cross Entropy in the classification learning"
    )
    parser.add_argument("--seed",
                        default=2021,
                        type=int,
                        help="the random seed")
    parser.add_argument(
        "--train_dir",
        default="/train_dir",
        type=str,
        help=
        "the train data directory. it consists of the both ng and ok directorys, and they have img and mask folders."
    )
    parser.add_argument(
        "--val_dir",
        default="/val_dir",
        type=str,
        help=
        "the validation data directory. it consists of the both ng and ok directorys, and they have img and mask folders."
    )

    args = parser.parse_args()

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    segmentation_train = True
    classification_train = True

    train_dir = Path(args.train_dir)
    val_dir = Path(args.val_dir)

    train_ok_dir = str(train_dir / "ok")
    train_mask_dir = str(train_dir / "mask")
    train_ng_dir = str(train_dir / "ng")

    val_ok_dir = str(val_dir / "ok")
    val_mask_dir = str(val_dir / "mask")
    val_ng_dir = str(val_dir / "ng")

    seg_train_dataset = SegmentationDataset(img_dir=train_ng_dir,
                                            mask_dir=train_mask_dir,
                                            n_channels=3,
                                            classes=1,
                                            train=True)
    seg_val_dataset = SegmentationDataset(img_dir=val_ng_dir,
                                          mask_dir=val_mask_dir,
                                          n_channels=3,
                                          classes=1,
                                          train=False)

    cls_train_dataset = ClassificationDataset(ok_dir=train_ok_dir,
                                              ng_dir=train_ng_dir,
                                              n_channels=3,
                                              classes=1,
                                              train=True)
    cls_val_dataset = ClassificationDataset(ok_dir=val_ok_dir,
                                            ng_dir=val_ng_dir,
                                            n_channels=3,
                                            classes=1,
                                            train=False)

    seg_train_loader = DataLoader(seg_train_dataset,
                                  batch_size=8,
                                  shuffle=True)
    seg_val_loader = DataLoader(seg_val_dataset, batch_size=8, shuffle=True)
    cls_train_loader = DataLoader(cls_train_dataset,
                                  batch_size=8,
                                  shuffle=True)
    cls_val_loader = DataLoader(cls_val_dataset, batch_size=8, shuffle=True)

    my_model = DownconvUnet(in_channel=3, seg_classes=1, cls_classes=2)
    avg_model = AveragedModel(my_model)

    my_model.to(device)
    avg_model.to(device)

    with mlflow.start_run() as run:
        seg_args = Params(args.batch_size, args.seg_epoch, args.lr, args.seed,
                          args.seg_weight)
        cls_args = Params(args.batch_size, args.cls_epoch, args.lr, args.seed,
                          args.cls_weight)
        mode_list = ["seg", "cls"]
        for mode in mode_list:
            for key, value in vars(seg_args).items():
                mlflow.log_param(f"{mode}_{key}", value)

        # Segmentation train

        if segmentation_train:
            print("-" * 5 + "Segmentation training start" + "-" * 5)

            my_model.mode = 1
            train_metrics = Metrics()
            train_loss = 0.
            train_iou = 0.
            train_acc = 0.

            val_metrics = Metrics()
            val_loss = 0.
            val_iou = 0.
            val_acc = 0.

            my_model.train()

            optimizer = torch.optim.Adam(my_model.parameters(), lr=seg_args.lr)
            scheduler = CosineAnnealingLR(optimizer, T_max=100)
            bce = WeightedBCELoss(weight=seg_args.weight)
            swa_start = int(seg_args.num_epoch * 0.75)
            swa_scheduler = SWALR(optimizer,
                                  anneal_strategy='linear',
                                  anneal_epochs=swa_start,
                                  swa_lr=seg_args.swa_lr)

            for epoch in range(seg_args.num_epoch):
                for batch_idx, batch in enumerate(seg_train_loader):
                    batch = tuple(t.to(device) for t in batch)
                    seg_x, seg_y = batch

                    optimizer.zero_grad()

                    pred_y = my_model(seg_x)
                    loss = bce(pred_y, seg_y)
                    loss.backward()
                    optimizer.step()

                    train_loss += loss.item()
                    train_metrics.update(pred_y, seg_y, loss.item())
                    train_iou += train_metrics.iou
                    train_acc += train_metrics.acc

                    step = epoch * len(seg_train_loader) + batch_idx
                    for metric, value in vars(train_metrics).items():
                        mlflow.log_metric(f"seg_train_{metric}",
                                          value,
                                          step=step)

                train_loss /= len(seg_train_loader)
                train_iou /= len(seg_train_loader)
                train_acc /= len(seg_train_loader)

                my_model.eval()

                for batch_idx, batch in enumerate(seg_val_loader):
                    batch = tuple(t.to(device) for t in batch)
                    seg_x, seg_y = batch
                    pred_y = my_model(seg_x)

                    loss = bce(pred_y, seg_y)

                    val_loss += loss.item()
                    val_metrics.update(pred_y, seg_y, val_loss)
                    val_iou += val_metrics.iou
                    val_acc += val_metrics.acc

                    step = epoch * len(seg_val_loader) + batch_idx
                    for metric, value in vars(val_metrics).items():
                        mlflow.log_metric(f"seg_val_{metric}",
                                          value,
                                          step=step)

                val_loss /= len(seg_val_loader)
                val_iou /= len(seg_val_loader)
                val_acc /= len(seg_val_loader)

                print(f"Epoch {epoch + 1}:")
                print("-" * 10)
                print(
                    f"train_loss {train_loss:.3f}, train_iou: {train_iou:.3f}, "
                    f"train_accuracy: {train_acc:.3f}")
                print(f"val_loss {val_loss:.3f}, val_iou: {val_iou:.3f}, "
                      f"val_accuracy: {val_acc:.3f}")

                if epoch > swa_start:
                    print("Stochastic average start")
                    avg_model.update_parameters(my_model)
                    swa_scheduler.step()
                else:
                    scheduler.step()

            print("Segmentation train completed")

            # Classification train

            if classification_train:
                print("-" * 5 + "Classification training start" + "-" * 5)

                my_model.mode = 2

                train_metrics = Metrics()
                train_loss = 0.
                train_iou = 0.
                train_acc = 0.

                val_metrics = Metrics()
                val_loss = 0.
                val_iou = 0.
                val_acc = 0.

                my_model.train()

                optimizer = torch.optim.Adam(my_model.parameters(),
                                             lr=cls_args.lr)
                scheduler = CosineAnnealingLR(optimizer, T_max=100)
                bce = WeightedBCELoss(weight=cls_args.weight)
                swa_start = int(cls_args.num_epoch * 0.75)
                swa_scheduler = SWALR(optimizer,
                                      anneal_strategy='linear',
                                      anneal_epochs=swa_start,
                                      swa_lr=cls_args.swa_lr)

                for epoch in range(cls_args.num_epoch):
                    for batch_idx, batch in enumerate(cls_train_loader):
                        batch = tuple(t.to(device) for t in batch)
                        cls_x, cls_y = batch

                        optimizer.zero_grad()

                        pred_y = my_model(cls_x)
                        loss = bce(pred_y, cls_y)
                        loss.backward()
                        optimizer.step()

                        train_loss += loss.item()
                        train_metrics.update(pred_y, cls_y, train_loss)
                        train_acc += train_metrics.acc

                        step = epoch * len(seg_train_loader) + batch_idx
                        for metric, value in vars(train_metrics).items():
                            mlflow.log_metric(f"cls_train_{metric}",
                                              value,
                                              step=step)

                    train_loss /= len(seg_train_loader)
                    train_acc /= len(seg_train_loader)

                    my_model.eval()

                    for batch_idx, batch in enumerate(cls_val_loader):
                        batch = tuple(t.to(device) for t in batch)
                        cls_x, cls_y = batch
                        pred_y = my_model(cls_x)

                        loss = bce(pred_y, cls_y)

                        val_loss += loss.item()
                        val_metrics.update(pred_y, cls_y, loss.item())
                        val_acc += val_metrics.acc

                        step = epoch * len(seg_train_loader) + batch_idx
                        for metric, value in vars(val_metrics).items():
                            mlflow.log_metric(f"cls_train_{metric}",
                                              value,
                                              step=step)

                    val_loss /= len(seg_val_loader)
                    val_acc /= len(seg_val_loader)

                    print(f"Epoch {epoch + 1}:")
                    print("-" * 10)
                    print(
                        f"train_loss {train_loss:.3f}, train_iou: {train_iou:.3f}, "
                        f"train_accuracy: {train_acc:.3f}")
                    print(f"val_loss {val_loss:.3f}, val_iou: {val_iou:.3f}, "
                          f"val_accuracy: {val_acc:.3f}")

                print("Classification train completed")

                if epoch > swa_start:
                    print("Stochastic average start")
                    avg_model.update_parameters(my_model)
                    swa_scheduler.step()
                else:
                    scheduler.step()
    weight_path = "weights/donwconv_swa_weights.pth"
    torch.save(my_model.state_dict(), weight_path)
    print(f"model weight saved to {weight_path}")
    def fit(
        self,
        train_objectives: Iterable[Tuple[DataLoader, nn.Module]],
        evaluator: SentenceEvaluator = None,
        epochs: int = 1,
        steps_per_epoch=None,
        scheduler: str = 'WarmupLinear',
        warmup_steps: int = 10000,
        optimizer_class: Type[Optimizer] = transformers.AdamW,
        optimizer_params: Dict[str, object] = {
            'lr': 2e-5,
            'eps': 1e-6,
            'correct_bias': False
        },
        weight_decay: float = 0.01,
        evaluation_steps: int = 0,
        output_path: str = None,
        save_best_model: bool = True,
        max_grad_norm: float = 1,
        use_amp: bool = False,
        callback: Callable[[float, int, int], None] = None,
        show_progress_bar: bool = True,
        log_every: int = 100,
        wandb_project_name: str = None,
        wandb_config: Dict[str, object] = {},
        use_swa: bool = False,
        swa_epochs_start: int = 5,
        swa_anneal_epochs: int = 10,
        swa_lr: float = 0.05,
    ):
        """
        Train the model with the given training objective
        Each training objective is sampled in turn for one batch.
        We sample only as many batches from each objective as there are in the smallest one
        to make sure of equal training with each dataset.

        :param train_objectives: Tuples of (DataLoader, LossFunction). Pass more than one for multi-task learning
        :param evaluator: An evaluator (sentence_transformers.evaluation) evaluates the model performance during training on held-out dev data. It is used to determine the best model that is saved to disc.
        :param epochs: Number of epochs for training
        :param steps_per_epoch: Number of training steps per epoch. If set to None (default), one epoch is equal the DataLoader size from train_objectives.
        :param scheduler: Learning rate scheduler. Available schedulers: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts
        :param warmup_steps: Behavior depends on the scheduler. For WarmupLinear (default), the learning rate is increased from o up to the maximal learning rate. After these many training steps, the learning rate is decreased linearly back to zero.
        :param optimizer_class: Optimizer
        :param optimizer_params: Optimizer parameters
        :param weight_decay: Weight decay for model parameters
        :param evaluation_steps: If > 0, evaluate the model using evaluator after each number of training steps
        :param output_path: Storage path for the model and evaluation files
        :param save_best_model: If true, the best model (according to evaluator) is stored at output_path
        :param max_grad_norm: Used for gradient normalization.
        :param use_amp: Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0
        :param callback: Callback function that is invoked after each evaluation.
                It must accept the following three parameters in this order:
                `score`, `epoch`, `steps`
        :param show_progress_bar: If True, output a tqdm progress bar
        """

        if use_amp:
            from torch.cuda.amp import autocast
            scaler = torch.cuda.amp.GradScaler()

        self.to(self._target_device)

        if output_path is not None:
            os.makedirs(output_path, exist_ok=True)

        dataloaders = [dataloader for dataloader, _ in train_objectives]

        # Use smart batching
        for dataloader in dataloaders:
            dataloader.collate_fn = self.smart_batching_collate

        loss_models = [loss for _, loss in train_objectives]
        for loss_model in loss_models:
            loss_model.to(self._target_device)

        self.best_score = -9999999

        if steps_per_epoch is None or steps_per_epoch == 0:
            steps_per_epoch = min(
                [len(dataloader) for dataloader in dataloaders])

        num_train_steps = int(steps_per_epoch * epochs)

        # Prepare logger
        if wandb_available and wandb_project_name:
            if not wandb.setup().settings.sweep_id:
                config = {
                    'epochs': epochs,
                    'steps_per_epoch': steps_per_epoch,
                    'scheduler': scheduler,
                    'warmup_steps': warmup_steps,
                    'weight_decay': weight_decay,
                    'evaluation_steps': evaluation_steps,
                    'output_path': output_path,
                    'save_best_model': save_best_model,
                    'max_grad_norm': max_grad_norm,
                    'use_amp': use_amp,
                }
                wandb.init(project=wandb_project_name,
                           config=config,
                           **wandb_config)
            wandb.watch(self)

        # SWA
        if use_swa:
            swa_model = AveragedModel(self)

        # Prepare optimizers
        optimizers = []
        schedulers = []
        for loss_model in loss_models:
            param_optimizer = list(loss_model.named_parameters())

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                weight_decay
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = optimizer_class(optimizer_grouped_parameters,
                                        **optimizer_params)
            scheduler_obj = self._get_scheduler(optimizer,
                                                scheduler=scheduler,
                                                warmup_steps=warmup_steps,
                                                t_total=num_train_steps)

            optimizers.append(optimizer)
            schedulers.append(scheduler_obj)
        if use_swa:
            swa_scheduler = SWALR(optimizers[0],
                                  swa_lr=swa_lr,
                                  anneal_epochs=swa_anneal_epochs,
                                  anneal_strategy='linear')

        global_step = 0
        data_iterators = [iter(dataloader) for dataloader in dataloaders]

        num_train_objectives = len(train_objectives)

        skip_scheduler = False
        for epoch in trange(epochs,
                            desc="Epoch",
                            disable=not show_progress_bar):
            training_steps = 0

            for loss_model in loss_models:
                loss_model.zero_grad()
                loss_model.train()

            for _ in trange(steps_per_epoch,
                            desc="Iteration",
                            smoothing=0.05,
                            disable=not show_progress_bar):
                for train_idx in range(num_train_objectives):
                    loss_model = loss_models[train_idx]
                    optimizer = optimizers[train_idx]
                    scheduler = schedulers[train_idx]
                    data_iterator = data_iterators[train_idx]

                    try:
                        data = next(data_iterator)
                    except StopIteration:
                        data_iterator = iter(dataloaders[train_idx])
                        data_iterators[train_idx] = data_iterator
                        data = next(data_iterator)

                    features, labels = data

                    if use_amp:
                        with autocast():
                            loss_value = loss_model(features, labels)

                        scale_before_step = scaler.get_scale()
                        scaler.scale(loss_value).backward()
                        scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(loss_model.parameters(),
                                                       max_grad_norm)
                        scaler.step(optimizer)
                        scaler.update()

                        skip_scheduler = scaler.get_scale(
                        ) != scale_before_step
                    else:
                        loss_value = loss_model(features, labels)
                        loss_value.backward()
                        torch.nn.utils.clip_grad_norm_(loss_model.parameters(),
                                                       max_grad_norm)
                        optimizer.step()

                    optimizer.zero_grad()

                    # if wandb init is called
                    if wandb_available and wandb.run is not None and (
                            training_steps + 1) % log_every == 0:
                        wandb.log(
                            {
                                loss_model.__class__.__name__:
                                loss_value.item(),
                                "lr": scheduler.get_last_lr()[0],
                            },
                            step=global_step)

                    if not skip_scheduler:
                        scheduler.step()

                training_steps += 1
                global_step += 1

                if evaluation_steps > 0 and training_steps % evaluation_steps == 0:
                    self._eval_during_training(evaluator, output_path,
                                               save_best_model, epoch,
                                               training_steps, global_step,
                                               callback)
                    for loss_model in loss_models:
                        loss_model.zero_grad()
                        loss_model.train()

            if use_swa and epoch > swa_epochs_start:
                swa_model.update_parameters(self)
                swa_scheduler.step()

            self._eval_during_training(evaluator, output_path, save_best_model,
                                       epoch, -1, global_step, callback)
        if use_swa:
            return swa_model
Ejemplo n.º 24
0
def test_plotting():
    print("Creating Train and Validation Dataset")
    train_transforms = T.Compose([T.ToTensor(), T.Normalize((0.5, ), (0.5, ))])
    valid_transforms = T.Compose([T.ToTensor(), T.Normalize((0.5, ), (0.5, ))])

    train_set, valid_set = dataset.create_cifar10_dataset(
        train_transforms, valid_transforms)
    print("Train and Validation Datasets Created")

    print("Creating DataLoaders")
    train_loader, valid_loader = dataset.create_loaders(train_set, train_set)

    print("Train and Validation Dataloaders Created")
    print("Creating Model")

    all_supported_models = [
        "resnet18",
        # "resnet34",
        # "resnet50",
        # "resnet101",
        # "resnet152",
        # "resnext50_32x4d",
        # "resnext101_32x8d",
        # "vgg11",
        # "vgg13",
        # "vgg16",
        # "vgg19",
        # "mobilenet",
        # "mnasnet0_5",
        # "mnasnet1_0",
    ]

    for model_name in all_supported_models:
        model = model_factory.create_torchvision_model(
            model_name, num_classes=10, pretrained=False
        )  # We don't need pretrained True, we just need a forward pass

        if torch.cuda.is_available():
            print("Model Created. Moving it to CUDA")
        else:
            print("Model Created. Training on CPU only")
        model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=1e-3)

        criterion = (
            nn.CrossEntropyLoss()
        )  # All classification problems we need Cross entropy loss

        # early_stopper = utils.EarlyStopping(
        #     patience=7, verbose=True, path=SAVE_PATH
        # )
        # We do not need early stopping too

        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               T_max=300)

        swa_scheduler = SWALR(optimizer,
                              anneal_strategy="linear",
                              anneal_epochs=20,
                              swa_lr=0.05)
        swa_start = 2
        epoch = 5

    history = engine.fit(epoch, model, train_loader, valid_loader, criterion,
                         device, optimizer)
    plotter.plot_results(history, train_metric='loss', val_metric='top5_acc')
    return 1
Ejemplo n.º 25
0
                                          lr=args.lr,
                                          weight_decay=0.001)
            # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
        else:
            #optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
            optimizer = torch.optim.SGD(model.parameters(),
                                        lr=args.lr,
                                        momentum=0.9,
                                        weight_decay=5e-4)
            #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=0.1, step_size=2)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
                optimizer, T_0=3, T_mult=2, eta_min=args.min_lr, last_epoch=-1)
            #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=25,
            #                                                max_lr=args.lr, epochs=args.epochs, steps_per_epoch=len(train_loader))

        swa_scheduler = SWALR(optimizer, swa_lr=args.swa_lr)

        if args.smooth_ratio > 0.:
            criterion = BiTemperedLoss(
                t1=args.t1, t2=args.t2,
                label_smoothing=args.smooth_ratio).to(device)
        else:
            criterion = nn.CrossEntropyLoss().to(device)

        for epoch in range(args.epochs):
            train_one_epoch(epoch,
                            model,
                            swa_model,
                            args.swa_start,
                            criterion,
                            optimizer,
Ejemplo n.º 26
0
def pseudo_labeling(num_epochs, model, data_loader, val_loader,
                    unlabeled_loader, device, val_every, file_name):
    # Instead of using current epoch we use a "step" variable to calculate alpha_weight
    # This helps the model converge faster
    from torch.optim.swa_utils import AveragedModel, SWALR
    from segmentation_models_pytorch.losses import SoftCrossEntropyLoss, JaccardLoss
    from adamp import AdamP

    criterion = [
        SoftCrossEntropyLoss(smooth_factor=0.1),
        JaccardLoss('multiclass', classes=12)
    ]
    optimizer = AdamP(params=model.parameters(), lr=0.0001, weight_decay=1e-6)
    swa_scheduler = SWALR(optimizer, swa_lr=0.0001)
    swa_model = AveragedModel(model)
    optimizer = Lookahead(optimizer, la_alpha=0.5)

    step = 100
    size = 256
    best_mIoU = 0
    model.train()
    print('Start Pseudo-Labeling..')
    for epoch in range(num_epochs):
        hist = np.zeros((12, 12))
        for batch_idx, (imgs, image_infos) in enumerate(unlabeled_loader):

            # Forward Pass to get the pseudo labels
            # --------------------------------------------- test(unlabelse)를 모델에 통과
            model.eval()
            outs = model(torch.stack(imgs).to(device))
            oms = torch.argmax(outs.squeeze(), dim=1).detach().cpu().numpy()
            oms = torch.Tensor(oms)
            oms = oms.long()
            oms = oms.to(device)

            # --------------------------------------------- 학습

            model.train()
            # Now calculate the unlabeled loss using the pseudo label
            imgs = torch.stack(imgs)
            imgs = imgs.to(device)
            # preds_array = preds_array.to(device)

            output = model(imgs)
            loss = 0
            for each in criterion:
                loss += each(output, oms)

            unlabeled_loss = alpha_weight(step) * loss

            # Backpropogate
            optimizer.zero_grad()
            unlabeled_loss.backward()
            optimizer.step()
            output = torch.argmax(output.squeeze(),
                                  dim=1).detach().cpu().numpy()
            hist = add_hist(hist,
                            oms.detach().cpu().numpy(),
                            output,
                            n_class=12)

            if (batch_idx + 1) % 25 == 0:
                acc, acc_cls, mIoU, fwavacc = label_accuracy_score(hist)
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, mIoU:{:.4f}'.
                      format(epoch + 1, num_epochs, batch_idx + 1,
                             len(unlabeled_loader), unlabeled_loss.item(),
                             mIoU))
            # For every 50 batches train one epoch on labeled data
            # 50배치마다 라벨데이터를 1 epoch학습
            if batch_idx % 50 == 0:

                # Normal training procedure
                for batch_idx, (images, masks, _) in enumerate(data_loader):
                    labeled_loss = 0
                    images = torch.stack(images)
                    # (batch, channel, height, width)
                    masks = torch.stack(masks).long()

                    # gpu 연산을 위해 device 할당
                    images, masks = images.to(device), masks.to(device)

                    output = model(images)

                    for each in criterion:
                        labeled_loss += each(output, masks)

                    optimizer.zero_grad()
                    labeled_loss.backward()
                    optimizer.step()

                # Now we increment step by 1
                step += 1

        if (epoch + 1) % val_every == 0:
            avrg_loss, val_mIoU = validation(epoch + 1, model, val_loader,
                                             criterion, device)
            if val_mIoU > best_mIoU:
                print('Best performance at epoch: {}'.format(epoch + 1))
                print('Save model in', saved_dir)
                best_mIoU = val_mIoU
                save_model(model, file_name=file_name)

        model.train()

        if epoch > 3:
            swa_model.update_parameters(model)
            swa_scheduler.step()
Ejemplo n.º 27
0
def main():
    os.makedirs(SAVEPATH, exist_ok=True)
    print('save path:', SAVEPATH)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print('device:', device)

    print('weight_decay:', WEIGHTDECAY)
    print('momentum:', MOMENTUM)
    print('batch_size:', BATCHSIZE)
    print('lr:', LR)
    print('epoch:', EPOCHS)
    print('Label smoothing:', LABELSMOOTH)
    print('Stochastic Weight Averaging:', SWA)
    if SWA:
        print('Swa lr:', SWA_LR)
        print('Swa start epoch:', SWA_START)
    print('Cutout augmentation:', CUTOUT)
    if CUTOUT:
        print('Cutout size:', CUTOUTSIZE)
    print('Activation:', ACTIVATION)

    # get model
    model = get_seresnet_cifar(activation=ACTIVATION)

    # get loss function
    if LABELSMOOTH:
        criterion = LabelSmoothingLoss(classes=10, smoothing=0.1)
    else:
        criterion = torch.nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=LR,
                                momentum=MOMENTUM,
                                weight_decay=WEIGHTDECAY,
                                nesterov=True)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer,
                                                           T_max=EPOCHS,
                                                           eta_min=0)

    model = model.to(device)
    criterion = criterion.to(device)

    # Check number of parameters your model
    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print(f"Number of parameters: {pytorch_total_params}")
    if int(pytorch_total_params) > 2000000:
        print('Your model has the number of parameters more than 2 millions..')
        return

    if SWA:
        # apply swa
        swa_model = AveragedModel(model)
        swa_scheduler = SWALR(optimizer, swa_lr=SWA_LR)
        swa_total_params = sum(p.numel() for p in swa_model.parameters())
        print(f"Swa parameters: {swa_total_params}")

    # cinic mean, std
    normalize = transforms.Normalize(mean=[0.47889522, 0.47227842, 0.43047404],
                                     std=[0.24205776, 0.23828046, 0.25874835])

    if CUTOUT:
        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(), normalize,
            Cutout(size=CUTOUTSIZE)
        ])
    else:
        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(), normalize
        ])

    train_dataset = torchvision.datasets.ImageFolder('/content/train',
                                                     transform=train_transform)
    train_loader = DataLoader(train_dataset,
                              batch_size=BATCHSIZE,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True)

    # colab reload
    start_epoch = 0
    if os.path.isfile(os.path.join(SAVEPATH, 'latest_checkpoint.pth')):
        checkpoint = torch.load(os.path.join(SAVEPATH,
                                             'latest_checkpoint.pth'))
        start_epoch = checkpoint['epoch']
        scheduler.load_state_dict(checkpoint['scheduler'])
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        if SWA:
            swa_scheduler.load_state_dict(checkpoint['swa_scheduler'])
            swa_model.load_state_dict(checkpoint['swa_model'])
        print(start_epoch, 'load parameter')

    for epoch in range(start_epoch, EPOCHS):
        print("\n----- epoch: {}, lr: {} -----".format(
            epoch, optimizer.param_groups[0]["lr"]))

        # train for one epoch
        start_time = time.time()
        train(train_loader, epoch, model, optimizer, criterion, device)
        elapsed_time = time.time() - start_time
        print('==> {:.2f} seconds to train this epoch\n'.format(elapsed_time))

        # learning rate scheduling
        if SWA and epoch > SWA_START:
            swa_model.update_parameters(model)
            swa_scheduler.step()
        else:
            scheduler.step()

        if SWA:
            checkpoint = {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                'swa_model': swa_model.state_dict(),
                'swa_scheduler': swa_scheduler.state_dict()
            }
        else:
            checkpoint = {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict()
            }
        torch.save(checkpoint, os.path.join(SAVEPATH, 'latest_checkpoint.pth'))
        if epoch % 10 == 0:
            torch.save(checkpoint,
                       os.path.join(SAVEPATH, '%d_checkpoint.pth' % epoch))
Ejemplo n.º 28
0
 def __init__(self, *args, **kwargs):
     super(SWALRRunner, self).__init__(*args, **kwargs)
     self.swa_model = AveragedModel(self.model)
     self.swa_scheduler = SWALR(self.optimizer, swa_lr=0.05)
     self.swa_start = 5
Ejemplo n.º 29
0
    model = tv.models.resnet50(num_classes=3).to(device)
    swa_model = AveragedModel(model)

    if trainOrTest.lower() == "train":
        ### Optimizer
        optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-5)
        cosine_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                                epoch,
                                                                eta_min=0,
                                                                last_epoch=-1)
        scheduler = GradualWarmupScheduler(optimizer,
                                           multiplier=multiplier,
                                           total_epoch=warmEpoch,
                                           after_scheduler=cosine_scheduler)
        swa_scheduler = SWALR(optimizer,
                              swa_lr=LR,
                              anneal_epochs=15,
                              anneal_strategy="cos")
        ### Loss loss
        lossCri = nn.CrossEntropyLoss(reduction="sum")
        model = model.train()
        ### Data set
        fishDataset = MetaLearningDataset(trainSamples,
                                          intTrainLabels,
                                          trainTransforms,
                                          imgPath=imagePath)
        fishDataLoader = DataLoader(fishDataset,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=4,
                                    pin_memory=True,
                                    drop_last=True)
Ejemplo n.º 30
0
class Learner:
    def __init__(self, cfg_dir: str, data_loader: DataLoader, model,
                 labels_definition):
        self.cfg = get_conf(cfg_dir)
        self._labels_definition = labels_definition
        #TODO
        self.logger = self.init_logger(self.cfg.logger)
        #self.dataset = CustomDataset(**self.cfg.dataset)
        self.data = data_loader
        #self.val_dataset = CustomDatasetVal(**self.cfg.val_dataset)
        #self.val_data = DataLoader(self.val_dataset, **self.cfg.dataloader)
        # self.logger.log_parameters({"tr_len": len(self.dataset),
        #                             "val_len": len(self.val_dataset)})
        self.model = model
        #self.model._resnet.conv1.apply(init_weights_normal)
        self.device = self.cfg.train_params.device
        self.model = self.model.to(device=self.device)
        if self.cfg.train_params.optimizer.lower() == "adam":
            self.optimizer = optim.Adam(self.model.parameters(),
                                        **self.cfg.adam)
        elif self.cfg.train_params.optimizer.lower() == "rmsprop":
            self.optimizer = optim.RMSprop(self.model.parameters(),
                                           **self.cfg.rmsprop)
        else:
            raise ValueError(
                f"Unknown optimizer {self.cfg.train_params.optimizer}")

        self.lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer, T_max=100)
        self.criterion = nn.BCELoss()

        if self.cfg.logger.resume:
            # load checkpoint
            print("Loading checkpoint")
            save_dir = self.cfg.directory.load
            checkpoint = load_checkpoint(save_dir, self.device)
            self.model.load_state_dict(checkpoint["model"])
            self.optimizer.load_state_dict(checkpoint["optimizer"])
            self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
            self.epoch = checkpoint["epoch"]
            self.e_loss = checkpoint["e_loss"]
            self.best = checkpoint["best"]
            print(
                f"{datetime.now():%Y-%m-%d %H:%M:%S} "
                f"Loading checkpoint was successful, start from epoch {self.epoch}"
                f" and loss {self.best}")
        else:
            self.epoch = 1
            self.best = np.inf
            self.e_loss = []

        # initialize the early_stopping object
        self.early_stopping = EarlyStopping(
            patience=self.cfg.train_params.patience,
            verbose=True,
            delta=self.cfg.train_params.early_stopping_delta,
        )

        # stochastic weight averaging
        self.swa_model = AveragedModel(self.model)
        self.swa_scheduler = SWALR(self.optimizer, **self.cfg.SWA)

    def train(self, task: VisionTask):
        task.go_to_gpu(self.device)

        visualize_idx = np.random.randint(0, len(self.data), 50)

        while self.epoch <= self.cfg.train_params.epochs:
            running_loss = []
            self.model.train()

            for internel_iter, (images, gt_boxes, gt_labels, ego_labels,
                                counts, img_indexs,
                                wh) in enumerate(self.data):
                self.optimizer.zero_grad()

                # fl = task.get_flat_label(gt_labels)

                m = nn.Sigmoid()
                y = task.get_flat_label(gt_labels)
                x = images

                # move data to device
                x = x.to(device=self.device)
                y = y.to(device=self.device)

                # forward, backward
                encoded_vector = self.model(x)
                out = task.decode(encoded_vector)
                loss = self.criterion(m(out), y)
                loss.backward()
                # check grad norm for debugging
                grad_norm = check_grad_norm(self.model)
                # update
                self.optimizer.step()

                running_loss.append(loss.item())

                #print("Loss:", loss.item())
                #print("grad_norm", grad_norm)

                self.logger.log_metrics(
                    {
                        #"epoch": self.epoch,
                        "batch": internel_iter,
                        "loss": loss.item(),
                        "GradNorm": grad_norm,
                    },
                    epoch=self.epoch)

                #validation
                if internel_iter % 1000 == 0 and self.epoch % 5 == 0:
                    print("Internel iter: ", internel_iter)
                    out = m(out[-1])
                    definitions = []
                    l = task.boundary[1] - task.boundary[0]
                    n_boxes = len(gt_boxes[-1][-1])
                    print("Number of Boxes:", n_boxes)
                    name = "img_" + str(self.epoch) + "_" + str(
                        internel_iter / 1000)
                    for i in range(n_boxes):
                        prediction = out[i * l + 1 + i:i * l + l + 1 + i]
                        prediction = prediction.argmax()
                        definitions.append(name + ": " +
                                           self._labels_definition[
                                               task.get_name()][prediction])

                    print("list", definitions)
                    sz = wh[0][0].item()
                    img = torch.zeros([3, sz, sz])
                    img[0] = images[-1][self.cfg.dataloader.seq_len - 1]
                    img[1] = images[-1][2 * self.cfg.dataloader.seq_len - 1]
                    img[2] = images[-1][3 * self.cfg.dataloader.seq_len - 1]
                    self.logger.log_image(img,
                                          name=name,
                                          image_channels='first')

                #if internel_iter < 10:
                #    sz = wh[0][0].item()
                #    img = torch.zeros([3, sz, sz])
                #    print(img.shape)
                #    print(images.shape)
                #    img[0] = images[-1][self.cfg.dataloader.seq_len -1]
                #    img[1] = images[-1][2*self.cfg.dataloader.seq_len - 1]
                #    img[2] = images[-1][3*self.cfg.dataloader.seq_len - 1]
                #    self.log_image_with_text_on_it(img, gt_labels[-1][-1], task)
                #self.logger.log_image(img, name="v", image_channels='first')

            #bar.close()

            # Visualize
            # self.predict_visualize(index_list=visualize_idx, task=task)

            if self.epoch > self.cfg.train_params.swa_start:
                self.swa_model.update_parameters(self.model)
                self.swa_scheduler.step()
            else:
                self.lr_scheduler.step()

            # validate on val set
            # val_loss, t = self.validate()
            # t /= len(self.val_dataset)

            # average loss for an epoch
            self.e_loss.append(np.mean(running_loss))  # epoch loss
            # print(
            #     f"{datetime.now():%Y-%m-%d %H:%M:%S} Epoch {self.epoch} summary: train Loss: {self.e_loss[-1]:.2f} \t| Val loss: {val_loss:.2f}"
            #     f"\t| time: {t:.3f} seconds"
            # )

            self.logger.log_metrics({
                "epoch": self.epoch,
                "epoch_loss": self.e_loss[-1],
            })

            # early_stopping needs the validation loss to check if it has decreased,
            # and if it has, it will make a checkpoint of the current model
            #self.early_stopping(val_loss, self.model)

            if self.early_stopping.early_stop:
                print("Early stopping")
                self.save()
                break

            if self.epoch % self.cfg.train_params.save_every == 0:
                self.save()

            gc.collect()
            print("Task: " + task.get_name() + " epoch[" + str(self.epoch) +
                  "] finished.")
            self.epoch += 1

        # Update bn statistics for the swa_model at the end
        #if self.epoch >= self.cfg.train_params.swa_start:
#            torch.optim.swa_utils.update_bn(self.data.to(self.device), self.swa_model)
#self.save(name=self.cfg.directory.model_name + "-final" + str(self.epoch) + "-swa")

#macs, params = op_counter(self.model, sample=x)
#print(macs, params)
#self.logger.log_metrics({"GFLOPS": macs[:-1], "#Params": params[:-1], "task name": task.get_name(), "total_loss": self.e_loss[-1]})
        print("Training Finished!")
        return loss

    def train_multi(self, primary_task, auxiliary_tasks):

        # 1- got to gpu fo all tasks
        for auxilary_task in auxiliary_tasks:
            auxilary_task.go_to_gpu(self.device)
        primary_task.go_to_gpu(self.device)

        activation_function = nn.Sigmoid()

        while self.epoch <= self.cfg.train_params.epochs:
            running_loss = []
            self.model.train()

            for internel_iter, (images, gt_boxes, gt_labels, ego_labels,
                                counts, img_indexs,
                                wh) in enumerate(self.data):
                self.optimizer.zero_grad()

                x = images
                x = x.to(device=self.device)
                encoded_vector = self.model(x)

                total_loss = None
                # for auxiliary tasks
                for auxiliary_task in auxiliary_tasks:
                    y = auxiliary_task.get_flat_label(gt_labels)
                    # move data to device
                    y = y.to(device=self.device)
                    # forward
                    out = auxiliary_task.decode(encoded_vector)
                    auxiliary_loss = self.criterion(activation_function(out),
                                                    y)
                    if total_loss is None:
                        total_loss = auxiliary_loss
                    else:
                        total_loss += auxiliary_loss

                # for primary task
                y = primary_task.get_flat_label(gt_labels)
                # move data to device
                y = y.to(device=self.device)
                # forward
                out = primary_task.decode(encoded_vector)
                primary_loss = self.criterion(activation_function(out), y)
                total_loss += primary_loss

                total_loss.backward()
                # check grad norm for debugging
                grad_norm = check_grad_norm(self.model)
                # update
                self.optimizer.step()

                running_loss.append(primary_loss.item())

                self.logger.log_metrics(
                    {
                        # "epoch": self.epoch,
                        "batch": internel_iter,
                        "primary_loss": primary_loss.item(),
                        "GradNorm": grad_norm,
                    },
                    epoch=self.epoch)

                # validation
                if internel_iter % 1000 == 0 and self.epoch % 5 == 0:
                    print("Internel iter: ", internel_iter)
                    out = activation_function(out[-1])
                    definitions = []
                    l = primary_task.boundary[1] - primary_task.boundary[0]
                    n_boxes = len(gt_boxes[-1][-1])
                    print("Number of Boxes:", n_boxes)
                    name = "img_" + str(self.epoch) + "_" + str(
                        internel_iter / 1000)
                    for i in range(n_boxes):
                        prediction = out[i * l + 1 + i:i * l + l + 1 + i]
                        prediction = prediction.argmax()
                        definitions.append(
                            name + ": " + self._labels_definition[
                                primary_task.get_name()][prediction])

                    print("list", definitions)
                    sz = wh[0][0].item()
                    img = torch.zeros([3, sz, sz])
                    img[0] = images[-1][self.cfg.dataloader.seq_len - 1]
                    img[1] = images[-1][2 * self.cfg.dataloader.seq_len - 1]
                    img[2] = images[-1][3 * self.cfg.dataloader.seq_len - 1]

                    img_with_text = draw_text(img, definitions)
                    self.logger.log_image(img_with_text,
                                          name=name,
                                          image_channels='first')

            # Visualize
            # self.predict_visualize(index_list=visualize_idx, task=task)

            if self.epoch > self.cfg.train_params.swa_start:
                self.swa_model.update_parameters(self.model)
                self.swa_scheduler.step()
            else:
                self.lr_scheduler.step()

            # validate on val set
            # val_loss, t = self.validate()
            # t /= len(self.val_dataset)

            # average loss for an epoch
            self.e_loss.append(np.mean(running_loss))  # epoch loss
            # print(
            #     f"{datetime.now():%Y-%m-%d %H:%M:%S} Epoch {self.epoch} summary: train Loss: {self.e_loss[-1]:.2f} \t| Val loss: {val_loss:.2f}"
            #     f"\t| time: {t:.3f} seconds"
            # )

            self.logger.log_metrics({
                "epoch": self.epoch,
                "epoch_loss": self.e_loss[-1],
            })

            # early_stopping needs the validation loss to check if it has decreased,
            # and if it has, it will make a checkpoint of the current model
            # self.early_stopping(val_loss, self.model)

            if self.early_stopping.early_stop:
                print("Early stopping")
                self.save()
                break

            if self.epoch % self.cfg.train_params.save_every == 0:
                self.save()

            gc.collect()
            print("Task: " + primary_task.get_name() + " epoch[" +
                  str(self.epoch) + "] finished.")
            self.epoch += 1

        # Update bn statistics for the swa_model at the end
        # if self.epoch >= self.cfg.train_params.swa_start:
        #            torch.optim.swa_utils.update_bn(self.data.to(self.device), self.swa_model)
        # self.save(name=self.cfg.directory.model_name + "-final" + str(self.epoch) + "-swa")

        # macs, params = op_counter(self.model, sample=x)
        # print(macs, params)
        # self.logger.log_metrics({"GFLOPS": macs[:-1], "#Params": params[:-1], "task name": task.get_name(), "total_loss": self.e_loss[-1]})
        print("Training Finished!")
        return primary_loss

    def predict_visualize(self, index_list, task):
        print("===================================================")
        for i in index_list:
            images, gt_boxes, gt_labels, ego_labels, counts, img_indexs, wh = self.data.dataset.__getitem__(
                i)
            sz = img_indexs[0]

            y = task.get_flat_label(gt_labels)
            x = images

            # move data to device
            x = x.to(device=self.device)
            y = y.to(device=self.device)

            encoded_vector = self.model(x)
            out = task.decode(encoded_vector)
            self.log_image_with_text(img_tensor=images,
                                     out_vector=out,
                                     index=i,
                                     task=task)
        print("===================================================")

    def log_image_with_text(self, img_tensor, out_vector, index, task):
        definitions = []
        label_len = task.boundary[1] - task.boundary[0]
        name = "img_" + str(index)
        i = 0
        while True:
            finished = out_vector[i]
            if finished == True:
                break
            i += 1

            l = out_vector[i, label_len]
            i += label_len
            if len(np.nonzero(l)) > 0:
                definition_idx = np.nonzero(l)[0][0]
                definitions.append(
                    name + ": " +
                    self._labels_definition[task.get_name()][definition_idx])

        print(definitions)
        self.logger.log_image(img_tensor, name=name, image_channels='first')

    def log_image_with_text_on_it(self, img_tensor, labels, task):
        definitions = []
        box_count = len(labels)
        for j in range(min(box_count, VisionTask._max_box_count)):
            l = labels[j]  # len(l) = 149
            l = l[task.boundary[0]:task.boundary[1]]
            if len(np.nonzero(l)) > 0:
                definition_idx = np.nonzero(l)[0][0]
                definitions.append(
                    self._labels_definition[task.get_name()][definition_idx])

        img = draw_text(img_tensor, definitions)
        print(definitions)
        # print(images.shape)
        self.logger.log_image(img_tensor, name="v", image_channels='first')

    # @timeit
    # @torch.no_grad()
    # def validate(self):
    #
    #     self.model.eval()
    #
    #     running_loss = []
    #
    #     for idx, (x, y) in tqdm(enumerate(self.val_data), desc="Validation"):
    #         # move data to device
    #         x = x.to(device=self.device)
    #         y = y.to(device=self.device)
    #
    #         # forward, backward
    #         if self.epoch > self.cfg.train_params.swa_start:
    #             # Update bn statistics for the swa_model
    #             torch.optim.swa_utils.update_bn(self.data, self.swa_model)
    #             out = self.swa_model(x)
    #         else:
    #             out = self.model(x)
    #
    #         loss = self.criterion(out, y)
    #         running_loss.append(loss.item())
    #
    #     # average loss
    #     loss = np.mean(running_loss)
    #
    #     return loss

    def init_logger(self, cfg):
        logger = None
        # Check to see if there is a key in environment:
        EXPERIMENT_KEY = cfg.experiment_key

        # First, let's see if we continue or start fresh:
        CONTINUE_RUN = cfg.resume
        if (EXPERIMENT_KEY is not None):
            # There is one, but the experiment might not exist yet:
            api = comet_ml.API()  # Assumes API key is set in config/env
            try:
                api_experiment = api.get_experiment_by_id(EXPERIMENT_KEY)
            except Exception:
                api_experiment = None
            if api_experiment is not None:
                CONTINUE_RUN = True
                # We can get the last details logged here, if logged:
                # step = int(api_experiment.get_parameters_summary("batch")["valueCurrent"])
                # epoch = int(api_experiment.get_parameters_summary("epochs")["valueCurrent"])

        if CONTINUE_RUN:
            # 1. Recreate the state of ML system before creating experiment
            # otherwise it could try to log params, graph, etc. again
            # ...
            # 2. Setup the existing experiment to carry on:
            logger = comet_ml.ExistingExperiment(
                previous_experiment=EXPERIMENT_KEY,
                log_env_details=True,  # to continue env logging
                log_env_gpu=True,  # to continue GPU logging
                log_env_cpu=True,  # to continue CPU logging
                auto_histogram_weight_logging=True,
                auto_histogram_gradient_logging=True,
                auto_histogram_activation_logging=True)
            # Retrieved from above APIExperiment
            # self.logger.set_epoch(epoch)

        else:
            # 1. Create the experiment first
            #    This will use the COMET_EXPERIMENT_KEY if defined in env.
            #    Otherwise, you could manually set it here. If you don't
            #    set COMET_EXPERIMENT_KEY, the experiment will get a
            #    random key!
            logger = comet_ml.Experiment(
                disabled=cfg.disabled,
                project_name=cfg.project,
                auto_histogram_weight_logging=True,
                auto_histogram_gradient_logging=True,
                auto_histogram_activation_logging=True)
            logger.add_tags(cfg.tags.split())
            logger.log_parameters(self.cfg)

        return logger

    def save(self, name=None):
        checkpoint = {
            "epoch": self.epoch,
            "model": self.model.state_dict(),
            "optimizer": self.optimizer.state_dict(),
            "lr_scheduler": self.lr_scheduler.state_dict(),
            "best": self.best,
            "e_loss": self.e_loss
        }

        if name is None and self.epoch >= self.cfg.train_params.swa_start:
            save_name = self.cfg.directory.model_name + str(
                self.epoch) + "-swa"
            checkpoint['model-swa'] = self.swa_model.state_dict()

        elif name is None:
            save_name = self.cfg.directory.model_name + str(self.epoch)

        else:
            save_name = name

        if self.e_loss[-1] < self.best:
            self.best = self.e_loss[-1]
            checkpoint["best"] = self.best
            save_checkpoint(checkpoint, True, self.cfg.directory.save,
                            save_name)
        else:
            save_checkpoint(checkpoint, False, self.cfg.directory.save,
                            save_name)