def fit_model(self): """ Fits model. Uses AdamW optimizer, model averaging, and a cosine annealing learning rate schedule. """ optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.001) scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, 100, 2 ) self.swa_model = AveragedModel(self.model) swa_start = 750 swa_scheduler = SWALR( optimizer, swa_lr=0.001, anneal_epochs=10, anneal_strategy="cos" ) self.model.train() self.swa_model.train() for epoch in range(1000): optimizer.zero_grad() output = self.model(self.x) loss = -output.log_prob(self.y.view(-1, 1)).sum() loss.backward() optimizer.step() if epoch > swa_start: self.swa_model.update_parameters(self.model) swa_scheduler.step() else: scheduler.step() if epoch % 10 == 0: print(f"Epoch {epoch} complete. Loss: {loss}")
def __init__(self, cfg_dir: str): # load config file and initialize the logger and the device self.cfg = get_conf(cfg_dir) self.logger = self.init_logger(self.cfg.logger) self.device = self.init_device() # creating dataset interface and dataloader for trained data self.data, self.val_data = self.init_dataloader() # create model and initialize its weights and move them to the device self.model = self.init_model() # initialize the optimizer self.optimizer, self.lr_scheduler = self.init_optimizer() # define loss function self.criterion = torch.nn.CrossEntropyLoss() # if resuming, load the checkpoint self.if_resume() # initialize the early_stopping object self.early_stopping = EarlyStopping( patience=self.cfg.train_params.patience, verbose=True, delta=self.cfg.train_params.early_stopping_delta, ) # stochastic weight averaging if self.cfg.train_params.epochs > self.cfg.train_params.swa_start: self.swa_model = AveragedModel(self.model) self.swa_scheduler = SWALR(self.optimizer, **self.cfg.SWA)
def training_epoch_end(self, outputs): self.log('epoch_now', self.current_epoch, on_step=False, on_epoch=True, logger=True) (oppp) = self.optimizers(use_pl_optimizer=True) self.log('lr_now', self.get_lr_inside(oppp), on_step=False, on_epoch=True, logger=True) # https://github.com/PyTorchLightning/pytorch-lightning/issues/3095 if self.learning_params["swa"] and ( self.current_epoch >= self.learning_params["swa_start_epoch"]): if self.swa_model is None: (optimizer) = self.optimizers(use_pl_optimizer=True) print("creating_swa") self.swa_model = AveragedModel(self.network) self.new_scheduler = SWALR( optimizer, anneal_strategy="linear", anneal_epochs=5, swa_lr=self.learning_params["swa_lr"]) # https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging/ self.swa_model.update_parameters(self.network) self.new_scheduler.step()
def __init__(self, cfg_dir: str, data_loader: DataLoader, model, labels_definition): self.cfg = get_conf(cfg_dir) self._labels_definition = labels_definition #TODO self.logger = self.init_logger(self.cfg.logger) #self.dataset = CustomDataset(**self.cfg.dataset) self.data = data_loader #self.val_dataset = CustomDatasetVal(**self.cfg.val_dataset) #self.val_data = DataLoader(self.val_dataset, **self.cfg.dataloader) # self.logger.log_parameters({"tr_len": len(self.dataset), # "val_len": len(self.val_dataset)}) self.model = model #self.model._resnet.conv1.apply(init_weights_normal) self.device = self.cfg.train_params.device self.model = self.model.to(device=self.device) if self.cfg.train_params.optimizer.lower() == "adam": self.optimizer = optim.Adam(self.model.parameters(), **self.cfg.adam) elif self.cfg.train_params.optimizer.lower() == "rmsprop": self.optimizer = optim.RMSprop(self.model.parameters(), **self.cfg.rmsprop) else: raise ValueError( f"Unknown optimizer {self.cfg.train_params.optimizer}") self.lr_scheduler = optim.lr_scheduler.CosineAnnealingLR( self.optimizer, T_max=100) self.criterion = nn.BCELoss() if self.cfg.logger.resume: # load checkpoint print("Loading checkpoint") save_dir = self.cfg.directory.load checkpoint = load_checkpoint(save_dir, self.device) self.model.load_state_dict(checkpoint["model"]) self.optimizer.load_state_dict(checkpoint["optimizer"]) self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) self.epoch = checkpoint["epoch"] self.e_loss = checkpoint["e_loss"] self.best = checkpoint["best"] print( f"{datetime.now():%Y-%m-%d %H:%M:%S} " f"Loading checkpoint was successful, start from epoch {self.epoch}" f" and loss {self.best}") else: self.epoch = 1 self.best = np.inf self.e_loss = [] # initialize the early_stopping object self.early_stopping = EarlyStopping( patience=self.cfg.train_params.patience, verbose=True, delta=self.cfg.train_params.early_stopping_delta, ) # stochastic weight averaging self.swa_model = AveragedModel(self.model) self.swa_scheduler = SWALR(self.optimizer, **self.cfg.SWA)
def train(num_epochs, model, data_loader, val_loader, val_every, device, file_name): learning_rate = 0.0001 from torch.optim.swa_utils import AveragedModel, SWALR from torch.optim.lr_scheduler import CosineAnnealingLR from segmentation_models_pytorch.losses import SoftCrossEntropyLoss, JaccardLoss from adamp import AdamP criterion = [SoftCrossEntropyLoss(smooth_factor=0.1), JaccardLoss('multiclass', classes=12)] optimizer = AdamP(params=model.parameters(), lr=learning_rate, weight_decay=1e-6) swa_scheduler = SWALR(optimizer, swa_lr=learning_rate) swa_model = AveragedModel(model) look = Lookahead(optimizer, la_alpha=0.5) print('Start training..') best_miou = 0 for epoch in range(num_epochs): hist = np.zeros((12, 12)) model.train() for step, (images, masks, _) in enumerate(data_loader): loss = 0 images = torch.stack(images) # (batch, channel, height, width) masks = torch.stack(masks).long() # (batch, channel, height, width) # gpu 연산을 위해 device 할당 images, masks = images.to(device), masks.to(device) # inference outputs = model(images) for i in criterion: loss += i(outputs, masks) # loss 계산 (cross entropy loss) look.zero_grad() loss.backward() look.step() outputs = torch.argmax(outputs.squeeze(), dim=1).detach().cpu().numpy() hist = add_hist(hist, masks.detach().cpu().numpy(), outputs, n_class=12) acc, acc_cls, mIoU, fwavacc = label_accuracy_score(hist) # step 주기에 따른 loss, mIoU 출력 if (step + 1) % 25 == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, mIoU: {:.4f}'.format( epoch + 1, num_epochs, step + 1, len(data_loader), loss.item(), mIoU)) # validation 주기에 따른 loss 출력 및 best model 저장 if (epoch + 1) % val_every == 0: avrg_loss, val_miou = validation(epoch + 1, model, val_loader, criterion, device) if val_miou > best_miou: print('Best performance at epoch: {}'.format(epoch + 1)) print('Save model in', saved_dir) best_miou = val_miou save_model(model, file_name = file_name) if epoch > 3: swa_model.update_parameters(model) swa_scheduler.step()
def test_fit_swa_cuda(self): for model_name in supported_tv_models: model = cnn.create_cnn(model_name, 10, pretrained=None) opt = torch.optim.Adam(model.parameters(), lr=1e-3) loss = nn.CrossEntropyLoss() scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=300) swa_scheduler = SWALR(opt, anneal_strategy="linear", anneal_epochs=20, swa_lr=0.05) swa_start = 2 history = cnn.fit(model, 3, train_loader, val_loader, loss, device="cpu", optimizer=opt, scheduler=scheduler, num_batches=10, swa_start=swa_start, swa_scheduler=swa_scheduler) self.assertIsInstance(history, Dict) exp_keys = ("train", "val") for exp_k in exp_keys: self.assertTrue(exp_k in history.keys()) exp_keys2 = ("top1_acc", "top5_acc", "loss") for exp_k2 in exp_keys2: self.assertTrue(exp_k2 in history["train"].keys()) self.assertTrue(exp_k2 in history["val"].keys())
def on_train_epoch_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule'): if trainer.current_epoch == self.swa_start: # move average model to request device. self._average_model = self._average_model.to(self._device or pl_module.device) optimizers = trainer.optimizers for param_group in optimizers[0].param_groups: if self._swa_lrs is None: initial_lr = param_group["lr"] elif isinstance(self._swa_lrs, float): initial_lr = self._swa_lrs else: initial_lr = self._swa_lrs[0] param_group["initial_lr"] = initial_lr self._swa_lrs = initial_lr self._swa_scheduler = SWALR( optimizers[0], swa_lr=initial_lr, anneal_epochs=self._annealing_epochs, anneal_strategy=self._annealing_strategy, last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1 ) _scheduler_config = _get_default_scheduler_config() assert _scheduler_config["interval"] == "epoch" and _scheduler_config["frequency"] == 1 _scheduler_config["scheduler"] = self._swa_scheduler if trainer.lr_schedulers: lr_scheduler = trainer.lr_schedulers[0]["scheduler"] rank_zero_warn(f"Swapping lr_scheduler {lr_scheduler} for {self._swa_scheduler}") trainer.lr_schedulers[0] = _scheduler_config else: trainer.lr_schedulers.append(_scheduler_config) self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device) if self.swa_start <= trainer.current_epoch <= self.swa_end: self.update_parameters(self._average_model, pl_module, self.n_averaged, self.avg_fn) # Note: No > here in case the callback is saved with the model and training continues if trainer.current_epoch == self.swa_end + 1: # Transfer weights from average model to pl_module self.transfer_weights(self._average_model, pl_module) # Reset BatchNorm for update self.reset_batch_norm_and_save_state(pl_module) # There is no need to perform either backward or optimizer.step as we are # performing only one pass over the train data-loader to compute activation statistics # Therefore, we will virtually increase `num_training_batches` by 1 and skip backward. trainer.num_training_batches += 1 trainer.train_loop._skip_backward = True self._accumulate_grad_batches = trainer.accumulate_grad_batches trainer.accumulate_grad_batches = len(trainer.train_dataloader)
def _configure_optimizers(self, ) -> None: """Loads the optimizers.""" if self._optimizer is not None: self._optimizer = self._optimizer(self._network.parameters(), **self.optimizer_args) else: self._optimizer = None if self._optimizer and self._lr_scheduler is not None: if "steps_per_epoch" in self.lr_scheduler_args: self.lr_scheduler_args["steps_per_epoch"] = len( self.train_dataloader()) # Assume lr scheduler should update at each epoch if not specified. if "interval" not in self.lr_scheduler_args: interval = "epoch" else: interval = self.lr_scheduler_args.pop("interval") self._lr_scheduler = { "lr_scheduler": self._lr_scheduler(self._optimizer, **self.lr_scheduler_args), "interval": interval, } if self.swa_args is not None: self._swa_scheduler = { "swa_scheduler": SWALR(self._optimizer, swa_lr=self.swa_args["lr"]), "swa_start": self.swa_args["start"], } self._swa_network = AveragedModel(self._network).to(self.device)
def get_swa(optimizer, model, swa_lr=0.005, anneal_epochs=10, anneal_strategy="cos"): ''' SWALR Arguments: optimizer (torch.optim.Optimizer): wrapped optimizer swa_lr (float or list): the learning rate value for all param groups together or separately for each group. anneal_epochs (int): number of epochs in the annealing phase (default: 10) anneal_strategy (str): "cos" or "linear"; specifies the annealing strategy: "cos" for cosine annealing, "linear" for linear annealing (default: "cos") last_epoch (int): the index of the last epoch (default: 'cos') ''' swa_model = AveragedModel(model) # swa_scheduler = SWALR(optimizer, swa_lr=swa_lr) # swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, anneal_strategy="linear", anneal_epochs=5, swa_lr=swa_lr) swa_scheduler = SWALR(optimizer, swa_lr=swa_lr, anneal_epochs=anneal_epochs, anneal_strategy=anneal_strategy) return swa_scheduler, swa_model
def train_model(indep_vars, dep_var, verbose=True): """ Trains MDNVol network. Uses AdamW optimizer with cosine annealing learning rate schedule. Ouputs averaged model over the last 25% of training epochs. indep_vars: n x m torch tensor containing independent variables n = number of data points m = number of input variables dep_var: n x 1 torch tensor containing single dependent variable n = number of data points 1 = single output variable """ model = MDN(indep_vars.shape[1], 1, 250, 5) optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, 100, 2) swa_model = AveragedModel(model) swa_start = 750 swa_scheduler = SWALR(optimizer, swa_lr=0.001, anneal_epochs=10, anneal_strategy="cos") model.train() swa_model.train() for epoch in range(1000): optimizer.zero_grad() output = model(indep_vars) loss = -output.log_prob(dep_var).sum() loss.backward() optimizer.step() if epoch > swa_start: swa_model.update_parameters(model) swa_scheduler.step() else: scheduler.step() if epoch % 10 == 0: if verbose: print(f"Epoch {epoch} complete. Loss: {loss}") swa_model.eval() return swa_model
def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"): if trainer.current_epoch == self.swa_start: # move average model to request device. self._average_model = self._average_model.to(self._device or pl_module.device) optimizer = trainer.optimizers[0] if self._swa_lrs is None: self._swa_lrs = [param_group["lr"] for param_group in optimizer.param_groups] if isinstance(self._swa_lrs, float): self._swa_lrs = [self._swa_lrs] * len(optimizer.param_groups) for lr, group in zip(self._swa_lrs, optimizer.param_groups): group["initial_lr"] = lr self._swa_scheduler = SWALR( optimizer, swa_lr=self._swa_lrs, anneal_epochs=self._annealing_epochs, anneal_strategy=self._annealing_strategy, last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1, ) default_scheduler_cfg = _get_default_scheduler_config() assert default_scheduler_cfg["interval"] == "epoch" and default_scheduler_cfg["frequency"] == 1 default_scheduler_cfg["scheduler"] = self._swa_scheduler if trainer.lr_schedulers: scheduler_cfg = trainer.lr_schedulers[0] if scheduler_cfg["interval"] != "epoch" or scheduler_cfg["frequency"] != 1: rank_zero_warn(f"SWA is currently only supported every epoch. Found {scheduler_cfg}") rank_zero_info( f"Swapping scheduler `{scheduler_cfg['scheduler'].__class__.__name__}`" f" for `{self._swa_scheduler.__class__.__name__}`" ) trainer.lr_schedulers[0] = default_scheduler_cfg else: trainer.lr_schedulers.append(default_scheduler_cfg) self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device) if self.swa_start <= trainer.current_epoch <= self.swa_end: self.update_parameters(self._average_model, pl_module, self.n_averaged, self.avg_fn) # Note: No > here in case the callback is saved with the model and training continues if trainer.current_epoch == self.swa_end + 1: # Transfer weights from average model to pl_module self.transfer_weights(self._average_model, pl_module) # Reset BatchNorm for update self.reset_batch_norm_and_save_state(pl_module) # There is no need to perform either backward or optimizer.step as we are # performing only one pass over the train data-loader to compute activation statistics # Therefore, we will virtually increase `num_training_batches` by 1 and skip backward. trainer.num_training_batches += 1 trainer.fit_loop._skip_backward = True self._accumulate_grad_batches = trainer.accumulate_grad_batches trainer.accumulate_grad_batches = trainer.num_training_batches
def configure_optimizers(self): optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.lr) if self.hparams.use_swa: self.swa_scheduler = SWALR( optimizer, swa_lr=self.hparams.swa_lr, anneal_strategy='linear', anneal_epochs=10 ) return optimizer
class SWALRRunner(ClassificationRunner): def __init__(self, *args, **kwargs): super(SWALRRunner, self).__init__(*args, **kwargs) self.swa_model = AveragedModel(self.model) self.swa_scheduler = SWALR(self.optimizer, swa_lr=0.05) self.swa_start = 5 def update_scheduler(self, epoch: int) -> None: if epoch > self.swa_start: self.swa_model.update_parameters(self.model) self.swa_scheduler.step() else: super(SWALRRunner, self).update_scheduler(epoch) def train_end(self, outputs): update_bn(self.loaders["train"], self.swa_model) return super(SWALRRunner, self).train_end(outputs)
def build_swa_model(cfg: CfgNode, model: torch.nn.Module, optimizer: torch.optim.Optimizer): # Instead of copying weights during initialization, the SWA model copys # the model weights when self.update_parameters is first called. # https://github.com/pytorch/pytorch/blob/1.7/torch/optim/swa_utils.py#L107 # The SWA model needs to be constructed for all processes in distributed # training, otherwise the training can get stuck. swa_model = AveragedModel(model) lr = cfg.SOLVER.BASE_LR lr *= cfg.SOLVER.SWA.LR_FACTOR swa_scheduler = SWALR(optimizer, swa_lr=lr) return swa_model, swa_scheduler
def __init__(self, config: DNNConfig): self.config = config self.epochs = config.epoch_num self.device = config.device self.model = tmp_model #self.criterion = CustomLoss() self.criterion = nn.MSELoss() optimizer_kwargs = { 'lr': config.lr, 'weight_decay': config.weight_decay } self.sam = config.issam self.optimizer = make_optimizer(self.model, optimizer_kwargs, optimizer_name=config.optimizer_name, sam=config.issam) self.scheduler_name = config.scheduler_name self.scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer=self.optimizer, T_max=config.T_max) self.isswa = config.getattr('isswa', False) self.swa_start = config.getattr('swa_start', 0) if config.isswa: self.swa_model = AveragedModel(self.model) self.swa_scheduler = SWALR(self.optimizer, swa_lr=0.025) #self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=self.optimizer, # mode=config.mode, factor=config.factor) self.loss_log = { 'train_loss': [], 'train_score': [], 'valid_loss': [], 'valid_score': [] }
def __init__(self, config): self.config = config self.device = 'cuda' if cuda.is_available() else 'cpu' self.model = MLP(config) self.swa_model = AveragedModel(self.model) self.optimizer = make_optimizer(self.model, optimizer_name=self.config.optimizer, sam=self.config.sam) self.scheduler = make_scheduler(self.optimizer, decay_name=self.config.scheduler, num_training_steps=self.config.num_training_steps, num_warmup_steps=self.config.num_warmup_steps) self.swa_start = self.config.swa_start self.swa_scheduler = SWALR(self.optimizer, swa_lr=self.config.swa_lr) self.epoch_num = 0 self.criterion = self.config.criterion
def configure_optimizers(self): if not hasattr(self, 'train_data'): self.prepare_data() if type(self.lr) is float: optimizer = torch.optim.AdamW([{'params': self.model.parameters(), 'lr': self.lr}]) else: param_lr_mappings, self.lr = self.get_param_lr_maps(self.lr) optimizer = torch.optim.AdamW(param_lr_mappings) if self.use_one_cycle_lr_scheduler: return ( [optimizer], [ torch.optim.lr_scheduler.OneCycleLR( optimizer, self.lr, epochs=self.max_epochs, steps_per_epoch=int(np.ceil(len(self.trainer.datamodule.train_dataset) / self.batch_size)), div_factor=1e2 ) ] ) elif self.lr_decay: return ( [optimizer], [ torch.optim.lr_scheduler.StepLR( optimizer, step_size=self.lr_decay_period, gamma=self.lr_decay_gamma ) ] ) elif self.swa: if type(self.lr) is float: optimizer = torch.optim.SGD([{'params': self.model.parameters(), 'lr': self.lr}]) else: param_lr_mappings, self.lr = self.get_param_lr_maps(self.lr) optimizer = torch.optim.SGD(param_lr_mappings) return [optimizer], [SWALR(optimizer, swa_lr=self.swa_lr)] else: return optimizer
def __init__(self): if args.train is not None: self.train_tuple = get_tuple(args.train, bs=args.batch_size, shuffle=True, drop_last=False) if args.valid is not None: valid_bsize = 2048 if args.multiGPU else 50 self.valid_tuple = get_tuple(args.valid, bs=valid_bsize, shuffle=False, drop_last=False) else: self.valid_tuple = None # Select Model, X is default if args.model == "X": self.model = ModelX(args) elif args.model == "V": self.model = ModelV(args) elif args.model == "U": self.model = ModelU(args) elif args.model == "D": self.model = ModelD(args) elif args.model == 'O': self.model = ModelO(args) else: print(args.model, " is not implemented.") # Load pre-trained weights from paths if args.loadpre is not None: self.model.load(args.loadpre) # GPU options if args.multiGPU: self.model.lxrt_encoder.multi_gpu() self.model = self.model.cuda() # Losses and optimizer self.logsoftmax = nn.LogSoftmax(dim=1) self.nllloss = nn.NLLLoss() if args.train is not None: batch_per_epoch = len(self.train_tuple.loader) self.t_total = int(batch_per_epoch * args.epochs // args.acc) print("Total Iters: %d" % self.t_total) def is_backbone(n): if "encoder" in n: return True elif "embeddings" in n: return True elif "pooler" in n: return True print("F: ", n) return False no_decay = ['bias', 'LayerNorm.weight'] params = list(self.model.named_parameters()) if args.reg: optimizer_grouped_parameters = [ { "params": [p for n, p in params if is_backbone(n)], "lr": args.lr }, { "params": [p for n, p in params if not is_backbone(n)], "lr": args.lr * 500 }, ] for n, p in self.model.named_parameters(): print(n) self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr) else: optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.wd }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr) if args.train is not None: self.scheduler = get_linear_schedule_with_warmup( self.optim, self.t_total * 0.1, self.t_total) self.output = args.output os.makedirs(self.output, exist_ok=True) # SWA Method: if args.contrib: self.optim = SWA(self.optim, swa_start=self.t_total * 0.75, swa_freq=5, swa_lr=args.lr) if args.swa: self.swa_model = AveragedModel(self.model) self.swa_start = self.t_total * 0.75 self.swa_scheduler = SWALR(self.optim, swa_lr=args.lr)
class HM: def __init__(self): if args.train is not None: self.train_tuple = get_tuple(args.train, bs=args.batch_size, shuffle=True, drop_last=False) if args.valid is not None: valid_bsize = 2048 if args.multiGPU else 50 self.valid_tuple = get_tuple(args.valid, bs=valid_bsize, shuffle=False, drop_last=False) else: self.valid_tuple = None # Select Model, X is default if args.model == "X": self.model = ModelX(args) elif args.model == "V": self.model = ModelV(args) elif args.model == "U": self.model = ModelU(args) elif args.model == "D": self.model = ModelD(args) elif args.model == 'O': self.model = ModelO(args) else: print(args.model, " is not implemented.") # Load pre-trained weights from paths if args.loadpre is not None: self.model.load(args.loadpre) # GPU options if args.multiGPU: self.model.lxrt_encoder.multi_gpu() self.model = self.model.cuda() # Losses and optimizer self.logsoftmax = nn.LogSoftmax(dim=1) self.nllloss = nn.NLLLoss() if args.train is not None: batch_per_epoch = len(self.train_tuple.loader) self.t_total = int(batch_per_epoch * args.epochs // args.acc) print("Total Iters: %d" % self.t_total) def is_backbone(n): if "encoder" in n: return True elif "embeddings" in n: return True elif "pooler" in n: return True print("F: ", n) return False no_decay = ['bias', 'LayerNorm.weight'] params = list(self.model.named_parameters()) if args.reg: optimizer_grouped_parameters = [ { "params": [p for n, p in params if is_backbone(n)], "lr": args.lr }, { "params": [p for n, p in params if not is_backbone(n)], "lr": args.lr * 500 }, ] for n, p in self.model.named_parameters(): print(n) self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr) else: optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.wd }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr) if args.train is not None: self.scheduler = get_linear_schedule_with_warmup( self.optim, self.t_total * 0.1, self.t_total) self.output = args.output os.makedirs(self.output, exist_ok=True) # SWA Method: if args.contrib: self.optim = SWA(self.optim, swa_start=self.t_total * 0.75, swa_freq=5, swa_lr=args.lr) if args.swa: self.swa_model = AveragedModel(self.model) self.swa_start = self.t_total * 0.75 self.swa_scheduler = SWALR(self.optim, swa_lr=args.lr) def train(self, train_tuple, eval_tuple): dset, loader, evaluator = train_tuple iter_wrapper = (lambda x: tqdm(x, total=len(loader)) ) if args.tqdm else (lambda x: x) print("Batches:", len(loader)) self.optim.zero_grad() best_roc = 0. ups = 0 total_loss = 0. for epoch in range(args.epochs): if args.reg: if args.model != "X": print(self.model.model.layer_weights) id2ans = {} id2prob = {} for i, (ids, feats, boxes, sent, target) in iter_wrapper(enumerate(loader)): if ups == args.midsave: self.save("MID") self.model.train() if args.swa: self.swa_model.train() feats, boxes, target = feats.cuda(), boxes.cuda(), target.long( ).cuda() # Model expects visual feats as tuple of feats & boxes logit = self.model(sent, (feats, boxes)) # Note: LogSoftmax does not change order, hence there should be nothing wrong with taking it as our prediction # In fact ROC AUC stays the exact same for logsoftmax / normal softmax, but logsoftmax is better for loss calculation # due to stronger penalization & decomplexifying properties (log(a/b) = log(a) - log(b)) logit = self.logsoftmax(logit) score = logit[:, 1] if i < 1: print(logit[0, :].detach()) # Note: This loss is the same as CrossEntropy (We splitted it up in logsoftmax & neg. log likelihood loss) loss = self.nllloss(logit.view(-1, 2), target.view(-1)) # Scaling loss by batch size, as we have batches with different sizes, since we do not "drop_last" & dividing by acc for accumulation # Not scaling the loss will worsen performance by ~2abs% loss = loss * logit.size(0) / args.acc loss.backward() total_loss += loss.detach().item() # Acts as argmax - extracting the higher score & the corresponding index (0 or 1) _, predict = logit.detach().max(1) # Getting labels for accuracy for qid, l in zip(ids, predict.cpu().numpy()): id2ans[qid] = l # Getting probabilities for Roc auc for qid, l in zip(ids, score.detach().cpu().numpy()): id2prob[qid] = l if (i + 1) % args.acc == 0: nn.utils.clip_grad_norm_(self.model.parameters(), args.clip) self.optim.step() if (args.swa) and (ups > self.swa_start): self.swa_model.update_parameters(self.model) self.swa_scheduler.step() else: self.scheduler.step() self.optim.zero_grad() ups += 1 # Do Validation in between if ups % 250 == 0: log_str = "\nEpoch(U) %d(%d): Train AC %0.2f RA %0.4f LOSS %0.4f\n" % ( epoch, ups, evaluator.evaluate(id2ans) * 100, evaluator.roc_auc(id2prob) * 100, total_loss) # Set loss back to 0 after printing it total_loss = 0. if self.valid_tuple is not None: # Do Validation acc, roc_auc = self.evaluate(eval_tuple) if roc_auc > best_roc: best_roc = roc_auc best_acc = acc # Only save BEST when no midsave is specified to save space #if args.midsave < 0: # self.save("BEST") log_str += "\nEpoch(U) %d(%d): DEV AC %0.2f RA %0.4f \n" % ( epoch, ups, acc * 100., roc_auc * 100) log_str += "Epoch(U) %d(%d): BEST AC %0.2f RA %0.4f \n" % ( epoch, ups, best_acc * 100., best_roc * 100.) print(log_str, end='') with open(self.output + "/log.log", 'a') as f: f.write(log_str) f.flush() if (epoch + 1) == args.epochs: if args.contrib: self.optim.swap_swa_sgd() self.save("LAST" + args.train) def predict(self, eval_tuple: DataTuple, dump=None, out_csv=True): dset, loader, evaluator = eval_tuple id2ans = {} id2prob = {} for i, datum_tuple in enumerate(loader): ids, feats, boxes, sent = datum_tuple[:4] self.model.eval() if args.swa: self.swa_model.eval() with torch.no_grad(): feats, boxes = feats.cuda(), boxes.cuda() logit = self.model(sent, (feats, boxes)) # Note: LogSoftmax does not change order, hence there should be nothing wrong with taking it as our prediction logit = self.logsoftmax(logit) score = logit[:, 1] if args.swa: logit = self.swa_model(sent, (feats, boxes)) logit = self.logsoftmax(logit) _, predict = logit.max(1) for qid, l in zip(ids, predict.cpu().numpy()): id2ans[qid] = l # Getting probas for Roc Auc for qid, l in zip(ids, score.cpu().numpy()): id2prob[qid] = l if dump is not None: if out_csv == True: evaluator.dump_csv(id2ans, id2prob, dump) else: evaluator.dump_result(id2ans, dump) return id2ans, id2prob def evaluate(self, eval_tuple: DataTuple, dump=None): """Evaluate all data in data_tuple.""" id2ans, id2prob = self.predict(eval_tuple, dump=dump) acc = eval_tuple.evaluator.evaluate(id2ans) roc_auc = eval_tuple.evaluator.roc_auc(id2prob) return acc, roc_auc def save(self, name): if args.swa: torch.save(self.swa_model.state_dict(), os.path.join(self.output, "%s.pth" % name)) else: torch.save(self.model.state_dict(), os.path.join(self.output, "%s.pth" % name)) def load(self, path): print("Load model from %s" % path) state_dict = torch.load("%s" % path) new_state_dict = {} for key, value in state_dict.items(): # N_averaged is a key in SWA models we cannot load, so we skip it if key.startswith("n_averaged"): print("n_averaged:", value) continue # SWA Models will start with module if key.startswith("module."): new_state_dict[key[len("module."):]] = value else: new_state_dict[key] = value state_dict = new_state_dict self.model.load_state_dict(state_dict)
def test_models(): # print("Setting Seed for the run, seed = {}".format(SEED)) # utils.seed_everything(SEED) # We don't need seeds for tests print("Creating Train and Validation Dataset") train_transforms = T.Compose([T.ToTensor(), T.Normalize((0.5, ), (0.5, ))]) valid_transforms = T.Compose([T.ToTensor(), T.Normalize((0.5, ), (0.5, ))]) train_set, valid_set = dataset.create_cifar10_dataset( train_transforms, valid_transforms) print("Train and Validation Datasets Created") print("Creating DataLoaders") train_loader, valid_loader = dataset.create_loaders(train_set, train_set) print("Train and Validation Dataloaders Created") print("Creating Model") all_supported_models = [ "resnet18", # "resnet34", # "resnet50", # "resnet101", # "resnet152", # "resnext50_32x4d", # "resnext101_32x8d", # "vgg11", # "vgg13", # "vgg16", # "vgg19", # "mobilenet", # "mnasnet0_5", # "mnasnet1_0", ] for model_name in all_supported_models: model = model_factory.create_torchvision_model( model_name, num_classes=10, pretrained=False ) # We don't need pretrained True, we just need a forward pass if torch.cuda.is_available(): print("Model Created. Moving it to CUDA") else: print("Model Created. Training on CPU only") model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = ( nn.CrossEntropyLoss() ) # All classification problems we need Cross entropy loss # early_stopper = utils.EarlyStopping( # patience=7, verbose=True, path=SAVE_PATH # ) # We do not need early stopping too scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=300) swa_scheduler = SWALR(optimizer, anneal_strategy="linear", anneal_epochs=20, swa_lr=0.05) swa_start = 2 if torch.cuda.is_available(): scaler = amp.GradScaler() train_metrics = engine.train_step( model, train_loader, criterion, device, optimizer, num_batches=10, fp16_scaler=scaler, ) history2 = engine.fit( 1, model, train_loader, valid_loader, criterion, device, optimizer, num_batches=10, grad_penalty=True, use_fp16=True, ) train_metrics = engine.train_step( model, train_loader, criterion, device, optimizer, num_batches=10, ) history = engine.sanity_fit( model, train_loader, valid_loader, criterion, device, num_batches=10, grad_penalty=True, ) history2 = engine.fit( 1, model, train_loader, valid_loader, criterion, device, optimizer, num_batches=10, grad_penalty=True, ) history3 = engine.fit( 3, model, train_loader, valid_loader, criterion, device, optimizer, scheduler=scheduler, num_batches=10, grad_penalty=True, swa_start=swa_start, swa_scheduler=swa_scheduler, ) print("Done !!") return 1
def training(model, train_dataloader, valid_dataloader, test_dataloader, model_cfg, fold_idx=1): print("-------- ", str(fold_idx), " --------") global model_config model_config = model_cfg device = get_device() model.to(device) if fold_idx == 1: print('CONFIG: ') if fold_idx == 1: print([(v, getattr(model_config, v)) for v in dir(model_config) if v[:2] != "__"]) if fold_idx == 1: print('MODEL: ', model) epochs = model_config.epochs if model_config.optimizer == 'AdamW': optimizer = torch.optim.AdamW(model.parameters(), lr=float(model_config.lr), eps=float(model_config.eps), weight_decay=float( model_config.weight_decay)) elif model_config.optimizer == 'SGD': optimizer = torch.optim.SGD(model.parameters(), lr=float(model_config.lr)) if model_config.scheduler == 'linear': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(model_config.warmup_steps), num_training_steps=len(train_dataloader) * epochs) else: scheduler = None criterion = nn.BCEWithLogitsLoss() #nn.CrossEntropyLoss() swa_model = AveragedModel(model) if model_config.swa_scheduler == 'linear': swa_scheduler = SWALR(optimizer, swa_lr=float(model_config.lr)) else: swa_scheduler = CosineAnnealingLR(optimizer, T_max=100) print('TRAINING...') training_stats = [] best_dev_auc = float('-inf') with tqdm(total=epochs, leave=False) as pbar: for epoch_i in range(0, epochs): if epoch_i >= int(model_config.swa_start): update_bn(train_dataloader, swa_model) train_auc, train_acc, avg_train_loss = train( model, train_dataloader, device, criterion, optimizer) swa_model.update_parameters(model) swa_scheduler.step() update_bn(valid_dataloader, swa_model) valid_auc, valid_acc, avg_dev_loss, dev_d = valid( swa_model, valid_dataloader, device, criterion) else: train_auc, train_acc, avg_train_loss = train( model, train_dataloader, device, criterion, optimizer, scheduler=scheduler) valid_auc, valid_acc, avg_dev_loss, dev_d = valid( model, valid_dataloader, device, criterion) if cfg.final_train: valid_auc = 0 valid_acc = 0 avg_dev_loss = 0 add_stats(training_stats, avg_train_loss, avg_dev_loss, train_acc, train_auc, valid_acc, valid_auc) if (cfg.final_train & (epoch_i == epochs - 1)) | (not cfg.final_train & (valid_auc > best_dev_auc)): best_dev_auc = valid_auc if epoch_i >= int(model_config.swa_start): update_bn(test_dataloader, swa_model) test_d = gen_test(swa_model, test_dataloader, device) save(fold_idx, swa_model, optimizer, dev_d, test_d, valid_auc) else: test_d = gen_test(model, test_dataloader, device) save(fold_idx, model, optimizer, dev_d, test_d, valid_auc) pbar.update(1) print('TRAINING COMPLETED') # Show training results col_names = [ 'train_loss', 'train_acc', 'train_auc', 'dev_loss', 'dev_acc', 'dev_auc' ] training_stats = pd.DataFrame(training_stats, columns=col_names) print(training_stats.head(epochs)) plot_training_results(training_stats, fold_idx) # If config, get best model and make submission if cfg.run['submission'] == True: make_submission(model, test_dataloader)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--batch_size", default=8, type=int, help="batch size of both segmentation and classification training") parser.add_argument( "--seg_epoch", default=100, type=int, help="the number of epoch in the segmentation training") parser.add_argument( "--cls_epoch", default=20, type=int, help="the number of epoch in the classification training") parser.add_argument("--lr", default=0.01, type=float, help="the learning rate of training") parser.add_argument("--swa_lr", default=0.005, type=float, help="the stochastic learning rate of training") parser.add_argument( "--seg_weight", default=[0.1, 1], type=list, nargs='+', help="the weight of Binary Cross Entropy in the segmentation learning") parser.add_argument( "--cls_weight", default=[1, 1], type=list, nargs='+', help="the weight of Binary Cross Entropy in the classification learning" ) parser.add_argument("--seed", default=2021, type=int, help="the random seed") parser.add_argument( "--train_dir", default="/train_dir", type=str, help= "the train data directory. it consists of the both ng and ok directorys, and they have img and mask folders." ) parser.add_argument( "--val_dir", default="/val_dir", type=str, help= "the validation data directory. it consists of the both ng and ok directorys, and they have img and mask folders." ) args = parser.parse_args() device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') segmentation_train = True classification_train = True train_dir = Path(args.train_dir) val_dir = Path(args.val_dir) train_ok_dir = str(train_dir / "ok") train_mask_dir = str(train_dir / "mask") train_ng_dir = str(train_dir / "ng") val_ok_dir = str(val_dir / "ok") val_mask_dir = str(val_dir / "mask") val_ng_dir = str(val_dir / "ng") seg_train_dataset = SegmentationDataset(img_dir=train_ng_dir, mask_dir=train_mask_dir, n_channels=3, classes=1, train=True) seg_val_dataset = SegmentationDataset(img_dir=val_ng_dir, mask_dir=val_mask_dir, n_channels=3, classes=1, train=False) cls_train_dataset = ClassificationDataset(ok_dir=train_ok_dir, ng_dir=train_ng_dir, n_channels=3, classes=1, train=True) cls_val_dataset = ClassificationDataset(ok_dir=val_ok_dir, ng_dir=val_ng_dir, n_channels=3, classes=1, train=False) seg_train_loader = DataLoader(seg_train_dataset, batch_size=8, shuffle=True) seg_val_loader = DataLoader(seg_val_dataset, batch_size=8, shuffle=True) cls_train_loader = DataLoader(cls_train_dataset, batch_size=8, shuffle=True) cls_val_loader = DataLoader(cls_val_dataset, batch_size=8, shuffle=True) my_model = DownconvUnet(in_channel=3, seg_classes=1, cls_classes=2) avg_model = AveragedModel(my_model) my_model.to(device) avg_model.to(device) with mlflow.start_run() as run: seg_args = Params(args.batch_size, args.seg_epoch, args.lr, args.seed, args.seg_weight) cls_args = Params(args.batch_size, args.cls_epoch, args.lr, args.seed, args.cls_weight) mode_list = ["seg", "cls"] for mode in mode_list: for key, value in vars(seg_args).items(): mlflow.log_param(f"{mode}_{key}", value) # Segmentation train if segmentation_train: print("-" * 5 + "Segmentation training start" + "-" * 5) my_model.mode = 1 train_metrics = Metrics() train_loss = 0. train_iou = 0. train_acc = 0. val_metrics = Metrics() val_loss = 0. val_iou = 0. val_acc = 0. my_model.train() optimizer = torch.optim.Adam(my_model.parameters(), lr=seg_args.lr) scheduler = CosineAnnealingLR(optimizer, T_max=100) bce = WeightedBCELoss(weight=seg_args.weight) swa_start = int(seg_args.num_epoch * 0.75) swa_scheduler = SWALR(optimizer, anneal_strategy='linear', anneal_epochs=swa_start, swa_lr=seg_args.swa_lr) for epoch in range(seg_args.num_epoch): for batch_idx, batch in enumerate(seg_train_loader): batch = tuple(t.to(device) for t in batch) seg_x, seg_y = batch optimizer.zero_grad() pred_y = my_model(seg_x) loss = bce(pred_y, seg_y) loss.backward() optimizer.step() train_loss += loss.item() train_metrics.update(pred_y, seg_y, loss.item()) train_iou += train_metrics.iou train_acc += train_metrics.acc step = epoch * len(seg_train_loader) + batch_idx for metric, value in vars(train_metrics).items(): mlflow.log_metric(f"seg_train_{metric}", value, step=step) train_loss /= len(seg_train_loader) train_iou /= len(seg_train_loader) train_acc /= len(seg_train_loader) my_model.eval() for batch_idx, batch in enumerate(seg_val_loader): batch = tuple(t.to(device) for t in batch) seg_x, seg_y = batch pred_y = my_model(seg_x) loss = bce(pred_y, seg_y) val_loss += loss.item() val_metrics.update(pred_y, seg_y, val_loss) val_iou += val_metrics.iou val_acc += val_metrics.acc step = epoch * len(seg_val_loader) + batch_idx for metric, value in vars(val_metrics).items(): mlflow.log_metric(f"seg_val_{metric}", value, step=step) val_loss /= len(seg_val_loader) val_iou /= len(seg_val_loader) val_acc /= len(seg_val_loader) print(f"Epoch {epoch + 1}:") print("-" * 10) print( f"train_loss {train_loss:.3f}, train_iou: {train_iou:.3f}, " f"train_accuracy: {train_acc:.3f}") print(f"val_loss {val_loss:.3f}, val_iou: {val_iou:.3f}, " f"val_accuracy: {val_acc:.3f}") if epoch > swa_start: print("Stochastic average start") avg_model.update_parameters(my_model) swa_scheduler.step() else: scheduler.step() print("Segmentation train completed") # Classification train if classification_train: print("-" * 5 + "Classification training start" + "-" * 5) my_model.mode = 2 train_metrics = Metrics() train_loss = 0. train_iou = 0. train_acc = 0. val_metrics = Metrics() val_loss = 0. val_iou = 0. val_acc = 0. my_model.train() optimizer = torch.optim.Adam(my_model.parameters(), lr=cls_args.lr) scheduler = CosineAnnealingLR(optimizer, T_max=100) bce = WeightedBCELoss(weight=cls_args.weight) swa_start = int(cls_args.num_epoch * 0.75) swa_scheduler = SWALR(optimizer, anneal_strategy='linear', anneal_epochs=swa_start, swa_lr=cls_args.swa_lr) for epoch in range(cls_args.num_epoch): for batch_idx, batch in enumerate(cls_train_loader): batch = tuple(t.to(device) for t in batch) cls_x, cls_y = batch optimizer.zero_grad() pred_y = my_model(cls_x) loss = bce(pred_y, cls_y) loss.backward() optimizer.step() train_loss += loss.item() train_metrics.update(pred_y, cls_y, train_loss) train_acc += train_metrics.acc step = epoch * len(seg_train_loader) + batch_idx for metric, value in vars(train_metrics).items(): mlflow.log_metric(f"cls_train_{metric}", value, step=step) train_loss /= len(seg_train_loader) train_acc /= len(seg_train_loader) my_model.eval() for batch_idx, batch in enumerate(cls_val_loader): batch = tuple(t.to(device) for t in batch) cls_x, cls_y = batch pred_y = my_model(cls_x) loss = bce(pred_y, cls_y) val_loss += loss.item() val_metrics.update(pred_y, cls_y, loss.item()) val_acc += val_metrics.acc step = epoch * len(seg_train_loader) + batch_idx for metric, value in vars(val_metrics).items(): mlflow.log_metric(f"cls_train_{metric}", value, step=step) val_loss /= len(seg_val_loader) val_acc /= len(seg_val_loader) print(f"Epoch {epoch + 1}:") print("-" * 10) print( f"train_loss {train_loss:.3f}, train_iou: {train_iou:.3f}, " f"train_accuracy: {train_acc:.3f}") print(f"val_loss {val_loss:.3f}, val_iou: {val_iou:.3f}, " f"val_accuracy: {val_acc:.3f}") print("Classification train completed") if epoch > swa_start: print("Stochastic average start") avg_model.update_parameters(my_model) swa_scheduler.step() else: scheduler.step() weight_path = "weights/donwconv_swa_weights.pth" torch.save(my_model.state_dict(), weight_path) print(f"model weight saved to {weight_path}")
def fit( self, train_objectives: Iterable[Tuple[DataLoader, nn.Module]], evaluator: SentenceEvaluator = None, epochs: int = 1, steps_per_epoch=None, scheduler: str = 'WarmupLinear', warmup_steps: int = 10000, optimizer_class: Type[Optimizer] = transformers.AdamW, optimizer_params: Dict[str, object] = { 'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False }, weight_decay: float = 0.01, evaluation_steps: int = 0, output_path: str = None, save_best_model: bool = True, max_grad_norm: float = 1, use_amp: bool = False, callback: Callable[[float, int, int], None] = None, show_progress_bar: bool = True, log_every: int = 100, wandb_project_name: str = None, wandb_config: Dict[str, object] = {}, use_swa: bool = False, swa_epochs_start: int = 5, swa_anneal_epochs: int = 10, swa_lr: float = 0.05, ): """ Train the model with the given training objective Each training objective is sampled in turn for one batch. We sample only as many batches from each objective as there are in the smallest one to make sure of equal training with each dataset. :param train_objectives: Tuples of (DataLoader, LossFunction). Pass more than one for multi-task learning :param evaluator: An evaluator (sentence_transformers.evaluation) evaluates the model performance during training on held-out dev data. It is used to determine the best model that is saved to disc. :param epochs: Number of epochs for training :param steps_per_epoch: Number of training steps per epoch. If set to None (default), one epoch is equal the DataLoader size from train_objectives. :param scheduler: Learning rate scheduler. Available schedulers: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts :param warmup_steps: Behavior depends on the scheduler. For WarmupLinear (default), the learning rate is increased from o up to the maximal learning rate. After these many training steps, the learning rate is decreased linearly back to zero. :param optimizer_class: Optimizer :param optimizer_params: Optimizer parameters :param weight_decay: Weight decay for model parameters :param evaluation_steps: If > 0, evaluate the model using evaluator after each number of training steps :param output_path: Storage path for the model and evaluation files :param save_best_model: If true, the best model (according to evaluator) is stored at output_path :param max_grad_norm: Used for gradient normalization. :param use_amp: Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0 :param callback: Callback function that is invoked after each evaluation. It must accept the following three parameters in this order: `score`, `epoch`, `steps` :param show_progress_bar: If True, output a tqdm progress bar """ if use_amp: from torch.cuda.amp import autocast scaler = torch.cuda.amp.GradScaler() self.to(self._target_device) if output_path is not None: os.makedirs(output_path, exist_ok=True) dataloaders = [dataloader for dataloader, _ in train_objectives] # Use smart batching for dataloader in dataloaders: dataloader.collate_fn = self.smart_batching_collate loss_models = [loss for _, loss in train_objectives] for loss_model in loss_models: loss_model.to(self._target_device) self.best_score = -9999999 if steps_per_epoch is None or steps_per_epoch == 0: steps_per_epoch = min( [len(dataloader) for dataloader in dataloaders]) num_train_steps = int(steps_per_epoch * epochs) # Prepare logger if wandb_available and wandb_project_name: if not wandb.setup().settings.sweep_id: config = { 'epochs': epochs, 'steps_per_epoch': steps_per_epoch, 'scheduler': scheduler, 'warmup_steps': warmup_steps, 'weight_decay': weight_decay, 'evaluation_steps': evaluation_steps, 'output_path': output_path, 'save_best_model': save_best_model, 'max_grad_norm': max_grad_norm, 'use_amp': use_amp, } wandb.init(project=wandb_project_name, config=config, **wandb_config) wandb.watch(self) # SWA if use_swa: swa_model = AveragedModel(self) # Prepare optimizers optimizers = [] schedulers = [] for loss_model in loss_models: param_optimizer = list(loss_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) scheduler_obj = self._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps) optimizers.append(optimizer) schedulers.append(scheduler_obj) if use_swa: swa_scheduler = SWALR(optimizers[0], swa_lr=swa_lr, anneal_epochs=swa_anneal_epochs, anneal_strategy='linear') global_step = 0 data_iterators = [iter(dataloader) for dataloader in dataloaders] num_train_objectives = len(train_objectives) skip_scheduler = False for epoch in trange(epochs, desc="Epoch", disable=not show_progress_bar): training_steps = 0 for loss_model in loss_models: loss_model.zero_grad() loss_model.train() for _ in trange(steps_per_epoch, desc="Iteration", smoothing=0.05, disable=not show_progress_bar): for train_idx in range(num_train_objectives): loss_model = loss_models[train_idx] optimizer = optimizers[train_idx] scheduler = schedulers[train_idx] data_iterator = data_iterators[train_idx] try: data = next(data_iterator) except StopIteration: data_iterator = iter(dataloaders[train_idx]) data_iterators[train_idx] = data_iterator data = next(data_iterator) features, labels = data if use_amp: with autocast(): loss_value = loss_model(features, labels) scale_before_step = scaler.get_scale() scaler.scale(loss_value).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm) scaler.step(optimizer) scaler.update() skip_scheduler = scaler.get_scale( ) != scale_before_step else: loss_value = loss_model(features, labels) loss_value.backward() torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm) optimizer.step() optimizer.zero_grad() # if wandb init is called if wandb_available and wandb.run is not None and ( training_steps + 1) % log_every == 0: wandb.log( { loss_model.__class__.__name__: loss_value.item(), "lr": scheduler.get_last_lr()[0], }, step=global_step) if not skip_scheduler: scheduler.step() training_steps += 1 global_step += 1 if evaluation_steps > 0 and training_steps % evaluation_steps == 0: self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, global_step, callback) for loss_model in loss_models: loss_model.zero_grad() loss_model.train() if use_swa and epoch > swa_epochs_start: swa_model.update_parameters(self) swa_scheduler.step() self._eval_during_training(evaluator, output_path, save_best_model, epoch, -1, global_step, callback) if use_swa: return swa_model
def test_plotting(): print("Creating Train and Validation Dataset") train_transforms = T.Compose([T.ToTensor(), T.Normalize((0.5, ), (0.5, ))]) valid_transforms = T.Compose([T.ToTensor(), T.Normalize((0.5, ), (0.5, ))]) train_set, valid_set = dataset.create_cifar10_dataset( train_transforms, valid_transforms) print("Train and Validation Datasets Created") print("Creating DataLoaders") train_loader, valid_loader = dataset.create_loaders(train_set, train_set) print("Train and Validation Dataloaders Created") print("Creating Model") all_supported_models = [ "resnet18", # "resnet34", # "resnet50", # "resnet101", # "resnet152", # "resnext50_32x4d", # "resnext101_32x8d", # "vgg11", # "vgg13", # "vgg16", # "vgg19", # "mobilenet", # "mnasnet0_5", # "mnasnet1_0", ] for model_name in all_supported_models: model = model_factory.create_torchvision_model( model_name, num_classes=10, pretrained=False ) # We don't need pretrained True, we just need a forward pass if torch.cuda.is_available(): print("Model Created. Moving it to CUDA") else: print("Model Created. Training on CPU only") model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = ( nn.CrossEntropyLoss() ) # All classification problems we need Cross entropy loss # early_stopper = utils.EarlyStopping( # patience=7, verbose=True, path=SAVE_PATH # ) # We do not need early stopping too scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=300) swa_scheduler = SWALR(optimizer, anneal_strategy="linear", anneal_epochs=20, swa_lr=0.05) swa_start = 2 epoch = 5 history = engine.fit(epoch, model, train_loader, valid_loader, criterion, device, optimizer) plotter.plot_results(history, train_metric='loss', val_metric='top5_acc') return 1
lr=args.lr, weight_decay=0.001) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1) else: #optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=0.1, step_size=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=3, T_mult=2, eta_min=args.min_lr, last_epoch=-1) #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=25, # max_lr=args.lr, epochs=args.epochs, steps_per_epoch=len(train_loader)) swa_scheduler = SWALR(optimizer, swa_lr=args.swa_lr) if args.smooth_ratio > 0.: criterion = BiTemperedLoss( t1=args.t1, t2=args.t2, label_smoothing=args.smooth_ratio).to(device) else: criterion = nn.CrossEntropyLoss().to(device) for epoch in range(args.epochs): train_one_epoch(epoch, model, swa_model, args.swa_start, criterion, optimizer,
def pseudo_labeling(num_epochs, model, data_loader, val_loader, unlabeled_loader, device, val_every, file_name): # Instead of using current epoch we use a "step" variable to calculate alpha_weight # This helps the model converge faster from torch.optim.swa_utils import AveragedModel, SWALR from segmentation_models_pytorch.losses import SoftCrossEntropyLoss, JaccardLoss from adamp import AdamP criterion = [ SoftCrossEntropyLoss(smooth_factor=0.1), JaccardLoss('multiclass', classes=12) ] optimizer = AdamP(params=model.parameters(), lr=0.0001, weight_decay=1e-6) swa_scheduler = SWALR(optimizer, swa_lr=0.0001) swa_model = AveragedModel(model) optimizer = Lookahead(optimizer, la_alpha=0.5) step = 100 size = 256 best_mIoU = 0 model.train() print('Start Pseudo-Labeling..') for epoch in range(num_epochs): hist = np.zeros((12, 12)) for batch_idx, (imgs, image_infos) in enumerate(unlabeled_loader): # Forward Pass to get the pseudo labels # --------------------------------------------- test(unlabelse)를 모델에 통과 model.eval() outs = model(torch.stack(imgs).to(device)) oms = torch.argmax(outs.squeeze(), dim=1).detach().cpu().numpy() oms = torch.Tensor(oms) oms = oms.long() oms = oms.to(device) # --------------------------------------------- 학습 model.train() # Now calculate the unlabeled loss using the pseudo label imgs = torch.stack(imgs) imgs = imgs.to(device) # preds_array = preds_array.to(device) output = model(imgs) loss = 0 for each in criterion: loss += each(output, oms) unlabeled_loss = alpha_weight(step) * loss # Backpropogate optimizer.zero_grad() unlabeled_loss.backward() optimizer.step() output = torch.argmax(output.squeeze(), dim=1).detach().cpu().numpy() hist = add_hist(hist, oms.detach().cpu().numpy(), output, n_class=12) if (batch_idx + 1) % 25 == 0: acc, acc_cls, mIoU, fwavacc = label_accuracy_score(hist) print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, mIoU:{:.4f}'. format(epoch + 1, num_epochs, batch_idx + 1, len(unlabeled_loader), unlabeled_loss.item(), mIoU)) # For every 50 batches train one epoch on labeled data # 50배치마다 라벨데이터를 1 epoch학습 if batch_idx % 50 == 0: # Normal training procedure for batch_idx, (images, masks, _) in enumerate(data_loader): labeled_loss = 0 images = torch.stack(images) # (batch, channel, height, width) masks = torch.stack(masks).long() # gpu 연산을 위해 device 할당 images, masks = images.to(device), masks.to(device) output = model(images) for each in criterion: labeled_loss += each(output, masks) optimizer.zero_grad() labeled_loss.backward() optimizer.step() # Now we increment step by 1 step += 1 if (epoch + 1) % val_every == 0: avrg_loss, val_mIoU = validation(epoch + 1, model, val_loader, criterion, device) if val_mIoU > best_mIoU: print('Best performance at epoch: {}'.format(epoch + 1)) print('Save model in', saved_dir) best_mIoU = val_mIoU save_model(model, file_name=file_name) model.train() if epoch > 3: swa_model.update_parameters(model) swa_scheduler.step()
def main(): os.makedirs(SAVEPATH, exist_ok=True) print('save path:', SAVEPATH) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('device:', device) print('weight_decay:', WEIGHTDECAY) print('momentum:', MOMENTUM) print('batch_size:', BATCHSIZE) print('lr:', LR) print('epoch:', EPOCHS) print('Label smoothing:', LABELSMOOTH) print('Stochastic Weight Averaging:', SWA) if SWA: print('Swa lr:', SWA_LR) print('Swa start epoch:', SWA_START) print('Cutout augmentation:', CUTOUT) if CUTOUT: print('Cutout size:', CUTOUTSIZE) print('Activation:', ACTIVATION) # get model model = get_seresnet_cifar(activation=ACTIVATION) # get loss function if LABELSMOOTH: criterion = LabelSmoothingLoss(classes=10, smoothing=0.1) else: criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM, weight_decay=WEIGHTDECAY, nesterov=True) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=EPOCHS, eta_min=0) model = model.to(device) criterion = criterion.to(device) # Check number of parameters your model pytorch_total_params = sum(p.numel() for p in model.parameters()) print(f"Number of parameters: {pytorch_total_params}") if int(pytorch_total_params) > 2000000: print('Your model has the number of parameters more than 2 millions..') return if SWA: # apply swa swa_model = AveragedModel(model) swa_scheduler = SWALR(optimizer, swa_lr=SWA_LR) swa_total_params = sum(p.numel() for p in swa_model.parameters()) print(f"Swa parameters: {swa_total_params}") # cinic mean, std normalize = transforms.Normalize(mean=[0.47889522, 0.47227842, 0.43047404], std=[0.24205776, 0.23828046, 0.25874835]) if CUTOUT: train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, Cutout(size=CUTOUTSIZE) ]) else: train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) train_dataset = torchvision.datasets.ImageFolder('/content/train', transform=train_transform) train_loader = DataLoader(train_dataset, batch_size=BATCHSIZE, shuffle=True, num_workers=4, pin_memory=True) # colab reload start_epoch = 0 if os.path.isfile(os.path.join(SAVEPATH, 'latest_checkpoint.pth')): checkpoint = torch.load(os.path.join(SAVEPATH, 'latest_checkpoint.pth')) start_epoch = checkpoint['epoch'] scheduler.load_state_dict(checkpoint['scheduler']) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) if SWA: swa_scheduler.load_state_dict(checkpoint['swa_scheduler']) swa_model.load_state_dict(checkpoint['swa_model']) print(start_epoch, 'load parameter') for epoch in range(start_epoch, EPOCHS): print("\n----- epoch: {}, lr: {} -----".format( epoch, optimizer.param_groups[0]["lr"])) # train for one epoch start_time = time.time() train(train_loader, epoch, model, optimizer, criterion, device) elapsed_time = time.time() - start_time print('==> {:.2f} seconds to train this epoch\n'.format(elapsed_time)) # learning rate scheduling if SWA and epoch > SWA_START: swa_model.update_parameters(model) swa_scheduler.step() else: scheduler.step() if SWA: checkpoint = { 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'swa_model': swa_model.state_dict(), 'swa_scheduler': swa_scheduler.state_dict() } else: checkpoint = { 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } torch.save(checkpoint, os.path.join(SAVEPATH, 'latest_checkpoint.pth')) if epoch % 10 == 0: torch.save(checkpoint, os.path.join(SAVEPATH, '%d_checkpoint.pth' % epoch))
def __init__(self, *args, **kwargs): super(SWALRRunner, self).__init__(*args, **kwargs) self.swa_model = AveragedModel(self.model) self.swa_scheduler = SWALR(self.optimizer, swa_lr=0.05) self.swa_start = 5
model = tv.models.resnet50(num_classes=3).to(device) swa_model = AveragedModel(model) if trainOrTest.lower() == "train": ### Optimizer optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-5) cosine_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, epoch, eta_min=0, last_epoch=-1) scheduler = GradualWarmupScheduler(optimizer, multiplier=multiplier, total_epoch=warmEpoch, after_scheduler=cosine_scheduler) swa_scheduler = SWALR(optimizer, swa_lr=LR, anneal_epochs=15, anneal_strategy="cos") ### Loss loss lossCri = nn.CrossEntropyLoss(reduction="sum") model = model.train() ### Data set fishDataset = MetaLearningDataset(trainSamples, intTrainLabels, trainTransforms, imgPath=imagePath) fishDataLoader = DataLoader(fishDataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)
class Learner: def __init__(self, cfg_dir: str, data_loader: DataLoader, model, labels_definition): self.cfg = get_conf(cfg_dir) self._labels_definition = labels_definition #TODO self.logger = self.init_logger(self.cfg.logger) #self.dataset = CustomDataset(**self.cfg.dataset) self.data = data_loader #self.val_dataset = CustomDatasetVal(**self.cfg.val_dataset) #self.val_data = DataLoader(self.val_dataset, **self.cfg.dataloader) # self.logger.log_parameters({"tr_len": len(self.dataset), # "val_len": len(self.val_dataset)}) self.model = model #self.model._resnet.conv1.apply(init_weights_normal) self.device = self.cfg.train_params.device self.model = self.model.to(device=self.device) if self.cfg.train_params.optimizer.lower() == "adam": self.optimizer = optim.Adam(self.model.parameters(), **self.cfg.adam) elif self.cfg.train_params.optimizer.lower() == "rmsprop": self.optimizer = optim.RMSprop(self.model.parameters(), **self.cfg.rmsprop) else: raise ValueError( f"Unknown optimizer {self.cfg.train_params.optimizer}") self.lr_scheduler = optim.lr_scheduler.CosineAnnealingLR( self.optimizer, T_max=100) self.criterion = nn.BCELoss() if self.cfg.logger.resume: # load checkpoint print("Loading checkpoint") save_dir = self.cfg.directory.load checkpoint = load_checkpoint(save_dir, self.device) self.model.load_state_dict(checkpoint["model"]) self.optimizer.load_state_dict(checkpoint["optimizer"]) self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) self.epoch = checkpoint["epoch"] self.e_loss = checkpoint["e_loss"] self.best = checkpoint["best"] print( f"{datetime.now():%Y-%m-%d %H:%M:%S} " f"Loading checkpoint was successful, start from epoch {self.epoch}" f" and loss {self.best}") else: self.epoch = 1 self.best = np.inf self.e_loss = [] # initialize the early_stopping object self.early_stopping = EarlyStopping( patience=self.cfg.train_params.patience, verbose=True, delta=self.cfg.train_params.early_stopping_delta, ) # stochastic weight averaging self.swa_model = AveragedModel(self.model) self.swa_scheduler = SWALR(self.optimizer, **self.cfg.SWA) def train(self, task: VisionTask): task.go_to_gpu(self.device) visualize_idx = np.random.randint(0, len(self.data), 50) while self.epoch <= self.cfg.train_params.epochs: running_loss = [] self.model.train() for internel_iter, (images, gt_boxes, gt_labels, ego_labels, counts, img_indexs, wh) in enumerate(self.data): self.optimizer.zero_grad() # fl = task.get_flat_label(gt_labels) m = nn.Sigmoid() y = task.get_flat_label(gt_labels) x = images # move data to device x = x.to(device=self.device) y = y.to(device=self.device) # forward, backward encoded_vector = self.model(x) out = task.decode(encoded_vector) loss = self.criterion(m(out), y) loss.backward() # check grad norm for debugging grad_norm = check_grad_norm(self.model) # update self.optimizer.step() running_loss.append(loss.item()) #print("Loss:", loss.item()) #print("grad_norm", grad_norm) self.logger.log_metrics( { #"epoch": self.epoch, "batch": internel_iter, "loss": loss.item(), "GradNorm": grad_norm, }, epoch=self.epoch) #validation if internel_iter % 1000 == 0 and self.epoch % 5 == 0: print("Internel iter: ", internel_iter) out = m(out[-1]) definitions = [] l = task.boundary[1] - task.boundary[0] n_boxes = len(gt_boxes[-1][-1]) print("Number of Boxes:", n_boxes) name = "img_" + str(self.epoch) + "_" + str( internel_iter / 1000) for i in range(n_boxes): prediction = out[i * l + 1 + i:i * l + l + 1 + i] prediction = prediction.argmax() definitions.append(name + ": " + self._labels_definition[ task.get_name()][prediction]) print("list", definitions) sz = wh[0][0].item() img = torch.zeros([3, sz, sz]) img[0] = images[-1][self.cfg.dataloader.seq_len - 1] img[1] = images[-1][2 * self.cfg.dataloader.seq_len - 1] img[2] = images[-1][3 * self.cfg.dataloader.seq_len - 1] self.logger.log_image(img, name=name, image_channels='first') #if internel_iter < 10: # sz = wh[0][0].item() # img = torch.zeros([3, sz, sz]) # print(img.shape) # print(images.shape) # img[0] = images[-1][self.cfg.dataloader.seq_len -1] # img[1] = images[-1][2*self.cfg.dataloader.seq_len - 1] # img[2] = images[-1][3*self.cfg.dataloader.seq_len - 1] # self.log_image_with_text_on_it(img, gt_labels[-1][-1], task) #self.logger.log_image(img, name="v", image_channels='first') #bar.close() # Visualize # self.predict_visualize(index_list=visualize_idx, task=task) if self.epoch > self.cfg.train_params.swa_start: self.swa_model.update_parameters(self.model) self.swa_scheduler.step() else: self.lr_scheduler.step() # validate on val set # val_loss, t = self.validate() # t /= len(self.val_dataset) # average loss for an epoch self.e_loss.append(np.mean(running_loss)) # epoch loss # print( # f"{datetime.now():%Y-%m-%d %H:%M:%S} Epoch {self.epoch} summary: train Loss: {self.e_loss[-1]:.2f} \t| Val loss: {val_loss:.2f}" # f"\t| time: {t:.3f} seconds" # ) self.logger.log_metrics({ "epoch": self.epoch, "epoch_loss": self.e_loss[-1], }) # early_stopping needs the validation loss to check if it has decreased, # and if it has, it will make a checkpoint of the current model #self.early_stopping(val_loss, self.model) if self.early_stopping.early_stop: print("Early stopping") self.save() break if self.epoch % self.cfg.train_params.save_every == 0: self.save() gc.collect() print("Task: " + task.get_name() + " epoch[" + str(self.epoch) + "] finished.") self.epoch += 1 # Update bn statistics for the swa_model at the end #if self.epoch >= self.cfg.train_params.swa_start: # torch.optim.swa_utils.update_bn(self.data.to(self.device), self.swa_model) #self.save(name=self.cfg.directory.model_name + "-final" + str(self.epoch) + "-swa") #macs, params = op_counter(self.model, sample=x) #print(macs, params) #self.logger.log_metrics({"GFLOPS": macs[:-1], "#Params": params[:-1], "task name": task.get_name(), "total_loss": self.e_loss[-1]}) print("Training Finished!") return loss def train_multi(self, primary_task, auxiliary_tasks): # 1- got to gpu fo all tasks for auxilary_task in auxiliary_tasks: auxilary_task.go_to_gpu(self.device) primary_task.go_to_gpu(self.device) activation_function = nn.Sigmoid() while self.epoch <= self.cfg.train_params.epochs: running_loss = [] self.model.train() for internel_iter, (images, gt_boxes, gt_labels, ego_labels, counts, img_indexs, wh) in enumerate(self.data): self.optimizer.zero_grad() x = images x = x.to(device=self.device) encoded_vector = self.model(x) total_loss = None # for auxiliary tasks for auxiliary_task in auxiliary_tasks: y = auxiliary_task.get_flat_label(gt_labels) # move data to device y = y.to(device=self.device) # forward out = auxiliary_task.decode(encoded_vector) auxiliary_loss = self.criterion(activation_function(out), y) if total_loss is None: total_loss = auxiliary_loss else: total_loss += auxiliary_loss # for primary task y = primary_task.get_flat_label(gt_labels) # move data to device y = y.to(device=self.device) # forward out = primary_task.decode(encoded_vector) primary_loss = self.criterion(activation_function(out), y) total_loss += primary_loss total_loss.backward() # check grad norm for debugging grad_norm = check_grad_norm(self.model) # update self.optimizer.step() running_loss.append(primary_loss.item()) self.logger.log_metrics( { # "epoch": self.epoch, "batch": internel_iter, "primary_loss": primary_loss.item(), "GradNorm": grad_norm, }, epoch=self.epoch) # validation if internel_iter % 1000 == 0 and self.epoch % 5 == 0: print("Internel iter: ", internel_iter) out = activation_function(out[-1]) definitions = [] l = primary_task.boundary[1] - primary_task.boundary[0] n_boxes = len(gt_boxes[-1][-1]) print("Number of Boxes:", n_boxes) name = "img_" + str(self.epoch) + "_" + str( internel_iter / 1000) for i in range(n_boxes): prediction = out[i * l + 1 + i:i * l + l + 1 + i] prediction = prediction.argmax() definitions.append( name + ": " + self._labels_definition[ primary_task.get_name()][prediction]) print("list", definitions) sz = wh[0][0].item() img = torch.zeros([3, sz, sz]) img[0] = images[-1][self.cfg.dataloader.seq_len - 1] img[1] = images[-1][2 * self.cfg.dataloader.seq_len - 1] img[2] = images[-1][3 * self.cfg.dataloader.seq_len - 1] img_with_text = draw_text(img, definitions) self.logger.log_image(img_with_text, name=name, image_channels='first') # Visualize # self.predict_visualize(index_list=visualize_idx, task=task) if self.epoch > self.cfg.train_params.swa_start: self.swa_model.update_parameters(self.model) self.swa_scheduler.step() else: self.lr_scheduler.step() # validate on val set # val_loss, t = self.validate() # t /= len(self.val_dataset) # average loss for an epoch self.e_loss.append(np.mean(running_loss)) # epoch loss # print( # f"{datetime.now():%Y-%m-%d %H:%M:%S} Epoch {self.epoch} summary: train Loss: {self.e_loss[-1]:.2f} \t| Val loss: {val_loss:.2f}" # f"\t| time: {t:.3f} seconds" # ) self.logger.log_metrics({ "epoch": self.epoch, "epoch_loss": self.e_loss[-1], }) # early_stopping needs the validation loss to check if it has decreased, # and if it has, it will make a checkpoint of the current model # self.early_stopping(val_loss, self.model) if self.early_stopping.early_stop: print("Early stopping") self.save() break if self.epoch % self.cfg.train_params.save_every == 0: self.save() gc.collect() print("Task: " + primary_task.get_name() + " epoch[" + str(self.epoch) + "] finished.") self.epoch += 1 # Update bn statistics for the swa_model at the end # if self.epoch >= self.cfg.train_params.swa_start: # torch.optim.swa_utils.update_bn(self.data.to(self.device), self.swa_model) # self.save(name=self.cfg.directory.model_name + "-final" + str(self.epoch) + "-swa") # macs, params = op_counter(self.model, sample=x) # print(macs, params) # self.logger.log_metrics({"GFLOPS": macs[:-1], "#Params": params[:-1], "task name": task.get_name(), "total_loss": self.e_loss[-1]}) print("Training Finished!") return primary_loss def predict_visualize(self, index_list, task): print("===================================================") for i in index_list: images, gt_boxes, gt_labels, ego_labels, counts, img_indexs, wh = self.data.dataset.__getitem__( i) sz = img_indexs[0] y = task.get_flat_label(gt_labels) x = images # move data to device x = x.to(device=self.device) y = y.to(device=self.device) encoded_vector = self.model(x) out = task.decode(encoded_vector) self.log_image_with_text(img_tensor=images, out_vector=out, index=i, task=task) print("===================================================") def log_image_with_text(self, img_tensor, out_vector, index, task): definitions = [] label_len = task.boundary[1] - task.boundary[0] name = "img_" + str(index) i = 0 while True: finished = out_vector[i] if finished == True: break i += 1 l = out_vector[i, label_len] i += label_len if len(np.nonzero(l)) > 0: definition_idx = np.nonzero(l)[0][0] definitions.append( name + ": " + self._labels_definition[task.get_name()][definition_idx]) print(definitions) self.logger.log_image(img_tensor, name=name, image_channels='first') def log_image_with_text_on_it(self, img_tensor, labels, task): definitions = [] box_count = len(labels) for j in range(min(box_count, VisionTask._max_box_count)): l = labels[j] # len(l) = 149 l = l[task.boundary[0]:task.boundary[1]] if len(np.nonzero(l)) > 0: definition_idx = np.nonzero(l)[0][0] definitions.append( self._labels_definition[task.get_name()][definition_idx]) img = draw_text(img_tensor, definitions) print(definitions) # print(images.shape) self.logger.log_image(img_tensor, name="v", image_channels='first') # @timeit # @torch.no_grad() # def validate(self): # # self.model.eval() # # running_loss = [] # # for idx, (x, y) in tqdm(enumerate(self.val_data), desc="Validation"): # # move data to device # x = x.to(device=self.device) # y = y.to(device=self.device) # # # forward, backward # if self.epoch > self.cfg.train_params.swa_start: # # Update bn statistics for the swa_model # torch.optim.swa_utils.update_bn(self.data, self.swa_model) # out = self.swa_model(x) # else: # out = self.model(x) # # loss = self.criterion(out, y) # running_loss.append(loss.item()) # # # average loss # loss = np.mean(running_loss) # # return loss def init_logger(self, cfg): logger = None # Check to see if there is a key in environment: EXPERIMENT_KEY = cfg.experiment_key # First, let's see if we continue or start fresh: CONTINUE_RUN = cfg.resume if (EXPERIMENT_KEY is not None): # There is one, but the experiment might not exist yet: api = comet_ml.API() # Assumes API key is set in config/env try: api_experiment = api.get_experiment_by_id(EXPERIMENT_KEY) except Exception: api_experiment = None if api_experiment is not None: CONTINUE_RUN = True # We can get the last details logged here, if logged: # step = int(api_experiment.get_parameters_summary("batch")["valueCurrent"]) # epoch = int(api_experiment.get_parameters_summary("epochs")["valueCurrent"]) if CONTINUE_RUN: # 1. Recreate the state of ML system before creating experiment # otherwise it could try to log params, graph, etc. again # ... # 2. Setup the existing experiment to carry on: logger = comet_ml.ExistingExperiment( previous_experiment=EXPERIMENT_KEY, log_env_details=True, # to continue env logging log_env_gpu=True, # to continue GPU logging log_env_cpu=True, # to continue CPU logging auto_histogram_weight_logging=True, auto_histogram_gradient_logging=True, auto_histogram_activation_logging=True) # Retrieved from above APIExperiment # self.logger.set_epoch(epoch) else: # 1. Create the experiment first # This will use the COMET_EXPERIMENT_KEY if defined in env. # Otherwise, you could manually set it here. If you don't # set COMET_EXPERIMENT_KEY, the experiment will get a # random key! logger = comet_ml.Experiment( disabled=cfg.disabled, project_name=cfg.project, auto_histogram_weight_logging=True, auto_histogram_gradient_logging=True, auto_histogram_activation_logging=True) logger.add_tags(cfg.tags.split()) logger.log_parameters(self.cfg) return logger def save(self, name=None): checkpoint = { "epoch": self.epoch, "model": self.model.state_dict(), "optimizer": self.optimizer.state_dict(), "lr_scheduler": self.lr_scheduler.state_dict(), "best": self.best, "e_loss": self.e_loss } if name is None and self.epoch >= self.cfg.train_params.swa_start: save_name = self.cfg.directory.model_name + str( self.epoch) + "-swa" checkpoint['model-swa'] = self.swa_model.state_dict() elif name is None: save_name = self.cfg.directory.model_name + str(self.epoch) else: save_name = name if self.e_loss[-1] < self.best: self.best = self.e_loss[-1] checkpoint["best"] = self.best save_checkpoint(checkpoint, True, self.cfg.directory.save, save_name) else: save_checkpoint(checkpoint, False, self.cfg.directory.save, save_name)