def test(model_path_effi7, model_path_resnest, output_dir, test_loader, addNDVI): in_channels = 4 if(addNDVI): in_channels += 1 model_resnest = smp.UnetPlusPlus( encoder_name="timm-resnest101e", encoder_weights="imagenet", in_channels=in_channels, classes=10, ) model_effi7 = smp.UnetPlusPlus( encoder_name="efficientnet-b7", encoder_weights="imagenet", in_channels=in_channels, classes=10, ) # 如果模型是SWA if("swa" in model_path_resnest): model_resnest = AveragedModel(model_resnest) if("swa" in model_path_effi7): model_effi7 = AveragedModel(model_effi7) model_resnest.to(DEVICE); model_resnest.load_state_dict(torch.load(model_path_resnest)) model_resnest.eval() model_effi7.to(DEVICE); model_effi7.load_state_dict(torch.load(model_path_effi7)) model_effi7.eval() for image, image_stretch, image_path, ndvi in test_loader: with torch.no_grad(): # image.shape: 16,4,256,256 image_flip2 = torch.flip(image,[2]) image_flip2 = image_flip2.cuda() image_flip3 = torch.flip(image,[3]) image_flip3 = image_flip3.cuda() image = image.cuda() image_stretch = image_stretch.cuda() output1 = model_resnest(image).cpu().data.numpy() output2 = model_resnest(image_stretch).cpu().data.numpy() output3 = model_effi7(image).cpu().data.numpy() output4 = model_effi7(image_stretch).cpu().data.numpy() output5 = torch.flip(model_resnest(image_flip2),[2]).cpu().data.numpy() output6 = torch.flip(model_effi7(image_flip2),[2]).cpu().data.numpy() output7 = torch.flip(model_resnest(image_flip3),[3]).cpu().data.numpy() output8 = torch.flip(model_effi7(image_flip3),[3]).cpu().data.numpy() output = (output1 + output2 + output3 + output4 + output5 + output6 + output7 + output8) / 8.0 # output.shape: 16,10,256,256 for i in range(output.shape[0]): pred = output[i] # for low_ndvi in range(3,8): # pred[low_ndvi][ndvi[i]>35] = 0 # for high_ndvi in range(3): # pred[high_ndvi][ndvi[i]<0.02] = 0 pred = np.argmax(pred, axis = 0) + 1 pred = np.uint8(pred) save_path = os.path.join(output_dir, image_path[i][-10:].replace('.tif', '.png')) print(save_path) cv2.imwrite(save_path, pred)
def training_epoch_end(self, outputs): self.log('epoch_now', self.current_epoch, on_step=False, on_epoch=True, logger=True) (oppp) = self.optimizers(use_pl_optimizer=True) self.log('lr_now', self.get_lr_inside(oppp), on_step=False, on_epoch=True, logger=True) # https://github.com/PyTorchLightning/pytorch-lightning/issues/3095 if self.learning_params["swa"] and ( self.current_epoch >= self.learning_params["swa_start_epoch"]): if self.swa_model is None: (optimizer) = self.optimizers(use_pl_optimizer=True) print("creating_swa") self.swa_model = AveragedModel(self.network) self.new_scheduler = SWALR( optimizer, anneal_strategy="linear", anneal_epochs=5, swa_lr=self.learning_params["swa_lr"]) # https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging/ self.swa_model.update_parameters(self.network) self.new_scheduler.step()
def __init__(self, cfg_dir: str): # load config file and initialize the logger and the device self.cfg = get_conf(cfg_dir) self.logger = self.init_logger(self.cfg.logger) self.device = self.init_device() # creating dataset interface and dataloader for trained data self.data, self.val_data = self.init_dataloader() # create model and initialize its weights and move them to the device self.model = self.init_model() # initialize the optimizer self.optimizer, self.lr_scheduler = self.init_optimizer() # define loss function self.criterion = torch.nn.CrossEntropyLoss() # if resuming, load the checkpoint self.if_resume() # initialize the early_stopping object self.early_stopping = EarlyStopping( patience=self.cfg.train_params.patience, verbose=True, delta=self.cfg.train_params.early_stopping_delta, ) # stochastic weight averaging if self.cfg.train_params.epochs > self.cfg.train_params.swa_start: self.swa_model = AveragedModel(self.model) self.swa_scheduler = SWALR(self.optimizer, **self.cfg.SWA)
def fit_model(self): """ Fits model. Uses AdamW optimizer, model averaging, and a cosine annealing learning rate schedule. """ optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.001) scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, 100, 2 ) self.swa_model = AveragedModel(self.model) swa_start = 750 swa_scheduler = SWALR( optimizer, swa_lr=0.001, anneal_epochs=10, anneal_strategy="cos" ) self.model.train() self.swa_model.train() for epoch in range(1000): optimizer.zero_grad() output = self.model(self.x) loss = -output.log_prob(self.y.view(-1, 1)).sum() loss.backward() optimizer.step() if epoch > swa_start: self.swa_model.update_parameters(self.model) swa_scheduler.step() else: scheduler.step() if epoch % 10 == 0: print(f"Epoch {epoch} complete. Loss: {loss}")
def _configure_optimizers(self, ) -> None: """Loads the optimizers.""" if self._optimizer is not None: self._optimizer = self._optimizer(self._network.parameters(), **self.optimizer_args) else: self._optimizer = None if self._optimizer and self._lr_scheduler is not None: if "steps_per_epoch" in self.lr_scheduler_args: self.lr_scheduler_args["steps_per_epoch"] = len( self.train_dataloader()) # Assume lr scheduler should update at each epoch if not specified. if "interval" not in self.lr_scheduler_args: interval = "epoch" else: interval = self.lr_scheduler_args.pop("interval") self._lr_scheduler = { "lr_scheduler": self._lr_scheduler(self._optimizer, **self.lr_scheduler_args), "interval": interval, } if self.swa_args is not None: self._swa_scheduler = { "swa_scheduler": SWALR(self._optimizer, swa_lr=self.swa_args["lr"]), "swa_start": self.swa_args["start"], } self._swa_network = AveragedModel(self._network).to(self.device)
def get_swa(optimizer, model, swa_lr=0.005, anneal_epochs=10, anneal_strategy="cos"): ''' SWALR Arguments: optimizer (torch.optim.Optimizer): wrapped optimizer swa_lr (float or list): the learning rate value for all param groups together or separately for each group. anneal_epochs (int): number of epochs in the annealing phase (default: 10) anneal_strategy (str): "cos" or "linear"; specifies the annealing strategy: "cos" for cosine annealing, "linear" for linear annealing (default: "cos") last_epoch (int): the index of the last epoch (default: 'cos') ''' swa_model = AveragedModel(model) # swa_scheduler = SWALR(optimizer, swa_lr=swa_lr) # swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, anneal_strategy="linear", anneal_epochs=5, swa_lr=swa_lr) swa_scheduler = SWALR(optimizer, swa_lr=swa_lr, anneal_epochs=anneal_epochs, anneal_strategy=anneal_strategy) return swa_scheduler, swa_model
def __init__(self, cfg_dir: str, data_loader: DataLoader, model, labels_definition): self.cfg = get_conf(cfg_dir) self._labels_definition = labels_definition #TODO self.logger = self.init_logger(self.cfg.logger) #self.dataset = CustomDataset(**self.cfg.dataset) self.data = data_loader #self.val_dataset = CustomDatasetVal(**self.cfg.val_dataset) #self.val_data = DataLoader(self.val_dataset, **self.cfg.dataloader) # self.logger.log_parameters({"tr_len": len(self.dataset), # "val_len": len(self.val_dataset)}) self.model = model #self.model._resnet.conv1.apply(init_weights_normal) self.device = self.cfg.train_params.device self.model = self.model.to(device=self.device) if self.cfg.train_params.optimizer.lower() == "adam": self.optimizer = optim.Adam(self.model.parameters(), **self.cfg.adam) elif self.cfg.train_params.optimizer.lower() == "rmsprop": self.optimizer = optim.RMSprop(self.model.parameters(), **self.cfg.rmsprop) else: raise ValueError( f"Unknown optimizer {self.cfg.train_params.optimizer}") self.lr_scheduler = optim.lr_scheduler.CosineAnnealingLR( self.optimizer, T_max=100) self.criterion = nn.BCELoss() if self.cfg.logger.resume: # load checkpoint print("Loading checkpoint") save_dir = self.cfg.directory.load checkpoint = load_checkpoint(save_dir, self.device) self.model.load_state_dict(checkpoint["model"]) self.optimizer.load_state_dict(checkpoint["optimizer"]) self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) self.epoch = checkpoint["epoch"] self.e_loss = checkpoint["e_loss"] self.best = checkpoint["best"] print( f"{datetime.now():%Y-%m-%d %H:%M:%S} " f"Loading checkpoint was successful, start from epoch {self.epoch}" f" and loss {self.best}") else: self.epoch = 1 self.best = np.inf self.e_loss = [] # initialize the early_stopping object self.early_stopping = EarlyStopping( patience=self.cfg.train_params.patience, verbose=True, delta=self.cfg.train_params.early_stopping_delta, ) # stochastic weight averaging self.swa_model = AveragedModel(self.model) self.swa_scheduler = SWALR(self.optimizer, **self.cfg.SWA)
def train(num_epochs, model, data_loader, val_loader, val_every, device, file_name): learning_rate = 0.0001 from torch.optim.swa_utils import AveragedModel, SWALR from torch.optim.lr_scheduler import CosineAnnealingLR from segmentation_models_pytorch.losses import SoftCrossEntropyLoss, JaccardLoss from adamp import AdamP criterion = [SoftCrossEntropyLoss(smooth_factor=0.1), JaccardLoss('multiclass', classes=12)] optimizer = AdamP(params=model.parameters(), lr=learning_rate, weight_decay=1e-6) swa_scheduler = SWALR(optimizer, swa_lr=learning_rate) swa_model = AveragedModel(model) look = Lookahead(optimizer, la_alpha=0.5) print('Start training..') best_miou = 0 for epoch in range(num_epochs): hist = np.zeros((12, 12)) model.train() for step, (images, masks, _) in enumerate(data_loader): loss = 0 images = torch.stack(images) # (batch, channel, height, width) masks = torch.stack(masks).long() # (batch, channel, height, width) # gpu 연산을 위해 device 할당 images, masks = images.to(device), masks.to(device) # inference outputs = model(images) for i in criterion: loss += i(outputs, masks) # loss 계산 (cross entropy loss) look.zero_grad() loss.backward() look.step() outputs = torch.argmax(outputs.squeeze(), dim=1).detach().cpu().numpy() hist = add_hist(hist, masks.detach().cpu().numpy(), outputs, n_class=12) acc, acc_cls, mIoU, fwavacc = label_accuracy_score(hist) # step 주기에 따른 loss, mIoU 출력 if (step + 1) % 25 == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, mIoU: {:.4f}'.format( epoch + 1, num_epochs, step + 1, len(data_loader), loss.item(), mIoU)) # validation 주기에 따른 loss 출력 및 best model 저장 if (epoch + 1) % val_every == 0: avrg_loss, val_miou = validation(epoch + 1, model, val_loader, criterion, device) if val_miou > best_miou: print('Best performance at epoch: {}'.format(epoch + 1)) print('Save model in', saved_dir) best_miou = val_miou save_model(model, file_name = file_name) if epoch > 3: swa_model.update_parameters(model) swa_scheduler.step()
def weight_averaging(model_class, checkpoint_paths, data_loader, device): from torch.optim.swa_utils import AveragedModel, update_bn model = model_class.load_from_checkpoint(checkpoint_paths[0]) swa_model = AveragedModel(model) for path in checkpoint_paths: model = model_class.load_from_checkpoint(path) swa_model.update_parameters(model) swa_model = swa_model.to(device) update_bn(data_loader, swa_model, device) return swa_model
def build_swa_model(cfg: CfgNode, model: torch.nn.Module, optimizer: torch.optim.Optimizer): # Instead of copying weights during initialization, the SWA model copys # the model weights when self.update_parameters is first called. # https://github.com/pytorch/pytorch/blob/1.7/torch/optim/swa_utils.py#L107 # The SWA model needs to be constructed for all processes in distributed # training, otherwise the training can get stuck. swa_model = AveragedModel(model) lr = cfg.SOLVER.BASE_LR lr *= cfg.SOLVER.SWA.LR_FACTOR swa_scheduler = SWALR(optimizer, swa_lr=lr) return swa_model, swa_scheduler
def average_model_weights(checkpoint_path, average_fn, checkpoint_N): checkpoint_files = [ os.path.join(checkpoint_path, file_name) for file_name in os.listdir(checkpoint_path) if file_name.endswith(".pt") ] def ckpt_key(ckpt): return int(ckpt.split('_')[-1].split('.')[0]) try: checkpoint_files = sorted(checkpoint_files, key=ckpt_key) except: logging.warn( "Checkpoint names are changed, which may cause inconsistent order." ) # Select the last N checkpoint if checkpoint_N > 0 and checkpoint_N <= len(checkpoint_files): checkpoint_files = checkpoint_files[-checkpoint_N:] # initialize averaged model with first checkpoint model = load_model(checkpoint_files[0]) averaged_model = AveragedModel(model, avg_fn=average_fn) # loop through the remaining checkpoints and update averaged model for checkpoint in checkpoint_files: model = load_model(checkpoint) averaged_model.update_parameters(model) last_checkpoint = torch.load(checkpoint_files[-1]) opts = last_checkpoint['opts'] filename = f'{opts.model}_{opts.data}_{last_checkpoint["epoch"]}_averaged.pt' save_path = os.path.join(checkpoint_path, filename) if opts.precision[-3:] == ".16": model.half() else: model.float() torch.save( { 'epoch': last_checkpoint['epoch'] + 1, 'model_state_dict': averaged_model.module.state_dict(), 'loss': 0, # dummy just to work with validate script 'train_accuracy': 0, # dummy just to work with validate script 'opts': opts }, save_path) return averaged_model
def __init__(self, model, device, config, fold_num): self.config = config self.epoch = 0 self.start_epoch = 0 self.fold_num = fold_num if self.config.stage2: self.base_dir = f'./result/stage2/{config.dir}/{config.dir}_fold_{config.fold_num}' else: self.base_dir = f'./result/{config.dir}/{config.dir}_fold_{config.fold_num}' os.makedirs(self.base_dir, exist_ok=True) self.log_path = f'{self.base_dir}/log.txt' self.best_summary_loss = 10**5 self.model = model self.swa_model = AveragedModel(self.model) self.device = device self.wandb = True self.cutmix = self.config.cutmix_ratio self.fmix = self.config.fmix_ratio self.smix = self.config.smix_ratio self.es = EarlyStopping(patience=8) self.scaler = GradScaler() self.amp = self.config.amp param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.optimizer, self.scheduler = get_optimizer( self.model, self.config.optimizer_name, self.config.optimizer_params, self.config.scheduler_name, self.config.scheduler_params, self.config.n_epochs) self.criterion = get_criterion(self.config.criterion_name, self.config.criterion_params) self.log(f'Fitter prepared. Device is {self.device}') set_wandb(self.config, fold_num)
def before_run(self, runner): """Construct the averaged model which will keep track of the running averages of the parameters of the model.""" model = runner.model self.model = AveragedModel(model) self.meta = runner.meta if self.meta is None: self.meta = dict() self.meta.setdefault('hook_msgs', dict()) if not 'hook_msgs' in self.meta.keys(): self.meta.setdefault('hook_msgs', dict())
def __init__(self, config): self.config = config self.device = 'cuda' if cuda.is_available() else 'cpu' self.model = MLP(config) self.swa_model = AveragedModel(self.model) self.optimizer = make_optimizer(self.model, optimizer_name=self.config.optimizer, sam=self.config.sam) self.scheduler = make_scheduler(self.optimizer, decay_name=self.config.scheduler, num_training_steps=self.config.num_training_steps, num_warmup_steps=self.config.num_warmup_steps) self.swa_start = self.config.swa_start self.swa_scheduler = SWALR(self.optimizer, swa_lr=self.config.swa_lr) self.epoch_num = 0 self.criterion = self.config.criterion
def train_model(indep_vars, dep_var, verbose=True): """ Trains MDNVol network. Uses AdamW optimizer with cosine annealing learning rate schedule. Ouputs averaged model over the last 25% of training epochs. indep_vars: n x m torch tensor containing independent variables n = number of data points m = number of input variables dep_var: n x 1 torch tensor containing single dependent variable n = number of data points 1 = single output variable """ model = MDN(indep_vars.shape[1], 1, 250, 5) optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, 100, 2) swa_model = AveragedModel(model) swa_start = 750 swa_scheduler = SWALR(optimizer, swa_lr=0.001, anneal_epochs=10, anneal_strategy="cos") model.train() swa_model.train() for epoch in range(1000): optimizer.zero_grad() output = model(indep_vars) loss = -output.log_prob(dep_var).sum() loss.backward() optimizer.step() if epoch > swa_start: swa_model.update_parameters(model) swa_scheduler.step() else: scheduler.step() if epoch % 10 == 0: if verbose: print(f"Epoch {epoch} complete. Loss: {loss}") swa_model.eval() return swa_model
def predict(model_class, test_set, checkpoint_path, device, robust): assert isfile( checkpoint_path), f"no checkpoint found at '{checkpoint_path}'" checkpoint = torch.load(checkpoint_path, map_location=device) chk_robust = checkpoint["model_params"]["robust"] assert (chk_robust == robust ), f"checkpoint['robust'] != robust ({chk_robust} vs {robust})" model = model_class(**checkpoint["model_params"], device=device) model.to(device) model.load_state_dict(checkpoint["state_dict"]) if "swa" in checkpoint.keys(): model.swa = checkpoint["swa"] model_dict = model.swa["model_state_dict"] model.swa["model"] = AveragedModel(model) model.swa["model"].load_state_dict(model_dict) idx, comp, y_test, output = model.predict(test_set) df = pd.DataFrame({"idx": idx, "comp": comp, "y_test": y_test}) if model.robust: mean, log_std = output.chunk(2, dim=1) pre_logits_std = torch.exp(log_std).cpu().numpy() logits = sampled_softmax(mean, log_std, samples=10).cpu().numpy() pre_logits = mean.cpu().numpy() for idx, std_al in enumerate(pre_logits_std.T): df[f"class_{idx}_std_al"] = std_al else: pre_logits = output.cpu().numpy() logits = softmax(pre_logits, axis=1) for idx, (logit, pre_logit) in enumerate(zip(logits.T, pre_logits.T)): df[f"class_{idx}_logit"] = logit df[f"class_{idx}_pred"] = pre_logit return df, y_test, logits, pre_logits
def __init__(self, blocks, channels, features, pre_act=False, radix=1, groups=1, bottleneck_width=64, activation=nn.SiLU, squeeze_excitation=False, bottleneck=False, bottleneck_expansion=4, beta=0, val_lambda=0.333, lr=1e-2, use_swa=False, swa_lr=1e-2, swa_freq=250): super(Network, self).__init__() self.save_hyperparameters() self.net = PolicyValueNetwork( blocks=blocks, channels=channels, features=features, pre_act=pre_act, activation=activation, squeeze_excitation=squeeze_excitation, bottleneck=bottleneck, bottleneck_expansion=bottleneck_expansion, radix=radix, groups=groups, bottleneck_width=bottleneck_width ) if use_swa: self.swa_model = AveragedModel(self.net) self.ce = nn.CrossEntropyLoss(reduction='none') self.bce = nn.BCEWithLogitsLoss()
def test_1(model_path, output_dir, test_loader, addNDVI): in_channels = 4 if (addNDVI): in_channels += 1 model = smp.UnetPlusPlus( encoder_name="resnet101", encoder_weights="imagenet", in_channels=in_channels, classes=10, ) # model = smp.DeepLabV3Plus( # encoder_name="timm-regnety_320", #resnet101 # encoder_weights="imagenet", # in_channels=4, # classes=8, # ) # 如果模型是SWA if ("swa" in model_path): model = AveragedModel(model) model.to(DEVICE) model.load_state_dict(torch.load(model_path)) model.eval() for image, image_stretch, image_path, ndvi in test_loader: with torch.no_grad(): image = image.cuda() image_stretch = image_stretch.cuda() output1 = model(image).cpu().data.numpy() output2 = model(image_stretch).cpu().data.numpy() output = (output1 + output2) / 2.0 for i in range(output.shape[0]): pred = output[i] pred = np.argmax(pred, axis=0) + 1 pred = np.uint8(pred) save_path = os.path.join( output_dir, image_path[i].split('\\')[-1].replace('.tif', '.png')) #print(image_path[i][-10:]) print(save_path) cv2.imwrite(save_path, pred)
def __init__(self, config: DNNConfig): self.config = config self.epochs = config.epoch_num self.device = config.device self.model = tmp_model #self.criterion = CustomLoss() self.criterion = nn.MSELoss() optimizer_kwargs = { 'lr': config.lr, 'weight_decay': config.weight_decay } self.sam = config.issam self.optimizer = make_optimizer(self.model, optimizer_kwargs, optimizer_name=config.optimizer_name, sam=config.issam) self.scheduler_name = config.scheduler_name self.scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer=self.optimizer, T_max=config.T_max) self.isswa = config.getattr('isswa', False) self.swa_start = config.getattr('swa_start', 0) if config.isswa: self.swa_model = AveragedModel(self.model) self.swa_scheduler = SWALR(self.optimizer, swa_lr=0.025) #self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=self.optimizer, # mode=config.mode, factor=config.factor) self.loss_log = { 'train_loss': [], 'train_score': [], 'valid_loss': [], 'valid_score': [] }
def predict(model_class, test_set, checkpoint_path, device, robust): assert isfile( checkpoint_path), f"no checkpoint found at '{checkpoint_path}'" checkpoint = torch.load(checkpoint_path, map_location=device) chk_robust = checkpoint["model_params"]["robust"] assert (chk_robust == robust ), f"checkpoint['robust'] != robust ({chk_robust} vs {robust})" model = model_class(**checkpoint["model_params"], device=device) model.to(device) model.load_state_dict(checkpoint["state_dict"]) normalizer = Normalizer() normalizer.load_state_dict(checkpoint["normalizer"]) if "swa" in checkpoint.keys(): model.swa = checkpoint["swa"] model_dict = model.swa["model_state_dict"] model.swa["model"] = AveragedModel(model) model.swa["model"].load_state_dict(model_dict) idx, comp, y_test, output = model.predict(test_set) df = pd.DataFrame({"idx": idx, "comp": comp, "y_test": y_test}) output = output.cpu().squeeze( ) # move preds to CPU in case model ran on GPU if robust: mean, log_std_al = (x.squeeze() for x in output.chunk(2, dim=1)) df["pred"] = normalizer.denorm(mean).numpy() df["std_al"] = (log_std_al.exp() * normalizer.std).numpy() else: df["pred"] = normalizer.denorm(output).numpy() return df
def load_weights(self, network_fn: Optional[Type[nn.Module]] = None) -> None: """Load the network weights.""" logger.debug("Loading network with pretrained weights.") filename = glob(self.weights_filename)[0] if not filename: raise FileNotFoundError( f"Could not find any pretrained weights at {self.weights_filename}" ) # Loading state directory. state_dict = torch.load(filename, map_location=torch.device(self._device)) self._network_args = state_dict["network_args"] weights = state_dict["model_state"] # Initializes the network with trained weights. if network_fn is not None: self._network = network_fn(**self._network_args) self._network.load_state_dict(weights) if "swa_network" in state_dict: self._swa_network = AveragedModel(self._network).to(self.device) self._swa_network.load_state_dict(state_dict["swa_network"])
def DecoderTensorWriting(model_weight_path, decoder_img_output_path, image_root_path, imageNames, if_swa=True): device = "cuda:1" model = EncoderDecoderNet(inChannels=3, encodedDimension=encodedDimension, drop_ratio=0, layersExpandRatio=layersExpandRatio, channelsExpandRatio=channelsExpandRatio, blockExpandRatio=blockExpandRatio, encoderImgHeight=12, encoderImgWidth=52, ch=12, if_add_plate_infor=True).to(device) if if_swa: model = AveragedModel(model) model.load_state_dict(torch.load(model_weight_path)) model = model.eval() transformer = tv.transforms.Compose([tv.transforms.ToTensor()]) for i, nameD in enumerate(imageNames): imgD = Image.open(os.path.join(image_root_path, nameD)).convert("RGB") print("Decoder : ", i) print(nameD) tImg = transformer(imgD).unsqueeze(dim=0).to(device) if if_add_plate_information: decoderTensor, encoderT = model( tImg, torch.from_numpy(np.array([img2Plates[nameD] ])).float().to(device)) else: decoderTensor, encoderT = model(tImg, None) #print(encoderT) decoder = torch.sigmoid(decoderTensor).detach().cpu().squeeze(dim=0) decoderImg = tv.transforms.ToPILImage()(decoder) decoderImg.save(os.path.join(decoder_img_output_path, nameD))
def average_model_weights(checkpoint_path, average_fn): checkpoint_files = [ os.path.join(checkpoint_path, file_name) for file_name in os.listdir(checkpoint_path) if file_name.endswith(".pt") ] # initialize averaged model with first checkpoint model = load_model(checkpoint_files[0]) averaged_model = AveragedModel(model, avg_fn=average_fn) # loop through the remaining checkpoints and update averaged model for checkpoint in checkpoint_files: model = load_model(checkpoint) averaged_model.update_parameters(model) last_checkpoint = torch.load(checkpoint_files[-1]) opts = last_checkpoint['opts'] filename = f'{opts.model}_{opts.data}_{last_checkpoint["epoch"]}_averaged.pt' save_path = os.path.join(checkpoint_path, filename) if opts.precision[-3:] == ".16": model.half() else: model.float() torch.save( { 'epoch': last_checkpoint['epoch'] + 1, 'model_state_dict': averaged_model.module.state_dict(), 'loss': 0, # dummy just to work with validate script 'train_accuracy': 0, # dummy just to work with validate script 'opts': opts }, save_path) return averaged_model
def EncoderTensorWriting(model_weight_path, write_path, image_root_path, imageNames, if_swa=True): device = "cuda:1" model = EncoderDecoderNet(inChannels=3, encodedDimension=encodedDimension, drop_ratio=0, layersExpandRatio=layersExpandRatio, channelsExpandRatio=channelsExpandRatio, blockExpandRatio=blockExpandRatio, encoderImgHeight=12, encoderImgWidth=52, ch=12, if_add_plate_infor=True).to(device) if if_swa: model = AveragedModel(model) model.load_state_dict(torch.load(model_weight_path)) model = model.eval() transformer = tv.transforms.Compose([tv.transforms.ToTensor()]) for i, nameE in enumerate(imageNames): imgE = Image.open(os.path.join(image_root_path, nameE)).convert("RGB") print("Encoder : ", i) print(nameE) tImg = transformer(imgE).unsqueeze(dim=0).to(device) if if_add_plate_information: _, encoderTensor = model( tImg, torch.from_numpy(np.array([img2Plates[nameE] ])).float().to(device)) else: _, encoderTensor = model(tImg, None) encoderTensor = encoderTensor.detach().cpu().numpy() encoderTensor = np.squeeze(encoderTensor, axis=0) np.save(os.path.join(write_path, nameE), encoderTensor)
trainTransforms = tv.transforms.Compose([ tv.transforms.RandomHorizontalFlip(p=0.5), tv.transforms.RandomVerticalFlip(p=0.5), tv.transforms.RandomApply( [tv.transforms.RandomCrop(size=randomCropSize)], p=0.5), tv.transforms.RandomApply([tv.transforms.RandomRotation(degrees=60)], p=0.5), tv.transforms.Resize(size=inputImageSize), tv.transforms.ToTensor(), #tv.transforms.RandomErasing(p=0.2, scale=(0.1, 0.15), ratio=(0.1, 1.)) ]) testTransforms = tv.transforms.Compose( [tv.transforms.Resize(size=inputImageSize), tv.transforms.ToTensor()]) model = tv.models.resnet50(num_classes=3).to(device) swa_model = AveragedModel(model) if trainOrTest.lower() == "train": ### Optimizer optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-5) cosine_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, epoch, eta_min=0, last_epoch=-1) scheduler = GradualWarmupScheduler(optimizer, multiplier=multiplier, total_epoch=warmEpoch, after_scheduler=cosine_scheduler) swa_scheduler = SWALR(optimizer, swa_lr=LR, anneal_epochs=15,
def main(): os.makedirs(SAVEPATH, exist_ok=True) print('save path:', SAVEPATH) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('device:', device) print('weight_decay:', WEIGHTDECAY) print('momentum:', MOMENTUM) print('batch_size:', BATCHSIZE) print('lr:', LR) print('epoch:', EPOCHS) print('Label smoothing:', LABELSMOOTH) print('Stochastic Weight Averaging:', SWA) if SWA: print('Swa lr:', SWA_LR) print('Swa start epoch:', SWA_START) print('Cutout augmentation:', CUTOUT) if CUTOUT: print('Cutout size:', CUTOUTSIZE) print('Activation:', ACTIVATION) # get model model = get_seresnet_cifar(activation=ACTIVATION) # get loss function if LABELSMOOTH: criterion = LabelSmoothingLoss(classes=10, smoothing=0.1) else: criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM, weight_decay=WEIGHTDECAY, nesterov=True) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=EPOCHS, eta_min=0) model = model.to(device) criterion = criterion.to(device) # Check number of parameters your model pytorch_total_params = sum(p.numel() for p in model.parameters()) print(f"Number of parameters: {pytorch_total_params}") if int(pytorch_total_params) > 2000000: print('Your model has the number of parameters more than 2 millions..') return if SWA: # apply swa swa_model = AveragedModel(model) swa_scheduler = SWALR(optimizer, swa_lr=SWA_LR) swa_total_params = sum(p.numel() for p in swa_model.parameters()) print(f"Swa parameters: {swa_total_params}") # cinic mean, std normalize = transforms.Normalize(mean=[0.47889522, 0.47227842, 0.43047404], std=[0.24205776, 0.23828046, 0.25874835]) if CUTOUT: train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, Cutout(size=CUTOUTSIZE) ]) else: train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) train_dataset = torchvision.datasets.ImageFolder('/content/train', transform=train_transform) train_loader = DataLoader(train_dataset, batch_size=BATCHSIZE, shuffle=True, num_workers=4, pin_memory=True) # colab reload start_epoch = 0 if os.path.isfile(os.path.join(SAVEPATH, 'latest_checkpoint.pth')): checkpoint = torch.load(os.path.join(SAVEPATH, 'latest_checkpoint.pth')) start_epoch = checkpoint['epoch'] scheduler.load_state_dict(checkpoint['scheduler']) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) if SWA: swa_scheduler.load_state_dict(checkpoint['swa_scheduler']) swa_model.load_state_dict(checkpoint['swa_model']) print(start_epoch, 'load parameter') for epoch in range(start_epoch, EPOCHS): print("\n----- epoch: {}, lr: {} -----".format( epoch, optimizer.param_groups[0]["lr"])) # train for one epoch start_time = time.time() train(train_loader, epoch, model, optimizer, criterion, device) elapsed_time = time.time() - start_time print('==> {:.2f} seconds to train this epoch\n'.format(elapsed_time)) # learning rate scheduling if SWA and epoch > SWA_START: swa_model.update_parameters(model) swa_scheduler.step() else: scheduler.step() if SWA: checkpoint = { 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'swa_model': swa_model.state_dict(), 'swa_scheduler': swa_scheduler.state_dict() } else: checkpoint = { 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } torch.save(checkpoint, os.path.join(SAVEPATH, 'latest_checkpoint.pth')) if epoch % 10 == 0: torch.save(checkpoint, os.path.join(SAVEPATH, '%d_checkpoint.pth' % epoch))
def __init__(self): if args.train is not None: self.train_tuple = get_tuple(args.train, bs=args.batch_size, shuffle=True, drop_last=False) if args.valid is not None: valid_bsize = 2048 if args.multiGPU else 50 self.valid_tuple = get_tuple(args.valid, bs=valid_bsize, shuffle=False, drop_last=False) else: self.valid_tuple = None # Select Model, X is default if args.model == "X": self.model = ModelX(args) elif args.model == "V": self.model = ModelV(args) elif args.model == "U": self.model = ModelU(args) elif args.model == "D": self.model = ModelD(args) elif args.model == 'O': self.model = ModelO(args) else: print(args.model, " is not implemented.") # Load pre-trained weights from paths if args.loadpre is not None: self.model.load(args.loadpre) # GPU options if args.multiGPU: self.model.lxrt_encoder.multi_gpu() self.model = self.model.cuda() # Losses and optimizer self.logsoftmax = nn.LogSoftmax(dim=1) self.nllloss = nn.NLLLoss() if args.train is not None: batch_per_epoch = len(self.train_tuple.loader) self.t_total = int(batch_per_epoch * args.epochs // args.acc) print("Total Iters: %d" % self.t_total) def is_backbone(n): if "encoder" in n: return True elif "embeddings" in n: return True elif "pooler" in n: return True print("F: ", n) return False no_decay = ['bias', 'LayerNorm.weight'] params = list(self.model.named_parameters()) if args.reg: optimizer_grouped_parameters = [ { "params": [p for n, p in params if is_backbone(n)], "lr": args.lr }, { "params": [p for n, p in params if not is_backbone(n)], "lr": args.lr * 500 }, ] for n, p in self.model.named_parameters(): print(n) self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr) else: optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.wd }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr) if args.train is not None: self.scheduler = get_linear_schedule_with_warmup( self.optim, self.t_total * 0.1, self.t_total) self.output = args.output os.makedirs(self.output, exist_ok=True) # SWA Method: if args.contrib: self.optim = SWA(self.optim, swa_start=self.t_total * 0.75, swa_freq=5, swa_lr=args.lr) if args.swa: self.swa_model = AveragedModel(self.model) self.swa_start = self.t_total * 0.75 self.swa_scheduler = SWALR(self.optim, swa_lr=args.lr)
def training(model, train_dataloader, valid_dataloader, test_dataloader, model_cfg, fold_idx=1): print("-------- ", str(fold_idx), " --------") global model_config model_config = model_cfg device = get_device() model.to(device) if fold_idx == 1: print('CONFIG: ') if fold_idx == 1: print([(v, getattr(model_config, v)) for v in dir(model_config) if v[:2] != "__"]) if fold_idx == 1: print('MODEL: ', model) epochs = model_config.epochs if model_config.optimizer == 'AdamW': optimizer = torch.optim.AdamW(model.parameters(), lr=float(model_config.lr), eps=float(model_config.eps), weight_decay=float( model_config.weight_decay)) elif model_config.optimizer == 'SGD': optimizer = torch.optim.SGD(model.parameters(), lr=float(model_config.lr)) if model_config.scheduler == 'linear': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(model_config.warmup_steps), num_training_steps=len(train_dataloader) * epochs) else: scheduler = None criterion = nn.BCEWithLogitsLoss() #nn.CrossEntropyLoss() swa_model = AveragedModel(model) if model_config.swa_scheduler == 'linear': swa_scheduler = SWALR(optimizer, swa_lr=float(model_config.lr)) else: swa_scheduler = CosineAnnealingLR(optimizer, T_max=100) print('TRAINING...') training_stats = [] best_dev_auc = float('-inf') with tqdm(total=epochs, leave=False) as pbar: for epoch_i in range(0, epochs): if epoch_i >= int(model_config.swa_start): update_bn(train_dataloader, swa_model) train_auc, train_acc, avg_train_loss = train( model, train_dataloader, device, criterion, optimizer) swa_model.update_parameters(model) swa_scheduler.step() update_bn(valid_dataloader, swa_model) valid_auc, valid_acc, avg_dev_loss, dev_d = valid( swa_model, valid_dataloader, device, criterion) else: train_auc, train_acc, avg_train_loss = train( model, train_dataloader, device, criterion, optimizer, scheduler=scheduler) valid_auc, valid_acc, avg_dev_loss, dev_d = valid( model, valid_dataloader, device, criterion) if cfg.final_train: valid_auc = 0 valid_acc = 0 avg_dev_loss = 0 add_stats(training_stats, avg_train_loss, avg_dev_loss, train_acc, train_auc, valid_acc, valid_auc) if (cfg.final_train & (epoch_i == epochs - 1)) | (not cfg.final_train & (valid_auc > best_dev_auc)): best_dev_auc = valid_auc if epoch_i >= int(model_config.swa_start): update_bn(test_dataloader, swa_model) test_d = gen_test(swa_model, test_dataloader, device) save(fold_idx, swa_model, optimizer, dev_d, test_d, valid_auc) else: test_d = gen_test(model, test_dataloader, device) save(fold_idx, model, optimizer, dev_d, test_d, valid_auc) pbar.update(1) print('TRAINING COMPLETED') # Show training results col_names = [ 'train_loss', 'train_acc', 'train_auc', 'dev_loss', 'dev_acc', 'dev_auc' ] training_stats = pd.DataFrame(training_stats, columns=col_names) print(training_stats.head(epochs)) plot_training_results(training_stats, fold_idx) # If config, get best model and make submission if cfg.run['submission'] == True: make_submission(model, test_dataloader)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--batch_size", default=8, type=int, help="batch size of both segmentation and classification training") parser.add_argument( "--seg_epoch", default=100, type=int, help="the number of epoch in the segmentation training") parser.add_argument( "--cls_epoch", default=20, type=int, help="the number of epoch in the classification training") parser.add_argument("--lr", default=0.01, type=float, help="the learning rate of training") parser.add_argument("--swa_lr", default=0.005, type=float, help="the stochastic learning rate of training") parser.add_argument( "--seg_weight", default=[0.1, 1], type=list, nargs='+', help="the weight of Binary Cross Entropy in the segmentation learning") parser.add_argument( "--cls_weight", default=[1, 1], type=list, nargs='+', help="the weight of Binary Cross Entropy in the classification learning" ) parser.add_argument("--seed", default=2021, type=int, help="the random seed") parser.add_argument( "--train_dir", default="/train_dir", type=str, help= "the train data directory. it consists of the both ng and ok directorys, and they have img and mask folders." ) parser.add_argument( "--val_dir", default="/val_dir", type=str, help= "the validation data directory. it consists of the both ng and ok directorys, and they have img and mask folders." ) args = parser.parse_args() device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') segmentation_train = True classification_train = True train_dir = Path(args.train_dir) val_dir = Path(args.val_dir) train_ok_dir = str(train_dir / "ok") train_mask_dir = str(train_dir / "mask") train_ng_dir = str(train_dir / "ng") val_ok_dir = str(val_dir / "ok") val_mask_dir = str(val_dir / "mask") val_ng_dir = str(val_dir / "ng") seg_train_dataset = SegmentationDataset(img_dir=train_ng_dir, mask_dir=train_mask_dir, n_channels=3, classes=1, train=True) seg_val_dataset = SegmentationDataset(img_dir=val_ng_dir, mask_dir=val_mask_dir, n_channels=3, classes=1, train=False) cls_train_dataset = ClassificationDataset(ok_dir=train_ok_dir, ng_dir=train_ng_dir, n_channels=3, classes=1, train=True) cls_val_dataset = ClassificationDataset(ok_dir=val_ok_dir, ng_dir=val_ng_dir, n_channels=3, classes=1, train=False) seg_train_loader = DataLoader(seg_train_dataset, batch_size=8, shuffle=True) seg_val_loader = DataLoader(seg_val_dataset, batch_size=8, shuffle=True) cls_train_loader = DataLoader(cls_train_dataset, batch_size=8, shuffle=True) cls_val_loader = DataLoader(cls_val_dataset, batch_size=8, shuffle=True) my_model = DownconvUnet(in_channel=3, seg_classes=1, cls_classes=2) avg_model = AveragedModel(my_model) my_model.to(device) avg_model.to(device) with mlflow.start_run() as run: seg_args = Params(args.batch_size, args.seg_epoch, args.lr, args.seed, args.seg_weight) cls_args = Params(args.batch_size, args.cls_epoch, args.lr, args.seed, args.cls_weight) mode_list = ["seg", "cls"] for mode in mode_list: for key, value in vars(seg_args).items(): mlflow.log_param(f"{mode}_{key}", value) # Segmentation train if segmentation_train: print("-" * 5 + "Segmentation training start" + "-" * 5) my_model.mode = 1 train_metrics = Metrics() train_loss = 0. train_iou = 0. train_acc = 0. val_metrics = Metrics() val_loss = 0. val_iou = 0. val_acc = 0. my_model.train() optimizer = torch.optim.Adam(my_model.parameters(), lr=seg_args.lr) scheduler = CosineAnnealingLR(optimizer, T_max=100) bce = WeightedBCELoss(weight=seg_args.weight) swa_start = int(seg_args.num_epoch * 0.75) swa_scheduler = SWALR(optimizer, anneal_strategy='linear', anneal_epochs=swa_start, swa_lr=seg_args.swa_lr) for epoch in range(seg_args.num_epoch): for batch_idx, batch in enumerate(seg_train_loader): batch = tuple(t.to(device) for t in batch) seg_x, seg_y = batch optimizer.zero_grad() pred_y = my_model(seg_x) loss = bce(pred_y, seg_y) loss.backward() optimizer.step() train_loss += loss.item() train_metrics.update(pred_y, seg_y, loss.item()) train_iou += train_metrics.iou train_acc += train_metrics.acc step = epoch * len(seg_train_loader) + batch_idx for metric, value in vars(train_metrics).items(): mlflow.log_metric(f"seg_train_{metric}", value, step=step) train_loss /= len(seg_train_loader) train_iou /= len(seg_train_loader) train_acc /= len(seg_train_loader) my_model.eval() for batch_idx, batch in enumerate(seg_val_loader): batch = tuple(t.to(device) for t in batch) seg_x, seg_y = batch pred_y = my_model(seg_x) loss = bce(pred_y, seg_y) val_loss += loss.item() val_metrics.update(pred_y, seg_y, val_loss) val_iou += val_metrics.iou val_acc += val_metrics.acc step = epoch * len(seg_val_loader) + batch_idx for metric, value in vars(val_metrics).items(): mlflow.log_metric(f"seg_val_{metric}", value, step=step) val_loss /= len(seg_val_loader) val_iou /= len(seg_val_loader) val_acc /= len(seg_val_loader) print(f"Epoch {epoch + 1}:") print("-" * 10) print( f"train_loss {train_loss:.3f}, train_iou: {train_iou:.3f}, " f"train_accuracy: {train_acc:.3f}") print(f"val_loss {val_loss:.3f}, val_iou: {val_iou:.3f}, " f"val_accuracy: {val_acc:.3f}") if epoch > swa_start: print("Stochastic average start") avg_model.update_parameters(my_model) swa_scheduler.step() else: scheduler.step() print("Segmentation train completed") # Classification train if classification_train: print("-" * 5 + "Classification training start" + "-" * 5) my_model.mode = 2 train_metrics = Metrics() train_loss = 0. train_iou = 0. train_acc = 0. val_metrics = Metrics() val_loss = 0. val_iou = 0. val_acc = 0. my_model.train() optimizer = torch.optim.Adam(my_model.parameters(), lr=cls_args.lr) scheduler = CosineAnnealingLR(optimizer, T_max=100) bce = WeightedBCELoss(weight=cls_args.weight) swa_start = int(cls_args.num_epoch * 0.75) swa_scheduler = SWALR(optimizer, anneal_strategy='linear', anneal_epochs=swa_start, swa_lr=cls_args.swa_lr) for epoch in range(cls_args.num_epoch): for batch_idx, batch in enumerate(cls_train_loader): batch = tuple(t.to(device) for t in batch) cls_x, cls_y = batch optimizer.zero_grad() pred_y = my_model(cls_x) loss = bce(pred_y, cls_y) loss.backward() optimizer.step() train_loss += loss.item() train_metrics.update(pred_y, cls_y, train_loss) train_acc += train_metrics.acc step = epoch * len(seg_train_loader) + batch_idx for metric, value in vars(train_metrics).items(): mlflow.log_metric(f"cls_train_{metric}", value, step=step) train_loss /= len(seg_train_loader) train_acc /= len(seg_train_loader) my_model.eval() for batch_idx, batch in enumerate(cls_val_loader): batch = tuple(t.to(device) for t in batch) cls_x, cls_y = batch pred_y = my_model(cls_x) loss = bce(pred_y, cls_y) val_loss += loss.item() val_metrics.update(pred_y, cls_y, loss.item()) val_acc += val_metrics.acc step = epoch * len(seg_train_loader) + batch_idx for metric, value in vars(val_metrics).items(): mlflow.log_metric(f"cls_train_{metric}", value, step=step) val_loss /= len(seg_val_loader) val_acc /= len(seg_val_loader) print(f"Epoch {epoch + 1}:") print("-" * 10) print( f"train_loss {train_loss:.3f}, train_iou: {train_iou:.3f}, " f"train_accuracy: {train_acc:.3f}") print(f"val_loss {val_loss:.3f}, val_iou: {val_iou:.3f}, " f"val_accuracy: {val_acc:.3f}") print("Classification train completed") if epoch > swa_start: print("Stochastic average start") avg_model.update_parameters(my_model) swa_scheduler.step() else: scheduler.step() weight_path = "weights/donwconv_swa_weights.pth" torch.save(my_model.state_dict(), weight_path) print(f"model weight saved to {weight_path}")
if fold > args.nfold - 1: break max_acc = 0. min_loss = 1e10 print('Training with {} started'.format(fold)) print(len(trn_idx), len(val_idx)) train_loader, val_loader = prepare_dataloader(train, trn_idx, val_idx) device = torch.device('cuda') model = CassvaImgClassifier(args.model, train.label.nunique(), pretrained=True).to(device) swa_model = AveragedModel(model).to(device) model = torch.nn.DataParallel(model) scaler = GradScaler() if 'vit' in args.model: optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=0.001) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1) else: #optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9,