def __init__(self, batch_size, strategy, checkpoint_path, num_epochs, model, train_num_datasets, test_num_datasets, train_len=None, num_gpu=1, save_epoch=1, checkpoint_dir="training", num_classes=2, learning_rate=5e-4): self.num_epochs = num_epochs self.save_tensorboard_image = int(num_gpu) == 1 self.checkpoint_path = checkpoint_path self.train_len = train_len self.batch_size = batch_size self.strategy = strategy self.num_gpu = int(num_gpu) self.train_epoch_step = (train_num_datasets // self.batch_size) - 1 self.test_epoch_step = (test_num_datasets // self.batch_size) - 1 self.save_epoch = int(save_epoch) self.model = model self.train_writer = tf.summary.create_file_writer('training') self.lr = self.multi_step_lr(initial_learning_rate=learning_rate, epochs=num_epochs) self.optimizer = AdamP(learning_rate=self.lr, weight_decay=1e-2) self.ckpt = tf.train.Checkpoint(model=self.model, optimizer=self.optimizer) self.ckpt_manager = tf.train.CheckpointManager(self.ckpt, checkpoint_dir, max_to_keep=5) if self.ckpt_manager.latest_checkpoint: self.ckpt.restore(self.ckpt_manager.latest_checkpoint) self.epoch = int(self.ckpt_manager.latest_checkpoint.split('-')[-1]) tf.get_logger().info("Latest checkpoint restored:{}".format(self.ckpt_manager.latest_checkpoint)) else: self.epoch = 0 tf.get_logger().info('Not restoring from saved checkpoint') self.train_acc_metric = tf.keras.metrics.MeanIoU( num_classes=num_classes + 1 if num_classes == 1 else num_classes, name='train_accuracy') self.test_acc_metric = tf.keras.metrics.MeanIoU( num_classes=num_classes + 1 if num_classes == 1 else num_classes, name='test_accuracy') self.train_loss_metric = tf.keras.metrics.Mean(name='train_loss') self.test_loss_metric = tf.keras.metrics.Mean(name='test_loss')
def __init__(self, image_size, latent_dim = 512, fmap_max = 512, style_depth = 8, network_capacity = 16, transparent = False, fp16 = False, cl_reg = False, steps = 1, lr = 1e-4, fq_layers = [], fq_dict_size = 256, attn_layers = [], no_const = False): super().__init__() self.lr = lr self.steps = steps self.ema_updater = EMA(0.995) self.S = StyleVectorizer(latent_dim, style_depth) self.G = Generator(image_size, latent_dim, network_capacity, transparent = transparent, attn_layers = attn_layers, no_const = no_const, fmap_max = fmap_max) self.D = Discriminator(image_size, network_capacity, fq_layers = fq_layers, fq_dict_size = fq_dict_size, attn_layers = attn_layers, transparent = transparent, fmap_max = fmap_max) self.SE = StyleVectorizer(latent_dim, style_depth) self.GE = Generator(image_size, latent_dim, network_capacity, transparent = transparent, attn_layers = attn_layers, no_const = no_const) # experimental contrastive loss discriminator regularization assert not (transparent and cl_reg), 'contrastive loss regularization does not work with transparent images yet' self.D_cl = ContrastiveLearner(self.D, image_size, hidden_layer='flatten') if cl_reg else None # wrapper for augmenting all images going into the discriminator self.D_aug = AugWrapper(self.D, image_size) set_requires_grad(self.SE, False) set_requires_grad(self.GE, False) generator_params = list(self.G.parameters()) + list(self.S.parameters()) self.G_opt = AdamP(generator_params, lr = self.lr, betas=(0.5, 0.9)) self.D_opt = AdamP(self.D.parameters(), lr = self.lr, betas=(0.5, 0.9)) self._init_weights() self.reset_parameter_averaging() self.cuda() if fp16: (self.S, self.G, self.D, self.SE, self.GE), (self.G_opt, self.D_opt) = amp.initialize([self.S, self.G, self.D, self.SE, self.GE], [self.G_opt, self.D_opt], opt_level='O2')
def configure_optimizers(self): if cfg['optimizer'] == 'adam': opt = torch.optim.Adam(self.predictor.parameters(), lr=5e-3, weight_decay=5e-4) elif cfg['optimizer'] == 'adamp': opt = AdamP(self.predictor.parameters(), lr=0.0001, betas=(0.9, 0.999), weight_decay=1e-2) def lr_foo(epoch): if epoch < self.hparams.warm_up_step: # warm up lr lr_scale = 0.1**(self.hparams.warm_up_step - epoch) else: lr_scale = 0.95**epoch return lr_scale scheduler = LambdaLR(opt, lr_lambda=lr_foo) self.sched = scheduler self.opt = opt return [opt], [scheduler]
def select_optimizer(param, opt_name: str, lr: float, weight_decay: float): if opt_name == 'SGD': optimizer = SGDP(param, lr=lr, momentum=0.9, weight_decay=weight_decay, nesterov=True) elif opt_name == 'SGDP': optimizer = SGDP(param, lr=lr, momentum=0.9, weight_decay=weight_decay, nesterov=True) elif opt_name == 'Adam': optimizer = torch.optim.Adam(param, lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=weight_decay, amsgrad=False) elif opt_name == 'AdamP': optimizer = AdamP(param, lr=lr, betas=(0.9, 0.999), weight_decay=weight_decay, nesterov=True) else: raise NotImplementedError('The optimizer should be in [SGD]') return optimizer
def configure_optimizers(self): lr = float(cfg["train_params"]["lr"]) if cfg['optimizer'] == 'adam': opt = torch.optim.Adam(self.predictor.parameters(), lr=lr, weight_decay=5e-4) if cfg['optimizer'] == 'adamw': opt = torch.optim.AdamW(self.predictor.parameters(), lr=lr, weight_decay=5e-4) elif cfg['optimizer'] == 'adamp': opt = AdamP(self.predictor.parameters(), lr=0.0005, betas=(0.9, 0.999), weight_decay=1e-2) def lr_foo(epoch): if epoch < self.hparams.warm_up_step: # warm up lr lr_scale = 0.1**(self.hparams.warm_up_step - epoch) else: lr_scale = 0.98**epoch lr_scale = max(1e-6, lr_scale) return lr_scale scheduler = LambdaLR(opt, lr_lambda=lr_foo) self.opt = opt return opt # , [scheduler]
def get_optimizer(optimizer_name, model, lr, weight_decay=0.0, filter=lambda x: True, sparse_embedding=False): parameters = [p for name, p in model.named_parameters() if filter(name)] if not parameters: return None if optimizer_name == "sgd": return torch.optim.SGD(parameters, lr=lr, weight_decay=weight_decay) elif optimizer_name == "sgdm": assert not sparse_embedding return torch.optim.SGD(parameters, lr=lr, weight_decay=weight_decay, momentum=0.9) elif optimizer_name == "adam": if sparse_embedding: sparse_parameters = [] dense_parameters = [] for name, p in model.named_parameters(): if name.endswith("embedding.weight"): sparse_parameters.append(p) else: dense_parameters.append(p) sparse_adam = torch.optim.SparseAdam(sparse_parameters, lr=lr) dense_adam = torch.optim.Adam(dense_parameters, lr=lr, weight_decay=weight_decay) optimizer = MultipleOptimizer(sparse_adam, dense_adam) return optimizer else: return torch.optim.Adam(parameters, lr=lr, weight_decay=weight_decay) elif optimizer_name == "adame": assert not sparse_embedding return torch.optim.Adam(parameters, lr=lr, weight_decay=weight_decay, eps=1e-3) elif optimizer_name == "adamw": assert not sparse_embedding return torch.optim.AdamW(parameters, lr=lr, weight_decay=weight_decay) elif optimizer_name == "adamp": assert not sparse_embedding return AdamP(parameters, lr=lr, weight_decay=weight_decay) else: raise NotImplementedError()
def init_optimizer(optimizer_name, model, lr, wd, lr_restart_step=1, lr_decay_gamma=0.9, scheduler="step", nesterov=False, num_epochs=None, steps_per_epoch=None): if optimizer_name == "sgd": optimizer_ft = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=wd, nesterov=nesterov) elif optimizer_name == "adam": optimizer_ft = optim.Adam(model.parameters(), lr=lr, weight_decay=wd) elif optimizer_name == "adamp": from adamp import AdamP optimizer_ft = AdamP(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=wd) # 1e-2) elif optimizer_name == "sgdp": from adamp import SGDP optimizer_ft = SGDP(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9, nesterov=nesterov) # else: # opt_attr = getattr(toptim, optimizer_name) # if opt_attr: # optimizer_ft = opt_attr(model.parameters()) # else: # raise Exception("unknown optimizer name", optimizer_name) if scheduler == "cosine": exp_lr_scheduler = lr_scheduler.CosineAnnealingWarmRestarts( optimizer_ft, lr_restart_step) use_lr_schedule_steps = True elif scheduler == "cycle": exp_lr_scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer_ft, max_lr=lr, steps_per_epoch=steps_per_epoch, epochs=num_epochs, pct_start=0.1) use_lr_schedule_steps = False elif scheduler == "step": exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=lr_restart_step, gamma=lr_decay_gamma) use_lr_schedule_steps = False return optimizer_ft, exp_lr_scheduler, use_lr_schedule_steps
def train(num_epochs, model, data_loader, val_loader, val_every, device, file_name): learning_rate = 0.0001 from torch.optim.swa_utils import AveragedModel, SWALR from torch.optim.lr_scheduler import CosineAnnealingLR from segmentation_models_pytorch.losses import SoftCrossEntropyLoss, JaccardLoss from adamp import AdamP criterion = [SoftCrossEntropyLoss(smooth_factor=0.1), JaccardLoss('multiclass', classes=12)] optimizer = AdamP(params=model.parameters(), lr=learning_rate, weight_decay=1e-6) swa_scheduler = SWALR(optimizer, swa_lr=learning_rate) swa_model = AveragedModel(model) look = Lookahead(optimizer, la_alpha=0.5) print('Start training..') best_miou = 0 for epoch in range(num_epochs): hist = np.zeros((12, 12)) model.train() for step, (images, masks, _) in enumerate(data_loader): loss = 0 images = torch.stack(images) # (batch, channel, height, width) masks = torch.stack(masks).long() # (batch, channel, height, width) # gpu 연산을 위해 device 할당 images, masks = images.to(device), masks.to(device) # inference outputs = model(images) for i in criterion: loss += i(outputs, masks) # loss 계산 (cross entropy loss) look.zero_grad() loss.backward() look.step() outputs = torch.argmax(outputs.squeeze(), dim=1).detach().cpu().numpy() hist = add_hist(hist, masks.detach().cpu().numpy(), outputs, n_class=12) acc, acc_cls, mIoU, fwavacc = label_accuracy_score(hist) # step 주기에 따른 loss, mIoU 출력 if (step + 1) % 25 == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, mIoU: {:.4f}'.format( epoch + 1, num_epochs, step + 1, len(data_loader), loss.item(), mIoU)) # validation 주기에 따른 loss 출력 및 best model 저장 if (epoch + 1) % val_every == 0: avrg_loss, val_miou = validation(epoch + 1, model, val_loader, criterion, device) if val_miou > best_miou: print('Best performance at epoch: {}'.format(epoch + 1)) print('Save model in', saved_dir) best_miou = val_miou save_model(model, file_name = file_name) if epoch > 3: swa_model.update_parameters(model) swa_scheduler.step()
def get_optim(model: nn.Module, optim_type: str, lr: float): if optim_type == Config.Adam: optimizer = optim.Adam(model.parameters(), lr=lr) elif optim_type == Config.SGD: optimizer = optim.SGD(model.parameters(), lr=lr) elif optim_type == Config.Momentum: optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) elif optim_type == Config.AdamP: optimizer = AdamP(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=1e-2) return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True): opt_lower = args.opt.lower() weight_decay = args.weight_decay if weight_decay and filter_bias_and_bn: parameters = add_weight_decay(model, weight_decay) weight_decay = 0. else: parameters = model.parameters() opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'momentum': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'adamp': optimizer = AdamP(parameters, lr=args.lr, betas=(0.9, 0.999), weight_decay=weight_decay) else: assert False and "Invalid optimizer" raise ValueError return optimizer
def configure_optimizers(self): if self.args.optimizer == 'AdamW': optimizer = AdamW(self.parameters(), lr=self.args.lr) elif self.args.optimizer == 'AdamP': from adamp import AdamP optimizer = AdamP(self.parameters(), lr=self.args.lr) else: raise NotImplementedError('Only AdamW and AdamP is Supported!') if self.args.lr_scheduler == 'cos': scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2) elif self.args.lr_scheduler == 'exp': scheduler = ExponentialLR(optimizer, gamma=0.5) else: raise NotImplementedError( 'Only cos and exp lr scheduler is Supported!') return { 'optimizer': optimizer, 'scheduler': scheduler, }
def get_optimizer(opt, model, momentum=0.9, wd=5e-4, nesterov=False): optimizer = None if opt.optim == 'sgd': optimizer = optim.SGD( filter(lambda p: p.requires_grad, model.parameters()), lr=opt.lr, momentum=momentum, weight_decay=wd, nesterov=nesterov ) elif opt.optim == 'adam': optimizer = optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=opt.lr ) elif opt.optim == 'adamp': optimizer = AdamP(filter(lambda p: p.requires_grad, model.parameters( )), lr=opt.lr, betas=(0.9, 0.999), weight_decay=1e-2) return optimizer
def configure_optimizers(self): if cfg['optimizer'] == "Adam": optimizer = torch.optim.Adam(self.netD.parameters(), lr=cfg['lr']) elif cfg['optimizer'] == "AdamP": optimizer = AdamP(self.netD.parameters(), lr=cfg['lr'], betas=(0.9, 0.999), weight_decay=1e-2) elif cfg['optimizer'] == "SGDP": optimizer = SGDP(self.netD.parameters(), lr=cfg['lr'], weight_decay=1e-5, momentum=0.9, nesterov=True) elif cfg['optimizer'] == "MADGRAD": from madgrad import MADGRAD optimizer = MADGRAD(self.netD.parameters(), lr=cfg['lr'], momentum=0.9, weight_decay=0.01, eps=1e-6) return optimizer
def get_optimizer(model, optimizer_name, scheduler_name): if optimizer_name == 'Adam': optimizer = Adam(model.parameters(), lr=learning_rate) elif optimizer_name == 'AdamW': optimizer = AdamW(model.parameters(), lr=learning_rate) elif optimizer_name == 'AdamP': optimizer = AdamP(model.parameters(), lr=learning_rate) elif optimizer_name == 'MADGRAD': optimizer = madgrad.MADGRAD(model.parameters(), lr=learning_rate) else: optimizer = optim.Ranger(model.parameters(), lr=learning_rate, alpha=0.6, k=10) if scheduler_name == 'step': scheduler = StepLR(optimizer, 10, gamma=0.5) elif scheduler_name == 'reduce': scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=10) else: scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=0.) return optimizer, scheduler
def load_optimizer(args, param_group): if args.optim.optimizer.lower() == 'sgd': optimizer = optim.SGD( param_group, args.optim.lr.init, momentum=args.optim.momentum, weight_decay=args.optim.wd.base, nesterov=args.optim.nesterov, ) elif args.optim.optimizer.lower() == 'adamp': optimizer = AdamP( param_group, args.optim.lr.init, betas=(args.optim.momentum, 0.999), weight_decay=args.optim.wd.base, nesterov=args.optim.nesterov, ) else: raise ValueError("Unknown optimizer : {}".format(args.optim.optimizer)) set_init_lr(optimizer.param_groups) return optimizer
def train(data_dir, model_dir, args): seed_everything(args.seed) save_dir = increment_path(os.path.join(model_dir, args.name)) # -- settings use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # -- dataset dataset_module = getattr(import_module("dataset"), args.dataset) # MaskBaseDataset dataset = dataset_module(data_dir=data_dir, val_ratio=args.val_ratio) num_classes = dataset.num_classes # 18 # -- augmentation transform_module = getattr(import_module("dataset"), args.augmentation) # default: BaseAugmentation transform = transform_module( resize=args.resize, mean=dataset.mean, std=dataset.std, ) dataset.set_transform(transform) # -- data_loader train_set, val_set = dataset.split_dataset() train_loader = DataLoader(train_set, batch_size=args.batch_size, num_workers=8, shuffle=True, pin_memory=use_cuda, drop_last=True) val_loader = DataLoader(val_set, batch_size=args.batch_size, num_workers=8, shuffle=False, pin_memory=use_cuda, drop_last=True) # -- model models = [] model_module_gender = getattr(import_module("model"), args.model_gender) # default: BaseModel model_gender = model_module_gender(num_classes=args.num_classes_gender, grad_point=args.grad_point).to(device) model_gender = torch.nn.DataParallel(model_gender) # -- loss & metric criterion_gender = create_criterion( args.criterion_gender, classes=args.num_classes_gender) # default: f1 if args.optimizer == "AdamP": optimizer_gender = AdamP(filter(lambda p: p.requires_grad, model_gender.parameters()), lr=args.lr, weight_decay=5e-4) else: opt_module = getattr(import_module('torch.optim'), args.optimizer) # default: Adam optimizer_gender = opt_module(filter(lambda p: p.requires_grad, model_gender.parameters()), lr=args.lr, weight_decay=5e-4) scheduler_gender = StepLR(optimizer_gender, args.lr_decay_step, gamma=0.5) # -- logging logger_gender = SummaryWriter(log_dir=os.path.join(save_dir, 'gender')) with open(Path(save_dir) / 'gender' / 'config.json', 'w', encoding='utf-8') as f: json.dump(vars(args), f, ensure_ascii=False, indent=4) best_val_acc_gender = 0 best_val_loss_gender = np.inf for epoch in range(args.epochs): # train loop model_gender.train() loss_value_gender = 0 matches_gender = 0 for idx, train_batch in enumerate(train_loader): inputs, labels_mask, labels_gender, labels_age = train_batch inputs = inputs.to(device) labels_gender = labels_gender.to(device) optimizer_gender.zero_grad() outs_gender = model_gender(inputs) preds_gender = torch.argmax(outs_gender, dim=-1) loss_gender = criterion_gender(outs_gender, labels_gender) loss_gender.backward() optimizer_gender.step() loss_value_gender += loss_gender.item() matches_gender += (preds_gender == labels_gender).sum().item() if (idx + 1) % args.log_interval == 0: train_loss_gender = loss_value_gender / args.log_interval train_acc_gender = matches_gender / args.batch_size / args.log_interval current_lr_gender = get_lr(optimizer_gender) print( f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss_gender:4.4} || training accuracy {train_acc_gender:4.2%} || lr {current_lr_gender}" ) logger_gender.add_scalar("Train/loss", train_loss_gender, epoch * len(train_loader) + idx) logger_gender.add_scalar("Train/accuracy", train_acc_gender, epoch * len(train_loader) + idx) loss_value_gender = 0 matches_gender = 0 scheduler_gender.step() #val loop with torch.no_grad(): print("Calculating validation results...") model_gender.eval() val_loss_items_gender = [] val_acc_items_gender = [] figure = None for val_batch in val_loader: inputs, labels_mask, labels_gender, labels_age = val_batch inputs = inputs.to(device) labels_gender = labels_gender.to(device) outs_gender = model_gender(inputs) preds_gender = torch.argmax(outs_gender, dim=-1) loss_item_gender = criterion_gender(outs_gender, labels_gender).item() acc_item_gender = (labels_gender == preds_gender).sum().item() val_loss_items_gender.append(loss_item_gender) val_acc_items_gender.append(acc_item_gender) if figure is None: # inputs_np = torch.clone(inputs).detach().cpu().permute(0, 2, 3, 1).numpy() inputs_np = torch.clone(inputs).detach().cpu() inputs_np = inputs_np.permute(0, 2, 3, 1).numpy() inputs_np = dataset_module.denormalize_image( inputs_np, dataset.mean, dataset.std) figure = grid_image( inputs_np, labels_mask, preds_gender, args.dataset != "MaskSplitByProfileDataset") plt.show() val_loss_gender = np.sum(val_loss_items_gender) / len(val_loader) val_acc_gender = np.sum(val_acc_items_gender) / len(val_set) if val_loss_gender < best_val_loss_gender or val_acc_gender > best_val_acc_gender: save_model(model_gender, epoch, val_loss_gender, val_acc_gender, os.path.join(save_dir, "gender"), args.model_gender) if val_loss_gender < best_val_loss_gender and val_acc_gender > best_val_acc_gender: print( f"New best model_gender for val acc and val loss : {val_acc_gender:4.2%} {val_loss_gender:4.2}! saving the best model_gender.." ) best_val_loss_gender = val_loss_gender best_val_acc_gender = val_acc_gender elif val_loss_gender < best_val_loss_gender: print( f"New best model_gender for val loss : {val_loss_gender:4.2}! saving the best model_gender.." ) best_val_loss_gender = val_loss_gender elif val_acc_gender > best_val_acc_gender: print( f"New best model_gender for val accuracy : {val_acc_gender:4.2%}! saving the best model_gender.." ) best_val_acc_gender = val_acc_gender print( f"[Val] acc: {val_acc_gender:4.2%}, loss: {val_loss_gender:4.2} || " f"best acc: {best_val_acc_gender:4.2%}, best loss: {best_val_loss_gender:4.2}" ) logger_gender.add_scalar("Val/loss", val_loss_gender, epoch) logger_gender.add_scalar("Val/accuracy", val_acc_gender, epoch) logger_gender.add_figure("results", figure, epoch) print()
def train(data_dir, model_dir, args): seed_everything(args.seed) # args.__dict__ == vars(args) save_dir = increment_path(os.path.join(model_dir, args.name)) # -- settings use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # -- dataset dataset_module = getattr(import_module("dataset"), args.dataset) # MaskBaseDataset dataset = dataset_module(data_dir=data_dir, val_ratio=args.val_ratio) num_classes = dataset.num_classes # 18 # -- augmentation transform_module = getattr(import_module("dataset"), args.augmentation) # default: BaseAugmentation transform = transform_module( resize=args.resize, mean=dataset.mean, std=dataset.std, ) dataset.set_transform(transform) # -- data_loader train_set, val_set = dataset.split_dataset() train_loader = DataLoader(train_set, batch_size=args.batch_size, num_workers=8, shuffle=True, pin_memory=use_cuda, drop_last=True) val_loader = DataLoader(val_set, batch_size=args.batch_size, num_workers=8, shuffle=False, pin_memory=use_cuda, drop_last=True) # -- model model_module = getattr(import_module("model"), args.model) # default: BaseModel model = model_module(num_classes=num_classes, grad_point=args.grad_point).to(device) model = torch.nn.DataParallel(model) # if want model train begin from args.continue_epoch checkpoint. if args.continue_train: try_dir = find_dir_try(args.continue_try_num, model_dir, args.continue_name) epoch_dir = find_dir_epoch(args.continue_epoch, try_dir) model.load_state_dict(torch.load(epoch_dir)) # -- loss & metric if args.criterion == "cross_entropy": criterion = create_criterion(args.criterion) # default: cross_entropy else: criterion = create_criterion( args.criterion, classes=num_classes) # default: cross_entropy if args.optimizer == "AdamP": optimizer = AdamP(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=5e-4) else: opt_module = getattr(import_module('torch.optim'), args.optimizer) # default: Adam optimizer = opt_module(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=5e-4) scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) # -- logging if not os.path.exists(save_dir): os.mkdir(save_dir) with open(Path(save_dir) / 'config.json', 'w', encoding='utf-8') as f: json.dump(vars(args), f, ensure_ascii=False, indent=4) best_val_acc = 0 best_val_loss = np.inf for epoch in range(args.epochs): # train loop model.train() loss_value = 0 matches = 0 for idx, train_batch in enumerate(train_loader): inputs, labels = train_batch inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss = criterion(outs, labels) loss.backward() optimizer.step() loss_value += loss.item() matches += (preds == labels).sum().item() if (idx + 1) % args.log_interval == 0: train_loss = loss_value / args.log_interval train_acc = matches / args.batch_size / args.log_interval current_lr = get_lr(optimizer) print( f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}" ) loss_value = 0 matches = 0 scheduler.step() #val loop with torch.no_grad(): print("Calculating validation results...") model.eval() val_loss_items = [] val_acc_items = [] figure = None for val_batch in val_loader: inputs, labels = val_batch inputs = inputs.to(device) labels = labels.to(device) outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss_item = criterion(outs, labels).item() acc_item = (labels == preds).sum().item() val_loss_items.append(loss_item) val_acc_items.append(acc_item) if figure is None: # inputs_np = torch.clone(inputs).detach().cpu().permute(0, 2, 3, 1).numpy() inputs_np = torch.clone(inputs).detach().cpu() inputs_np = inputs_np.permute(0, 2, 3, 1).numpy() inputs_np = dataset_module.denormalize_image( inputs_np, dataset.mean, dataset.std) figure = grid_image( inputs_np, labels, preds, args.dataset != "MaskSplitByProfileDataset") plt.show() val_loss = np.sum(val_loss_items) / len(val_loader) val_acc = np.sum(val_acc_items) / len(val_set) if val_loss < best_val_loss or val_acc > best_val_acc: save_model(model, epoch, val_loss, val_acc, save_dir, args.model) if val_loss < best_val_loss and val_acc > best_val_acc: print( f"New best model for val acc and val loss : {val_acc:4.2%} {val_loss:4.2}! saving the best model.." ) best_val_loss = val_loss best_val_acc = val_acc elif val_loss < best_val_loss: print( f"New best model for val loss : {val_loss:4.2}! saving the best model.." ) save_model(model, epoch, val_loss, val_acc, save_dir, args.model) best_val_loss = val_loss elif val_acc > best_val_acc: print( f"New best model for val accuracy : {val_acc:4.2%}! saving the best model.." ) save_model(model, epoch, val_loss, val_acc, save_dir, args.model) best_val_acc = val_acc print( f"[Val] acc: {val_acc:4.2%}, loss: {val_loss:4.2} || " f"best acc: {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}" ) print()
def train(img_dir, model_dir, args): seed_everything(args.seed) start = time.time() get_current_time() save_dir = increment_path(os.path.join(model_dir, args.name)) # settings device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # dataset dataset_module = getattr(import_module("dataset"), args.dataset) dataset = dataset_module( img_dir=img_dir, val_ratio=args.val_ratio, ) num_classes = dataset.num_classes transform_module = getattr(import_module("dataset"), args.augmentation) transform = transform_module(mean=dataset.mean, std=dataset.std) train_dataset, val_dataset = dataset.split_dataset() train_dataset.dataset.set_transform(transform["train"]) val_dataset.dataset.set_transform(transform["val"]) train_loader = DataLoader( train_dataset, batch_size=args.batch_size, num_workers=2, shuffle=True, pin_memory=torch.cuda.is_available(), drop_last=True, ) val_loader = DataLoader( val_dataset, batch_size=args.valid_batch_size, num_workers=2, shuffle=False, pin_memory=torch.cuda.is_available(), drop_last=True, ) model_module = getattr(import_module("model"), args.model) model = model_module(num_classes=num_classes).to(device) model = torch.nn.DataParallel(model) criterion = create_criterion(args.criterion) optimizer = None if args.optimizer == "AdamP": optimizer = AdamP(model.parameters(), lr=args.lr) else: opt_module = getattr(import_module("torch.optim"), args.optimizer) optimizer = opt_module( model.parameters(), # filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, # weight_decay=5e-4, ) # scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) logger = SummaryWriter(log_dir=save_dir) best_val_acc = 0 best_val_loss = np.inf best_val_f1 = 0 for epoch in range(args.epochs): model.train() train_loss = 0 train_acc = 0 train_f1 = 0 for i, data in enumerate(tqdm(train_loader)): imgs, labels = data imgs = imgs.float().to(device) labels = labels.long().to(device) optimizer.zero_grad() outputs = model(imgs) loss = criterion(outputs, labels) loss.backward() optimizer.step() preds = torch.argmax(outputs, 1) acc = (preds == labels).sum().item() / len(imgs) t_f1_score = f1_score( labels.cpu().detach().numpy(), preds.cpu().detach().numpy(), average="macro", ) train_loss += loss train_acc += acc train_f1 += t_f1_score if (i + 1) % args.log_interval == 0: train_loss /= args.log_interval train_acc /= args.log_interval train_f1 /= args.log_interval current_lr = get_lr(optimizer) print( f"Epoch[{epoch + 1}/{args.epochs}]({i + 1}/{len(train_loader)}) || trainin_loss {train_loss:.4f} || training acc {train_acc:.4f} || train f1_score {train_f1:.4f} || lr {current_lr}" ) logger.add_scalar("Train/loss", train_loss, epoch * len(train_loader) + i) logger.add_scalar("Train/accuracy", train_acc, epoch * len(train_loader) + i) logger.add_scalar("Train/F1-score", train_f1, epoch * len(train_loader) + i) train_loss = 0 train_acc = 0 train_f1 = 0 # scheduler.step() # training은 1 epoch이 끝나야 완료된 것 # 학습이 끝난 각 epoch에서 최고의 score를 가진 것을 저장하는 것 with torch.no_grad(): print("Validation step---------------------") model.eval() val_loss_items = [] val_acc_items = [] val_f1_items = [] for data in tqdm(val_loader): imgs, labels = data imgs = imgs.float().to(device) labels = labels.long().to(device) outputs = model(imgs) preds = torch.argmax(outputs, 1) loss = criterion(outputs, labels).item() acc = (labels == preds).sum().item() val_f1 = f1_score( labels.cpu().detach().numpy(), preds.cpu().detach().numpy(), average="macro", ) val_loss_items.append(loss) val_acc_items.append(acc) val_f1_items.append(val_f1) val_loss = np.sum(val_loss_items) / len(val_loader) val_acc = np.sum(val_acc_items) / len(val_dataset) val_f1 = np.sum(val_f1_items) / len(val_loader) print( f"val_loader: {len(val_loader)} | val_dataset: {len(val_dataset)}" ) best_val_loss = min(best_val_loss, val_loss) best_val_f1 = max(val_f1, best_val_f1) best_val_acc = max(val_acc, best_val_acc) # if val_acc > best_val_acc: # print( # f"New best model for val acc: {val_acc:4.2%}! saving the best model..." # ) # torch.save(model.module.state_dict(), f"{save_dir}/best.pth") # best_val_acc = val_acc if val_f1 > best_val_f1: print( f"New best model for val f1: {val_f1:.4f}! saving the best model..." ) torch.save(model.module.state_dict(), f"{save_dir}/best.pth") best_val_f1 = val_f1 # TODO: last model 저장이 여기 위치가 맞나 ?? # torch.save(model.module.state_dict(), f"{save_dir}/last.pth") print( f"[Val] acc: {val_acc:.4f}, loss: {val_loss:.4f} || best acc: {best_val_acc:.4f}, best loss: {best_val_loss:.4f}" ) logger.add_scalar("Val/loss", val_loss, epoch) logger.add_scalar("Val/accuracy", val_acc, epoch) logger.add_scalar("Val/f1-score", val_f1, epoch) print() torch.save(model.module.state_dict(), f"{save_dir}/last.pth") # How much time training taken times = time.time() - start minute, sec = divmod(times, 60) print(f"Finish Training! Taken time is {minute} minutes {sec} seconds")
return model_ft, input_size model, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True) #model = MobileNetV3(num_classes).to(device) #model=SuperLightMobileNet(num_classes) #model=EfficientNet(1,1,num_classes) #model = Xception(num_classes).to(device) #self.criterion = nn.CrossEntropyLoss().to(self.device) CEloss = nn.CrossEntropyLoss() optimizer = AdamP(model.parameters(), lr=0.01, betas=(0.9, 0.999), weight_decay=1e-2) #optimizer = torch.optim.Adam(, lr=lr,weight_decay=) #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=10,min_lr=0.0001) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60, 100], gamma=0.1) model = nn.DataParallel(model) model.to(device) import numpy as np total_epoch = 50 total_iteration_per_epoch = int(np.ceil(len(train_dataset) / batch_size)) for epoch in range(1, total_epoch + 1): model.train()
def pseudo_labeling(num_epochs, model, data_loader, val_loader, unlabeled_loader, device, val_every, file_name): # Instead of using current epoch we use a "step" variable to calculate alpha_weight # This helps the model converge faster from torch.optim.swa_utils import AveragedModel, SWALR from segmentation_models_pytorch.losses import SoftCrossEntropyLoss, JaccardLoss from adamp import AdamP criterion = [ SoftCrossEntropyLoss(smooth_factor=0.1), JaccardLoss('multiclass', classes=12) ] optimizer = AdamP(params=model.parameters(), lr=0.0001, weight_decay=1e-6) swa_scheduler = SWALR(optimizer, swa_lr=0.0001) swa_model = AveragedModel(model) optimizer = Lookahead(optimizer, la_alpha=0.5) step = 100 size = 256 best_mIoU = 0 model.train() print('Start Pseudo-Labeling..') for epoch in range(num_epochs): hist = np.zeros((12, 12)) for batch_idx, (imgs, image_infos) in enumerate(unlabeled_loader): # Forward Pass to get the pseudo labels # --------------------------------------------- test(unlabelse)를 모델에 통과 model.eval() outs = model(torch.stack(imgs).to(device)) oms = torch.argmax(outs.squeeze(), dim=1).detach().cpu().numpy() oms = torch.Tensor(oms) oms = oms.long() oms = oms.to(device) # --------------------------------------------- 학습 model.train() # Now calculate the unlabeled loss using the pseudo label imgs = torch.stack(imgs) imgs = imgs.to(device) # preds_array = preds_array.to(device) output = model(imgs) loss = 0 for each in criterion: loss += each(output, oms) unlabeled_loss = alpha_weight(step) * loss # Backpropogate optimizer.zero_grad() unlabeled_loss.backward() optimizer.step() output = torch.argmax(output.squeeze(), dim=1).detach().cpu().numpy() hist = add_hist(hist, oms.detach().cpu().numpy(), output, n_class=12) if (batch_idx + 1) % 25 == 0: acc, acc_cls, mIoU, fwavacc = label_accuracy_score(hist) print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, mIoU:{:.4f}'. format(epoch + 1, num_epochs, batch_idx + 1, len(unlabeled_loader), unlabeled_loss.item(), mIoU)) # For every 50 batches train one epoch on labeled data # 50배치마다 라벨데이터를 1 epoch학습 if batch_idx % 50 == 0: # Normal training procedure for batch_idx, (images, masks, _) in enumerate(data_loader): labeled_loss = 0 images = torch.stack(images) # (batch, channel, height, width) masks = torch.stack(masks).long() # gpu 연산을 위해 device 할당 images, masks = images.to(device), masks.to(device) output = model(images) for each in criterion: labeled_loss += each(output, masks) optimizer.zero_grad() labeled_loss.backward() optimizer.step() # Now we increment step by 1 step += 1 if (epoch + 1) % val_every == 0: avrg_loss, val_mIoU = validation(epoch + 1, model, val_loader, criterion, device) if val_mIoU > best_mIoU: print('Best performance at epoch: {}'.format(epoch + 1)) print('Save model in', saved_dir) best_mIoU = val_mIoU save_model(model, file_name=file_name) model.train() if epoch > 3: swa_model.update_parameters(model) swa_scheduler.step()
margin.to(device) nomargin = ArcMarginForTest(in_feature=512, out_feature=num_classes, easy_margin=True) nomargin.to(device) # Tensorboard : network graph 생성 # writer.add_graph(margin, (model(images.to(device)),labels.to(device))) # writer.close() classes = tuple([x for x in range(0, num_classes)]) #criterion = FocalLoss(gamma=2, alpha=0.25).to(device) criterion = torch.nn.CrossEntropyLoss().to(device) optimizer = AdamP([{ 'params': model.parameters(), 'weight_decay': 5e-6 }, { 'params': margin.parameters(), 'weight_decay': 5e-6 }], lr=learning_rate) # # optimizer = torch.optim.Adam([ # {'params': model.parameters(), 'weight_decay': 5e-6}, # {'params': margin.parameters(), 'weight_decay': 5e-6} # # ], lr=learning_rate) # scheduler = lr_scheduler.ExponentialLR(optimizer, gamma= 0.99) m_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15, 30], gamma=0.33) # co_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, # T_max=10, # eta_min=0)
args.add_argument("--iteration", type=str, default='0') args.add_argument("--pause", type=int, default=0) config = args.parse_args() model = ResNet50(block=models.resnet.BasicBlock, layers=[2, 2, 2, 2], num_classes=config.num_classes) load_weight(model) criterion = nn.BCEWithLogitsLoss() model = model.cuda() criterion = criterion.cuda() optimizer = AdamP( [param for param in model.parameters() if param.requires_grad], lr=config.base_lr, weight_decay=1e-4) scheduler = StepLR(optimizer, step_size=20, gamma=0.1) if IS_ON_NSML: # This NSML block is mandatory. Do not change. bind_nsml(model) nsml.save('checkpoint') if config.pause: nsml.paused(scope=locals()) if config.mode == 'train': # Local debugging block. This module is not mandatory. # But this would be quite useful for troubleshooting. train_loader = data_loader(root=DATASET_PATH, split='train') val_loader = data_loader(root=DATASET_PATH, split='val')
def train(cfg): SEED = cfg.values.seed MODEL_NAME = cfg.values.model_name USE_KFOLD = cfg.values.val_args.use_kfold TRAIN_ONLY = cfg.values.train_only seed_everything(SEED) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config') model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 whole_df = load_data("/opt/ml/input/data/train/train.tsv") additional_df = load_data("/opt/ml/input/data/train/additional_train.tsv") whole_label = whole_df['label'].values # additional_label = additional_df['label'].values if cfg.values.tokenizer_arc: tokenizer_module = getattr(import_module('transformers'), cfg.values.tokenizer_arc) tokenizer = tokenizer_module.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) early_stopping = EarlyStoppingCallback(early_stopping_patience=9999999, early_stopping_threshold=0.001) training_args = TrainingArguments( output_dir=cfg.values.train_args.output_dir, # output directory save_total_limit=cfg.values.train_args. save_total_limit, # number of total save model. save_steps=cfg.values.train_args.save_steps, # model saving step. num_train_epochs=cfg.values.train_args. num_epochs, # total number of training epochs learning_rate=cfg.values.train_args.lr, # learning_rate per_device_train_batch_size=cfg.values.train_args. train_batch_size, # batch size per device during training per_device_eval_batch_size=cfg.values.train_args. eval_batch_size, # batch size for evaluation warmup_steps=cfg.values.train_args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=cfg.values.train_args. weight_decay, # strength of weight decay max_grad_norm=cfg.values.train_args.max_grad_norm, logging_dir=cfg.values.train_args. logging_dir, # directory for storing logs logging_steps=cfg.values.train_args.logging_steps, # log saving step. evaluation_strategy=cfg.values.train_args. evaluation_strategy, # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=cfg.values.train_args.eval_steps, # evaluation step. dataloader_num_workers=4, seed=SEED, label_smoothing_factor=cfg.values.train_args.label_smoothing_factor, load_best_model_at_end=True, # metric_for_best_model='accuracy' ) if USE_KFOLD: kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k) k = 1 for train_idx, val_idx in kfold.split(whole_df, whole_label): print('\n') cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15) train_df = whole_df.iloc[train_idx] # train_df = pd.concat((train_df, additional_df)) val_df = whole_df.iloc[val_idx] if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold' training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold' optimizer = MADGRAD(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, # callbacks=[early_stopping] ) k += 1 # train model trainer.train() else: cpprint('=' * 20 + f'START TRAINING' + '=' * 20) if not TRAIN_ONLY: train_df, val_df = train_test_split( whole_df, test_size=cfg.values.val_args.test_size, random_state=SEED) # train_df = pd.concat((train_df, additional_df)) if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) optimizer = transformers.AdamW(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs # scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, callbacks=[early_stopping]) # train model trainer.train() else: training_args.evaluation_strategy = 'no' if cfg.values.model_arc == 'Roberta': print('Roberta') tokenized_train = roberta_tokenized_dataset( whole_df, tokenizer) else: tokenized_train = tokenized_dataset(whole_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, whole_df['label'].values) try: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + '/only_train' training_args.logging_dir = cfg.values.train_args.output_dir + '/only_train' optimizer = AdamP(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset optimizers=optimizers, # callbacks=[early_stopping] ) # train model trainer.train()
elif args.optimizer == 'SDG' and args.sam: base_optimizer = optim.SGD optimizer = SAM(net.parameters(), base_optimizer, lr=initial_lr, momentum=momentum, weight_decay=weight_decay) elif args.optimizer == 'SDGP': optimizer = SGDP(net.parameters(), lr=0.1, weight_decay=1e-5, momentum=0.9, nesterov=True) elif args.optimizer == 'ADAMP': optimizer = AdamP(net.parameters(), lr=0.01, betas=(0.9, 0.999), weight_decay=1e-2) else: optimizer = optim.SGD(net.parameters(), lr=initial_lr, momentum=momentum, weight_decay=weight_decay) steps = 15 scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, steps) criterion = MultiBoxLoss(num_classes, 0.35, True, 0, True, 7, 0.35, False) priorbox = PriorBox(cfg, image_size=(img_dim, img_dim)) with torch.no_grad(): priors = priorbox.forward()
def get_adamp(lr=0.001, model=None, weight_decay=1e-6): params = [p for p in model.parameters() if p.requires_grad] return AdamP(params, lr=lr)
def adamp(params, lr): return AdamP(params, lr=lr, betas=(0.9, 0.999), weight_decay=1e-2)
def train_no_val(img_dir, model_dir, args): seed_everything(args.seed) start = time.time() get_current_time() save_dir = increment_path(os.path.join(model_dir, args.name)) # settings device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # dataset dataset_module = getattr(import_module("dataset"), args.dataset) dataset = dataset_module( img_dir=img_dir, val_ratio=args.val_ratio, ) num_classes = dataset.num_classes transform_module = getattr(import_module("dataset"), args.augmentation) transform = transform_module(mean=dataset.mean, std=dataset.std) dataset.set_transform(transform["train"]) train_loader = DataLoader( dataset, batch_size=args.batch_size, num_workers=2, shuffle=True, pin_memory=torch.cuda.is_available(), drop_last=True, ) model_module = getattr(import_module("model"), args.model) model = model_module(num_classes=num_classes).to(device) model = torch.nn.DataParallel(model) criterion = create_criterion(args.criterion) optimizer = None if args.optimizer == "AdamP": optimizer = AdamP(model.parameters()) else: opt_module = getattr(import_module("torch.optim"), args.optimizer) optimizer = opt_module( model.parameters(), # filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, # weight_decay=5e-4, ) # scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) logger = SummaryWriter(log_dir=save_dir) best_val_acc = 0 best_val_loss = np.inf best_val_f1 = 0 for epoch in range(args.epochs): model.train() train_loss = 0 train_acc = 0 train_f1 = 0 for i, data in enumerate(tqdm(train_loader)): imgs, labels = data imgs = imgs.float().to(device) labels = labels.long().to(device) optimizer.zero_grad() outputs = model(imgs) loss = criterion(outputs, labels) loss.backward() optimizer.step() preds = torch.argmax(outputs, 1) acc = (preds == labels).sum().item() / len(imgs) t_f1_score = f1_score( labels.cpu().detach().numpy(), preds.cpu().detach().numpy(), average="macro", ) train_loss += loss train_acc += acc train_f1 += t_f1_score if (i + 1) % args.log_interval == 0: train_loss /= args.log_interval train_acc /= args.log_interval train_f1 /= args.log_interval current_lr = get_lr(optimizer) print( f"Epoch[{epoch + 1}/{args.epochs}]({i + 1}/{len(train_loader)}) || trainin_loss {train_loss:.4f} || training acc {train_acc:.4f} || train f1_score {train_f1:.4f} || lr {current_lr}" ) logger.add_scalar("Train/loss", train_loss, epoch * len(train_loader) + i) logger.add_scalar("Train/accuracy", train_acc, epoch * len(train_loader) + i) logger.add_scalar("Train/F1-score", train_f1, epoch * len(train_loader) + i) train_loss = 0 train_acc = 0 train_f1 = 0 torch.save(model.module.state_dict(), f"{save_dir}/last.pth") # How much time training taken times = time.time() - start minute, sec = divmod(times, 60) print(f"Finish Training! Taken time is {minute} minutes {sec} seconds")
unsup_val=unsup_val, BATCH=args.batch) ##########model load ######### print(args.pre_train) model = two_head_net(args.model, num_class, args.p_weight_path, args.pre_train) ############sup train########## if args.sup_train: if args.pre_train: print("using pre trained model, do not need to train again!") pass else: #optimizer = optim.SGD(model.parameters(), lr=args.sup_lr, momentum=args.sup_momentum, weight_decay=args.sup_wdecay,nesterov=True) optimizer = AdamP(model.parameters(), lr=args.sup_lr) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=MILESTONES) sup_train_exp(model.cuda(), criterions, optimizer, scheduler, dataset_loader, args.sup_epoch, args.sup_path) print("========sup train fininsh!=============") ############unsup train########### if args.unsup_train: checkpoint = torch.load(args.sup_path, map_location='cuda:0') model.load_state_dict(checkpoint) optimizer = AdamP(model.parameters(), lr=args.unsup_lr, momentum=args.unsup_momentum, weight_decay=args.unsup_wdecay) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=MILESTONES)
def main(): ############################################################################## if args.server == 'server_A': work_dir = os.path.join('/data1/JM/lung-seg-back-up', args.exp) print(work_dir) elif args.server == 'server_B': work_dir = os.path.join('/data1/workspace/JM_gen/lung-seg-back-up', args.exp) print(work_dir) elif args.server == 'server_D': work_dir = os.path.join( '/daintlab/home/woans0104/workspace/' 'lung-seg-back-up', args.exp) print(work_dir) ############################################################################## if not os.path.exists(work_dir): os.makedirs(work_dir) # copy this file to work dir to keep training configuration shutil.copy(__file__, os.path.join(work_dir, 'main.py')) with open(os.path.join(work_dir, 'args.pkl'), 'wb') as f: pickle.dump(args, f) source_dataset, target_dataset1, target_dataset2 \ = loader.dataset_condition(args.source_dataset) # 1.load_dataset train_loader_source, test_loader_source \ = loader.get_loader(server=args.server, dataset=source_dataset, train_size=args.train_size, aug_mode=args.aug_mode, aug_range=args.aug_range, batch_size=args.batch_size, work_dir=work_dir) train_loader_target1, _ = loader.get_loader(server=args.server, dataset=target_dataset1, train_size=1, aug_mode=False, aug_range=args.aug_range, batch_size=1, work_dir=work_dir) train_loader_target2, _ = loader.get_loader(server=args.server, dataset=target_dataset2, train_size=1, aug_mode=False, aug_range=args.aug_range, batch_size=1, work_dir=work_dir) test_data_li = [ test_loader_source, train_loader_target1, train_loader_target2 ] trn_logger = Logger(os.path.join(work_dir, 'train.log')) trn_raw_logger = Logger(os.path.join(work_dir, 'train_raw.log')) val_logger = Logger(os.path.join(work_dir, 'validation.log')) trn_logger_ae = Logger(os.path.join(work_dir, 'ae_train.log')) val_logger_ae = Logger(os.path.join(work_dir, 'ae_validation.log')) # 2.model_select model_seg = Unet2D(in_shape=(1, 256, 256)) model_seg = model_seg.cuda() model_ae = ae_lung(in_shape=(1, 256, 256)) model_ae = model_ae.cuda() cudnn.benchmark = True # 3.gpu select model_seg = nn.DataParallel(model_seg) model_ae = nn.DataParallel(model_ae) # 4.optim if args.optim == 'adam': optimizer_seg = torch.optim.Adam(model_seg.parameters(), betas=(args.adam_beta1, 0.999), eps=args.eps, lr=args.lr, weight_decay=args.weight_decay) optimizer_ae = torch.optim.Adam(model_ae.parameters(), betas=(args.adam_beta1, 0.999), eps=args.eps, lr=args.lr, weight_decay=args.weight_decay) elif args.optim == 'adamp': optimizer_seg = AdamP(model_seg.parameters(), betas=(args.adam_beta1, 0.999), eps=args.eps, lr=args.lr, weight_decay=args.weight_decay) optimizer_ae = AdamP(model_ae.parameters(), betas=(args.adam_beta1, 0.999), eps=args.eps, lr=args.lr, weight_decay=args.weight_decay) elif args.optim == 'sgd': optimizer_seg = torch.optim.SGD(model_seg.parameters(), lr=args.lr, weight_decay=args.weight_decay) optimizer_ae = torch.optim.SGD(model_ae.parameters(), lr=args.lr, weight_decay=args.weight_decay) # lr decay lr_schedule = args.lr_schedule lr_scheduler_seg = optim.lr_scheduler.MultiStepLR( optimizer_seg, milestones=lr_schedule[:-1], gamma=0.1) lr_scheduler_ae = optim.lr_scheduler.MultiStepLR( optimizer_ae, milestones=lr_schedule[:-1], gamma=0.1) # 5.loss criterion_seg = select_loss(args.seg_loss_function) criterion_ae = select_loss(args.ae_loss_function) criterion_embedding = select_loss(args.embedding_loss_function) ############################################################################ # train best_iou = 0 try: if args.train_mode: for epoch in range(lr_schedule[-1]): train(model_seg=model_seg, model_ae=model_ae, train_loader=train_loader_source, epoch=epoch, criterion_seg=criterion_seg, criterion_ae=criterion_ae, criterion_embedding=criterion_embedding, optimizer_seg=optimizer_seg, optimizer_ae=optimizer_ae, logger=trn_logger, sublogger=trn_raw_logger, logger_ae=trn_logger_ae) iou = validate(model_seg=model_seg, model_ae=model_ae, val_loader=test_loader_source, epoch=epoch, criterion_seg=criterion_seg, criterion_ae=criterion_ae, logger=val_logger, logger_ae=val_logger_ae) print('validation result ************************************') lr_scheduler_seg.step() lr_scheduler_ae.step() if args.val_size == 0: is_best = 1 else: is_best = iou > best_iou best_iou = max(iou, best_iou) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model_seg.state_dict(), 'optimizer': criterion_seg.state_dict() }, is_best, work_dir, filename='checkpoint.pth') print("train end") except RuntimeError as e: print( '#jm_private', '----------------------------------- error train : ' 'send to message JM ' '& Please send a kakao talk -------------------------- ' '\n error message : {}'.format(e)) import ipdb ipdb.set_trace() draw_curve(work_dir, trn_logger, val_logger) draw_curve(work_dir, trn_logger_ae, val_logger_ae, labelname='ae') # here is load model for last pth check_best_pth(work_dir) # validation if args.test_mode: print('Test mode ...') main_test(model=model_seg, test_loader=test_data_li, args=args)
save_every_iters = len(train_loader) num_epochs = 100 criterion = nn.CrossEntropyLoss() lr = 0.001 weight_decay = 1e-2 optimizer = AdamP( [ { "params": model.backbone.parameters() }, { "params": model.classifier.parameters() }, ], lr=1.0, betas=(0.9, 0.999), weight_decay=weight_decay, ) le = len(train_loader) def lambda_lr_scheduler(iteration, lr0, n, a): return lr0 * pow((1.0 - 1.0 * iteration / n), a) lr_scheduler = lrs.LambdaLR(