def train(train_instances, dev_instances, model, config, logger): train_data = CustomDataset(data=train_instances) sampler = RandomSampler(train_data) batch_size = config.batch_size iterator = trange(config.num_epochs, desc='Epoch', disable=False) data_loader = DataLoader(dataset=train_data, sampler=sampler, batch_size=batch_size, collate_fn=CustomCollate.collate, pin_memory=True, num_workers=1) optimizer = RAdam(model.parameters(), lr=config.learning_rate) logger.info('***** Start Training *****') torch.autograd.set_detect_anomaly(True) model.train() losses = [] best_eval_loss = 10000 best_epoch = -1 best_model = None for epoch in iterator: logger.info('***** Epoch: {} *****'.format(epoch)) total_loss = 0.0 total_items = 0 for _, batch in enumerate(data_loader): batch = batch_to_device(batch, config.device) model.to(config.device) model.train() model.zero_grad() output = model(batch) logliks = output[LOG_LIKELIHOOD] loss = -logliks.sum() / output[BATCH_SIZE] loss.backward() optimizer.step() total_loss += -logliks.sum().item() total_items += output[BATCH_SIZE] total_loss /= total_items losses.append(total_loss) logger.info('Train-Loss:{}'.format(total_loss)) # eval eval_result = Evaluator.evaluate(dev_instances, model, config, logger) eval_loss = eval_result[TOTAL_LOSS] if eval_loss < best_eval_loss: logger.info('Update model') best_eval_loss = eval_loss best_epoch = epoch best_model = copy.deepcopy(model) else: if config.patience < epoch - best_epoch: logger.info('Early stopping, Best Epoch: {}'.format(best_epoch)) break logger.info('End Training, Best Epoch: {}'.format(best_epoch)) model_filename = Trainer.save_model(config, best_model, best_epoch) return best_model, model_filename
def configure_optimizers(self): opt = RAdam( self.model.parameters(), lr=self.hp.lr, weight_decay=self.hp.weight_decay, ) return [opt]
def configure_optimizers(self): self.optimizer = RAdam(self.parameters(), lr=self.cfg.train.lr, weight_decay=2e-5) warmup_epo = 1 warmup_factor = 10 scheduler_cos = CosineAnnealingLR(self.optimizer, T_max=self.cfg.train.epoch - warmup_epo, eta_min=0) self.scheduler = GradualWarmupScheduler(self.optimizer, multiplier=warmup_factor, total_epoch=warmup_epo, after_scheduler=scheduler_cos) return [self.optimizer], [self.scheduler]
def configure_optimizers(self): optimizer = { "sgd": FusedSGD(self.parameters(), lr=self.lr, momentum=self.args.momentum), "adam": FusedAdam(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "adamw": torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "radam": RAdam(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "adabelief": AdaBelief(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "adabound": AdaBound(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "adamp": AdamP(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "novograd": FusedNovoGrad(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), }[self.args.optimizer.lower()] if not self.args.use_scheduler: return optimizer scheduler = { "scheduler": NoamLR( optimizer=optimizer, warmup_epochs=self.args.warmup, total_epochs=self.args.epochs, steps_per_epoch=len(self.train_dataloader()) // self.args.gpus, init_lr=self.args.init_lr, max_lr=self.args.lr, final_lr=self.args.final_lr, ), "interval": "step", "frequency": 1, } return {"optimizer": optimizer, "lr_scheduler": scheduler}
def get(self, params, optimizer_name): """ Creates torch optimizer specified by 'optimizer_name' for given 'params'. params: list of torch.nn.parameter.Parameter optimizer_name: str """ if optimizer_name == "sgd": optimizer = SGD( params, lr=self.learning_rate, momentum=self.momentum, weight_decay=self.weight_decay, ) elif optimizer_name == "adam": optimizer = Adam( params, lr=self.learning_rate, betas=tuple(self.betas), eps=self.eps, weight_decay=self.weight_decay, amsgrad=self.amsgrad, ) elif optimizer_name == "adabound": optimizer = AdaBound( params, lr=self.learning_rate, betas=tuple(self.betas), final_lr=self.adabound_final_lr, gamma=self.adabound_gamma, eps=self.eps, weight_decay=self.weight_decay, amsbound=self.amsgrad, ) elif optimizer_name == "lamb": optimizer = Lamb( params, lr=self.learning_rate, betas=tuple(self.betas), eps=self.eps, weight_decay=self.weight_decay, ) elif optimizer_name == "radam": optimizer = RAdam( params, lr=self.learning_rate, betas=tuple(self.betas), eps=self.eps, weight_decay=self.weight_decay, ) else: Exception( "Invalid OPTIMIZER, try: 'adam', 'sgd', 'adabound', 'lamb' or 'radam'" ) return optimizer
def configure_optimizers(self): if self.args.hpus: self.model = self.model.to(get_device(self.args)) permute_params(self.model, True, self.args.run_lazy_mode) # Avoid instantiate optimizers if not have to # since might not be supported if self.args.optimizer.lower() == 'sgd': optimizer = SGD(self.parameters(), lr=self.learning_rate, momentum=self.args.momentum) elif self.args.optimizer.lower() == 'adam': optimizer = Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay) elif self.args.optimizer.lower() == 'radam': optimizer = RAdam(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay) elif self.args.optimizer.lower() == 'adamw': optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay) elif self.args.optimizer.lower() == 'fusedadamw': from habana_frameworks.torch.hpex.optimizers import FusedAdamW optimizer = FusedAdamW(self.parameters(), lr=self.learning_rate, eps=1e-08, weight_decay=self.args.weight_decay) else: assert False, "optimizer {} not suppoerted".format( self.args.optimizer.lower()) scheduler = { "none": None, "multistep": torch.optim.lr_scheduler.MultiStepLR(optimizer, self.args.steps, gamma=self.args.factor), "cosine": torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, self.args.max_epochs), "plateau": torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=self.args.factor, patience=self.args.lr_patience), }[self.args.scheduler.lower()] opt_dict = {"optimizer": optimizer, "monitor": "val_loss"} if scheduler is not None: opt_dict.update({"lr_scheduler": scheduler}) return opt_dict
def configure_optimizers(self): params = self.model.parameters() if self.hp.optim == 'fused_adam': from apex.optimizers import FusedAdam opt = FusedAdam( params, lr=self.hp.lr, weight_decay=self.hp.weight_decay, ) sched = { 'scheduler': OneCycleLR( opt, max_lr=self.hp.lr, total_steps=self.total_steps, ), 'interval': 'step', } elif self.hp.optim == 'radam': opt = RAdam( params, lr=self.hp.lr, weight_decay=self.hp.weight_decay, ) # noinspection PyTypeChecker sched = { 'scheduler': LambdaLR( opt, lr_lambda=partial( flat_cos, total_steps=self.total_steps, ), ), 'interval': 'step', } else: raise Exception return [opt], [sched]
def configure_optimizers(self): optimizer = { "sgd": FusedSGD(self.parameters(), lr=self.learning_rate, momentum=self.args.momentum), "adam": FusedAdam(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay), "radam": RAdam(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay), }[self.args.optimizer.lower()] scheduler = { "none": None, "multistep": torch.optim.lr_scheduler.MultiStepLR(optimizer, self.args.steps, gamma=self.args.factor), "cosine": torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, self.args.max_epochs), "plateau": torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=self.args.factor, patience=self.args.lr_patience), }[self.args.scheduler.lower()] opt_dict = {"optimizer": optimizer, "monitor": "val_loss"} if scheduler is not None: opt_dict.update({"lr_scheduler": scheduler}) return opt_dict
def configure_optimizers(self): optimizer = Lookahead( RAdam(self.parameters(), lr=0.001, weight_decay=WEIGHT_DECAY, eps=1e-5)) schedule = { 'scheduler': OneCycleLRLookahead(optimizer, max_lr=MAX_LR, epochs=EPOCHS, steps_per_epoch=int( len(self._trainval[b'filenames']) / BATCH_SIZE), verbose=False), 'name': 'learning_rate', 'interval': 'step', 'frequency': 1 } return [optimizer], [schedule]
def main(args: Dict[str, Any]): start = time.time() # Intialize config config_path: str = args["config"] with open(config_path, "r", encoding="utf-8") as f: config: Dict[str, Any] = yaml.safe_load(f) logger.info(f"Loaded config at: {config_path}") logger.info(f"{pformat(config)}") # Initialize device if args["use_gpu"] and torch.cuda.is_available(): device: torch.device = torch.device("cuda:0") else: device = torch.device("cpu") # Intialize model model = nn.DataParallel(Resnet50( embedding_size=config["embedding_size"], pretrained=config["pretrained"] )) model = model.to(device) logger.info(f"Initialized model: {model}") # Initialize optimizer optimizer = RAdam(model.parameters(), lr=config["lr"]) logger.info(f"Initialized optimizer: {optimizer}") # Initialize train transforms transform_train = T.Compose([ T.Resize((config["image_size"], config["image_size"])), T.RandomHorizontalFlip(), T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), T.RandomAffine(degrees=5, scale=(0.8, 1.2), translate=(0.2, 0.2)), T.ToTensor(), T.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), ]) logger.info(f"Initialized training transforms: {transform_train}") # Initialize training set train_set = Dataset(args["train_dir"], transform=transform_train) if args["loss"] == "tripletloss": # Initialize train loader for triplet loss batch_size: int = config["classes_per_batch"] * config["samples_per_class"] train_loader = DataLoader( train_set, batch_size, sampler=PKSampler( train_set.targets, config["classes_per_batch"], config["samples_per_class"] ), shuffle=False, num_workers=args["n_workers"], pin_memory=True, ) logger.info(f"Initialized train_loader: {train_loader.dataset}") # Intialize loss function loss_function = TripletMarginLoss( margin=config["margin"], sampling_type=config["sampling_type"] ) logger.info(f"Initialized training loss: {loss_function}") elif args["loss"] == "proxy_nca": # Initialize train loader for proxy-nca loss batch_size: int = config["batch_size"] train_loader = DataLoader( train_set, config["batch_size"], shuffle=True, num_workers=args["n_workers"], pin_memory=True, ) logger.info(f"Initialized train_loader: {train_loader.dataset}") loss_function = ProxyNCALoss( n_classes=len(train_set.classes), embedding_size=config["embedding_size"], embedding_scale=config["embedding_scale"], proxy_scale=config["proxy_scale"], smoothing_factor=config["smoothing_factor"], device=device ) elif args["loss"] == "proxy_anchor": # Intialize train loader for proxy-anchor loss batch_size: int = config["batch_size"] train_loader = DataLoader( train_set, config["batch_size"], shuffle=True, num_workers=args["n_workers"], pin_memory=True, ) logger.info(f"Initialized train_loader: {train_loader.dataset}") loss_function = ProxyAnchorLoss( n_classes=len(train_set.classes), embedding_size=config["embedding_size"], margin=config["margin"], alpha=config["alpha"], device=device ) elif args["loss"] == "soft_triple": # Intialize train loader for proxy-anchor loss batch_size: int = config["batch_size"] train_loader = DataLoader( train_set, config["batch_size"], shuffle=True, num_workers=args["n_workers"], pin_memory=True, ) logger.info(f"Initialized train_loader: {train_loader.dataset}") loss_function = SoftTripleLoss( n_classes=len(train_set.classes), embedding_size=config["embedding_size"], n_centers_per_class=config["n_centers_per_class"], lambda_=config["lambda"], gamma=config["gamma"], tau=config["tau"], margin=config["margin"], device=device ) else: raise Exception("Only the following losses is supported: " "['tripletloss', 'proxy_nca', 'proxy_anchor', 'soft_triple']. " f"Got {args['loss']}") # Initialize test transforms transform_test = T.Compose([ T.Resize((config["image_size"], config["image_size"])), T.ToTensor(), T.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), ]) logger.info(f"Initialized test transforms: {transform_test}") # Initialize test set and test loader test_dataset = Dataset(args["test_dir"], transform=transform_test) test_loader = DataLoader( test_dataset, batch_size, shuffle=False, num_workers=args["n_workers"], ) logger.info(f"Initialized test_loader: {test_loader.dataset}") # Initialize reference set and reference loader # If reference set is not given, use train set as reference set, but without random sampling if not args["reference_dir"]: reference_set = Dataset(args["train_dir"], transform=transform_test) else: reference_set = Dataset(args["reference_dir"], transform=transform_test) # Sometimes reference set is too large to fit into memory, # therefore we only sample a subset of it. n_samples_per_reference_class: int = args["n_samples_per_reference_class"] if n_samples_per_reference_class > 0: reference_set = get_subset_from_dataset(reference_set, n_samples_per_reference_class) reference_loader = DataLoader( reference_set, batch_size, shuffle=False, num_workers=args["n_workers"], ) logger.info(f"Initialized reference set: {reference_loader.dataset}") # Initialize checkpointing directory checkpoint_dir: str = os.path.join(args["checkpoint_root_dir"], CURRENT_TIME) writer = SummaryWriter(log_dir=checkpoint_dir) logger.info(f"Created checkpoint directory at: {checkpoint_dir}") # Dictionary contains all metrics output_dict: Dict[str, Any] = { "total_epoch": args["n_epochs"], "current_epoch": 0, "current_iter": 0, "metrics": { "mean_average_precision": 0.0, "average_precision_at_1": 0.0, "average_precision_at_5": 0.0, "average_precision_at_10": 0.0, "top_1_accuracy": 0.0, "top_5_accuracy": 0.0, "normalized_mutual_information": 0.0, } } # Start training and testing logger.info("Start training...") for _ in range(1, args["n_epochs"] + 1): output_dict = train_one_epoch( model, optimizer, loss_function, train_loader, test_loader, reference_loader, writer, device, config, checkpoint_dir, args['log_frequency'], args['validate_frequency'], output_dict ) logger.info(f"DONE TRAINING {args['n_epochs']} epochs") # Visualize embeddings logger.info("Calculating train embeddings for visualization...") log_embeddings_to_tensorboard(train_loader, model, device, writer, tag="train") logger.info("Calculating reference embeddings for visualization...") log_embeddings_to_tensorboard(reference_loader, model, device, writer, tag="reference") logger.info("Calculating test embeddings for visualization...") log_embeddings_to_tensorboard(test_loader, model, device, writer, tag="test") # Visualize model's graph logger.info("Adding graph for visualization") with torch.no_grad(): dummy_input = torch.zeros(1, 3, config["image_size"], config["image_size"]).to(device) writer.add_graph(model.module.features, dummy_input) # Save all hyper-parameters and corresponding metrics logger.info("Saving all hyper-parameters") writer.add_hparams( config, metric_dict={f"hyperparams/{key}": value for key, value in output_dict["metrics"].items()} ) with open(os.path.join(checkpoint_dir, "output_dict.json"), "w") as f: json.dump(output_dict, f, indent=4) logger.info(f"Dumped output_dict.json at {checkpoint_dir}") end = time.time() logger.info(f"EVERYTHING IS DONE. Training time: {round(end - start, 2)} seconds")
def main(): anchors = [30, 54, 95] shuffle = not (args.no_shuffle) exp = args.exp warm_up_epoch = 3 # Load and process data if args.fold: df_train = pd.read_csv(args.data_path + 'k_fold/official_train_fold%d.csv' % (args.fold)) df_val = pd.read_csv(args.data_path + 'k_fold/official_val_fold%d.csv' % (args.fold)) else: df_train = pd.read_csv(args.data_path + 'official_train.csv') df_val = pd.read_csv(args.data_path + 'official_val.csv') train = df_train.image_path.to_list() val = df_val.image_path.to_list() if exp: y_train = df_train.anchor.to_list() y_val = df_val.anchor.to_list() reg_train_gt = df_train.exp_wind.to_list() reg_val_gt = df_val.exp_wind.to_list() else: y_train = df_train.wind_speed.to_list() y_val = df_val.wind_speed.to_list() train_transform, val_transform = get_transform(args.image_size) train_dataset = WindDataset(image_list=train, target=y_train, exp_target=reg_train_gt if exp else None, transform=train_transform) val_dataset = WindDataset(image_list=val, target=y_val, exp_target=reg_val_gt if exp else None, transform=val_transform) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.num_workers, drop_last=True) val_loader = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, drop_last=True) warm_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size * 14, shuffle=shuffle, num_workers=args.num_workers, drop_last=True) # Load model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') last_epoch = 0 # model = ResNet50_BN_idea() if not exp: model = Effnet_Wind_B7() # model = Effnet_Wind_B5() else: model = Effnet_Wind_B5_exp_6() # model = ResNetExample() # if not exp: # model = Seresnext_Wind() # else: # model = Seresnext_Wind_Exp() # Optimizer if args.opt == 'radam': optimizer = RAdam( model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=args.weight_decay, ) elif args.opt == 'adamw': optimizer = AdamW(model.parameters(), args.lr) elif args.opt == 'adam': optimizer = Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) else: optimizer = SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True, weight_decay=args.weight_decay) if args.weights: # model.load_state_dict(torch.load(args.weights)) last_epoch = extract_number(args.weights) try: checkpoint = torch.load(args.weights) model.load_state_dict(checkpoint['model_state_dict']) if checkpoint['pre_opt'] == args.opt: optimizer.load_state_dict(checkpoint['optimizer_state_dict']) print(optimizer) except: model.load_state_dict(torch.load(args.weights)) else: model.apply(reset_m_batchnorm) model.to(device) # Loss function if exp: criterion = JointLoss2() else: criterion = RMSELoss() # generate log and visualization save_path = args.save_path log_cache = (args.batch_size, args.image_size, shuffle, exp) write_log(args.save_path, model, optimizer, criterion, log_cache) plot_dict = {'train': list(), 'val': list()} log_train_path = save_path + 'training_log.txt' plot_train_path = save_path + 'log.json' write_mode = 'w' if os.path.exists(log_train_path) and os.path.exists(plot_train_path): write_mode = 'a' with open(plot_train_path, 'r') as j: plot_dict = json.load(j) plot_dict['train'] = plot_dict['train'][:last_epoch] plot_dict['val'] = plot_dict['val'][:last_epoch] # Training print('Start warm up') model.freeze_except_last() for epoch in range(warm_up_epoch): warm_up( model=model, dataloader=warm_loader, optimizer=optimizer, criterion=criterion, device=device, ) model.unfreeze() with open(log_train_path, write_mode) as f: for epoch in range(1, args.epoch + 1): print('Epoch:', epoch + last_epoch) f.write('Epoch: %d\n' % (epoch + last_epoch)) loss = train_epoch(model=model, dataloader=train_loader, optimizer=optimizer, criterion=criterion, device=device, exp=exp) RMSE = val_epoch(model=model, dataloader=val_loader, device=device, exp=exp, anchors=anchors) if not exp: f.write('Training loss: %.4f\n' % (loss)) f.write('RMSE val: %.4f\n' % (RMSE)) print('RMSE loss: %.4f' % (loss)) print('RMSE val: %.4f' % (RMSE)) else: loss, classify, regress = loss RMSE, accuracy = RMSE f.write('Training loss: %.4f\n' % (loss)) f.write('Classification loss: %.4f\n' % (classify)) f.write('Regression loss: %.4f\n' % (regress)) f.write('Accuracy val: %.4f\n' % (accuracy)) f.write('RMSE val: %.4f\n' % (RMSE)) print('Training loss: %.4f' % (loss)) print('Classification loss: %.4f' % (classify)) print('Regression loss: %.4f' % (regress)) print('Accuracy val: %.4f' % (accuracy)) print('RMSE val: %.4f' % (RMSE)) # torch.save(model.state_dict(), save_path + 'epoch%d.pth'%(epoch+last_epoch)) save_name = save_path + 'epoch%d.pth' % (epoch + last_epoch) save_pth(save_name, epoch + last_epoch, model, optimizer, args.opt) plot_dict['train'].append(loss) plot_dict['val'].append(RMSE) with open(plot_train_path, 'w') as j: json.dump(plot_dict, j)
def train(args, cfg): device = torch.device('cuda') model = ModelWithLoss(cfg).to(device) print('------------Model Architecture-------------') print(model) print('Loading Datasets...') data_loader = {} if cfg.SOLVER.AUGMENTATION: train_transforms = SyntheticTransforms() else: train_transforms = ToTensor() if cfg.DATASET.TRACK == 'synthetic': train_dataset = SyntheticBurst(ZurichRAW2RGB(cfg.DATASET.TRAIN_SYNTHETIC), crop_sz=cfg.SOLVER.PATCH_SIZE, burst_size=cfg.MODEL.BURST_SIZE, transform=train_transforms) elif cfg.DATASET.TRACK == 'real': train_dataset = BurstSRDataset(cfg.DATASET.REAL, split='train', crop_sz=cfg.SOLVER.PATCH_SIZE // 8, burst_size=cfg.MODEL.BURST_SIZE) sampler = RandomSampler(train_dataset) batch_sampler = BatchSampler(sampler=sampler, batch_size=cfg.SOLVER.BATCH_SIZE, drop_last=True) batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations=cfg.SOLVER.MAX_ITER) train_loader = DataLoader(train_dataset, num_workers=args.num_workers, batch_sampler=batch_sampler, pin_memory=True) data_loader['train'] = train_loader # if args.eval_step != 0: # val_transforms = # val_dataset = # sampler = SequentialSampler(val_dataset) # batch_sampler = BatchSampler(sampler=sampler, batch_size=args.batch_size, drop_last=False) # val_loader = DataLoader(val_dataset, num_workers=args.num_workers, batch_sampler=batch_sampler) # data_loader['val'] = val_loader if cfg.SOLVER.OPTIMIZER == 'radam': optimizer = RAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.SOLVER.LR) elif cfg.SOLVER.OPTIMIZER == 'adabound': optimizer = AdaBound(filter(lambda p:p.requires_grad, model.parameters()), lr=cfg.SOLVER.LR, final_lr=cfg.SOLVER.FINAL_LR) # optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.SOLVER.LR) # scheduler = MultiStepLR(optimizer, cfg.SOLVER.LR_STEP, gamma=0.1) scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.LR, cfg.SOLVER.LR_STEP, warmup_factor=cfg.SOLVER.WARMUP_FACTOR, warmup_iters=cfg.SOLVER.WARMUP_ITER) if args.resume_iter != 0: model_path = os.path.join(cfg.OUTPUT_DIR, 'model', 'iteration_{}.pth'.format(args.resume_iter)) print(f'Resume from {model_path}') model.model.load_state_dict(fix_model_state_dict(torch.load(os.path.join(cfg.OUTPUT_DIR, 'model', 'iteration_{}.pth'.format(args.resume_iter))))) if model.flow_refine: FR_model_path = os.path.dirname(model_path)[:-5] + "FR_model/" + 'iteration_{}.pth'.format(args.resume_iter) model.FR_model.load_state_dict(torch.load(FR_model_path)) if model.denoise_burst: denoise_model_path = os.path.dirname(model_path)[:-5] + "denoise_model/" + 'iteration_{}.pth'.format(args.resume_iter) model.denoise_model.load_state_dict(torch.load(denoise_model_path)) optimizer.load_state_dict(torch.load(os.path.join(cfg.OUTPUT_DIR, 'optimizer', 'iteration_{}.pth'.format(args.resume_iter)))) scheduler.load_state_dict(torch.load(os.path.join(cfg.OUTPUT_DIR, 'scheduler', 'iteration_{}.pth'.format(args.resume_iter)))) elif cfg.SOLVER.PRETRAIN_MODEL != '': model_path = cfg.SOLVER.PRETRAIN_MODEL print(f'load pretrain model from {model_path}') model.model.load_state_dict(fix_model_state_dict(torch.load(model_path))) if model.flow_refine: FR_model_path = os.path.dirname(model_path)[:-5] + "FR_model/" + os.path.basename(cfg.SOLVER.PRETRAIN_MODEL) model.FR_model.load_state_dict(torch.load(FR_model_path)) if model.denoise_burst: denoise_model_path = os.path.dirname(model_path)[:-5] + "denoise_model/" + os.path.basename(cfg.SOLVER.PRETRAIN_MODEL) model.denoise_model.load_state_dict(torch.load(denoise_model_path)) if cfg.SOLVER.SYNC_BATCHNORM: model = convert_model(model).to(device) if args.num_gpus > 1: model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpus))) if not args.debug: summary_writer = SummaryWriter(log_dir=cfg.OUTPUT_DIR) else: summary_writer = None do_train(args, cfg, model, optimizer, scheduler, data_loader, device, summary_writer)
def train_loop(folds, fold): if CFG.device == 'GPU': LOGGER.info(f"========== fold: {fold} training ==========") elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info(f"========== fold: {fold} training ==========") elif CFG.nprocs == 8: xm.master_print(f"========== fold: {fold} training ==========") # ==================================================== # loader # ==================================================== trn_idx = folds[folds['fold'] != fold].index val_idx = folds[folds['fold'] == fold].index train_folds = folds.loc[trn_idx].reset_index(drop=True) valid_folds = folds.loc[val_idx].reset_index(drop=True) train_folds = train_folds[train_folds['StudyInstanceUID'].isin( train_annotations['StudyInstanceUID'].unique())].reset_index(drop=True) valid_labels = valid_folds[CFG.target_cols].values train_dataset = TrainDataset(train_folds, train_annotations, use_annot=True, transform=get_transforms(data='train')) valid_dataset = TrainDataset(valid_folds, train_annotations, use_annot=False, transform=get_transforms(data='valid')) if CFG.device == 'GPU': train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size * 2, shuffle=False, num_workers=CFG.num_workers, pin_memory=True, drop_last=False) elif CFG.device == 'TPU': train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=CFG.batch_size, sampler=train_sampler, drop_last=True, num_workers=CFG.num_workers) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=CFG.batch_size * 2, sampler=valid_sampler, drop_last=False, num_workers=CFG.num_workers) # ==================================================== # scheduler # ==================================================== def get_scheduler(optimizer): if CFG.scheduler == 'ReduceLROnPlateau': scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps) elif CFG.scheduler == 'CosineAnnealingLR': scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1) elif CFG.scheduler == 'CosineAnnealingWarmRestarts': scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1) return scheduler # ==================================================== # model & optimizer # ==================================================== if CFG.device == 'TPU': device = xm.xla_device() elif CFG.device == 'GPU': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') teacher_model = CustomSeResNet152D(CFG.model_name, pretrained=False) teacher_model.to(device) state = torch.load(CFG.teacher) teacher_model.load_state_dict(state['model']) for param in teacher_model.parameters(): param.requires_grad = False teacher_model.eval() # teacher_model.to(device) model = CustomSeResNet152D_WLF(CFG.model_name, pretrained=True) model.to(device) # state = torch.load(CFG.student) # model.load_state_dict(state['model']) optimizer = RAdam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay) scheduler = get_scheduler(optimizer) # ==================================================== # loop # ==================================================== train_criterion = CustomLoss(weights=CFG.weights) valid_criterion = nn.BCEWithLogitsLoss() best_score = 0. best_loss = np.inf for epoch in range(CFG.epochs): start_time = time.time() # train if CFG.device == 'TPU': if CFG.nprocs == 1: avg_loss = train_fn(train_loader, teacher_model, model, train_criterion, optimizer, epoch, scheduler, device) elif CFG.nprocs == 8: para_train_loader = pl.ParallelLoader(train_loader, [device]) avg_loss = train_fn( para_train_loader.per_device_loader(device), teacher_model, model, train_criterion, optimizer, epoch, scheduler, device) elif CFG.device == 'GPU': avg_loss = train_fn(train_loader, teacher_model, model, train_criterion, optimizer, epoch, scheduler, device) # eval if CFG.device == 'TPU': if CFG.nprocs == 1: avg_val_loss, preds, _ = valid_fn(valid_loader, model, valid_criterion, device) elif CFG.nprocs == 8: para_valid_loader = pl.ParallelLoader(valid_loader, [device]) avg_val_loss, preds, valid_labels = valid_fn( para_valid_loader.per_device_loader(device), model, valid_criterion, device) preds = idist.all_gather(torch.tensor(preds)).to('cpu').numpy() valid_labels = idist.all_gather( torch.tensor(valid_labels)).to('cpu').numpy() elif CFG.device == 'GPU': avg_val_loss, preds, _ = valid_fn(valid_loader, model, valid_criterion, device) if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(avg_val_loss) elif isinstance(scheduler, CosineAnnealingLR): scheduler.step() elif isinstance(scheduler, CosineAnnealingWarmRestarts): scheduler.step() # scoring score, scores = get_score(valid_labels, preds) elapsed = time.time() - start_time if CFG.device == 'GPU': LOGGER.info( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) LOGGER.info( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) LOGGER.info( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) elif CFG.nprocs == 8: xm.master_print( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) xm.master_print( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) if score > best_score: best_score = score if CFG.device == 'GPU': LOGGER.info( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model' ) torch.save({ 'model': model.state_dict(), 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth') elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model' ) elif CFG.nprocs == 8: xm.master_print( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model' ) xm.save({ 'model': model, 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth') if avg_val_loss < best_loss: best_loss = avg_val_loss if CFG.device == 'GPU': LOGGER.info( f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model') torch.save({ 'model': model.state_dict(), 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_loss.pth') elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info( f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model' ) elif CFG.nprocs == 8: xm.master_print( f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model' ) xm.save({ 'model': model, 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_loss.pth') # # inference用に全て保存しておく # if CFG.device == 'TPU': # xm.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth') # elif CFG.device == 'GPU': # torch.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth') if CFG.nprocs != 8: check_point = torch.load( OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth') for c in [f'pred_{c}' for c in CFG.target_cols]: valid_folds[c] = np.nan valid_folds[[f'pred_{c}' for c in CFG.target_cols]] = check_point['preds'] return valid_folds
def get_optimizer( model: nn.Module, optimizer_name: str, learning_rate: float, weight_decay: float = 1e-5, no_weight_decay_on_bias: bool = False, eps: float = 1e-5, **kwargs, ) -> Optimizer: """ Construct an Optimizer for given model Args: model: Model to optimize. Only parameters that require_grad will be used optimizer_name: Name of the optimizer. Case-insensitive learning_rate: Target learning rate (regardless of the scheduler) weight_decay: Target weight decay no_weight_decay_on_bias: Whether to disable weight decay on bias parameters eps: Default epsilon for Adam-like optimizers. **kwargs: Additional parameters for optimizer Returns: """ from torch.optim import ASGD, SGD, Adam, RMSprop, AdamW from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger # Optimizer parameter groups default_pg, biases_pg = [], [] for k, v in model.named_parameters(): if v.requires_grad: if str.endswith(k, ".bias"): biases_pg.append(v) # biases else: default_pg.append(v) # all else if no_weight_decay_on_bias: parameters = default_pg else: parameters = default_pg + biases_pg optimizer: Optimizer = None if optimizer_name.lower() == "sgd": optimizer = SGD( parameters, lr=learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "asgd": optimizer = ASGD( parameters, lr=learning_rate, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "adam": optimizer = Adam( parameters, lr=learning_rate, weight_decay=weight_decay, eps=eps, **kwargs, ) elif optimizer_name.lower() == "rms": optimizer = RMSprop(parameters, learning_rate, weight_decay=weight_decay, **kwargs) elif optimizer_name.lower() == "adamw": optimizer = AdamW( parameters, lr=learning_rate, weight_decay=weight_decay, eps=eps, **kwargs, ) elif optimizer_name.lower() == "radam": optimizer = RAdam( parameters, lr=learning_rate, weight_decay=weight_decay, eps=eps, **kwargs, ) elif optimizer_name.lower() == "ranger": optimizer = Ranger( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "lamb": optimizer = Lamb( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "diffgrad": optimizer = DiffGrad( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "novograd": optimizer = NovoGrad( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "fused_lamb": from apex.optimizers import FusedLAMB optimizer = FusedLAMB(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) elif optimizer_name.lower() == "fused_sgd": from apex.optimizers import FusedSGD optimizer = FusedSGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) elif optimizer_name.lower() == "fused_adam": from apex.optimizers import FusedAdam optimizer = FusedAdam(parameters, learning_rate, eps=eps, weight_decay=weight_decay, adam_w_mode=True, **kwargs) else: raise KeyError(f"Cannot get optimizer by name {optimizer_name}") # Currently either no_wd or per-group lr if no_weight_decay_on_bias: optimizer.add_param_group({"params": biases_pg, "weight_decay": 0}) return optimizer
def main(cfg: DictConfig): # Config ################################################################ IMAGE_NUM = cfg.data.image_num IMAGE_SIZE = cfg.data.image_size exp_name = cfg.data.exp model_name = f'efficientnet-{cfg.data.model_name}' BATCH_SIZE = cfg.training.batch_size lr = cfg.training.lr NUM_EPOCHS = cfg.training.num_epoch FOLD = cfg.training.fold OPTIMIZER = cfg.training.optimizer SCHEDULER = cfg.training.scheduler # Chenge Current Dir ################################################################ cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) # Data Loading ################################################################ # Background_rate = 0.7 # img_path = glob.glob('./data/grid_256_level_1/img/*.jpg') img_path = glob.glob('./data/grid_128_level_1/img/*.jpg') # Labelデータの読み込み # meta = pd.read_csv('./data/input/train.csv') # meta = pd.read_csv('./data/input/modified_train.csv') # 修正ver1 meta = pd.read_csv( './data/input/modified_train_v2.csv') # 修正ver2 (score_3, 4, 5の割合を考慮) # Data Augmentation transform = ImageTransform(img_size=IMAGE_SIZE) # ImageSizeを指定 # StratifiedKFold cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) meta['fold'] = -1 for i, (trn_idx, val_idx) in enumerate(cv.split(meta, meta['isup_grade'])): meta.loc[val_idx, 'fold'] = i # Dataset, DataLoader ################################################################ # multiがtrueの場合、すべてのfoldを使用。false(デフォルト)の場合は一つのfoldのみを使用 dataloaders = get_dataloaders(meta, FOLD, img_path, transform, IMAGE_NUM, BATCH_SIZE, multi=cfg.training.multi_fold, binning=cfg.training.binning) # Model ################################################################ if cfg.training.binning: OUTPUTSIZE = 5 else: OUTPUTSIZE = 6 net = ModelEFN_2(model_name=model_name, output_size=OUTPUTSIZE) # Set Weight # model_path = './weights/efn_b0_fromjpg_augtile_04_epoch_18_loss_1.191_kappa_0.716.pth' # net.load_state_dict(torch.load(model_path, map_location=device)) # criterion = nn.CrossEntropyLoss(reduction='mean') criterion = nn.BCEWithLogitsLoss() # criterion = QWKLoss() opt_dict = { 'adam': optim.Adam(net.parameters(), lr=lr), 'radam': RAdam(net.parameters(), lr=lr), 'sgd': optim.SGD(net.parameters(), lr=lr, weight_decay=0.0001, momentum=0.9) } optimizer = opt_dict[OPTIMIZER] sch_dict = { 'step': StepLR(optimizer, step_size=4, gamma=0.5), 'cos': CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2, eta_min=lr * 0.1), 'cos_2': CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS, eta_min=0), 'none': None, 'warmup': None } scheduler = sch_dict[SCHEDULER] if SCHEDULER == 'warmup': del optimizer, scheduler warmup_factor = 10 warmup_epo = 1 if OPTIMIZER == 'adam': optimizer = optim.Adam(net.parameters(), lr=lr / warmup_factor) elif OPTIMIZER == 'radam': optimizer = RAdam(net.parameters(), lr=lr / warmup_factor) else: optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=0.0001, momentum=0.9) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, NUM_EPOCHS - warmup_epo, eta_min=0) scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, total_epoch=warmup_epo, after_scheduler=scheduler_cosine) # ML Flow ########################################################################### experient_name = f'PANDA_{cfg.data.model_name}' mlflow.set_experiment(experient_name) with mlflow.start_run(): # パラメータを記録 for k, v in cfg.data.items(): mlflow.log_param('data/' + str(k), v) for k, v in cfg.training.items(): mlflow.log_param('training/' + str(k), v) # Train ################################################################ writer = SummaryWriter(f'./tensorboard/{exp_name}') if cfg.training.multi_fold: trainer = Trainer_multifold(dataloaders, net, device, NUM_EPOCHS, criterion, optimizer, scheduler, exp=exp_name, writer=writer, save_weight_path='./weights', binning=cfg.training.binning) else: trainer = Trainer(dataloaders, net, device, NUM_EPOCHS, criterion, optimizer, scheduler, exp=exp_name, writer=writer, save_weight_path='./weights', binning=cfg.training.binning) trainer.train()
def get_optimizer(optimizer_name: str, parameters, learning_rate: float, weight_decay=1e-5, eps=1e-5, **kwargs) -> Optimizer: from torch.optim import SGD, Adam, RMSprop, AdamW from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger if optimizer_name.lower() == "sgd": return SGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adam": return Adam(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) # As Jeremy suggests if optimizer_name.lower() == "rms": return RMSprop(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adamw": return AdamW(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) if optimizer_name.lower() == "radam": return RAdam(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) # As Jeremy suggests # Optimizers from torch-optimizer if optimizer_name.lower() == "ranger": return Ranger(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "lamb": return Lamb(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "diffgrad": return DiffGrad(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "novograd": return NovoGrad(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) # Optimizers from Apex (Fused version is faster on GPU with tensor cores) if optimizer_name.lower() == "fused_lamb": from apex.optimizers import FusedLAMB return FusedLAMB(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_sgd": from apex.optimizers import FusedSGD return FusedSGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_adam": from apex.optimizers import FusedAdam return FusedAdam(parameters, learning_rate, eps=eps, weight_decay=weight_decay, adam_w_mode=True, **kwargs) raise ValueError("Unsupported optimizer name " + optimizer_name)
def main( data_dir, save_dir, total_steps, warmup_steps, valid_steps, log_steps, save_steps, milestones, exclusive_rate, n_samples, accu_steps, batch_size, n_workers, preload, comment, ckpt, grad_norm_clip, use_target_features, **kwargs, ): """Main function.""" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") metadata_path = Path(data_dir) / "metadata.json" dataset = IntraSpeakerDataset(data_dir, metadata_path, n_samples, preload, ref_feat=use_target_features) trainlen = int(0.9 * len(dataset)) lengths = [trainlen, len(dataset) - trainlen] trainset, validset = random_split(dataset, lengths) train_loader = DataLoader( trainset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=n_workers, pin_memory=True, collate_fn=collate_batch, ) valid_loader = DataLoader( validset, batch_size=batch_size * accu_steps, num_workers=n_workers, drop_last=True, pin_memory=True, collate_fn=collate_batch, ) train_iterator = iter(train_loader) if comment is not None: log_dir = "logs/" log_dir += datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") log_dir += "_" + comment writer = SummaryWriter(log_dir) save_dir_path = Path(save_dir) save_dir_path.mkdir(parents=True, exist_ok=True) if ckpt is not None: try: start_step = int(ckpt.split('-')[1][4:]) ref_included = True except: start_step = 0 ref_included = False model = torch.jit.load(ckpt).to(device) optimizer = RAdam( [ { "params": model.unet.parameters(), "lr": 1e-6 }, { "params": model.smoothers.parameters() }, { "params": model.mel_linear.parameters() }, { "params": model.post_net.parameters() }, ], lr=1e-4, ) scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps - start_step) print("Optimizer and scheduler restarted.") print(f"Model loaded from {ckpt}, iteration: {start_step}") else: ref_included = False start_step = 0 model = FragmentVC().to(device) model = torch.jit.script(model) optimizer = RAdam(model.parameters(), lr=1e-4) scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps) criterion = nn.L1Loss() best_loss = float("inf") best_state_dict = None self_exclude = 0.0 pbar = tqdm(total=valid_steps, ncols=0, desc="Train", unit=" step") for step in range(start_step, total_steps): batch_loss = 0.0 for _ in range(accu_steps): try: batch = next(train_iterator) except StopIteration: train_iterator = iter(train_loader) batch = next(train_iterator) loss = model_fn(batch, model, criterion, self_exclude, ref_included, device) loss = loss / accu_steps batch_loss += loss.item() loss.backward() optimizer.step() scheduler.step() torch.nn.utils.clip_grad_norm_(model.parameters(), grad_norm_clip) optimizer.zero_grad() pbar.update() pbar.set_postfix(loss=f"{batch_loss:.2f}", excl=self_exclude, step=step + 1) if step % log_steps == 0 and comment is not None: writer.add_scalar("Loss/train", batch_loss, step) writer.add_scalar("Self-exclusive Rate", self_exclude, step) if (step + 1) % valid_steps == 0: pbar.close() valid_loss = valid(valid_loader, model, criterion, device) if comment is not None: writer.add_scalar("Loss/valid", valid_loss, step + 1) if valid_loss < best_loss: best_loss = valid_loss best_state_dict = model.state_dict() pbar = tqdm(total=valid_steps, ncols=0, desc="Train", unit=" step") if (step + 1) % save_steps == 0 and best_state_dict is not None: loss_str = f"{best_loss:.4f}".replace(".", "dot") best_ckpt_name = f"retriever-best-loss{loss_str}.pt" loss_str = f"{valid_loss:.4f}".replace(".", "dot") curr_ckpt_name = f"retriever-step{step+1}-loss{loss_str}.pt" current_state_dict = model.state_dict() model.cpu() model.load_state_dict(best_state_dict) model.save(str(save_dir_path / best_ckpt_name)) model.load_state_dict(current_state_dict) model.save(str(save_dir_path / curr_ckpt_name)) model.to(device) pbar.write( f"Step {step + 1}, best model saved. (loss={best_loss:.4f})") if (step + 1) >= milestones[1]: self_exclude = exclusive_rate elif (step + 1) == milestones[0]: ref_included = True optimizer = RAdam( [ { "params": model.unet.parameters(), "lr": 1e-6 }, { "params": model.smoothers.parameters() }, { "params": model.mel_linear.parameters() }, { "params": model.post_net.parameters() }, ], lr=1e-4, ) scheduler = get_cosine_schedule_with_warmup( optimizer, warmup_steps, total_steps - milestones[0]) pbar.write("Optimizer and scheduler restarted.") elif (step + 1) > milestones[0]: self_exclude = (step + 1 - milestones[0]) / (milestones[1] - milestones[0]) self_exclude *= exclusive_rate pbar.close()
def get_optimizer(net, opt_conf, tasks=None, is_disc=False, iterations=-1): """Returns a tuple (optimizer, scheduler) according to opt_conf which should come from the trainer's opts as: trainer.opts.<model>.opt Args: net (nn.Module): Network to update opt_conf (addict.Dict): optimizer and scheduler options tasks: list of tasks iterations (int, optional): Last epoch number. Defaults to -1, meaning start with base lr. Returns: Tuple: (torch.Optimizer, torch._LRScheduler) """ opt = scheduler = None lr_names = [] if tasks is None: lr_default = opt_conf.lr params = net.parameters() lr_names.append("full") elif isinstance(opt_conf.lr, float): # Use default for all tasks lr_default = opt_conf.lr params = net.parameters() lr_names.append("full") elif len(opt_conf.lr) == 1: # Use default for all tasks lr_default = opt_conf.lr.default params = net.parameters() lr_names.append("full") else: lr_default = opt_conf.lr.default params = list() for task in tasks: lr = opt_conf.lr.get(task, lr_default) parameters = None # Parameters for encoder if not is_disc: if task == "m": parameters = net.encoder.parameters() params.append({"params": parameters, "lr": lr}) lr_names.append("encoder") # Parameters for decoders if task == "p": if hasattr(net, "painter"): parameters = net.painter.parameters() lr_names.append("painter") else: parameters = net.decoders[task].parameters() lr_names.append(f"decoder_{task}") else: if task in net: parameters = net[task].parameters() lr_names.append(f"disc_{task}") if parameters is not None: params.append({"params": parameters, "lr": lr}) if opt_conf.optimizer.lower() == "extraadam": opt = ExtraAdam(params, lr=lr_default, betas=(opt_conf.beta1, 0.999)) elif opt_conf.optimizer.lower() == "novograd": opt = NovoGrad(params, lr=lr_default, betas=(opt_conf.beta1, 0)) # default for beta2 is 0 elif opt_conf.optimizer.lower() == "radam": opt = RAdam(params, lr=lr_default, betas=(opt_conf.beta1, 0.999)) elif opt_conf.optimizer.lower() == "rmsprop": opt = RMSprop(params, lr=lr_default) else: opt = Adam(params, lr=lr_default, betas=(opt_conf.beta1, 0.999)) scheduler = get_scheduler(opt, opt_conf, iterations) return opt, scheduler, lr_names
def train(rank: int, cfg: DictConfig): print(OmegaConf.to_yaml(cfg)) if cfg.train.n_gpu > 1: init_process_group(backend=cfg.train.dist_config['dist_backend'], init_method=cfg.train.dist_config['dist_url'], world_size=cfg.train.dist_config['world_size'] * cfg.train.n_gpu, rank=rank) device = torch.device( 'cuda:{:d}'.format(rank) if torch.cuda.is_available() else 'cpu') generator = Generator(sum(cfg.model.feature_dims), *cfg.model.cond_dims, **cfg.model.generator).to(device) discriminator = Discriminator(**cfg.model.discriminator).to(device) if rank == 0: print(generator) os.makedirs(cfg.train.ckpt_dir, exist_ok=True) print("checkpoints directory : ", cfg.train.ckpt_dir) if os.path.isdir(cfg.train.ckpt_dir): cp_g = scan_checkpoint(cfg.train.ckpt_dir, 'g_') cp_do = scan_checkpoint(cfg.train.ckpt_dir, 'd_') steps = 1 if cp_g is None or cp_do is None: state_dict_do = None last_epoch = -1 else: state_dict_g = load_checkpoint(cp_g, device) state_dict_do = load_checkpoint(cp_do, device) generator.load_state_dict(state_dict_g['generator']) discriminator.load_state_dict(state_dict_do['discriminator']) steps = state_dict_do['steps'] + 1 last_epoch = state_dict_do['epoch'] if cfg.train.n_gpu > 1: generator = DistributedDataParallel(generator, device_ids=[rank]).to(device) discriminator = DistributedDataParallel(discriminator, device_ids=[rank]).to(device) optim_g = RAdam(generator.parameters(), cfg.opt.lr, betas=cfg.opt.betas) optim_d = RAdam(discriminator.parameters(), cfg.opt.lr, betas=cfg.opt.betas) if state_dict_do is not None: optim_g.load_state_dict(state_dict_do['optim_g']) optim_d.load_state_dict(state_dict_do['optim_d']) scheduler_g = torch.optim.lr_scheduler.ExponentialLR( optim_g, gamma=cfg.opt.lr_decay, last_epoch=last_epoch) scheduler_d = torch.optim.lr_scheduler.ExponentialLR( optim_d, gamma=cfg.opt.lr_decay, last_epoch=last_epoch) train_filelist = load_dataset_filelist(cfg.dataset.train_list) trainset = FeatureDataset(cfg.dataset, train_filelist, cfg.data) train_sampler = DistributedSampler( trainset) if cfg.train.n_gpu > 1 else None train_loader = DataLoader(trainset, batch_size=cfg.train.batch_size, num_workers=cfg.train.num_workers, shuffle=True, sampler=train_sampler, pin_memory=True, drop_last=True) if rank == 0: val_filelist = load_dataset_filelist(cfg.dataset.test_list) valset = FeatureDataset(cfg.dataset, val_filelist, cfg.data, segmented=False) val_loader = DataLoader(valset, batch_size=1, num_workers=cfg.train.num_workers, shuffle=False, sampler=train_sampler, pin_memory=True) sw = SummaryWriter(os.path.join(cfg.train.ckpt_dir, 'logs')) generator.train() discriminator.train() for epoch in range(max(0, last_epoch), cfg.train.epochs): if rank == 0: start = time.time() print("Epoch: {}".format(epoch + 1)) if cfg.train.n_gpu > 1: train_sampler.set_epoch(epoch) for y, x_noised_features, x_noised_cond in train_loader: if rank == 0: start_b = time.time() y = y.to(device, non_blocking=True) x_noised_features = x_noised_features.transpose(1, 2).to( device, non_blocking=True) x_noised_cond = x_noised_cond.to(device, non_blocking=True) z1 = torch.randn(cfg.train.batch_size, cfg.model.cond_dims[1], device=device) z2 = torch.randn(cfg.train.batch_size, cfg.model.cond_dims[1], device=device) y_hat1 = generator(x_noised_features, x_noised_cond, z=z1) y_hat2 = generator(x_noised_features, x_noised_cond, z=z2) # Discriminator real_scores, fake_scores = discriminator(y), discriminator( y_hat1.detach()) d_loss = discriminator_loss(real_scores, fake_scores) optim_d.zero_grad() d_loss.backward(retain_graph=True) optim_d.step() # Generator g_stft_loss = criterion(y, y_hat1) + criterion( y, y_hat2) - criterion(y_hat1, y_hat2) g_adv_loss = adversarial_loss(fake_scores) g_loss = g_adv_loss + g_stft_loss optim_g.zero_grad() g_loss.backward() optim_g.step() if rank == 0: # STDOUT logging if steps % cfg.train.stdout_interval == 0: with torch.no_grad(): print( 'Steps : {:d}, Gen Loss Total : {:4.3f}, STFT Error : {:4.3f}, s/b : {:4.3f}' .format(steps, g_loss, g_stft_loss, time.time() - start_b)) # checkpointing if steps % cfg.train.checkpoint_interval == 0: ckpt_dir = "{}/g_{:08d}".format(cfg.train.ckpt_dir, steps) save_checkpoint( ckpt_dir, { 'generator': (generator.module if cfg.train.n_gpu > 1 else generator).state_dict() }) ckpt_dir = "{}/do_{:08d}".format(cfg.train.ckpt_dir, steps) save_checkpoint( ckpt_dir, { 'discriminator': (discriminator.module if cfg.train.n_gpu > 1 else discriminator).state_dict(), 'optim_g': optim_g.state_dict(), 'optim_d': optim_d.state_dict(), 'steps': steps, 'epoch': epoch }) # Tensorboard summary logging if steps % cfg.train.summary_interval == 0: sw.add_scalar("training/gen_loss_total", g_loss, steps) sw.add_scalar("training/gen_stft_error", g_stft_loss, steps) # Validation if steps % cfg.train.validation_interval == 0: generator.eval() torch.cuda.empty_cache() val_err_tot = 0 with torch.no_grad(): for j, (y, x_noised_features, x_noised_cond) in enumerate(val_loader): y_hat = generator( x_noised_features.transpose(1, 2).to(device), x_noised_cond.to(device)) val_err_tot += criterion(y, y_hat).item() if j <= 4: # sw.add_audio('noised/y_noised_{}'.format(j), y_noised[0], steps, cfg.data.target_sample_rate) sw.add_audio('generated/y_hat_{}'.format(j), y_hat[0], steps, cfg.data.sample_rate) sw.add_audio('gt/y_{}'.format(j), y[0], steps, cfg.data.sample_rate) val_err = val_err_tot / (j + 1) sw.add_scalar("validation/stft_error", val_err, steps) generator.train() steps += 1 scheduler_g.step() scheduler_d.step() if rank == 0: print('Time taken for epoch {} is {} sec\n'.format( epoch + 1, int(time.time() - start)))