def main(): display_config() dataset_root = get_full_path(args.scale, args.train_set) print('Contructing dataset...') dataset_factory = DatasetFactory() train_dataset = dataset_factory.create_dataset(args.model, dataset_root) model_factory = ModelFactory() model = model_factory.create_model(args.model) loss_fn = get_loss_fn(model.name) check_point = os.path.join('check_point', model.name, str(args.scale) + 'x') solver = Solver(model, check_point, loss_fn=loss_fn, batch_size=args.batch_size, num_epochs=args.num_epochs, learning_rate=args.learning_rate, fine_tune=args.fine_tune, verbose=args.verbose) print('Training...') solver.train(train_dataset)
def train(model, optimizer, train_data, params): for step, batch in enumerate(tqdm(train_data), 1): batch_x = batch[0] batch_y = batch[1] targets = {"label": batch_y} optim_step(step, model, batch_x, targets, optimizer, get_loss_fn(params["loss"]), params)
def train(models, optimizer, train_data, params): if params["loss"] == "none_loss": return loss_fn = get_loss_fn(params["loss"]) for step, batch in enumerate(tqdm(train_data), 1): batch_x = batch[0] batch_y = batch[1] targets = {"label": batch_y} with tf.GradientTape() as g: outputs = models["model"](batch_x, training=True) if params["method"] == "FSIW": logits0 = models["fsiw0"](batch_x, training=False)["logits"] logits1 = models["fsiw1"](batch_x, training=False)["logits"] outputs = { "logits": outputs["logits"], "logits0": logits0, "logits1": logits1 } elif params["method"] == "ES-DFM": logitsx = models["esdfm"](batch_x, training=False) outputs = { "logits": outputs["logits"], "tn_logits": logitsx["tn_logits"], "dp_logits": logitsx["dp_logits"] } reg_loss = tf.add_n(models["model"].losses) loss_dict = loss_fn(targets, outputs, params) loss = loss_dict["loss"] + reg_loss trainable_variables = models["model"].trainable_variables gradients = g.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(gradients, trainable_variables))
def main(cfg): torch.cuda.empty_cache() torch.manual_seed(cfg.param.seed) # Training settings cwd = Path(hydra.utils.get_original_cwd()) wsi_dir = cwd/cfg.dir.wsi patch_dir = cwd/cfg.dir.patch ckpt = Checkpoint( cwd, cfg.gpus, cfg.dir.resume, cfg.dir.save_to, cfg.log.save_model) device = torch.device( f"cuda:{cfg.gpus[0]}" if cfg.gpus[0] != -1 else "cpu") model = build_model(gpus=cfg.gpus) optimizer = RAdam(model.parameters(), lr=cfg.param.lr) scheduler = StepLR(optimizer, step_size=1, gamma=cfg.param.gamma) if cfg.dir.resume: model, optimizer, scheduler = ckpt.load_state( model, optimizer, scheduler) criterion = get_loss_fn() train_wsi, test_wsi = split_wsi( wsi_dir, ckpt.save_to, cwd, ratio=cfg.data.ratio, projects=cfg.data.projects, strategies=cfg.data.strategies, limit=cfg.data.limit) for epoch in range(ckpt.start_epoch, cfg.param.epochs + 1): split_data( patch_dir, ckpt.save_to, train_wsi, test_wsi, cfg.data.chunks, epoch, cfg.dir.resume) for chunk in range(ckpt.start_chunk, cfg.data.chunks): data_loader = get_loaders( cfg.param.batch_size, ckpt.save_to, chunk, cfg.gpus) train( model, device, data_loader, optimizer, scheduler, criterion, epoch, cfg.param.epochs, chunk, cfg.data.chunks, ckpt) ckpt.start_chunk = 0 scheduler.step() ckpt.save(model, optimizer, scheduler, epoch, chunk, loss=False) ckpt.close_writer()
def main(): display_config() print('Contructing dataset...') os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_dataset = VSR_Dataset(dir=args.train_set, trans=transforms.Compose([ RandomCrop(48, args.scale), DataAug(), ToTensor() ])) model_factory = ModelFactory() model = model_factory.create_model(args.model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(1.0 * params / (1000 * 1000)) loss_fn = get_loss_fn(model.name) check_point = os.path.join(args.checkpoint, model.name, str(args.scale) + 'x') if not os.path.exists(check_point): os.makedirs(check_point) solver = Solver(model, check_point, model.name, loss_fn=loss_fn, batch_size=args.batch_size, num_epochs=args.num_epochs, learning_rate=args.learning_rate, fine_tune=args.fine_tune, verbose=args.verbose) print('Training...') val_dataset = VSR_Dataset(dir=args.test_set, trans=transforms.Compose([ToTensor()])) solver.train(train_dataset, val_dataset)
def run_test_single_dataset(net, test_dataloader, params, data_config, train_index, start_time, use_wandb=True, wandb_suffix=''): net.eval() ponder_weight = params['loss']['ponder_weight'] loss_fn = get_loss_fn(params['loss']) fs = data_config['fs'] total_losses = [] ponder_losses = [] enhance_losses = [] sdrs = [] per_db_results = {} test_index = 0 audio_objs = [] for (clean, noise, mix, file_db) in tqdm.tqdm(test_dataloader): clean, mix = clean.cuda(), mix.cuda() db = file_db[0][:-4] # Train pred, ponder = net(mix) loss_enhance, loss_ponder = loss_fn(clean, pred, ponder) total_loss = loss_enhance + ponder_weight * loss_ponder if db not in per_db_results: per_db_results[db] = {'enhance': [], 'ponder': []} per_db_results[db]['enhance'].append(loss_enhance.item()) per_db_results[db]['ponder'].append(loss_ponder.item()) total_losses.append(total_loss.item()) ponder_losses.append(loss_ponder.item()) enhance_losses.append(loss_enhance.item()) sdr = test_sisdr(pred, clean) sdrs.append(sdr.item()) if test_index < params['log']['num_audio_save'] and use_wandb: audio_objs += make_wandb_audio(clean, 'clean_{}'.format(test_index), fs) audio_objs += make_wandb_audio(mix, 'mix_{}'.format(test_index), fs, is_multi=True) audio_objs += make_wandb_audio(pred, 'pred_{}'.format(test_index), fs) test_index += 1 if use_wandb: wandb.log({'Test Outputs' + wandb_suffix: audio_objs}) wandb.log({'Total Test Loss' + wandb_suffix: np.array(total_losses).mean()}) wandb.log({'Ponder Test Loss' + wandb_suffix: np.array(ponder_losses).mean()}) wandb.log({'Enhance Test Loss' + wandb_suffix: np.array(enhance_losses).mean()}) wandb.log({'Test SDR' + wandb_suffix: np.array(sdrs).mean()}) fig_ponder, _, ponder_stats = generate_plot( per_db_results, train_index) wandb.log({'per_db_ponder' + wandb_suffix: wandb.Image(fig_ponder)}) fig_enhance, _, enhance_stats = generate_plot(per_db_results, train_index, metric='enhance') wandb.log({'per_db_enhance' + wandb_suffix: wandb.Image(fig_enhance)}) save_dict(per_db_results, 'per_db_data' + wandb_suffix, train_index, start_time, params) save_dict(ponder_stats, 'per_db_ponder_stats' + wandb_suffix, train_index, start_time, params) save_dict(enhance_stats, 'per_db_enhance_stats' + wandb_suffix, train_index, start_time, params) net.train() return np.array(total_losses).mean(), np.array(ponder_losses).mean()
def main(params, gpu, start_time, str_params_function, use_wandb=True): # get the network ready lr = params['opt']['lr'] grad_clip = params['opt']['grad_clip'] ponder_weight = 0 if params['loss']['ponder_warmup'] else params['loss']['ponder_weight'] net = PonderEnhancer(params['model']) torch.autograd.set_detect_anomaly(True) opt = torch.optim.Adam(net.parameters(), lr=lr) loss_fn = get_loss_fn(params['loss']) if use_wandb: setup_wandb(params, net, start_time, str_params_function) net.cuda() net.train() stop_early = False running_test_losses = [] index = 0 print('------- Starting Training -------') try: for _ in tqdm.tqdm(range(params['epochs'])): train_dset = get_dataset(params['train_data_config']) train_dataloader = get_dataloader(params, train_dset) pbar_cur_loader = tqdm.tqdm(train_dataloader) for (clean, _, mix, _) in pbar_cur_loader: clean, mix = clean.cuda(), mix.cuda() # Train opt.zero_grad() pred, ponder = net(mix, verbose=False) loss_enhance, loss_ponder = loss_fn(clean, pred, ponder) total_loss = loss_enhance + ponder_weight * loss_ponder total_loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), grad_clip) opt.step() index += 1 if use_wandb: wandb.log({"Total Train Loss": total_loss.item()}) wandb.log({"Ponder Train Loss": loss_ponder.item()}) wandb.log({"Enhance Train Loss": loss_enhance.item()}) if index % params['log']['test_pd'] == 0: # regular test that matches training test_loss, test_ponder_loss = run_test(net, params, index, start_time, use_wandb) running_test_losses.append([test_loss, test_ponder_loss]) stop_early = check_early_stopping_criterion(running_test_losses) if index % params['log']['ckpt_pd'] == 0: save_model_ckpt(net, params, start_time, index, use_wandb) if stop_early: save_model_ckpt(net, params, start_time, index, use_wandb) exit() if params['loss']['ponder_warmup']: ponder_weight = params['loss']['ponder_weight'] ponder_weight *= min(1, index / (params['loss']['ponder_warmup'] * len(train_dataloader))) pbar_cur_loader.set_description("loss : {:10.8f}".format(total_loss.item())) except KeyboardInterrupt: pass
data_root = options.data_root input_size = options.input_size train_csv_path = data_root + "train.csv" test_csv_path = data_root + "test.csv" images_dir = data_root + "images/" submission_df_path = data_root + "sample_submission.csv" num_classes = 4 num_cv_folds = 5 device = get_device() model, _ = init_model(num_classes, use_pretrained=options.pre_train) feature_center = torch.zeros(4, 32 * model.num_features).to(device) criterion = get_loss_fn() optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.95) tr_df_all = pd.read_csv(train_csv_path) tr_df, val_df = train_test_split(tr_df_all, test_size=0.2) val_df = val_df.reset_index(drop=True) tr_df = tr_df.reset_index(drop=True) te_df = pd.read_csv(test_csv_path)
def get_trainer(args, model, train_loader, val_loader, metrics): # setup optimizer # filter for trainable parameters (https://github.com/pytorch/pytorch/issues/679) trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = Adam(trainable_params, lr=args.lr, weight_decay=args.l2reg) # setup hook and logging hooks = [spk.train.MaxEpochHook(args.max_epochs)] if args.max_steps: hooks.append(spk.train.MaxStepHook(max_steps=args.max_steps)) schedule = spk.train.ReduceLROnPlateauHook( optimizer=optimizer, patience=args.lr_patience, factor=args.lr_decay, min_lr=args.lr_min, window_length=1, stop_after_min=True, ) hooks.append(schedule) hooks.append(PatchingHook()) if args.logger == "csv": logger = spk.train.CSVHook( os.path.join(args.modelpath, "log"), metrics, every_n_epochs=args.log_every_n_epochs, ) hooks.append(logger) elif args.logger == "tensorboard": logger = spk.train.TensorboardHook( os.path.join(args.modelpath, "log"), metrics, every_n_epochs=args.log_every_n_epochs, ) hooks.append(logger) if args.save_n_steps > 0: saving_hook = SavingHook(args.save_n_steps) hooks.append(saving_hook) # setup loss function loss_fn = get_loss_fn(args) # setup trainer if args.loss == "smooth": trainer = SmoothTrainer( args.modelpath, model, loss_fn, optimizer, train_loader, val_loader, checkpoint_interval=args.checkpoint_interval, keep_n_checkpoints=args.keep_n_checkpoints, hooks=hooks, ) else: trainer = spk.train.Trainer( args.modelpath, model, loss_fn, optimizer, train_loader, val_loader, checkpoint_interval=args.checkpoint_interval, keep_n_checkpoints=args.keep_n_checkpoints, hooks=hooks, ) return trainer
def main(args): with open(args.config, 'r') as f: y = yaml.load(f, Loader=yaml.Loader) cfg = addict.Dict(y) cfg.general.config = args.config # misc device = cfg.general.device random.seed(cfg.general.random_state) os.environ['PYTHONHASHSEED'] = str(cfg.general.random_state) np.random.seed(cfg.general.random_state) torch.manual_seed(cfg.general.random_state) # log if cfg.general.expid == '': expid = dt.datetime.now().strftime('%Y%m%d%H%M%S') cfg.general.expid = expid else: expid = cfg.general.expid cfg.general.logdir = str(LOGDIR / expid) if not os.path.exists(cfg.general.logdir): os.makedirs(cfg.general.logdir) os.chmod(cfg.general.logdir, 0o777) logger = utils.get_logger(os.path.join(cfg.general.logdir, 'main.log')) logger.info(f'Logging at {cfg.general.logdir}') logger.info(cfg) shutil.copyfile(str(args.config), cfg.general.logdir + '/config.yaml') writer = SummaryWriter(cfg.general.logdir) # data X_train = np.load(cfg.data.X_train, allow_pickle=True) y_train = np.load(cfg.data.y_train, allow_pickle=True) logger.info('Loaded X_train, y_train') # CV kf = model_selection.__dict__[cfg.training.split]( n_splits=cfg.training.n_splits, shuffle=True, random_state=cfg.general.random_state) # noqa score_list = {'loss': [], 'score': []} for fold_i, (train_idx, valid_idx) in enumerate( kf.split(X=np.zeros(len(y_train)), y=y_train[:, 0])): if fold_i + 1 not in cfg.training.target_folds: continue X_train_ = X_train[train_idx] y_train_ = y_train[train_idx] X_valid_ = X_train[valid_idx] y_valid_ = y_train[valid_idx] _ratio = cfg.training.get('with_x_percent_fold_1_of_5', 0.) if _ratio > 0.: assert cfg.training.n_splits == 5 and fold_i + 1 == 1 from sklearn.model_selection import train_test_split if _ratio == 0.95: _test_size = 0.25 elif _ratio == 0.9: _test_size = 0.5 else: raise NotImplementedError _X_train, X_valid_, _y_train, y_valid_ = train_test_split( X_valid_, y_valid_, test_size=_test_size, random_state=cfg.general.random_state) X_train_ = np.concatenate([X_train_, _X_train], axis=0) y_train_ = np.concatenate([y_train_, _y_train], axis=0) train_set = Dataset(X_train_, y_train_, cfg, mode='train') valid_set = Dataset(X_valid_, y_valid_, cfg, mode='valid') if fold_i == 0: logger.info(train_set.transform) logger.info(valid_set.transform) train_loader = DataLoader(train_set, batch_size=cfg.training.batch_size, shuffle=True, num_workers=cfg.training.n_worker, pin_memory=True) valid_loader = DataLoader(valid_set, batch_size=cfg.training.batch_size, shuffle=False, num_workers=cfg.training.n_worker, pin_memory=True) # model model = models.get_model(cfg=cfg) model = model.to(device) criterion = loss.get_loss_fn(cfg) optimizer = utils.get_optimizer(model.parameters(), config=cfg) scheduler = utils.get_lr_scheduler(optimizer, config=cfg) start_epoch = 1 best = {'loss': 1e+9, 'score': -1.} is_best = {'loss': False, 'score': False} # resume if cfg.model.resume: if os.path.isfile(cfg.model.resume): checkpoint = torch.load(cfg.model.resume) start_epoch = checkpoint['epoch'] + 1 best['loss'] = checkpoint['loss/best'] best['score'] = checkpoint['score/best'] if cfg.general.multi_gpu: model.load_state_dict( utils.fix_model_state_dict(checkpoint['state_dict'])) else: model.load_state_dict(checkpoint['state_dict']) if cfg.model.get('load_optimizer', True): optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Loaded checkpoint {} (epoch {})'.format( cfg.model.resume, start_epoch - 1)) else: raise IOError('No such file {}'.format(cfg.model.resume)) if cfg.general.multi_gpu: model = nn.DataParallel(model) for epoch_i in range(start_epoch, cfg.training.epochs + 1): if scheduler is not None: if cfg.training.lr_scheduler.name == 'MultiStepLR': optimizer.zero_grad() optimizer.step() scheduler.step() for param_group in optimizer.param_groups: current_lr = param_group['lr'] _ohem_loss = (cfg.training.ohem_loss and cfg.training.ohem_epoch < epoch_i) train = training(train_loader, model, criterion, optimizer, config=cfg, using_ohem_loss=_ohem_loss, lr=current_lr) valid = training(valid_loader, model, criterion, optimizer, is_training=False, config=cfg, lr=current_lr) if scheduler is not None and cfg.training.lr_scheduler.name != 'MultiStepLR': if cfg.training.lr_scheduler.name == 'ReduceLROnPlateau': if scheduler.mode == 'min': value = valid['loss'] elif scheduler.mode == 'max': value = valid['score'] else: raise NotImplementedError scheduler.step(value) else: scheduler.step() is_best['loss'] = valid['loss'] < best['loss'] is_best['score'] = valid['score'] > best['score'] if is_best['loss']: best['loss'] = valid['loss'] if is_best['score']: best['score'] = valid['score'] model_state_dict = model.module.state_dict( ) if cfg.general.multi_gpu else model.state_dict() # noqa state_dict = { 'epoch': epoch_i, 'state_dict': model_state_dict, 'optimizer': optimizer.state_dict(), 'loss/valid': valid['loss'], 'score/valid': valid['score'], 'loss/best': best['loss'], 'score/best': best['score'], } utils.save_checkpoint( state_dict, is_best, epoch_i, valid['loss'], valid['score'], Path(cfg.general.logdir) / f'fold_{fold_i}', ) # tensorboard writer.add_scalar('Loss/Train', train['loss'], epoch_i) writer.add_scalar('Loss/Valid', valid['loss'], epoch_i) writer.add_scalar('Loss/Best', best['loss'], epoch_i) writer.add_scalar('Metrics/Train', train['score'], epoch_i) writer.add_scalar('Metrics/Valid', valid['score'], epoch_i) writer.add_scalar('Metrics/Best', best['score'], epoch_i) log = f'[{expid}] Fold {fold_i+1} Epoch {epoch_i}/{cfg.training.epochs} ' log += f'[loss] {train["loss"]:.6f}/{valid["loss"]:.6f} ' log += f'[score] {train["score"]:.6f}/{valid["score"]:.6f} ' log += f'({best["score"]:.6f}) ' log += f'lr {current_lr:.6f}' logger.info(log) score_list['loss'].append(best['loss']) score_list['score'].append(best['score']) if cfg.training.single_fold: break # noqa log = f'[{expid}] ' log += f'[loss] {cfg.training.n_splits}-fold/mean {np.mean(score_list["loss"]):.4f} ' log += f'[score] {cfg.training.n_splits}-fold/mean {np.mean(score_list["score"]):.4f} ' # noqa logger.info(log)