def main():
    display_config()

    dataset_root = get_full_path(args.scale, args.train_set)

    print('Contructing dataset...')
    dataset_factory = DatasetFactory()
    train_dataset = dataset_factory.create_dataset(args.model, dataset_root)

    model_factory = ModelFactory()
    model = model_factory.create_model(args.model)

    loss_fn = get_loss_fn(model.name)

    check_point = os.path.join('check_point', model.name,
                               str(args.scale) + 'x')

    solver = Solver(model,
                    check_point,
                    loss_fn=loss_fn,
                    batch_size=args.batch_size,
                    num_epochs=args.num_epochs,
                    learning_rate=args.learning_rate,
                    fine_tune=args.fine_tune,
                    verbose=args.verbose)

    print('Training...')
    solver.train(train_dataset)
Exemple #2
0
def train(model, optimizer, train_data, params):
    for step, batch in enumerate(tqdm(train_data), 1):
        batch_x = batch[0]
        batch_y = batch[1]
        targets = {"label": batch_y}
        optim_step(step, model, batch_x, targets, optimizer,
                   get_loss_fn(params["loss"]), params)
def train(models, optimizer, train_data, params):
    if params["loss"] == "none_loss":
        return
    loss_fn = get_loss_fn(params["loss"])
    for step, batch in enumerate(tqdm(train_data), 1):
        batch_x = batch[0]
        batch_y = batch[1]
        targets = {"label": batch_y}

        with tf.GradientTape() as g:
            outputs = models["model"](batch_x, training=True)
            if params["method"] == "FSIW":
                logits0 = models["fsiw0"](batch_x, training=False)["logits"]
                logits1 = models["fsiw1"](batch_x, training=False)["logits"]
                outputs = {
                    "logits": outputs["logits"],
                    "logits0": logits0,
                    "logits1": logits1
                }
            elif params["method"] == "ES-DFM":
                logitsx = models["esdfm"](batch_x, training=False)
                outputs = {
                    "logits": outputs["logits"],
                    "tn_logits": logitsx["tn_logits"],
                    "dp_logits": logitsx["dp_logits"]
                }
            reg_loss = tf.add_n(models["model"].losses)
            loss_dict = loss_fn(targets, outputs, params)
            loss = loss_dict["loss"] + reg_loss

        trainable_variables = models["model"].trainable_variables
        gradients = g.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(gradients, trainable_variables))
Exemple #4
0
def main(cfg):
    torch.cuda.empty_cache()
    torch.manual_seed(cfg.param.seed)

    # Training settings
    cwd = Path(hydra.utils.get_original_cwd())
    wsi_dir = cwd/cfg.dir.wsi
    patch_dir = cwd/cfg.dir.patch
    ckpt = Checkpoint(
        cwd, cfg.gpus, cfg.dir.resume, cfg.dir.save_to, cfg.log.save_model)

    device = torch.device(
        f"cuda:{cfg.gpus[0]}" if cfg.gpus[0] != -1 else "cpu")

    model = build_model(gpus=cfg.gpus)
    optimizer = RAdam(model.parameters(), lr=cfg.param.lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=cfg.param.gamma)
    if cfg.dir.resume:
        model, optimizer, scheduler = ckpt.load_state(
            model, optimizer, scheduler)
    criterion = get_loss_fn()

    train_wsi, test_wsi = split_wsi(
        wsi_dir, ckpt.save_to, cwd, ratio=cfg.data.ratio,
        projects=cfg.data.projects, strategies=cfg.data.strategies,
        limit=cfg.data.limit)
    for epoch in range(ckpt.start_epoch, cfg.param.epochs + 1):
        split_data(
            patch_dir, ckpt.save_to, train_wsi, test_wsi, cfg.data.chunks,
            epoch, cfg.dir.resume)
        for chunk in range(ckpt.start_chunk, cfg.data.chunks):
            data_loader = get_loaders(
                cfg.param.batch_size, ckpt.save_to, chunk, cfg.gpus)
            train(
                model, device, data_loader, optimizer, scheduler, criterion,
                epoch, cfg.param.epochs, chunk, cfg.data.chunks, ckpt)

        ckpt.start_chunk = 0
        scheduler.step()
        ckpt.save(model, optimizer, scheduler, epoch, chunk, loss=False)

    ckpt.close_writer()
Exemple #5
0
def main():
    display_config()
    print('Contructing dataset...')
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    train_dataset = VSR_Dataset(dir=args.train_set,
                                trans=transforms.Compose([
                                    RandomCrop(48, args.scale),
                                    DataAug(),
                                    ToTensor()
                                ]))
    model_factory = ModelFactory()
    model = model_factory.create_model(args.model)
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print(1.0 * params / (1000 * 1000))

    loss_fn = get_loss_fn(model.name)

    check_point = os.path.join(args.checkpoint, model.name,
                               str(args.scale) + 'x')
    if not os.path.exists(check_point):
        os.makedirs(check_point)

    solver = Solver(model,
                    check_point,
                    model.name,
                    loss_fn=loss_fn,
                    batch_size=args.batch_size,
                    num_epochs=args.num_epochs,
                    learning_rate=args.learning_rate,
                    fine_tune=args.fine_tune,
                    verbose=args.verbose)

    print('Training...')
    val_dataset = VSR_Dataset(dir=args.test_set,
                              trans=transforms.Compose([ToTensor()]))
    solver.train(train_dataset, val_dataset)
def run_test_single_dataset(net, test_dataloader, params, data_config, train_index, start_time, 
                            use_wandb=True, wandb_suffix=''):
    net.eval()
    ponder_weight = params['loss']['ponder_weight']
    loss_fn = get_loss_fn(params['loss'])
    fs = data_config['fs']

    total_losses = []
    ponder_losses = []
    enhance_losses = []
    sdrs = []
    per_db_results = {}

    test_index = 0
    audio_objs = []
    for (clean, noise, mix, file_db) in tqdm.tqdm(test_dataloader):
        clean, mix = clean.cuda(), mix.cuda()
        db = file_db[0][:-4]

        # Train
        pred, ponder = net(mix)

        loss_enhance, loss_ponder = loss_fn(clean, pred, ponder)
        total_loss = loss_enhance + ponder_weight * loss_ponder

        if db not in per_db_results:
            per_db_results[db] = {'enhance': [], 'ponder': []}

        per_db_results[db]['enhance'].append(loss_enhance.item())
        per_db_results[db]['ponder'].append(loss_ponder.item())

        total_losses.append(total_loss.item())
        ponder_losses.append(loss_ponder.item())
        enhance_losses.append(loss_enhance.item())

        sdr = test_sisdr(pred, clean)
        sdrs.append(sdr.item())

        if test_index < params['log']['num_audio_save'] and use_wandb:
            audio_objs += make_wandb_audio(clean,
                                           'clean_{}'.format(test_index), fs)
            audio_objs += make_wandb_audio(mix,
                                           'mix_{}'.format(test_index), fs, is_multi=True)
            audio_objs += make_wandb_audio(pred,
                                           'pred_{}'.format(test_index), fs)

        test_index += 1

    if use_wandb:
        wandb.log({'Test Outputs' + wandb_suffix: audio_objs})
        wandb.log({'Total Test Loss' + wandb_suffix: np.array(total_losses).mean()})
        wandb.log({'Ponder Test Loss' + wandb_suffix: np.array(ponder_losses).mean()})
        wandb.log({'Enhance Test Loss' + wandb_suffix: np.array(enhance_losses).mean()})
        wandb.log({'Test SDR' + wandb_suffix: np.array(sdrs).mean()})

        fig_ponder, _, ponder_stats = generate_plot(
            per_db_results, train_index)
        wandb.log({'per_db_ponder' + wandb_suffix: wandb.Image(fig_ponder)})

        fig_enhance, _, enhance_stats = generate_plot(per_db_results, train_index,
                                                      metric='enhance')
        wandb.log({'per_db_enhance' + wandb_suffix: wandb.Image(fig_enhance)})

        save_dict(per_db_results, 'per_db_data' + wandb_suffix, train_index, start_time, params)
        save_dict(ponder_stats, 'per_db_ponder_stats' + wandb_suffix, train_index, start_time, params)
        save_dict(enhance_stats, 'per_db_enhance_stats' + wandb_suffix, train_index, start_time, params)

    net.train()

    return np.array(total_losses).mean(), np.array(ponder_losses).mean()
def main(params, gpu, start_time, str_params_function, use_wandb=True):
    # get the network ready
    lr = params['opt']['lr']
    grad_clip = params['opt']['grad_clip']
    ponder_weight = 0 if params['loss']['ponder_warmup'] else params['loss']['ponder_weight']

    net = PonderEnhancer(params['model'])

    torch.autograd.set_detect_anomaly(True)
    opt = torch.optim.Adam(net.parameters(), lr=lr)

    loss_fn = get_loss_fn(params['loss'])

    if use_wandb:
        setup_wandb(params, net, start_time, str_params_function)

    net.cuda()
    net.train()

    stop_early = False
    running_test_losses = []

    index = 0
    print('------- Starting Training -------')
    try:
        for _ in tqdm.tqdm(range(params['epochs'])):
            train_dset = get_dataset(params['train_data_config'])
            train_dataloader = get_dataloader(params, train_dset)

            pbar_cur_loader = tqdm.tqdm(train_dataloader)
            for (clean, _, mix, _) in pbar_cur_loader:
                clean, mix = clean.cuda(), mix.cuda()

                # Train
                opt.zero_grad()
                pred, ponder = net(mix, verbose=False)
                loss_enhance, loss_ponder = loss_fn(clean, pred, ponder)
                total_loss = loss_enhance + ponder_weight * loss_ponder

                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(net.parameters(), grad_clip)
                opt.step()

                index += 1

                if use_wandb:
                    wandb.log({"Total Train Loss": total_loss.item()})
                    wandb.log({"Ponder Train Loss": loss_ponder.item()})
                    wandb.log({"Enhance Train Loss": loss_enhance.item()})

                if index % params['log']['test_pd'] == 0:
                    # regular test that matches training
                    test_loss, test_ponder_loss = run_test(net, params, index, start_time, use_wandb)
                    running_test_losses.append([test_loss, test_ponder_loss])
                    stop_early = check_early_stopping_criterion(running_test_losses)

                if index % params['log']['ckpt_pd'] == 0:
                    save_model_ckpt(net, params, start_time, index, use_wandb)

                if stop_early:
                    save_model_ckpt(net, params, start_time, index, use_wandb)
                    exit()

                if params['loss']['ponder_warmup']:
                    ponder_weight = params['loss']['ponder_weight']
                    ponder_weight *= min(1, index /
                                         (params['loss']['ponder_warmup'] * len(train_dataloader)))

                pbar_cur_loader.set_description("loss : {:10.8f}".format(total_loss.item()))


    except KeyboardInterrupt:
        pass
Exemple #8
0
    data_root = options.data_root
    input_size = options.input_size

    train_csv_path = data_root + "train.csv"
    test_csv_path = data_root + "test.csv"
    images_dir = data_root + "images/"
    submission_df_path = data_root + "sample_submission.csv"

    num_classes = 4
    num_cv_folds = 5

    device = get_device()
    model, _ = init_model(num_classes, use_pretrained=options.pre_train)

    feature_center = torch.zeros(4, 32 * model.num_features).to(device)
    criterion = get_loss_fn()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.001,
                                momentum=0.9,
                                weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=2,
                                                gamma=0.95)

    tr_df_all = pd.read_csv(train_csv_path)
    tr_df, val_df = train_test_split(tr_df_all, test_size=0.2)
    val_df = val_df.reset_index(drop=True)
    tr_df = tr_df.reset_index(drop=True)
    te_df = pd.read_csv(test_csv_path)
def get_trainer(args, model, train_loader, val_loader, metrics):
    # setup optimizer
    # filter for trainable parameters (https://github.com/pytorch/pytorch/issues/679)
    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = Adam(trainable_params, lr=args.lr, weight_decay=args.l2reg)

    # setup hook and logging
    hooks = [spk.train.MaxEpochHook(args.max_epochs)]
    if args.max_steps:
        hooks.append(spk.train.MaxStepHook(max_steps=args.max_steps))

    schedule = spk.train.ReduceLROnPlateauHook(
        optimizer=optimizer,
        patience=args.lr_patience,
        factor=args.lr_decay,
        min_lr=args.lr_min,
        window_length=1,
        stop_after_min=True,
    )
    hooks.append(schedule)
    hooks.append(PatchingHook())

    if args.logger == "csv":
        logger = spk.train.CSVHook(
            os.path.join(args.modelpath, "log"),
            metrics,
            every_n_epochs=args.log_every_n_epochs,
        )
        hooks.append(logger)
    elif args.logger == "tensorboard":
        logger = spk.train.TensorboardHook(
            os.path.join(args.modelpath, "log"),
            metrics,
            every_n_epochs=args.log_every_n_epochs,
        )
        hooks.append(logger)

    if args.save_n_steps > 0:
        saving_hook = SavingHook(args.save_n_steps)
        hooks.append(saving_hook)

    # setup loss function
    loss_fn = get_loss_fn(args)

    # setup trainer
    if args.loss == "smooth":
        trainer = SmoothTrainer(
            args.modelpath,
            model,
            loss_fn,
            optimizer,
            train_loader,
            val_loader,
            checkpoint_interval=args.checkpoint_interval,
            keep_n_checkpoints=args.keep_n_checkpoints,
            hooks=hooks,
        )
    else:
        trainer = spk.train.Trainer(
            args.modelpath,
            model,
            loss_fn,
            optimizer,
            train_loader,
            val_loader,
            checkpoint_interval=args.checkpoint_interval,
            keep_n_checkpoints=args.keep_n_checkpoints,
            hooks=hooks,
        )
    return trainer
def main(args):
    with open(args.config, 'r') as f:
        y = yaml.load(f, Loader=yaml.Loader)
    cfg = addict.Dict(y)
    cfg.general.config = args.config

    # misc
    device = cfg.general.device
    random.seed(cfg.general.random_state)
    os.environ['PYTHONHASHSEED'] = str(cfg.general.random_state)
    np.random.seed(cfg.general.random_state)
    torch.manual_seed(cfg.general.random_state)

    # log
    if cfg.general.expid == '':
        expid = dt.datetime.now().strftime('%Y%m%d%H%M%S')
        cfg.general.expid = expid
    else:
        expid = cfg.general.expid
    cfg.general.logdir = str(LOGDIR / expid)
    if not os.path.exists(cfg.general.logdir):
        os.makedirs(cfg.general.logdir)
    os.chmod(cfg.general.logdir, 0o777)
    logger = utils.get_logger(os.path.join(cfg.general.logdir, 'main.log'))
    logger.info(f'Logging at {cfg.general.logdir}')
    logger.info(cfg)
    shutil.copyfile(str(args.config), cfg.general.logdir + '/config.yaml')
    writer = SummaryWriter(cfg.general.logdir)

    # data
    X_train = np.load(cfg.data.X_train, allow_pickle=True)
    y_train = np.load(cfg.data.y_train, allow_pickle=True)
    logger.info('Loaded X_train, y_train')
    # CV
    kf = model_selection.__dict__[cfg.training.split](
        n_splits=cfg.training.n_splits,
        shuffle=True,
        random_state=cfg.general.random_state)  # noqa
    score_list = {'loss': [], 'score': []}
    for fold_i, (train_idx, valid_idx) in enumerate(
            kf.split(X=np.zeros(len(y_train)), y=y_train[:, 0])):
        if fold_i + 1 not in cfg.training.target_folds:
            continue
        X_train_ = X_train[train_idx]
        y_train_ = y_train[train_idx]
        X_valid_ = X_train[valid_idx]
        y_valid_ = y_train[valid_idx]
        _ratio = cfg.training.get('with_x_percent_fold_1_of_5', 0.)
        if _ratio > 0.:
            assert cfg.training.n_splits == 5 and fold_i + 1 == 1
            from sklearn.model_selection import train_test_split
            if _ratio == 0.95:
                _test_size = 0.25
            elif _ratio == 0.9:
                _test_size = 0.5
            else:
                raise NotImplementedError
            _X_train, X_valid_, _y_train, y_valid_ = train_test_split(
                X_valid_,
                y_valid_,
                test_size=_test_size,
                random_state=cfg.general.random_state)
            X_train_ = np.concatenate([X_train_, _X_train], axis=0)
            y_train_ = np.concatenate([y_train_, _y_train], axis=0)
        train_set = Dataset(X_train_, y_train_, cfg, mode='train')
        valid_set = Dataset(X_valid_, y_valid_, cfg, mode='valid')
        if fold_i == 0:
            logger.info(train_set.transform)
            logger.info(valid_set.transform)
        train_loader = DataLoader(train_set,
                                  batch_size=cfg.training.batch_size,
                                  shuffle=True,
                                  num_workers=cfg.training.n_worker,
                                  pin_memory=True)
        valid_loader = DataLoader(valid_set,
                                  batch_size=cfg.training.batch_size,
                                  shuffle=False,
                                  num_workers=cfg.training.n_worker,
                                  pin_memory=True)

        # model
        model = models.get_model(cfg=cfg)
        model = model.to(device)
        criterion = loss.get_loss_fn(cfg)
        optimizer = utils.get_optimizer(model.parameters(), config=cfg)
        scheduler = utils.get_lr_scheduler(optimizer, config=cfg)

        start_epoch = 1
        best = {'loss': 1e+9, 'score': -1.}
        is_best = {'loss': False, 'score': False}

        # resume
        if cfg.model.resume:
            if os.path.isfile(cfg.model.resume):
                checkpoint = torch.load(cfg.model.resume)
                start_epoch = checkpoint['epoch'] + 1
                best['loss'] = checkpoint['loss/best']
                best['score'] = checkpoint['score/best']
                if cfg.general.multi_gpu:
                    model.load_state_dict(
                        utils.fix_model_state_dict(checkpoint['state_dict']))
                else:
                    model.load_state_dict(checkpoint['state_dict'])
                if cfg.model.get('load_optimizer', True):
                    optimizer.load_state_dict(checkpoint['optimizer'])
                logger.info('Loaded checkpoint {} (epoch {})'.format(
                    cfg.model.resume, start_epoch - 1))
            else:
                raise IOError('No such file {}'.format(cfg.model.resume))

        if cfg.general.multi_gpu:
            model = nn.DataParallel(model)

        for epoch_i in range(start_epoch, cfg.training.epochs + 1):
            if scheduler is not None:
                if cfg.training.lr_scheduler.name == 'MultiStepLR':
                    optimizer.zero_grad()
                    optimizer.step()
                    scheduler.step()
            for param_group in optimizer.param_groups:
                current_lr = param_group['lr']
            _ohem_loss = (cfg.training.ohem_loss
                          and cfg.training.ohem_epoch < epoch_i)
            train = training(train_loader,
                             model,
                             criterion,
                             optimizer,
                             config=cfg,
                             using_ohem_loss=_ohem_loss,
                             lr=current_lr)
            valid = training(valid_loader,
                             model,
                             criterion,
                             optimizer,
                             is_training=False,
                             config=cfg,
                             lr=current_lr)

            if scheduler is not None and cfg.training.lr_scheduler.name != 'MultiStepLR':
                if cfg.training.lr_scheduler.name == 'ReduceLROnPlateau':
                    if scheduler.mode == 'min':
                        value = valid['loss']
                    elif scheduler.mode == 'max':
                        value = valid['score']
                    else:
                        raise NotImplementedError
                    scheduler.step(value)
                else:
                    scheduler.step()

            is_best['loss'] = valid['loss'] < best['loss']
            is_best['score'] = valid['score'] > best['score']
            if is_best['loss']:
                best['loss'] = valid['loss']
            if is_best['score']:
                best['score'] = valid['score']
            model_state_dict = model.module.state_dict(
            ) if cfg.general.multi_gpu else model.state_dict()  # noqa
            state_dict = {
                'epoch': epoch_i,
                'state_dict': model_state_dict,
                'optimizer': optimizer.state_dict(),
                'loss/valid': valid['loss'],
                'score/valid': valid['score'],
                'loss/best': best['loss'],
                'score/best': best['score'],
            }
            utils.save_checkpoint(
                state_dict,
                is_best,
                epoch_i,
                valid['loss'],
                valid['score'],
                Path(cfg.general.logdir) / f'fold_{fold_i}',
            )

            # tensorboard
            writer.add_scalar('Loss/Train', train['loss'], epoch_i)
            writer.add_scalar('Loss/Valid', valid['loss'], epoch_i)
            writer.add_scalar('Loss/Best', best['loss'], epoch_i)
            writer.add_scalar('Metrics/Train', train['score'], epoch_i)
            writer.add_scalar('Metrics/Valid', valid['score'], epoch_i)
            writer.add_scalar('Metrics/Best', best['score'], epoch_i)

            log = f'[{expid}] Fold {fold_i+1} Epoch {epoch_i}/{cfg.training.epochs} '
            log += f'[loss] {train["loss"]:.6f}/{valid["loss"]:.6f} '
            log += f'[score] {train["score"]:.6f}/{valid["score"]:.6f} '
            log += f'({best["score"]:.6f}) '
            log += f'lr {current_lr:.6f}'
            logger.info(log)

        score_list['loss'].append(best['loss'])
        score_list['score'].append(best['score'])
        if cfg.training.single_fold: break  # noqa

    log = f'[{expid}] '
    log += f'[loss] {cfg.training.n_splits}-fold/mean {np.mean(score_list["loss"]):.4f} '
    log += f'[score] {cfg.training.n_splits}-fold/mean {np.mean(score_list["score"]):.4f} '  # noqa
    logger.info(log)