コード例 #1
0
ファイル: train.py プロジェクト: yuzhongshanyue/asteroid
def main(conf):
    exp_dir = conf['main_args']['exp_dir']
    # Define Dataloader
    train_loader, val_loader = make_dataloaders(**conf['data'],
                                                **conf['training'])
    conf['masknet'].update({'n_src': conf['data']['n_src']})
    # Define model, optimizer + scheduler
    model, optimizer = make_model_and_optimizer(conf)
    scheduler = None
    if conf['training']['half_lr']:
        scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5,
                                      patience=5)

    # Save config
    os.makedirs(exp_dir, exist_ok=True)
    conf_path = os.path.join(exp_dir, 'conf.yml')
    with open(conf_path, 'w') as outfile:
        yaml.safe_dump(conf, outfile)

    # Define loss function
    loss_func = ChimeraLoss(alpha=conf['training']['loss_alpha'])
    # Put together in System
    system = ChimeraSystem(model=model, loss_func=loss_func,
                           optimizer=optimizer, train_loader=train_loader,
                           val_loader=val_loader, scheduler=scheduler,
                           config=conf)

    # Callbacks
    checkpoint_dir = os.path.join(exp_dir, 'checkpoints/')
    checkpoint = ModelCheckpoint(checkpoint_dir, monitor='val_loss',
                                 mode='min', save_top_k=5, verbose=1)
    early_stopping = False
    if conf['training']['early_stop']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=10,
                                       verbose=1)
    gpus=-1
    # Don't ask GPU if they are not available.
    if not torch.cuda.is_available():
        print('No available GPU were found, set gpus to None')
        gpus = None

    # Train model
    trainer = pl.Trainer(max_nb_epochs=conf['training']['epochs'],
                         checkpoint_callback=checkpoint,
                         early_stop_callback=early_stopping,
                         default_save_path=exp_dir,
                         gpus=gpus,
                         distributed_backend='dp',
                         train_percent_check=1.0,  # Useful for fast experiment
                         gradient_clip_val=200,)
    trainer.fit(system)

    best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()}
    with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f:
        json.dump(best_k, f, indent=0)
    # Save last model for convenience
    torch.save(system.model.state_dict(),
               os.path.join(exp_dir, 'checkpoints/final.pth'))
コード例 #2
0
def main(conf):
    exp_dir = conf["main_args"]["exp_dir"]
    # Define Dataloader
    train_loader, val_loader = make_dataloaders(**conf["data"], **conf["training"])
    conf["masknet"].update({"n_src": conf["data"]["n_src"]})
    # Define model, optimizer + scheduler
    model, optimizer = make_model_and_optimizer(conf)
    scheduler = None
    if conf["training"]["half_lr"]:
        scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5)

    # Save config
    os.makedirs(exp_dir, exist_ok=True)
    conf_path = os.path.join(exp_dir, "conf.yml")
    with open(conf_path, "w") as outfile:
        yaml.safe_dump(conf, outfile)

    # Define loss function
    loss_func = ChimeraLoss(alpha=conf["training"]["loss_alpha"])
    # Put together in System
    system = ChimeraSystem(
        model=model,
        loss_func=loss_func,
        optimizer=optimizer,
        train_loader=train_loader,
        val_loader=val_loader,
        scheduler=scheduler,
        config=conf,
    )

    # Callbacks
    checkpoint_dir = os.path.join(exp_dir, "checkpoints/")
    checkpoint = ModelCheckpoint(
        checkpoint_dir, monitor="val_loss", mode="min", save_top_k=5, verbose=True
    )
    early_stopping = False
    if conf["training"]["early_stop"]:
        early_stopping = EarlyStopping(monitor="val_loss", patience=30, verbose=True)
    gpus = -1
    # Don't ask GPU if they are not available.
    if not torch.cuda.is_available():
        print("No available GPU were found, set gpus to None")
        gpus = None

    # Train model
    trainer = pl.Trainer(
        max_epochs=conf["training"]["epochs"],
        checkpoint_callback=checkpoint,
        early_stop_callback=early_stopping,
        default_root_dir=exp_dir,
        gpus=gpus,
        distributed_backend="dp",
        train_percent_check=1.0,  # Useful for fast experiment
        gradient_clip_val=200,
    )
    trainer.fit(system)

    best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()}
    with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f:
        json.dump(best_k, f, indent=0)
    # Save last model for convenience
    torch.save(system.model.state_dict(), os.path.join(exp_dir, "checkpoints/final.pth"))