def main(conf): exp_dir = conf['main_args']['exp_dir'] # Define Dataloader train_loader, val_loader = make_dataloaders(**conf['data'], **conf['training']) conf['masknet'].update({'n_src': conf['data']['n_src']}) # Define model, optimizer + scheduler model, optimizer = make_model_and_optimizer(conf) scheduler = None if conf['training']['half_lr']: scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5) # Save config os.makedirs(exp_dir, exist_ok=True) conf_path = os.path.join(exp_dir, 'conf.yml') with open(conf_path, 'w') as outfile: yaml.safe_dump(conf, outfile) # Define loss function loss_func = ChimeraLoss(alpha=conf['training']['loss_alpha']) # Put together in System system = ChimeraSystem(model=model, loss_func=loss_func, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, scheduler=scheduler, config=conf) # Callbacks checkpoint_dir = os.path.join(exp_dir, 'checkpoints/') checkpoint = ModelCheckpoint(checkpoint_dir, monitor='val_loss', mode='min', save_top_k=5, verbose=1) early_stopping = False if conf['training']['early_stop']: early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1) gpus=-1 # Don't ask GPU if they are not available. if not torch.cuda.is_available(): print('No available GPU were found, set gpus to None') gpus = None # Train model trainer = pl.Trainer(max_nb_epochs=conf['training']['epochs'], checkpoint_callback=checkpoint, early_stop_callback=early_stopping, default_save_path=exp_dir, gpus=gpus, distributed_backend='dp', train_percent_check=1.0, # Useful for fast experiment gradient_clip_val=200,) trainer.fit(system) best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: json.dump(best_k, f, indent=0) # Save last model for convenience torch.save(system.model.state_dict(), os.path.join(exp_dir, 'checkpoints/final.pth'))
def main(conf): exp_dir = conf["main_args"]["exp_dir"] # Define Dataloader train_loader, val_loader = make_dataloaders(**conf["data"], **conf["training"]) conf["masknet"].update({"n_src": conf["data"]["n_src"]}) # Define model, optimizer + scheduler model, optimizer = make_model_and_optimizer(conf) scheduler = None if conf["training"]["half_lr"]: scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5) # Save config os.makedirs(exp_dir, exist_ok=True) conf_path = os.path.join(exp_dir, "conf.yml") with open(conf_path, "w") as outfile: yaml.safe_dump(conf, outfile) # Define loss function loss_func = ChimeraLoss(alpha=conf["training"]["loss_alpha"]) # Put together in System system = ChimeraSystem( model=model, loss_func=loss_func, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, scheduler=scheduler, config=conf, ) # Callbacks checkpoint_dir = os.path.join(exp_dir, "checkpoints/") checkpoint = ModelCheckpoint( checkpoint_dir, monitor="val_loss", mode="min", save_top_k=5, verbose=True ) early_stopping = False if conf["training"]["early_stop"]: early_stopping = EarlyStopping(monitor="val_loss", patience=30, verbose=True) gpus = -1 # Don't ask GPU if they are not available. if not torch.cuda.is_available(): print("No available GPU were found, set gpus to None") gpus = None # Train model trainer = pl.Trainer( max_epochs=conf["training"]["epochs"], checkpoint_callback=checkpoint, early_stop_callback=early_stopping, default_root_dir=exp_dir, gpus=gpus, distributed_backend="dp", train_percent_check=1.0, # Useful for fast experiment gradient_clip_val=200, ) trainer.fit(system) best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: json.dump(best_k, f, indent=0) # Save last model for convenience torch.save(system.model.state_dict(), os.path.join(exp_dir, "checkpoints/final.pth"))