def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ model = LightningTemplateModel(hparams) # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name='test_exp', save_dir=hyperparams.log_dir, autosave=False, description='test demo') # ------------------------ # 2 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, gpus=8, nb_gpu_nodes=2) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # when using grid search, it's possible for all models to start at once # and use the same test tube experiment version relative_node_id = int(os.environ['SLURM_NODEID']) sleep(relative_node_id + 1) # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping(monitor='val_acc', patience=3, verbose=True, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, nb_gpu_nodes=hyperparams.nb_gpu_nodes) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, gpus=hparams.gpus, use_amp=True) # ------------------------ # 4 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ model = LightningTemplateModel(hparams) # ------------------------ # 2 INIT EXP # ------------------------ # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp) # ------------------------ # 4 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment( name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo' ) exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_acc', patience=3, verbose=True, mode='max' ) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min' ) # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # init experiment exp = Experiment( name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description ) exp.argparse(hparams) exp.save() # build model model = LightningTemplateModel(hparams) # configure trainer trainer = Trainer(experiment=exp) # train model trainer.fit(model)
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # init experiment exp = Experiment(name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description) exp.argparse(hparams) exp.save() # build model model = LightningTemplateModel(hparams) # callbacks early_stop = EarlyStopping( monitor='val_acc', patience=3, mode='min', verbose=True, ) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_acc', mode='min') # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # train model trainer.fit(model)
def main(hparams, cluster): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # when using grid search, it's possible for all models to start at once # and use the same test tube experiment version relative_node_id = int(os.environ['SLURM_NODEID']) sleep(relative_node_id + 1) # init experiment exp = Experiment( name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, version=hparams.hpc_exp_number, # match the slurm job version number description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, gpus=hparams.per_experiment_nb_gpus, nb_gpu_nodes=hyperparams.nb_gpu_nodes, distributed_backend=hyperparams.distributed_backend) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
default=test_tube_dir, help='where to save logs') parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir, help='where to save slurm meta') parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') parent_parser.add_argument('--nb_hopt_trials', type=int, default=1, help='how many grid search trials to run') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args( parent_parser, root_dir) hyperparams = parser.parse_args() # --------------------- # RUN TRAINING # --------------------- # run on HPC cluster print('RUNNING ON SLURM CLUSTER') optimize_on_cluster(hyperparams)