def main(): parent_parser = HyperOptArgumentParser(strategy="grid_search", add_help=False) logdir = "logs" parent_parser.add_argument( "--test_tube_save_path", default=os.path.join(logdir, "test_tube_data") ) parent_parser.add_argument( "--model_save_path", default=os.path.join(logdir, "model_weights") ) parent_parser.add_argument( "--experiment_name", default=os.path.join(logdir, "vampire") ) parser = VAMPIRE.add_model_specific_args(parent_parser, ".") hparams = parser.parse_args() model = VAMPIRE(hparams) exp = Experiment( name=hparams.experiment_name, save_dir=hparams.test_tube_save_path, autosave=False, ) exp.argparse(hparams) exp.save() trainer = Trainer(experiment=exp, fast_dev_run=False) trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # when using grid search, it's possible for all models to start at once # and use the same test tube experiment version relative_node_id = int(os.environ['SLURM_NODEID']) sleep(relative_node_id + 1) # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping(monitor='val_acc', patience=3, verbose=True, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, nb_gpu_nodes=hyperparams.nb_gpu_nodes) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping(monitor='val_acc', patience=3, verbose=True, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = DSANet(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name='dsanet_exp_{}_window={}_horizon={}'.format( hparams.data_name, hparams.window, hparams.horizon), save_dir=hparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=True, mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, early_stop_callback=early_stop, ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model) print('View tensorboard logs by running\ntensorboard --logdir %s' % os.getcwd()) print('and going to http://localhost:6006 on your browser')
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # init experiment exp = Experiment( name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description ) exp.argparse(hparams) exp.save() # build model model = ExampleModel(hparams) # callbacks early_stop = EarlyStopping( monitor='val_acc', patience=3, mode='min', verbose=True, ) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint( filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor='val_acc', mode='min' ) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # train model trainer.fit(model)
def main(hparams): # load model model = MyModel(hparams) # init experiment exp = Experiment( name=hparams.experiment_name, save_dir=hparams.test_tube_save_path, autosave=False, description='baseline attn interval' ) exp.argparse(hparams) exp.save() # define callbackes model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_loss', patience=5, verbose=True, mode='min' ) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor='pr', mode='max' ) # init trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, val_check_interval=1 ) # start training trainer.fit(model)
def run_test(): # model = MyModel.load_from_metrics( # weights_path='experiments/saved_batch_attn_rnn/model_weights/mimic_all/1/_ckpt_epoch_10.ckpt', # tags_csv='experiments/saved_batch_attn_rnn/test_tube_data/mimic_all/version_1/meta_tags.csv', # on_gpu=True, # map_location=None # ) # model = MyModel.load_from_metrics( # weights_path='saved/rel_interval/model_weights/inner/1/_ckpt_epoch_9.ckpt', # tags_csv='saved/rel_interval/test_tube_data/inner/version_1/meta_tags.csv', # on_gpu=True, # map_location=torch.device('cuda') # ) # model = MyModel.load_from_metrics( # weights_path='saved/rel_interval/model_weights/nash/1/_ckpt_epoch_4.ckpt', # tags_csv='saved/rel_interval/test_tube_data/nash/version_1/meta_tags.csv', # on_gpu=True, # map_location=torch.device('cuda') # ) # model = MyModel.load_from_metrics( # weights_path='saved/rel_interval/model_weights/nash/6/_ckpt_epoch_4.ckpt', # tags_csv='saved/rel_interval/test_tube_data/nash/version_6/meta_tags.csv', # on_gpu=True, # map_location=torch.device('cuda') # ) # trainer = Trainer() # trainer.test(model) # model = MyModel.load_from_metrics( # weights_path='saved/rel_interval/model_weights/nash/7/_ckpt_epoch_2.ckpt', # tags_csv='saved/rel_interval/test_tube_data/nash/version_7/meta_tags.csv', # on_gpu=True, # map_location=torch.device('cuda') # ) # trainer = Trainer() # trainer.test(model) model = MyModel.load_from_metrics( weights_path= 'saved_seed1/rel_interval_layer_varient/model_weights/ad/0/_ckpt_epoch_8.ckpt', tags_csv= 'saved_seed1/rel_interval_layer_varient/test_tube_data/ad/version_0/meta_tags.csv', on_gpu=True, map_location=torch.device('cuda')) trainer = Trainer() trainer.test(model)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', choices=['srcnn', 'srgan'], required=True) parser.add_argument('--scale_factor', type=int, default=4) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--patch_size', type=int, default=96) parser.add_argument('--gpus', type=str, default='0') opt = parser.parse_args() # load model class if opt.model == 'srcnn': Model = models.SRCNNModel elif opt.model == 'srgan': Model = models.SRGANModel # add model specific arguments to original parser parser = Model.add_model_specific_args(parser) opt = parser.parse_args() # instantiate experiment exp = Experiment(save_dir=f'./logs/{opt.model}') exp.argparse(opt) model = Model(opt) # define callbacks checkpoint_callback = ModelCheckpoint( filepath=exp.get_media_path(exp.name, exp.version), ) # instantiate trainer trainer = Trainer( experiment=exp, max_nb_epochs=4000, add_log_row_interval=50, check_val_every_n_epoch=10, checkpoint_callback=checkpoint_callback, gpus=[int(i) for i in opt.gpus.split(',')] ) # start training! trainer.fit(model)
def run_test(): # model = MyModel.load_from_metrics( # weights_path='saved/retain/model_weights/inner/7/_ckpt_epoch_9.ckpt', # tags_csv='saved/retain/test_tube_data/inner/version_7/meta_tags.csv', # on_gpu=True, # map_location=None, # ) # model = MyModel.load_from_metrics( # weights_path='saved/retain/model_weights/nash/2/_ckpt_epoch_22.ckpt', # tags_csv='saved/retain/test_tube_data/nash/version_2/meta_tags.csv', # on_gpu=True, # map_location=None, # ) # model = MyModel.load_from_metrics( # weights_path='saved/retain/model_weights/nash/6/_ckpt_epoch_4.ckpt', # tags_csv='saved/retain/test_tube_data/nash/version_6/meta_tags.csv', # on_gpu=True, # map_location=torch.device('cuda'), # ) # trainer = Trainer() # trainer.test(model) # model = MyModel.load_from_metrics( # weights_path='saved/retain/model_weights/nash/7/_ckpt_epoch_4.ckpt', # tags_csv='saved/retain/test_tube_data/nash/version_7/meta_tags.csv', # on_gpu=True, # map_location=torch.device('cuda'), # ) # trainer = Trainer() # trainer.test(model) model = MyModel.load_from_metrics( weights_path='saved_seed1/retain/model_weights/ad/1/_ckpt_epoch_8.ckpt', tags_csv='saved_seed1/retain/test_tube_data/ad/version_1/meta_tags.csv', on_gpu=True, map_location=torch.device('cuda'), ) trainer = Trainer() trainer.test(model)
def run_test(): # model = MyModel.load_from_metrics( # weights_path='saved/tlstm/model_weights/nash/4/_ckpt_epoch_22.ckpt', # tags_csv='saved/tlstm/test_tube_data/nash/version_4/meta_tags.csv', # on_gpu=True, # map_location=torch.device('cuda') # ) # model = MyModel.load_from_metrics( # weights_path='saved/tlstm/model_weights/ad/0/_ckpt_epoch_8.ckpt', # tags_csv='saved/tlstm/test_tube_data/ad/version_0/meta_tags.csv', # on_gpu=True, # map_location=torch.device('cuda') # ) # model = MyModel.load_from_metrics( # weights_path='saved/tlstm/model_weights/nash/8/_ckpt_epoch_6.ckpt', # tags_csv='saved/tlstm/test_tube_data/nash/version_10/meta_tags.csv', # on_gpu=True, # map_location=torch.device('cuda') # ) # trainer = Trainer() # trainer.test(model) # model = MyModel.load_from_metrics( # weights_path='saved/tlstm/model_weights/nash/9/_ckpt_epoch_6.ckpt', # tags_csv='saved/tlstm/test_tube_data/nash/version_9/meta_tags.csv', # on_gpu=True, # map_location=torch.device('cuda') # ) # trainer = Trainer() # trainer.test(model) model = MyModel.load_from_metrics( weights_path='saved_seed1/tlstm/model_weights/ad/2/_ckpt_epoch_7.ckpt', tags_csv='saved_seed1/tlstm/test_tube_data/ad/version_2/meta_tags.csv', on_gpu=True, map_location=torch.device('cuda')) trainer = Trainer() trainer.test(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = torch.cuda.is_available() if hparams.disable_cuda: on_gpu = False device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment exp = Experiment(name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description) exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer( experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # train model trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = torch.cuda.is_available() if hparams.disable_cuda: on_gpu = False device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment exp = Experiment(name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description) exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer(experiment=exp, on_gpu=on_gpu, cluster=cluster, enable_tqdm=hparams.enable_tqdm, overfit_pct=hparams.overfit, track_grad_norm=hparams.track_grad_norm, fast_dev_run=hparams.fast_dev_run, check_val_every_n_epoch=hparams.check_val_every_n_epoch, accumulate_grad_batches=hparams.accumulate_grad_batches, process_position=process_position, current_gpu_name=current_gpu, checkpoint_callback=checkpoint, early_stop_callback=early_stop, enable_early_stop=hparams.enable_early_stop, max_nb_epochs=hparams.max_nb_epochs, min_nb_epochs=hparams.min_nb_epochs, train_percent_check=hparams.train_percent_check, val_percent_check=hparams.val_percent_check, test_percent_check=hparams.test_percent_check, val_check_interval=hparams.val_check_interval, log_save_interval=hparams.log_save_interval, add_log_row_interval=hparams.add_log_row_interval, lr_scheduler_milestones=hparams.lr_scheduler_milestones) # train model trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = hparams.gpus is not None and torch.cuda.is_available() device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment(name='test_tube_exp', debug=True, save_dir=log_dir, version=0, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # gpus are ; separated for inside a node and , within nodes gpu_list = None if hparams.gpus is not None: gpu_list = [int(x) for x in hparams.gpus.split(';')] # configure trainer trainer = Trainer(experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=gpu_list) # train model trainer.fit(model)
patience=5, verbose=True, mode='auto') checkpoint = ModelCheckpoint( filepath=model_save_path, # save_best_only=True, # save_weights_only=True, verbose=True, monitor='val_loss', mode='auto', period=100, ) #----------------------------------------------------------------------- # 4 INIT TRAINER #----------------------------------------------------------------------- trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, # early_stop_callback=early_stop, max_nb_epochs=EPOCH, gpus=args.gpu #map(int, args.gpu.split(',')), #hparams.gpus, # distributed_backend='ddp' ) #----------------------------------------------------------------------- # 5 START TRAINING #----------------------------------------------------------------------- trainer.fit(model) sys.exit()
model_save_path = '{}/{}/{}'.format(hyperparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping(monitor='val_acc', patience=3, verbose=True, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min') # # Trainerを初期化する # In[9]: trainer = Trainer(experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hyperparams.gpus) # # 学習を開始! # In[10]: trainer.fit(model) # In[ ]: