def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # init experiment exp = Experiment( name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description ) exp.argparse(hparams) exp.save() # build model model = ExampleModel(hparams) # callbacks early_stop = EarlyStopping( monitor='val_acc', patience=3, mode='min', verbose=True, ) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint( filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor='val_acc', mode='min' ) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # train model trainer.fit(model)
def main(hparams): """ Main training routine specific for this project """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = DSANet(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name='dsanet_exp_{}_window={}_horizon={}'.format( hparams.data_name, hparams.window, hparams.horizon), save_dir=hparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=True, mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, early_stop_callback=early_stop, ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model) print('View tensorboard logs by running\ntensorboard --logdir %s' % os.getcwd()) print('and going to http://localhost:6006 on your browser')
def train(hparams): # init exp and track all the parameters from the HyperOptArgumentParser exp = Experiment( name=hparams.test_tube_exp_name, save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # pretend to train x = torch.rand((1, hparams.x_val)) for train_step in range(0, 100): y = torch.rand((hparams.x_val, 1)) out = x.mm(y) exp.log({'fake_err': out.item()}) # save exp when we're done exp.save()
def main(hparams, cluster): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # when using grid search, it's possible for all models to start at once # and use the same test tube experiment version relative_node_id = int(os.environ['SLURM_NODEID']) sleep(relative_node_id + 1) # init experiment exp = Experiment( name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, version=hparams.hpc_exp_number, # match the slurm job version number description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer(experiment=exp, gpus=hparams.per_experiment_nb_gpus, nb_gpu_nodes=hyperparams.nb_gpu_nodes, distributed_backend=hyperparams.distributed_backend) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main_trainer(hparams): print_params(hparams) full_exp = Experiment(name=hparams.tt_name+'_overall', debug=hparams.debug, autosave=False, description=hparams.tt_description, save_dir=hparams.tt_save_path) full_exp.add_argparse_meta(hparams) # fit model val_scores, train_scores = [], [] best_acc = 0 best_loss = 0 best_trial_nb = 0 for trial_nb in range(hparams.nb_trials): exp = Experiment(name=hparams.tt_name, debug=hparams.debug, autosave=False, description=hparams.tt_description, save_dir=hparams.tt_save_path) exp.add_argparse_meta(hparams) data = SequentialReadingsData(window_size=hparams.time_steps, data_path=hparams.data_path, flatten_x=True) val_loss, val_acc, history = fit_feedforward(hparams, exp, data.train_x, data.train_y, data.val_x, data.val_y, trial_nb) log_history(history.history, exp) exp.add_metric_row({'final_val_acc': val_acc, 'final_train_acc': val_loss}) exp.save() full_exp.add_metric_row({'val_acc': val_acc, 'val_loss': val_loss, 'trial_nb': trial_nb}) # save model when we have a better one if val_acc > best_acc: best_acc = val_acc best_loss = val_loss best_trial_nb = trial_nb val_scores.append(val_acc) mean_val_acc = np.mean(val_scores) full_exp.add_metric_row({'final_val_acc': mean_val_acc, 'best_val_loss': best_loss, 'best_val_acc': best_acc, 'best_trial_nb': best_trial_nb}) full_exp.save()
def main(hparams): # load model model = MyModel(hparams) # init experiment exp = Experiment( name=hparams.experiment_name, save_dir=hparams.test_tube_save_path, autosave=False, description='baseline attn interval' ) exp.argparse(hparams) exp.save() # define callbackes model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_loss', patience=5, verbose=True, mode='min' ) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor='pr', mode='max' ) # init trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, val_check_interval=1 ) # start training trainer.fit(model)
def train(hparams): # init exp and track all the parameters from the HyperOptArgumentParser exp = Experiment(name='dense_model', save_dir='/some/path', autosave=False) exp.add_argparse_meta(hparams) # define tensorflow graph x = tf.placeholder(dtype=tf.int32, name='x') y = tf.placeholder(dtype=tf.int32, name='y') out = x * y sess = tf.Session() # Run the tf op for train_step in range(0, 100): output = sess.run(out, feed_dict={x: hparams.x_val, y: hparams.y_val}) exp.add_metric_row({'fake_err': output}) # save exp when we're done exp.save()
def main(hparams, data): # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment(name=hparams.exp_name, debug=False, save_dir=log_dir, version=0, autosave=True, description='P2R codebase') # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = P2rSystem(hparams, data) model_save_path = '{}/{}/version_{}/checkpoints'.format( exp.save_dir, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, verbose=True, monitor='tng_loss', mode='min', save_best_only=True) # configure trainer trainer = Trainer(experiment=exp, checkpoint_callback=checkpoint, min_nb_epochs=1, max_nb_epochs=hparams.max_nb_epochs, track_grad_norm=2, accumulate_grad_batches=1, row_log_interval=1, amp_level='O2', use_amp=True, gpus=1) # train model trainer.fit(model) trainer.test() filepath = '{}/_ckpt_epoch_final.ckpt'.format(model_save_path) checkpoint.save_model(filepath, False)
def main(hparams): exp = Experiment( name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description, ) exp.argparse(hparams) exp.save() model = AutoregressiveFaceVAE(hparams) early_stop = EarlyStopping(monitor="avg_val_loss", patience=3, verbose=True, mode="min") model_save_path = "{}/{}/{}".format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor="avg_val_loss", mode="min", ) trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, distributed_backend=hparams.dist_backend, # val_check_interval=0.5, # distributed_backend="dp", # overfit_pct=0.01 ) trainer.fit(model)
def main(hparams): # init experiment experiment_args = parse_argdict_for_method(Experiment.__init__, hparams) exp = Experiment(**experiment_args) # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = Network(hparams) # callbacks if hparams.enable_early_stop: early_stop = EarlyStopping(monitor=hparams.monitor_value, patience=hparams.patience, verbose=True, mode=hparams.monitor_mode) else: early_stop = None if hparams.enable_model_checkpoint: model_save_path = pathlib.Path(exp.log_dir).parent / 'model_weights' checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=hparams.save_best_only, save_weights_only=hparams.save_weights_only, verbose=True, monitor=hparams.monitor_value, mode=hparams.monitor_mode) else: checkpoint = None # configure trainer trainer_args = parse_argdict_for_method(Trainer.__init__, hparams) trainer = Trainer(experiment=exp, early_stop_callback=early_stop, checkpoint_callback=checkpoint, **trainer_args) # train model trainer.fit(model)
def main_trainer(hparams): print_params(hparams) exp = Experiment(name=hparams.tt_name, debug=hparams.debug, autosave=False, description=hparams.tt_description, save_dir=hparams.tt_save_path) exp.add_argparse_meta(hparams) # fit model val_scores = [] best_score = 0 for trial_nb in range(hparams.nb_trials): data = dataset_loader.IndividualSequencesData( hparams.data_path, y_labels=hparams.y_labels.split(',')) X, Y, lengths = flatten_data(data.train_x_y) # fit model = hmm.GaussianHMM(n_components=hparams.nb_components, n_iter=hparams.nb_hmm_iters) model.fit(X, lengths) val_X, val_Y, lengths = flatten_data(data.val_x_y) Y_hat = model.predict(val_X, lengths) val_score = np.equal(Y_hat, val_Y).sum() / float(len(Y_hat)) # save model if val_score > best_score: best_score = val_score save_model(model, hparams, exp, trial_nb) val_scores.append(val_score) exp.add_metric_row({'val_acc': val_score, 'trail_nb': trial_nb}) mean_val_acc = np.mean(val_scores) exp.add_metric_row({'final_val_acc': mean_val_acc}) exp.save()
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT Logger # ------------------------ # init experiment exp = Experiment( name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo' ) exp.argparse(hparams) exp.save() # ------------------------ # 3 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, gpus=hparams.gpus, distributed_backend=hparams.dist_backend, ) # ------------------------ # 4 START TRAINING # ------------------------ trainer.fit(model)
def create_tt_experiment(hparams): """Create test-tube experiment for logging training and storing models. Parameters ---------- hparams : :obj:`dict` dictionary of hyperparameters defining experiment that will be saved as a csv file Returns ------- :obj:`tuple` - if experiment defined by hparams already exists, returns :obj:`(None, None, None)` - if experiment does not exist, returns :obj:`(hparams, sess_ids, exp)` """ from test_tube import Experiment # get session_dir hparams['session_dir'], sess_ids = get_session_dir( hparams, session_source=hparams.get('all_source', 'save')) if not os.path.isdir(hparams['session_dir']): os.makedirs(hparams['session_dir']) export_session_info_to_csv(hparams['session_dir'], sess_ids) hparams['expt_dir'] = get_expt_dir(hparams) if not os.path.isdir(hparams['expt_dir']): os.makedirs(hparams['expt_dir']) # check to see if experiment already exists if experiment_exists(hparams): return None, None, None exp = Experiment( name=hparams['experiment_name'], debug=False, save_dir=os.path.dirname(hparams['expt_dir'])) exp.save() hparams['version'] = exp.version return hparams, sess_ids, exp
def main_trainer(hparams): print_params(hparams) exp = Experiment(name=hparams.tt_name, debug=hparams.debug, autosave=False, description=hparams.tt_description, save_dir=hparams.tt_save_path) exp.add_argparse_meta(hparams) # init data loader # fit model val_scores, train_scores = [], [] best_score = 0 for trial_nb in range(hparams.nb_trials): data = SequentialReadingsData(window_size=hparams.time_steps, data_path=hparams.data_path, flatten_x=True) clf = RandomForestClassifier(n_estimators=hparams.nb_estimators) clf.fit(data.train_x, data.train_y) train_score = clf.score(data.train_x, data.train_y) val_score = clf.score(data.val_x, data.val_y) # save model when we have a better one if val_score > best_score: best_score = val_score save_model(clf, hparams, exp, trial_nb) train_scores.append(train_score) val_scores.append(val_score) exp.add_metric_row({'val_acc': val_score, 'train_acc': train_score, 'trail_nb': trial_nb}) mean_val_acc = np.mean(val_scores) mean_train_acc = np.mean(train_scores) exp.add_metric_row({'final_val_acc': mean_val_acc, 'final_train_acc': mean_train_acc}) exp.save()
def train(hparams, *args): """Train your awesome model. :param hparams: The arguments to run the model with. """ # Initialize experiments and track all the hyperparameters exp = Experiment( name=hparams.test_tube_exp_name, # Location to save the metrics. save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # Pretend to train. x = torch.rand((1, hparams.x_val)) for train_step in range(0, 100): y = torch.rand((hparams.x_val, 1)) out = x.mm(y) exp.log({'fake_err': out.item()}) # Save exp when . exp.save()
def train(hparams): # init exp and track all the parameters from the HyperOptArgumentParser exp = Experiment( name=hparams.test_tube_exp_name, save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # define tensorflow graph x = tf.placeholder(dtype=tf.int32, name='x') y = tf.placeholder(dtype=tf.int32, name='y') out = x * y sess = tf.Session() # Run the tf op for train_step in range(0, 100): output = sess.run(out, feed_dict={x: hparams.x_val, y: hparams.y_val}) exp.log({'fake_err': output}) # save exp when we're done exp.save()
def main(): model = CoolSystem() # PyTorch summarywriter with a few bells and whistles exp = Experiment(save_dir='../output/tmp') print(f"exp.save_dir: {exp.save_dir}") exp.save() print(f"saved !!!") # train on cpu using only 10% of the data (for demo purposes) # pass in experiment for automatic tensorboard logging. trainer = Trainer(experiment=exp, max_nb_epochs=1, train_percent_check=0.1) # train on 4 gpus (lightning chooses GPUs for you) # trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=4) # train on 4 gpus (you choose GPUs) # trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=[0, 1, 3, 7]) # train on 32 gpus across 4 nodes (make sure to submit appropriate SLURM job) # trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=8, nb_gpu_nodes=4) # train (1 epoch only here for demo) trainer.fit(model)
def optimize(optimizer_params): """ Main training routine specific for this project """ logging.basicConfig(level=logging.INFO) # dirs root_dir = os.path.dirname(os.path.realpath(__file__)) demo_log_dir = os.path.join(root_dir, 'dsanet_logs') checkpoint_dir = os.path.join(demo_log_dir, 'model_weights') test_tube_dir = os.path.join(demo_log_dir, 'test_tube_data') # although we user hyperOptParser, we are using it only as argparse right now parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) # gpu args parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') # allow model to overwrite or extend args parser = DSANet.add_model_specific_args(parent_parser, root_dir) hyperparams = parser.parse_args() print(hyperparams) setattr(hyperparams, 'batch_size', int(optimizer_params['batch_size'])) setattr(hyperparams, 'drop_prob', optimizer_params['dropout']) setattr(hyperparams, 'learning_rate', optimizer_params['learning_rate']) setattr(hyperparams, 'd_model', int(optimizer_params['units'])) # hyperparams['batch_size'] = optimizer_params['batch_size'] # hyperparams['drop_prob'] = optimizer_params['dropout'] # hyperparams['learning_rate'] = optimizer_params['learning_rate'] # hyperparams['d_model'] = optimizer_params['units'] print(hyperparams) hparams = hyperparams # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = DSANet(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name='dsanet_exp_{}_window={}_horizon={}'.format( hparams.data_name, hparams.window, hparams.horizon), save_dir=hparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint_callback = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='auto') early_stop = EarlyStopping(monitor='val_loss', patience=25, verbose=True, mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( gpus="0,1", distributed_backend='ddp', experiment=exp, early_stop_callback=early_stop, checkpoint_callback=checkpoint_callback, ) # ------------------------ # 5 START TRAINING # ------------------------ st_time = datetime.now() trainer.fit(model) eval_time = str(datetime.now() - st_time) print("Iteration %d: Getting results ... " % ITERATION) csv_load_path = '{}/{}/{}{}'.format(hparams.test_tube_save_path, exp.name, 'version_', exp.version) df = pd.read_csv('{}/{}'.format( csv_load_path, 'metrics.csv')) # change to experiment save dir min_idx = df['val_nd'].idxmin() of_connection = open(out_file, 'a') writer = csv.writer(of_connection) writer.writerow([ optimizer_params, hparams, df['tng_loss'].iloc[min_idx], df['val_loss'].iloc[min_idx], df['val_nd'].iloc[min_idx], df['NRMSE'].iloc[min_idx], df['val_rho10'].iloc[min_idx], df['val_rho50'].iloc[min_idx], df['val_rho90'].iloc[min_idx], eval_time, STATUS_OK ]) of_connection.close() return { 'loss': df['val_nd'].iloc[min_idx], 'ND': df['val_nd'].iloc[min_idx], 'NRMSE': df['NRMSE'].iloc[min_idx], 'val_loss': df['val_loss'].iloc[min_idx], 'params': optimizer_params, 'rho_metric': { 'rho10': df['val_rho10'].iloc[min_idx], 'rho50': df['val_rho50'].iloc[min_idx], 'rho90': df['val_rho90'].iloc[min_idx] }, 'iteration': ITERATION, 'eval_time': eval_time, 'status': STATUS_OK }
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = torch.cuda.is_available() if hparams.disable_cuda: on_gpu = False device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment exp = Experiment(name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description) exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer(experiment=exp, on_gpu=on_gpu, cluster=cluster, enable_tqdm=hparams.enable_tqdm, overfit_pct=hparams.overfit, track_grad_norm=hparams.track_grad_norm, fast_dev_run=hparams.fast_dev_run, check_val_every_n_epoch=hparams.check_val_every_n_epoch, accumulate_grad_batches=hparams.accumulate_grad_batches, process_position=process_position, current_gpu_name=current_gpu, checkpoint_callback=checkpoint, early_stop_callback=early_stop, enable_early_stop=hparams.enable_early_stop, max_nb_epochs=hparams.max_nb_epochs, min_nb_epochs=hparams.min_nb_epochs, train_percent_check=hparams.train_percent_check, val_percent_check=hparams.val_percent_check, test_percent_check=hparams.test_percent_check, val_check_interval=hparams.val_check_interval, log_save_interval=hparams.log_save_interval, add_log_row_interval=hparams.add_log_row_interval, lr_scheduler_milestones=hparams.lr_scheduler_milestones) # train model trainer.fit(model)
def main(hparams, cluster=None, results_dict=None): """ Main training routine specific for this project :param hparams: :return: """ # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment(name='test_tube_exp', debug=True, save_dir=log_dir, version=0, autosave=False, description='test demo') hparams.training_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/training' hparams.validation_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/validation' hparams.test_task_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/test_task' hparams.dummy_datasets = False hparams.audio_noise = 3e-3 hparams.cqt_fmin = 40. hparams.cqt_bins_per_octave = 24 hparams.cqt_n_bins = 216 hparams.cqt_hop_length = 512 hparams.cqt_filter_scale = 0.43 hparams.enc_channels = (1, 8, 16, 32, 64, 128, 256, 512, 512) hparams.enc_kernel_1_w = (3, 3, 3, 3, 3, 3, 3, 3) hparams.enc_kernel_1_h = (3, 3, 3, 3, 3, 3, 3, 3) hparams.enc_kernel_2_w = (1, 3, 1, 3, 1, 3, 1, 3) hparams.enc_kernel_2_h = (25, 3, 25, 3, 25, 3, 4, 3) hparams.enc_padding_1 = (1, 1, 1, 1, 1, 1, 1, 1) hparams.enc_padding_2 = (0, 1, 0, 1, 0, 1, 0, 0) hparams.enc_stride_1 = (1, 1, 1, 1, 1, 1, 1, 1) hparams.enc_stride_2 = (1, 1, 1, 1, 1, 1, 1, 1) hparams.enc_pooling_1 = (2, 1, 1, 1, 2, 1, 1, 1) hparams.ar_kernel_sizes = (5, 4, 1, 3, 3, 1, 3, 1, 6) hparams.ar_self_attention = (False, False, False, False, False, False, False, False, False) hparams.batch_size = 4 hparams.learning_rate = 3e-4 hparams.warmup_steps = 1000 hparams.annealing_steps = 100000 hparams.score_over_all_timesteps = False hparams.visible_steps = 60 # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = ContrastivePredictiveSystem(hparams) # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, #early_stop_callback=early_stop, # distributed_backend='dp', #gpus=[0], nb_sanity_val_steps=2, gradient_clip=0.5) # train model trainer.fit(model)
def main(hparams, cluster): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # when using grid search, it's possible for all models to start at once # and use the same test tube experiment version relative_node_id = int(os.environ['SLURM_NODEID']) sleep(relative_node_id + 1) # init experiment exp = Experiment( name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, version=hparams.hpc_exp_number, # match the slurm job version number description='test demo' ) exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_acc', patience=3, verbose=True, mode='max' ) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min' ) # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.per_experiment_nb_gpus, nb_gpu_nodes=hyperparams.nb_gpu_nodes ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams): """ Main training routine specific for this project """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = DSANet(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment(name='dsanet_exp_{}_window={}_horizon={}'.format( hparams.data_name, hparams.window, hparams.horizon), save_dir=hparams.test_tube_save_path, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint_callback = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='auto') early_stop = EarlyStopping(monitor='val_loss', patience=25, verbose=True, mode='min') # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( gpus="0", distributed_backend='dp', experiment=exp, early_stop_callback=early_stop, checkpoint_callback=checkpoint_callback, ) # ------------------------ # 5 START TRAINING # ------------------------ if hparams.test_only: model_load_path = '{}/{}'.format(hparams.model_save_path, exp.name) # metrics_load_path = '{}/{}'.format(hparams.test_tube_save_path, exp.name) path_list = [ os.path.join(dirpath, filename) for dirpath, _, filenames in os.walk(model_load_path) for filename in filenames if filename.endswith('.ckpt') ] # for dirpath, dirnames, filenames in os.walk(model_load_path): # if filename in [f for f in filenames if f.endswith(".ckpt")]: for filename in path_list: print(filename) data = filename.split("/") version_number = data[len(data) - 2] metrics_load_path = '{}/{}'.format(hparams.test_tube_save_path, exp.name) metrics_load_path = '{}/{}{}/{}'.format(metrics_load_path, 'version_', version_number, 'meta_tags.csv') print(metrics_load_path) hparams.metrics_load_path = metrics_load_path model = DSANet(hparams) model = DSANet.load_from_metrics(weights_path=filename, tags_csv=metrics_load_path, on_gpu=True) # model = LightningModule.load_from_checkpoint(filename) # test (pass in the model) hparams.metrics_load_path = metrics_load_path result = trainer.test(model) print(result) else: result = trainer.fit(model) print('View tensorboard logs by running\ntensorboard --logdir %s' % os.getcwd()) print('and going to http://localhost:6006 on your browser')
def main(hparams): """ Main training routine specific for this project :param hparams: :return: """ # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') model = LightningTemplateModel(hparams) print('model built') # ------------------------ # 2 INIT TEST TUBE EXP # ------------------------ # init experiment exp = Experiment( name=hyperparams.experiment_name, save_dir=hyperparams.test_tube_save_path, autosave=False, description='test demo' ) exp.argparse(hparams) exp.save() # ------------------------ # 3 DEFINE CALLBACKS # ------------------------ model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping( monitor='val_acc', patience=3, verbose=True, mode='max' ) checkpoint = ModelCheckpoint( filepath=model_save_path, save_best_only=True, verbose=True, monitor='val_loss', mode='min' ) # ------------------------ # 4 INIT TRAINER # ------------------------ trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=hparams.gpus, use_amp=True ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = hparams.gpus is not None and torch.cuda.is_available() device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment(name='test_tube_exp', debug=True, save_dir=log_dir, version=0, autosave=False, description='test demo') exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # gpus are ; separated for inside a node and , within nodes gpu_list = None if hparams.gpus is not None: gpu_list = [int(x) for x in hparams.gpus.split(';')] # configure trainer trainer = Trainer(experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, gpus=gpu_list) # train model trainer.fit(model)
def main(hparams, cluster=None, results_dict=None): """ Main training routine specific for this project :param hparams: :return: """ name = 'immersions_scalogram_resnet_house_smaller' version = 1 hparams.log_dir = '/home/idivinci3005/experiments/logs' hparams.checkpoint_dir = '/home/idivinci3005/experiments/checkpoints/' + name + '/' + str( version) hparams.training_set_path = '/home/idivinci3005/data/immersions/training' hparams.validation_set_path = '/home/idivinci3005/data/immersions/validation' hparams.test_task_set_path = '/home/idivinci3005/data/immersions/test_task' hparams.dummy_datasets = False hparams.audio_noise = 3e-3 hparams.cqt_fmin = 40. hparams.cqt_bins_per_octave = 24 hparams.cqt_n_bins = 216 hparams.cqt_hop_length = 512 hparams.cqt_filter_scale = 0.43 hparams.enc_channels = (1, 8, 16, 32, 64, 128, 256, 512, 512) hparams.enc_kernel_1_w = (3, 3, 3, 3, 3, 3, 3, 3) hparams.enc_kernel_1_h = (3, 3, 3, 3, 3, 3, 3, 3) hparams.enc_kernel_2_w = (1, 3, 1, 3, 1, 3, 1, 3) hparams.enc_kernel_2_h = (25, 3, 25, 3, 25, 3, 4, 3) hparams.enc_padding_1 = (1, 1, 1, 1, 1, 1, 1, 1) hparams.enc_padding_2 = (0, 1, 0, 1, 0, 1, 0, 0) hparams.enc_stride_1 = (1, 1, 1, 1, 1, 1, 1, 1) hparams.enc_stride_2 = (1, 1, 1, 1, 1, 1, 1, 1) hparams.enc_pooling_1 = (2, 1, 1, 1, 2, 1, 1, 1) hparams.ar_kernel_sizes = (5, 4, 1, 3, 3, 1, 3, 1, 6) hparams.ar_self_attention = (False, False, False, False, False, False, False, False, False) hparams.batch_size = 4 hparams.learning_rate = 3e-4 hparams.warmup_steps = 1000 hparams.annealing_steps = 100000 hparams.score_over_all_timesteps = False hparams.visible_steps = 60 hparams.batch_size = 32 hparams.learning_rate = 3e-4 hparams.warmup_steps = 1000 hparams.annealing_steps = 100000 hparams.score_over_all_timesteps = False hparams.visible_steps = 60 # init experiment exp = Experiment(name=name, debug=False, save_dir=hparams.log_dir, version=version, autosave=False, description='test demo') # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = ContrastivePredictiveSystem(hparams) task_model = ClassificationTaskModel( model, task_dataset_path=hparams.test_task_set_path) # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) checkpoint = ModelCheckpoint(filepath=hparams.checkpoint_dir, save_best_only=False, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, #early_stop_callback=early_stop, #distributed_backend='dp', gpus=[0], nb_sanity_val_steps=5, val_check_interval=0.2, gradient_clip=0.5, track_grad_norm=2) # train model trainer.fit(model)
def main(hparams, cluster=None, results_dict=None): """ Main training routine specific for this project :param hparams: :return: """ # init experiment name = 'immersions_scalogram_resnet_maestro' version = 0 hparams.log_dir = '/home/idivinci3005/experiments/logs' hparams.checkpoint_dir = '/home/idivinci3005/experiments/checkpoints/' + name + '/' + str( version) hparams.training_set_path = '/home/idivinci3005/data/maestro-v2.0.0' hparams.validation_set_path = '/home/idivinci3005/data/maestro-v2.0.0' hparams.test_task_set_path = '/home/idivinci3005/data/maestro-v2.0.0' hparams.audio_noise = 3e-3 hparams.ar_kernel_sizes = (5, 4, 1, 3, 3, 1, 3, 1, 6) hparams.ar_self_attention = (False, False, False, False, False, False, False, False, False) hparams.batch_size = 32 hparams.learning_rate = 3e-4 hparams.warmup_steps = 1000 hparams.annealing_steps = 100000 hparams.score_over_all_timesteps = False hparams.visible_steps = 62 if not os.path.exists(hparams.checkpoint_dir): os.mkdir(hparams.checkpoint_dir) exp = Experiment(name=name, debug=False, save_dir=hparams.log_dir, version=version, autosave=False, description='maestro dataset experiment') # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = ContrastivePredictiveSystemMaestro(hparams) task_model = MaestroClassificationTaskModel( model, task_dataset_path=hparams.test_task_set_path) model.test_task_model = task_model # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) checkpoint = ModelCheckpoint(filepath=hparams.checkpoint_dir, save_best_only=False, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, #early_stop_callback=early_stop, # distributed_backend='dp', gpus=[0], nb_sanity_val_steps=5, val_check_interval=0.1, val_percent_check=0.25, #train_percent_check=0.01 ) # train model trainer.fit(model)
def main(hparams, cluster, results_dict): """ Main training routine specific for this project :param hparams: :return: """ on_gpu = torch.cuda.is_available() if hparams.disable_cuda: on_gpu = False device = 'cuda' if on_gpu else 'cpu' hparams.__setattr__('device', device) hparams.__setattr__('on_gpu', on_gpu) hparams.__setattr__('nb_gpus', torch.cuda.device_count()) hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) # delay each training start to not overwrite logs process_position, current_gpu = TRAINING_MODEL.get_process_position( hparams.gpus) sleep(process_position + 1) # init experiment exp = Experiment(name=hparams.tt_name, debug=hparams.debug, save_dir=hparams.tt_save_path, version=hparams.hpc_exp_number, autosave=False, description=hparams.tt_description) exp.argparse(hparams) exp.save() # build model print('loading model...') model = TRAINING_MODEL(hparams) print('model built') # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_function=None, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer( experiment=exp, cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, ) # train model trainer.fit(model)
ds_valid=ds_valid, ds_test=ds_valid) #----------------------------------------------------------------------- # 2 INIT TEST TUBE EXP #----------------------------------------------------------------------- # init experiment exp = Experiment( name='voronoi', #hyperparams.experiment_name, save_dir='runs', #hyperparams.test_tube_save_path, # autosave=False, # description='experiment' ) exp.save() #----------------------------------------------------------------------- # 3 DEFINE CALLBACKS #----------------------------------------------------------------------- model_save_path = 'pl_voronoi' #'{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) early_stop = EarlyStopping(monitor='avg_val_loss', patience=5, verbose=True, mode='auto') checkpoint = ModelCheckpoint( filepath=model_save_path, # save_best_only=True, # save_weights_only=True, verbose=True,
def main(hparams, cluster=None, results_dict=None): """ Main training routine specific for this project :param hparams: :return: """ # init experiment log_dir = os.path.dirname(os.path.realpath(__file__)) exp = Experiment(name='test_tube_exp', debug=True, save_dir=log_dir, version=0, autosave=False, description='maestro dataset experiment') #hparams.training_set_path = '/Volumes/Elements/Datasets/maestro-v2.0.0' #hparams.validation_set_path = '/Volumes/Elements/Datasets/maestro-v2.0.0' #hparams.test_task_set_path = '/Volumes/Elements/Datasets/maestro-v2.0.0' hparams.training_set_path = 'C:/Users/HEV7RNG/Documents/data/maestro-v2.0.0' hparams.validation_set_path = 'C:/Users/HEV7RNG/Documents/data/maestro-v2.0.0' hparams.test_task_set_path = 'C:/Users/HEV7RNG/Documents/data/maestro-v2.0.0' hparams.audio_noise = 3e-3 hparams.ar_kernel_sizes = (5, 4, 1, 3, 3, 1, 3, 1, 6) hparams.ar_self_attention = (False, False, False, False, False, False, False, False, False) hparams.batch_size = 4 hparams.learning_rate = 2e-4 hparams.warmup_steps = 1000 hparams.annealing_steps = 100000 hparams.score_over_all_timesteps = False hparams.visible_steps = 62 # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = ContrastivePredictiveSystemMaestro(hparams) task_model = MaestroClassificationTaskModel( model, task_dataset_path=hparams.validation_set_path) model.test_task_model = task_model # callbacks early_stop = EarlyStopping(monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode) model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) checkpoint = ModelCheckpoint(filepath=model_save_path, save_best_only=True, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, early_stop_callback=early_stop, # distributed_backend='dp', #gpus=[0], nb_sanity_val_steps=2) # train model trainer.fit(model)
def test_training(self): # use default args given by lightning root_dir = '/Volumes/Elements/Projekte/Immersions' parent_parser = HyperOptArgumentParser(strategy='random_search', add_help=False) add_default_args(parent_parser, root_dir) # allow model to overwrite or extend args parser = ContrastivePredictiveSystem.add_model_specific_args(parent_parser, root_dir) hparams = parser.parse_args() name = 'immersions_scalogram_resnet_test' version = 0 hparams.log_dir = '/Volumes/Elements/Projekte/Immersions/logs' hparams.checkpoint_dir = '/Volumes/Elements/Projekte/Immersions/checkpoints' hparams.training_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/training' hparams.validation_set_path = '/Volumes/Elements/Datasets/Immersions/house_data_mp3/validation' hparams.dummy_datasets = False hparams.batch_size = 64 hparams.learning_rate = 2e-4 hparams.warmup_steps = 1000 hparams.annealing_steps = 100000 # init experiment exp = Experiment( name=name, debug=False, save_dir=hparams.log_dir, version=version, autosave=False, description='test demo' ) # set the hparams for the experiment exp.argparse(hparams) exp.save() # build model model = ContrastivePredictiveSystem(hparams) # callbacks early_stop = EarlyStopping( monitor=hparams.early_stop_metric, patience=hparams.early_stop_patience, verbose=True, mode=hparams.early_stop_mode ) checkpoint = ModelCheckpoint( filepath=hparams.checkpoint_dir, save_best_only=False, verbose=True, monitor=hparams.model_save_monitor_value, mode=hparams.model_save_monitor_mode ) # configure trainer trainer = Trainer( experiment=exp, checkpoint_callback=checkpoint, # early_stop_callback=early_stop, # distributed_backend='dp', gpus=[0], nb_sanity_val_steps=5, val_check_interval=0.2, train_percent_check=0.01, max_nb_epochs=1 ) # train model trainer.fit(model)