def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    model = LightningTemplateModel(hparams)

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------
    # init experiment
    exp = Experiment(name='test_exp',
                     save_dir=hyperparams.log_dir,
                     autosave=False,
                     description='test demo')

    # ------------------------
    # 2 INIT TRAINER
    # ------------------------
    trainer = Trainer(experiment=exp, gpus=8, nb_gpu_nodes=2)

    # ------------------------
    # 5 START TRAINING
    # ------------------------
    trainer.fit(model)
Beispiel #2
0
def main(hparams, cluster, results_dict):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    print('loading model...')
    model = LightningTemplateModel(hparams)
    print('model built')

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------
    # when using grid search, it's possible for all models to start at once
    # and use the same test tube experiment version
    relative_node_id = int(os.environ['SLURM_NODEID'])
    sleep(relative_node_id + 1)

    # init experiment
    exp = Experiment(name=hyperparams.experiment_name,
                     save_dir=hyperparams.test_tube_save_path,
                     autosave=False,
                     description='test demo')

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 DEFINE CALLBACKS
    # ------------------------
    model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name,
                                        exp.version)
    early_stop = EarlyStopping(monitor='val_acc',
                               patience=3,
                               verbose=True,
                               mode='max')

    checkpoint = ModelCheckpoint(filepath=model_save_path,
                                 save_best_only=True,
                                 verbose=True,
                                 monitor='val_loss',
                                 mode='min')

    # ------------------------
    # 4 INIT TRAINER
    # ------------------------
    trainer = Trainer(experiment=exp,
                      cluster=cluster,
                      checkpoint_callback=checkpoint,
                      early_stop_callback=early_stop,
                      gpus=hparams.gpus,
                      nb_gpu_nodes=hyperparams.nb_gpu_nodes)

    # ------------------------
    # 5 START TRAINING
    # ------------------------
    trainer.fit(model)
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    print('loading model...')
    model = LightningTemplateModel(hparams)
    print('model built')

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------

    # init experiment
    exp = Experiment(name=hyperparams.experiment_name,
                     save_dir=hyperparams.test_tube_save_path,
                     autosave=False,
                     description='test demo')

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 INIT TRAINER
    # ------------------------
    trainer = Trainer(experiment=exp, gpus=hparams.gpus, use_amp=True)

    # ------------------------
    # 4 START TRAINING
    # ------------------------
    trainer.fit(model)
Beispiel #4
0
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    model = LightningTemplateModel(hparams)

    # ------------------------
    # 2 INIT EXP
    # ------------------------
    # init experiment
    exp = Experiment(name=hyperparams.experiment_name,
                     save_dir=hyperparams.test_tube_save_path,
                     autosave=False,
                     description='test demo')

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 INIT TRAINER
    # ------------------------
    trainer = Trainer(experiment=exp)

    # ------------------------
    # 4 START TRAINING
    # ------------------------
    trainer.fit(model)
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    print('loading model...')
    model = LightningTemplateModel(hparams)
    print('model built')

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------

    # init experiment
    exp = Experiment(
        name=hyperparams.experiment_name,
        save_dir=hyperparams.test_tube_save_path,
        autosave=False,
        description='test demo'
    )

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 DEFINE CALLBACKS
    # ------------------------
    model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version)
    early_stop = EarlyStopping(
        monitor='val_acc',
        patience=3,
        verbose=True,
        mode='max'
    )

    checkpoint = ModelCheckpoint(
        filepath=model_save_path,
        save_best_only=True,
        verbose=True,
        monitor='val_loss',
        mode='min'
    )

    # ------------------------
    # 4 INIT TRAINER
    # ------------------------
    trainer = Trainer(
        experiment=exp,
        checkpoint_callback=checkpoint,
        early_stop_callback=early_stop,
    )

    # ------------------------
    # 5 START TRAINING
    # ------------------------
    trainer.fit(model)
Beispiel #6
0
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # init experiment
    exp = Experiment(
        name=hparams.tt_name,
        debug=hparams.debug,
        save_dir=hparams.tt_save_path,
        version=hparams.hpc_exp_number,
        autosave=False,
        description=hparams.tt_description
    )

    exp.argparse(hparams)
    exp.save()

    # build model
    model = LightningTemplateModel(hparams)

    # configure trainer
    trainer = Trainer(experiment=exp)

    # train model
    trainer.fit(model)
Beispiel #7
0
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # init experiment
    exp = Experiment(name=hparams.tt_name,
                     debug=hparams.debug,
                     save_dir=hparams.tt_save_path,
                     version=hparams.hpc_exp_number,
                     autosave=False,
                     description=hparams.tt_description)

    exp.argparse(hparams)
    exp.save()

    # build model
    model = LightningTemplateModel(hparams)

    # callbacks
    early_stop = EarlyStopping(
        monitor='val_acc',
        patience=3,
        mode='min',
        verbose=True,
    )

    model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name,
                                        exp.version)
    checkpoint = ModelCheckpoint(filepath=model_save_path,
                                 save_best_only=True,
                                 verbose=True,
                                 monitor='val_acc',
                                 mode='min')

    # configure trainer
    trainer = Trainer(
        experiment=exp,
        checkpoint_callback=checkpoint,
        early_stop_callback=early_stop,
    )

    # train model
    trainer.fit(model)
Beispiel #8
0
def main(hparams, cluster):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    print('loading model...')
    model = LightningTemplateModel(hparams)
    print('model built')

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------
    # when using grid search, it's possible for all models to start at once
    # and use the same test tube experiment version
    relative_node_id = int(os.environ['SLURM_NODEID'])
    sleep(relative_node_id + 1)

    # init experiment
    exp = Experiment(
        name=hyperparams.experiment_name,
        save_dir=hyperparams.test_tube_save_path,
        autosave=False,
        version=hparams.hpc_exp_number,  # match the slurm job version number
        description='test demo')

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 4 INIT TRAINER
    # ------------------------
    trainer = Trainer(experiment=exp,
                      gpus=hparams.per_experiment_nb_gpus,
                      nb_gpu_nodes=hyperparams.nb_gpu_nodes,
                      distributed_backend=hyperparams.distributed_backend)

    # ------------------------
    # 5 START TRAINING
    # ------------------------
    trainer.fit(model)
Beispiel #9
0
                               default=test_tube_dir,
                               help='where to save logs')
    parent_parser.add_argument('--slurm_log_path',
                               type=str,
                               default=slurm_out_dir,
                               help='where to save slurm meta')
    parent_parser.add_argument('--model_save_path',
                               type=str,
                               default=checkpoint_dir,
                               help='where to save model')
    parent_parser.add_argument('--experiment_name',
                               type=str,
                               default='pt_lightning_exp_a',
                               help='test tube exp name')
    parent_parser.add_argument('--nb_hopt_trials',
                               type=int,
                               default=1,
                               help='how many grid search trials to run')

    # allow model to overwrite or extend args
    parser = LightningTemplateModel.add_model_specific_args(
        parent_parser, root_dir)
    hyperparams = parser.parse_args()

    # ---------------------
    # RUN TRAINING
    # ---------------------
    # run on HPC cluster
    print('RUNNING ON SLURM CLUSTER')
    optimize_on_cluster(hyperparams)