def main(hparams, cluster, results_dict):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    print('loading model...')
    model = LightningTemplateModel(hparams)
    print('model built')

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------
    # when using grid search, it's possible for all models to start at once
    # and use the same test tube experiment version
    relative_node_id = int(os.environ['SLURM_NODEID'])
    sleep(relative_node_id + 1)

    # init experiment
    exp = Experiment(name=hyperparams.experiment_name,
                     save_dir=hyperparams.test_tube_save_path,
                     autosave=False,
                     description='test demo')

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 DEFINE CALLBACKS
    # ------------------------
    model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name,
                                        exp.version)
    early_stop = EarlyStopping(monitor='val_acc',
                               patience=3,
                               verbose=True,
                               mode='max')

    checkpoint = ModelCheckpoint(filepath=model_save_path,
                                 save_best_only=True,
                                 verbose=True,
                                 monitor='val_loss',
                                 mode='min')

    # ------------------------
    # 4 INIT TRAINER
    # ------------------------
    trainer = Trainer(experiment=exp,
                      cluster=cluster,
                      checkpoint_callback=checkpoint,
                      early_stop_callback=early_stop,
                      gpus=hparams.gpus,
                      nb_gpu_nodes=hyperparams.nb_gpu_nodes)

    # ------------------------
    # 5 START TRAINING
    # ------------------------
    trainer.fit(model)
Exemple #2
0
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
    print('loading model...')
    model = LightningTemplateModel(hparams)
    print('model built')

    # ------------------------
    # 2 INIT TEST TUBE EXP
    # ------------------------

    # init experiment
    exp = Experiment(name=hyperparams.experiment_name,
                     save_dir=hyperparams.test_tube_save_path,
                     autosave=False,
                     description='test demo')

    exp.argparse(hparams)
    exp.save()

    # ------------------------
    # 3 DEFINE CALLBACKS
    # ------------------------
    model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name,
                                        exp.version)
    early_stop = EarlyStopping(monitor='val_acc',
                               patience=3,
                               verbose=True,
                               mode='max')

    checkpoint = ModelCheckpoint(filepath=model_save_path,
                                 save_best_only=True,
                                 verbose=True,
                                 monitor='val_loss',
                                 mode='min')

    # ------------------------
    # 4 INIT TRAINER
    # ------------------------
    trainer = Trainer(
        experiment=exp,
        checkpoint_callback=checkpoint,
        early_stop_callback=early_stop,
        gpus=hparams.gpus,
    )

    # ------------------------
    # 5 START TRAINING
    # ------------------------
    trainer.fit(model)
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    :return:
    """
    # init experiment
    exp = Experiment(name=hparams.tt_name,
                     debug=hparams.debug,
                     save_dir=hparams.tt_save_path,
                     version=hparams.hpc_exp_number,
                     autosave=False,
                     description=hparams.tt_description)

    exp.argparse(hparams)
    exp.save()

    # build model
    model = LightningTemplateModel(hparams)

    # callbacks
    early_stop = EarlyStopping(
        monitor='val_acc',
        patience=3,
        mode='min',
        verbose=True,
    )

    model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name,
                                        exp.version)
    checkpoint = ModelCheckpoint(filepath=model_save_path,
                                 save_best_only=True,
                                 verbose=True,
                                 monitor='val_acc',
                                 mode='min')

    # configure trainer
    trainer = Trainer(
        experiment=exp,
        checkpoint_callback=checkpoint,
        early_stop_callback=early_stop,
    )

    # train model
    trainer.fit(model)
    parent_parser.add_argument(
        '--gpus',
        type=str,
        default='-1',
        help=
        'how many gpus to use in the node. -1 uses all the gpus on the node')
    parent_parser.add_argument('--test_tube_save_path',
                               type=str,
                               default=test_tube_dir,
                               help='where to save logs')
    parent_parser.add_argument('--model_save_path',
                               type=str,
                               default=checkpoint_dir,
                               help='where to save model')
    parent_parser.add_argument('--experiment_name',
                               type=str,
                               default='pt_lightning_exp_a',
                               help='test tube exp name')

    # allow model to overwrite or extend args
    parser = LightningTemplateModel.add_model_specific_args(
        parent_parser, root_dir)
    hyperparams = parser.parse_args()

    # ---------------------
    # RUN TRAINING
    # ---------------------
    # run on HPC cluster
    print(f'RUNNING INTERACTIVE MODE ON GPUS. gpu ids: {hyperparams.gpus}')
    main(hyperparams)