Ejemplo n.º 1
0
def train_tune(hparams, rdm):

    model = get_model(hparams)

    logger = TensorBoardLogger(save_dir=tune.get_trial_dir(),
                               name="",
                               version=".",
                               default_hp_metric=False)
    logger.log_hyperparams(
        hparams, {
            'train_acc': 0,
            'train_f1': 0,
            'train_loss': 0,
            'valid_acc': 0,
            'valid_f1': 0,
            'valid_loss': 0,
        })

    trainer = pl.Trainer(max_epochs=hparams['n_epochs'],
                         gpus=1,
                         logger=logger,
                         progress_bar_refresh_rate=0,
                         callbacks=[
                             TuneReportCallback(
                                 ['valid_acc', 'valid_f1', 'valid_loss'],
                                 on="validation_end")
                         ])
    trainer.fit(model, rdm)
def train_model(config=None):
    trainer = pl.Trainer(
        default_root_dir="/home/nuoc/Documents/MEX/src/version_0.2/checkpoints",
        gpus=1, precision=16,
        min_epochs=20,
        max_epochs=MAX_EPOCHS,
        callbacks=[TuneReportCallback({"loss": "avg_val_loss", }, on="validation_end")],
        logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"),
        stochastic_weight_avg=True
    )
    feature_dims2 = {
        "phase_dim": phase_dim,
        "pose_dim": pose_dim,
        "cost_dim": cost_dim,
        "g_input_dim": config["k"] + config["cost_hidden_dim"],
        "g_output_dim": phase_dim + config["k"] + cost_dim
    }

    in_slice = [phase_dim, pose_dim, cost_dim]
    out_slice = [phase_dim, config["k"], cost_dim]

    pose_encoder = MLP(config=config, dimensions=[pose_dim])
    model = MotionGenerationModel(config=config, Model=MoE, pose_autoencoder=pose_encoder,
                                     feature_dims=feature_dims2,
                                     input_slicers=in_slice, output_slicers=out_slice,
                                     train_set=train_set, val_set=val_set, test_set=test_set,
                                     name=model_name
                                     )

    trainer.fit(model)
Ejemplo n.º 3
0
def train_model(config, gpus, w2v, num_epochs=10):
    early_stop_callback = EarlyStopping(monitor="val_Accuracy",
                                        min_delta=0.0,
                                        patience=5,
                                        verbose=True,
                                        mode="min")
    checkpoint_callback = ModelCheckpoint(
        "models/wav2vec_kws/",
        save_top_k=1,
        verbose=True,
        monitor='val_Accuracy',
        mode='min',
    )
    tune_callback = TuneReportCallback({"acc": "val_Accuracy"},
                                       on="validation_end")
    logger = TensorBoardLogger("tb_logs", name="wav2vec_kws_tune")
    mlf_logger = MLFlowLogger(experiment_name="wav2vec_kws",
                              tracking_uri="http://192.168.0.32")
    mlflow.pytorch.autolog()
    trainer = pl.Trainer(
        gpus=gpus,
        callbacks=[checkpoint_callback, early_stop_callback, tune_callback],
        logger=[logger, mlf_logger],
        accumulate_grad_batches=4,
        amp_level="O0",
        max_epochs=num_epochs,
        progress_bar_refresh_rate=1,
        log_every_n_steps=1,
        flush_logs_every_n_steps=1)
    config = argparse.Namespace(**config)
    model = Wav2VecKWS(config, w2v)
    trainer.fit(model)
Ejemplo n.º 4
0
def myTrain(config,num_epochs,sym01,sym02,period):
    p = dict(
        seq_len = config['seq_len'],
        batch_size = config['batch_size'],
        criterion = nn.MSELoss(),
        max_epochs = num_epochs,
        n_features = 3,
        hidden_size = config['hidden_size'],
        num_layers = config['num_layers'],
        dropout = config['dropout'],
        learning_rate = config['lr']
    )
    print("myTrain parameters:",sym01,sym02,period)

    seed_everything(1)

    csv_logger = CSVLogger('./', name='lstm', version='0'),
    metrics = {"loss": "ptl/val_loss"}
    trainer = Trainer(
        max_epochs=p['max_epochs'],
        logger=csv_logger,
        callbacks=[TuneReportCallback(metrics, on="validation_end")]
        #gpus=1,
        #row_log_interval=1,
        #progress_bar_refresh_rate=2,
    )
    model = LSTMRegressor(
        n_features = p['n_features'],
        hidden_size = p['hidden_size'],
        seq_len = p['seq_len'],
        batch_size = p['batch_size'],
        criterion = p['criterion'],
        num_layers = p['num_layers'],
        dropout = p['dropout'],
        learning_rate = p['learning_rate']
    )

    dm = MyDataModule(
        sym01=sym01,
        sym02=sym02,
        period=period,
        seq_len = p['seq_len'],
        batch_size = p['batch_size']
    )
    dm.reset(
        sym01=sym01,
        sym02=sym02,
        period=period,
        seq_len = p['seq_len'],
        batch_size = p['batch_size']
    )
    dm.setup('test')
    trainer.fit(model, dm)
    testresult = trainer.test(model, datamodule=dm)
    trainer.save_checkpoint(sym01+"-lstm.ckpt")
    print(testresult)
    print(testresult)
    testresult = testresult[0]
    print(testresult['val_loss'])
    return model,testresult['val_loss']
Ejemplo n.º 5
0
def tune_main(hparams, num_epochs=15, num_gpus=0):

    print(hparams)

    mean, std, traindir, valdir, num_classes = choose_dataset('cifar10')
    traindir = '/home/jovyan/work/cv_data/cifar10/train'
    valdir = '/home/jovyan/work/cv_data/cifar10/test'
    hparams['num_classes'] = num_classes

    train_logger.info('Training Directory: {0}'.format(traindir) )

    model = LightningModel(hparams)

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        #distributed_backend=hparams.distributed_backend,
        precision=32,
        #early_stop_callback=early_stop_callback,
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCallback(
                {
                    "loss": "val_loss_epoch",
                    "accuracy": "val_acc_epoch"
                },
                on="validation_end"
            )
        ])

    normal_pipe = BasicPipe(hparams, traindir, valdir, mean, std)
    trainer.fit(model, normal_pipe)
Ejemplo n.º 6
0
def tunerun(config):
    RATIO = config['ratio']
    DAT_RAT, DAT_TOT = config['thresholds']
    TRAIN_BATCH_SIZE = config['train_batch_size']
    VAL_BATCH_SIZE = config['val_batch_size']
    TEST_BATCH_SIZE = config['test_batch_size']

    LEARNING_RATE = config['lr']
    MAX_EPOCHS = 30
    WEIGHT_DECAY = config['decay']

    #DATA_PATH = "/bigtemp/rm5tx/nlp_project/data_cache/"
    DATA_PATH = os.path.expanduser("~/data_cache/")
    # DATA_ADJACENT_PATH = os.path.expanduser("~/data_adjacent_cache/")

    MAX_LEN = config['max_len']
    ADJACENT = True

    data = ProjData(max_len=MAX_LEN,
                    ratio=RATIO,
                    adjacent=ADJACENT,
                    adjrat=DAT_RAT,
                    adjtot=DAT_TOT)
    if ADJACENT:
        MODEL_NAME = 'nlp_proj_adjacent' + str(MAX_LEN)
    else:
        MODEL_NAME = 'nlp_proj_norm' + str(MAX_LEN)

    try:
        data.load(DATA_PATH)
        print("Loaded Saved Data")
    except Exception as e:
        print(e)
        data.setup()
        data.save(DATA_PATH)

    model = ProjModel(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

    # logger = TensorBoardLogger(os.path.expanduser("~/tb_logs"), name=MODEL_NAME)#
    # checkpoint_callback = ModelCheckpoint(monitor='valid_loss',
    #                                         dirpath=os.path.expanduser("~/saved_models"),
    #                                         save_last=True,
    #                                         filename=MODEL_NAME + '-{epoch:02d}-{val_loss:.2f}',)
    tune = TuneReportCallback({"avg_acc": "avg_acc"}, on="validation_end")
    # earlystopping = EarlyStopping(monitor='avg_acc', verbose=True, patience=0)

    trainer = pl.Trainer(
        progress_bar_refresh_rate=100,  # logger=logger,
        accelerator='ddp',  # jupyter can't use ddp, use dp instead
        # effective batch size is batch_size * num_gpus * num_nodes
        gpus=1,
        gradient_clip_val=1.0,
        max_epochs=MAX_EPOCHS,
        fast_dev_run=False,
        callbacks=[tune])
    trainer.fit(model, data.train_dataloader(batch_size=TRAIN_BATCH_SIZE),
                data.val_dataloader(batch_size=VAL_BATCH_SIZE))
Ejemplo n.º 7
0
 def train(config):
     module = _MockModule(10, 20)
     trainer = pl.Trainer(max_epochs=1,
                          callbacks=[
                              TuneReportCallback(
                                  {"tune_loss": "avg_val_loss"},
                                  on="validation_end")
                          ])
     trainer.fit(module)
Ejemplo n.º 8
0
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0):
    model = LightningMNISTClassifier(config, data_dir)
    dm = MNISTDataModule(
        data_dir=data_dir, num_workers=1, batch_size=config["batch_size"])
    metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"}
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        progress_bar_refresh_rate=0,
        callbacks=[TuneReportCallback(metrics, on="validation_end")])
    trainer.fit(model, dm)
 def train_TFT_tune(config, num_epochs=num_epochs):
     m = TFT(**config, epochs=num_epochs)
     train_loader, val_loader, model = m._hyperparameter_optimization(df, freq)
     trainer = pl.Trainer(
         max_epochs=num_epochs,
         progress_bar_refresh_rate=0,
         num_sanity_val_steps=0,
         callbacks=TuneReportCallback({"loss": "val_loss"}, on="validation_end"),
         checkpoint_callback=False,
         logger=False,
     )
     trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader)
Ejemplo n.º 10
0
def train_model(config=None):
    trainer = pl.Trainer(
        default_root_dir="/home/nuoc/Documents/MEX/src/version_0.2/checkpoints",
        gpus=1, precision=16,
        min_epochs=20,
        max_epochs=MAX_EPOCHS,
        callbacks=[TuneReportCallback({"loss": "avg_val_loss", }, on="validation_end")],
        logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"),
        stochastic_weight_avg=True
    )
    model = VAE(config=config, input_dims=[pose_dim], name=model_name,
                    train_set=train_set, val_set=val_set, test_set=test_set)

    trainer.fit(model)
Ejemplo n.º 11
0
def train_mnist_tune(config, num_epochs=10, num_gpus=0):
    data_dir = os.path.abspath("./data")
    model = LightningMNISTClassifier(config, data_dir)
    with FileLock(os.path.expanduser("~/.data.lock")):
        dm = MNISTDataModule(data_dir=data_dir,
                             num_workers=1,
                             batch_size=config["batch_size"])
    metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"}
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        progress_bar_refresh_rate=0,
        callbacks=[TuneReportCallback(metrics, on="validation_end")])
    trainer.fit(model, dm)
Ejemplo n.º 12
0
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0):
    model = LightningMNISTClassifier(config, data_dir)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCallback(
                {
                    "loss": "ptl/val_loss",
                    "mean_accuracy": "ptl/val_accuracy"
                },
                on="validation_end")
        ])
    trainer.fit(model)
Ejemplo n.º 13
0
def fit(model=None, model_name="model", num_epochs=300, num_gpus=1):
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        logger=TensorBoardLogger(save_dir="logs/",
                                 name=model_name,
                                 version="0.0"),
        progress_bar_refresh_rate=20,
        callbacks=[
            TuneReportCallback({
                "loss": "avg_val_loss",
            }, on="validation_end"),
            EarlyStopping(monitor="avg_val_loss")
        ],
        precision=16,
    )
    trainer.fit(model)
Ejemplo n.º 14
0
def tuning(config=None, MODEL=None, pose_autoencoder=None, cost_dim=None, phase_dim=None,
          input_slices=None, output_slices=None,
          train_set=None, val_set=None, num_epochs=300, model_name="model"):
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=1,
        logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"),
        progress_bar_refresh_rate=5,
        callbacks=[
            TuneReportCallback({"loss": "avg_val_loss", }, on="validation_end"),
            EarlyStopping(monitor="avg_val_loss")
        ],
    )
    model = MODEL(config=config, pose_autoencoder=pose_autoencoder, cost_input_dimension=cost_dim, phase_dim=phase_dim,
                              input_slicers=input_slices, output_slicers=output_slices,
                              train_set=train_set, val_set=val_set, name=model_name)

    trainer.fit(model)
def train_model_tune(config, data_dir=None, num_epochs=10, num_gpus=1):
    model = CNN(config)
    image_module = MyImageModule(dataset_size=100, batch_size=32)
    image_module.setup()
    logger = TensorBoardLogger('tb_logs', name='Model_prueba_tuning')
    trainer = pl.Trainer(max_epochs=num_epochs,
                         gpus=num_gpus,
                         logger=logger,
                         deterministic=True,
                         callbacks=[
                             TuneReportCallback(
                                 {
                                     "loss": "ptl/val_loss",
                                     "mean_accuracy": "ptl/val_accuracy"
                                 },
                                 on="validation_end")
                         ])
    trainer.fit(model, datamodule=image_module)
Ejemplo n.º 16
0
def train_mnist_tune(config, num_epochs=10, num_gpus=0, data_dir="~/data"):
    data_dir = os.path.expanduser(data_dir)
    model = LightningMNISTClassifier(config, data_dir)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        logger=TensorBoardLogger(save_dir=os.getcwd(), name="", version="."),
        enable_progress_bar=False,
        callbacks=[
            TuneReportCallback(
                {
                    "loss": "ptl/val_loss",
                    "mean_accuracy": "ptl/val_accuracy"
                },
                on="validation_end")
        ])
    trainer.fit(model)
Ejemplo n.º 17
0
    def _default_trainable(config, checkpoint_dir=None):
        """A default trainable function used by `tune.run`

        It performs the most straight forward training loop with the provided `config`:
        - Create the pipeline (optionally with a provided vocab)
        - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch
        - Execute the training
        """
        if config["silence"]:
            logging.getLogger("biome.text").setLevel(logging.ERROR)

        pipeline = Pipeline.from_config(config["pipeline_config"])

        trainer_config = TrainerConfiguration(**config["trainer_config"])

        vocab_config = config["vocab_config"]
        if vocab_config:
            vocab_config = VocabularyConfiguration(**vocab_config)

        callbacks = trainer_config.callbacks
        if not isinstance(callbacks, list):
            callbacks = [callbacks]
        if not any(
            [isinstance(callback, TuneReportCallback) for callback in callbacks]
        ):
            tune_callback = TuneReportCallback(metrics=config["metrics"])
            if trainer_config.callbacks is None:
                trainer_config.callbacks = tune_callback
            else:
                trainer_config.callbacks = callbacks + [tune_callback]

        train_ds = Dataset.load_from_disk(config["train_dataset_path"])
        valid_ds = Dataset.load_from_disk(config["valid_dataset_path"])
        train_instances = train_ds.to_instances(pipeline=pipeline, disable_tqdm=True)
        valid_instances = valid_ds.to_instances(pipeline=pipeline, disable_tqdm=True)

        trainer = Trainer(
            pipeline=pipeline,
            train_dataset=train_instances,
            valid_dataset=valid_instances,
            trainer_config=trainer_config,
            vocab_config=vocab_config,
        )
        trainer.fit()
Ejemplo n.º 18
0
def train_mnist_tune(tuning_config, data_dir=None, num_epochs=10, num_gpus=0):
    # Only Training
    model = LightningMNISTClassifier(tuning_config, data_dir)

    # ===============================================================================
    # Callback
    # ===============================================================================\
    from pytorch_lightning.callbacks import ModelCheckpoint
    from pytorch_lightning.callbacks import EarlyStopping

    early_stop_cb = EarlyStopping(monitor='ptl/val_loss',
                                  patience=5,
                                  verbose=True,
                                  mode='min')

    ckpt_cb = ModelCheckpoint(tune.get_trial_dir() + '/checkpoints',
                              save_top_k=5,
                              verbose=True,
                              monitor='ptl/val_loss',
                              mode='min',
                              save_last=True,
                              filename='model_{epoch:03d}-{step}')

    tune_rp_cb = TuneReportCallback(
        {
            "val_loss": "ptl/val_loss",
            "val_accuracy": "ptl/val_accuracy"
        },
        on="validation_end")

    # ===============================================================================
    # Trainer
    # Note: Must set logger as default with
    # ===============================================================================
    trainer = pl.Trainer(
        progress_bar_refresh_rate=0,  # 0 means no print progress
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        callbacks=[ckpt_cb, tune_rp_cb, early_stop_cb])
    trainer.logger._default_hp_metric = False  # hp_metrc must be False
    trainer.fit(model)
Ejemplo n.º 19
0
def tune_train(tune_config: dict, base_arg):
    """
    Wrapper for ray.tune to adjust necessary params
    """
    # note tune config
    arg = deepcopy(base_arg)
    arg.learning_rate = tune_config['learning_rate']
    arg.weight_decay = tune_config['weight_decay']
    arg.batch_size = tune_config['batch_size']
    arg.latent_dim = tune_config['latent_dim']
    arg.does_ray_tuning = True

    cfg = Config(arg)

    main(cfg,
         callbacks=[
             TuneReportCallback(metrics=[
                 "val_lossR",
             ], on="validation_end")
         ])
Ejemplo n.º 20
0
def train_lesion(config, data_dir=None, num_epochs=10, num_gpus=1):
    model = LesionModel(config)
    data_module = LesionDataModule(config)
    logger = TensorBoardLogger('tb_logs', name='my_model')
    metrics = {"iou": "ptl/val_iou"}
    checkpoint_callback = ModelCheckpoint(
        verbose=True,
        monitor='ptl/val_iou',
        mode='max',  ## for val_loss this should be min
        dirpath='check_point_path/',
        filename='lesion-{epoch:02d}-{val_iou:.2f}')

    trainer = pl.Trainer(max_epochs=num_epochs,
                         gpus=num_gpus,
                         progress_bar_refresh_rate=50,
                         logger=logger,
                         automatic_optimization=True,
                         callbacks=[
                             TuneReportCallback(metrics, on="validation_end"),
                             checkpoint_callback
                         ])
    trainer.fit(model, data_module)
Ejemplo n.º 21
0
def tune_mnist(data_dir,
               num_samples=10,
               num_epochs=10,
               num_hosts=1,
               num_slots=4,
               use_gpu=False):
    config = {
        "layer_1": tune.choice([32, 64, 128]),
        "layer_2": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }

    # Add Tune callback.
    metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"}
    callbacks = [TuneReportCallback(metrics, on="validation_end")]
    trainable = tune.with_parameters(train_mnist,
                                     data_dir=data_dir,
                                     num_epochs=num_epochs,
                                     num_hosts=num_hosts,
                                     num_slots=num_slots,
                                     use_gpu=use_gpu,
                                     callbacks=callbacks)
    analysis = tune.run(
        trainable,
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        resources_per_trial={
            "cpu": 1,
            # Assume 1 cpu per slot.
            "extra_cpu": num_hosts * num_slots,
            # Assume 1 gpu per slot.
            "extra_gpu": num_hosts * num_slots * int(use_gpu)
        },
        name="tune_mnist")

    print("Best hyperparameters found were: ", analysis.best_config)
Ejemplo n.º 22
0
def train_tune(config,
               dim1,
               dim2,
               dim3,
               dim4,
               train_set=None,
               val_set=None,
               test_set=None,
               num_epochs=300,
               num_cpus=24,
               num_gpus=1,
               model_name="model"):

    model = MIX(config=config,
                dim1=dim1,
                dim2=dim2,
                dim3=dim3,
                dim4=dim4,
                train_set=train_set,
                val_set=val_set,
                test_set=test_set,
                name=model_name)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        logger=TensorBoardLogger(save_dir="logs/",
                                 name=model_name,
                                 version="0.0"),
        progress_bar_refresh_rate=20,
        callbacks=[
            TuneReportCallback({
                "loss": "avg_val_loss",
            }, on="validation_end"),
            EarlyStopping(monitor="avg_val_loss")
        ],
        precision=16,
    )
    trainer.fit(model)
Ejemplo n.º 23
0
    def fn_tune(config, config_init, num_epochs=10, num_gpus=1):
        gc.collect()
        config_init["batch_size"] = config["batch_size"]
        #config_init["learning_rate"] = config["learning_rate"]
        config_init["hidden_size"] = config["hidden_size"]
        config_init["bidirectional"] = config["bidirectional"]
        config_init["embed_dim"] = config["embed_dim"]
        config_init["num_layers"] = config["num_layers"]
        args = parser.parse_args(serialize_config(config_init))

        fx_dm = RaceDataModule(args, collate_fn)
        fx_dm.setup()
        fx_model = RaceModule(args, batch_fn)
        metrics = {"loss": "val_loss"}
        trainer = pl.Trainer(
            max_epochs=num_epochs,
            limit_train_batches=0.25,
            precision=16,
            gpus=num_gpus,
            progress_bar_refresh_rate=100,
            callbacks=[TuneReportCallback(metrics, on="validation_end")])

        trainer.fit(fx_model, fx_dm)
Ejemplo n.º 24
0
def tune_model(config, ptl_model, dset, train_inds, n_workers, n_val = None, 
               val_inds = None, tune_metrics = None, mode = 'tune', **trainer_kwargs):
    ''' A generic function to hp-tuning and model training with ray and pytorch-lightning '''
    
    model = ptl_model(config = config)
    
    if val_inds is None:
        shuffle(train_inds)

    train_dl = DataLoader(
        torch.utils.data.Subset(dset, train_inds[n_val:] if val_inds is None else train_inds),
        batch_size = config['batch_size'],
        num_workers = n_workers,
        drop_last = True,
        shuffle = True
    )
    val_dl = DataLoader(
        torch.utils.data.Subset(dset, train_inds[:n_val] if val_inds is None else val_inds),
        num_workers = n_workers,
        batch_size = config['batch_size'],
        drop_last = True,
        shuffle = False
    )
    
    callbacks = model.callbacks
    if mode == 'tune':
        callbacks += [
            TuneReportCallback(
                tune_metrics, 
                on = 'validation_end'
            )
        ]

    trainer = PLTrainer(callbacks = callbacks, **trainer_kwargs)
    trainer.fit(model, train_dl, val_dl)
    
    return trainer
Ejemplo n.º 25
0
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from runners.STL_runner import STLRunner, create_checkpoint_callback
from runners.MTL_runner import MTLRunner
from argparse import ArgumentParser
from functools import partial

parser = ArgumentParser(description="A multitask learner")
parser.add_argument("model_type", choices=["mtl", "stl"], help="")

tune_config = {
    "hidden_layers":
    tune.choice([[1, 2], [32, 64]]),
    "activations":
    tune.choice([["relu", "relu", "relu"], ["sigmoid", "sigmoid", "sigmoid"]])
}
callback = TuneReportCallback({"loss": "loss_validate"}, on="validation_end")


def run(self, learner, max_epochs=10, callbacks=None):
    trainer = pl.Trainer(
        max_epochs=max_epochs,
        logger=learner.logger,
        checkpoint_callback=create_checkpoint_callback(
            learner.checkpoints_prefix),
        callbacks=callbacks,
    )
    trainer.fit(learner.model, learner.data_module)
    trainer.test(learner.model, datamodule=learner.data_module)


def main():
Ejemplo n.º 26
0
    )

    # CREATE CHECKPOINTS DIR
    checkpoint_dir = f'checkpoints/{experiment_name}'
    os.makedirs(checkpoint_dir)

    # RUN TRAINER
    trainer = pl.Trainer(
        logger=logger,
        log_every_n_steps=1,
        max_epochs=10,
        val_check_interval=0.05,  # for dev
        progress_bar_refresh_rate=1,
        callbacks=[
            TuneReportCallback(metrics={
                "loss": "val_loss",
            },
                               on="validation_end"),
            TuneReportCheckpointCallback(
                metrics={"loss": "val_loss"},
                filename=
                f"{checkpoint_dir}/latest_checkpoint.ckpt",  # TODO edit callback so that it saves history of checkpoints and make PR to ray[tune]
                on="validation_end"),
            SampleReconstructionCallback(loader=val_loader)
        ])
    trainer.fit(model,
                train_dataloader=train_loader,
                val_dataloaders=val_loader)
    test_result = Trainer.test(model=model,
                               test_dataloaders=test_loader,
                               verbose=True)
Ejemplo n.º 27
0
        # the following values are the results of fine-tuning
        "lr": tune.choice([1.57513e-05]),
        "batch_size": tune.choice([32]),
        "max_seq_length": tune.choice([48]),
        "hidden_dropout_prob": tune.choice([.1]),
        "hidden_act": tune.choice(["gelu"]),
        "architecture": {
            "tokenizer": BertTokenizer,
            "model": BertForMultipleChoice,
            "pretrained_model_name": "bert-base-uncased"
        }
    }

    ray_tune_callback = TuneReportCallback(
        {
            "loss": "val_epoch_loss",
            "acc": "val_epoch_acc"
        },
        on="validation_end")

    logger = TensorBoardLogger('tb_logs/', name='csqa')
    # tr = train_tune(config, logger, epochs=1, gpus=1)

    analysis = tune.run(
        partial(
            train_tune, logger=logger, epochs=3, gpus=config["use_gpu"],
        ),
        config=rt_config,
        num_samples=1,
        resources_per_trial={
            "cpu": 10,
            "gpu": config["use_gpu"],
Ejemplo n.º 28
0
from functools import partial
from argparse import ArgumentParser

import torch
import pytorch_lightning as lit
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from ray import tune

from .lightningreapp import LightningReapp

parser = ArgumentParser()

callback_tuner = TuneReportCallback(
    {
        "loss": "val_loss",
        # "mean_accuracy": "val_accuracy"
    },
    on="validation_end",
)

default_tune_config = {
    "lr": tune.loguniform(1e-4, 1e-1),  # loguniform samples by magnitude
    "hidden_layer_size": tune.quniform(10, 50, 1)
}


### TUNING HYPERPARAMETERS
def train_tune(config, **tuner_kwargs):
    model = LightningReapp(config)

    max_epochs = tuner_kwargs.get('max_epochs', 10)
Ejemplo n.º 29
0
def trainable(config, train_loader, val_loader, test_loader):

    input_size = 28
    ae_arch = architecture.get_ae_architecture(input_size=input_size,
                                               latent_dim=128)

    # model = ConvAutoencoder(**{**ae_arch, 'verbose': True})
    model = ConvAutoencoder(
        **{
            **ae_arch, 'optimizer_name': config['optimizer_name'],
            'lr': config['lr']
        })
    model.logdir = 'ConvAutoencoder'

    model.set_latent(input_size)
    # print('model latent dim:', model.latent_size)

    config_str = json.dumps({
        **config, 'channels': ae_arch['encoder_channels'],
        'stride': ae_arch['encoder_stride'],
        'kernel_size': ae_arch['encoder_kernel_size'],
        'latent_dim': model.latent_size
    })

    # SET UP LOGGER
    section_name = 'ConvAutoencoder'
    save_dir = f'{os.path.expanduser("~")}/ai-core/Embedder/runs/{section_name}/'
    # save_dir =f'{os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")}/runs/{section_name}/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    # print(save_dir)
    # print(__name__)
    # print(__file__)
    # sdfcds
    experiment_name = f'ConvAutoencoder-{config_str}-{time()}'
    model.experiment_name = experiment_name
    logger = pl.loggers.TensorBoardLogger(
        save_dir=save_dir,
        name=experiment_name,
        default_hp_metric=False,
    )

    # CREATE CHECKPOINTS DIR
    checkpoint_dir = f'checkpoints/{experiment_name}'
    os.makedirs(checkpoint_dir)

    # RUN TRAINER
    trainer = pl.Trainer(
        logger=logger,
        log_every_n_steps=1,
        max_epochs=10,
        val_check_interval=0.05,  # for dev
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCallback(metrics={
                "loss": "val_loss",
            },
                               on="validation_end"),
            TuneReportCheckpointCallback(
                metrics={"loss": "val_loss"},
                filename=
                f"{checkpoint_dir}/latest_checkpoint.ckpt",  # TODO edit callback so that it saves history of checkpoints and make PR to ray[tune]
                on="validation_end"),
            SampleReconstructionCallback(loader=val_loader)
        ])
    trainer.fit(model,
                train_dataloader=train_loader,
                val_dataloaders=val_loader)
    test_result = Trainer.test(model=model,
                               test_dataloaders=test_loader,
                               verbose=True)
Ejemplo n.º 30
0
def get_trainer_from_cfg(cfg: DictConfig,
                         lightning_module,
                         stopper,
                         profiler: str = None) -> pl.Trainer:
    """Gets a PyTorch Lightning Trainer from a configuration

    Supports:
        automatic batch sizing
        Automatic learning rate finding (experimental)
        Callback instantiation
        Logging, both to disk and with TensorBoard

    Parameters
    ----------
    cfg : DictConfig
        configuration
    lightning_module : pl.LightningModule
        Lightning model to train
    stopper : callable
        Method to stop training. Must be passed so that figuring out batch size does not "count" towards stopping
    profiler : str, optional
        https://pytorch-lightning.readthedocs.io/en/latest/advanced/profiler.html, by default None

    Returns
    -------
    pl.Trainer
        https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html
    """
    steps_per_epoch = cfg.train.steps_per_epoch
    for split in ['train', 'val', 'test']:
        steps_per_epoch[split] = steps_per_epoch[split] if steps_per_epoch[
            split] is not None else 1.0

    # reload_dataloaders_every_epoch = True: a bit slower, but enables validation dataloader to get the new, automatic
    # learning rate schedule.

    if cfg.compute.batch_size == 'auto' or cfg.train.lr == 'auto':
        trainer = pl.Trainer(gpus=[cfg.compute.gpu_id],
                             precision=16 if cfg.compute.fp16 else 32,
                             limit_train_batches=1.0,
                             limit_val_batches=1.0,
                             limit_test_batches=1.0,
                             num_sanity_val_steps=0)
        # callbacks=[ExampleImagesCallback()])
        tmp_metrics = lightning_module.metrics
        tmp_workers = lightning_module.hparams.compute.num_workers
        # visualize_examples = lightning_module.visualize_examples

        if lightning_module.model_type != 'sequence':
            # there is a somewhat common error that VRAM will be maximized by the gpu-auto-tuner.
            # However, during training, we probabilistically sample colorspace transforms; in an "unlucky"
            # batch, perhaps all of the training samples are converted to HSV, hue and saturation changed, then changed
            # back. This is rare enough to not be encountered in "auto-tuning," so we'll get a train-time error. BAD!
            # so, we crank up the colorspace augmentation probability, then pick batch size, then change it back
            original_gpu_transforms = deepcopy(lightning_module.gpu_transforms)

            log.debug('orig: {}'.format(lightning_module.gpu_transforms))

            original_augs = cfg.augs
            new_augs = deepcopy(cfg.augs)
            new_augs.color_p = 1.0

            arch = lightning_module.hparams[lightning_module.model_type].arch
            mode = '2d'
            gpu_transforms = get_gpu_transforms(
                new_augs, '3d' if '3d' in arch.lower() else '2d')
            lightning_module.gpu_transforms = gpu_transforms
            log.debug('new: {}'.format(lightning_module.gpu_transforms))

        tuner = pl.tuner.tuning.Tuner(trainer)
        # hack for lightning to find the batch size
        cfg.batch_size = 2  # to start

        empty_metrics = EmptyMetrics()
        # don't store metrics when batch size finding
        lightning_module.metrics = empty_metrics
        # don't visualize our model inputs when batch size finding
        # lightning_module.visualize_examples = False
        should_viz = cfg.train.viz_examples
        lightning_module.hparams.train.viz_examples = 0
        # dramatically reduces RAM usage by this process
        lightning_module.hparams.compute.num_workers = min(tmp_workers, 1)
        if cfg.compute.batch_size == 'auto':
            max_trials = int(math.log2(cfg.compute.max_batch_size)) - int(
                math.log2(cfg.compute.min_batch_size))
            log.info('max trials: {}'.format(max_trials))
            new_batch_size = trainer.tuner.scale_batch_size(
                lightning_module,
                mode='power',
                steps_per_trial=30,
                init_val=cfg.compute.min_batch_size,
                max_trials=max_trials)
            cfg.compute.batch_size = new_batch_size
            log.info('auto-tuned batch size: {}'.format(new_batch_size))
        if cfg.train.lr == 'auto':
            lr_finder = trainer.tuner.lr_find(lightning_module,
                                              early_stop_threshold=None,
                                              min_lr=1e-6,
                                              max_lr=10.0)
            # log.info(lr_finder.results)
            plt.style.use('seaborn')
            fig = lr_finder.plot(suggest=True, show=False)
            viz.save_figure(fig, 'auto_lr_finder', False, 0, overwrite=False)
            plt.close(fig)
            new_lr = lr_finder.suggestion()
            log.info('auto-tuned learning rate: {}'.format(new_lr))
            cfg.train.lr = new_lr
            lightning_module.lr = new_lr
            lightning_module.hparams.lr = new_lr
        del trainer, tuner
        #  restore lightning module to original state
        lightning_module.hparams.train.viz_examples = should_viz
        lightning_module.metrics = tmp_metrics
        lightning_module.hparams.compute.num_workers = tmp_workers
        if lightning_module.model_type != 'sequence':
            lightning_module.gpu_transforms = original_gpu_transforms
            log.debug('reverted: {}'.format(lightning_module.gpu_transforms))

    key_metric = lightning_module.metrics.key_metric
    mode = 'min' if 'loss' in key_metric else 'max'
    monitor = f'val/{key_metric}'
    dirpath = os.path.join(cfg.run.dir, 'lightning_checkpoints')
    callback_list = [
        FPSCallback(),
        MetricsCallback(),
        ExampleImagesCallback(),
        CheckpointCallback(),
        StopperCallback(stopper),
        pl.callbacks.ModelCheckpoint(dirpath=dirpath,
                                     save_top_k=1,
                                     save_last=True,
                                     mode=mode,
                                     monitor=monitor,
                                     save_weights_only=True)
    ]
    if 'tune' in cfg and cfg.tune.use and ray:
        callback_list.append(
            TuneReportCallback(OmegaConf.to_container(cfg.tune.metrics),
                               on='validation_end'))
        # https://docs.ray.io/en/master/tune/tutorials/tune-pytorch-lightning.html
        tensorboard_logger = pl.loggers.tensorboard.TensorBoardLogger(
            save_dir=get_trial_dir(),
            name="",
            version=".",
            default_hp_metric=False)
        refresh_rate = 0
    else:
        tensorboard_logger = pl.loggers.tensorboard.TensorBoardLogger(
            os.getcwd())
        refresh_rate = 1

    # tuning messes with the callbacks
    trainer = pl.Trainer(gpus=[cfg.compute.gpu_id],
                         precision=16 if cfg.compute.fp16 else 32,
                         limit_train_batches=steps_per_epoch['train'],
                         limit_val_batches=steps_per_epoch['val'],
                         limit_test_batches=steps_per_epoch['test'],
                         logger=tensorboard_logger,
                         max_epochs=cfg.train.num_epochs,
                         num_sanity_val_steps=0,
                         callbacks=callback_list,
                         reload_dataloaders_every_epoch=True,
                         progress_bar_refresh_rate=refresh_rate,
                         profiler=profiler)
    torch.cuda.empty_cache()
    # gc.collect()

    # import signal
    # signal.signal(signal.SIGTERM, signal.SIG_DFL)
    # log.info('trainer is_slurm_managing_tasks: {}'.format(trainer.is_slurm_managing_tasks))
    return trainer