def train_mnist_tune_checkpoint(config, checkpoint_dir=None, num_epochs=10, num_gpus=0, data_dir="~/data"): data_dir = os.path.expanduser(data_dir) kwargs = { "max_epochs": num_epochs, # If fractional GPUs passed in, convert to int. "gpus": math.ceil(num_gpus), "logger": TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), "progress_bar_refresh_rate": 0, "callbacks": [ TuneReportCheckpointCallback( metrics={ "loss": "ptl/val_loss", "mean_accuracy": "ptl/val_accuracy" }, filename="checkpoint", on="validation_end") ] } if checkpoint_dir: kwargs["resume_from_checkpoint"] = os.path.join( checkpoint_dir, "checkpoint") model = LightningMNISTClassifier(config=config, data_dir=data_dir) trainer = pl.Trainer(**kwargs) trainer.fit(model)
def train_mnist_tune_checkpoint(config, checkpoint_dir=None, data_dir=None, num_epochs=10, num_gpus=0): trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, logger=TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[ TuneReportCheckpointCallback( metrics={ "loss": "ptl/val_loss", "mean_accuracy": "ptl/val_accuracy" }, filename="checkpoint", on="validation_end") ]) if checkpoint_dir: # Currently, this leads to errors: # model = LightningMNISTClassifier.load_from_checkpoint( # os.path.join(checkpoint, "checkpoint")) # Workaround: ckpt = pl_load( os.path.join(checkpoint_dir, "checkpoint"), map_location=lambda storage, loc: storage) model = LightningMNISTClassifier._load_model_state(ckpt, config=config, data_dir=data_dir) trainer.current_epoch = ckpt["epoch"] else: model = LightningMNISTClassifier(config=config, data_dir=data_dir) trainer.fit(model)
def train_tune(config, epochs, resources, checkpoint_dir=None): # viz logger logger = TensorBoardLogger(save_dir=tune.get_trial_dir(), name=model_name) # metric reporter + checkpoint callback callback = TuneReportCheckpointCallback( metrics=pbt_config['metrics_to_report']) # search trainer object trainer = pl.Trainer( max_epochs=epochs, gpus=resources['gpu'], logger=logger, callbacks=[callback], progress_bar_refresh_rate=50, precision=16, ) # checkpointing system if checkpoint_dir: model = network.load_from_checkpoint( os.path.join(checkpoint_dir, 'checkpoint')) else: model = network(config) # fits model/data module with current hyperparameter set data_module = dm(config) trainer.fit(model, datamodule=data_module)
def train_mult(config, checkpoint_dir=None): hyp_params.attn_dropout = config["attn_dropout"] hyp_params.attn_dropout_a = config["attn_dropout_a"] hyp_params.attn_dropout_v = config["attn_dropout_v"] hyp_params.embed_dropout = config["embed_dropout"] hyp_params.out_dropout = config["out_dropout"] hyp_params.relu_dropout = config["relu_dropout"] hyp_params.res_dropout = config["res_dropout"] # hyp_params.layers = int(config["layers"]) # hyp_params.num_heads = int(config["num_heads"]) # hyp_params.project_dim = int(config["num_heads"]) * int(config["head_dim"]) hyp_params.lr = config["lr"] hyp_params.weight_decay = config["weight_decay"] comet_logger = CometLogger( api_key="cgss7piePhyFPXRw1J2uUEjkQ", workspace="transformer", project_name=hyp_params.project_name, save_dir="logs/comet_ml", ) experiement_key = comet_logger.experiment.get_key() csv_logger = CSVLogger("logs/csv", name=experiement_key) early_stopping = EarlyStopping( monitor="valid_1mae", patience=10, verbose=True, mode="max" ) checkpoint = ModelCheckpoint(save_top_k=1, monitor="valid_1mae", mode="max") # tune_reporter = TuneReportCallback(["valid_loss", "valid_1mae"]) tune_checkpoint_reporter = TuneReportCheckpointCallback( metrics=["valid_loss", "valid_1mae"] ) model = MULTModelWarpedAll(hyp_params, early_stopping=early_stopping) trainer = pl.Trainer( gpus=1, max_epochs=hyp_params.num_epochs, log_every_n_steps=1, callbacks=[early_stopping, checkpoint, tune_checkpoint_reporter], logger=[csv_logger, comet_logger], limit_train_batches=hyp_params.limit, limit_val_batches=hyp_params.limit, weights_summary="full", weights_save_path="logs/weights", progress_bar_refresh_rate=0, ) if checkpoint_dir is not None: ck = th.load(os.path.join(checkpoint_dir, "checkpoint")) model.load_state_dict(ck["state_dict"]) trainer.current_epoch = ck["epoch"] trainer.fit(model) ck = th.load(checkpoint.best_model_path) model.load_state_dict(ck["state_dict"]) trainer.test(model)
def train(config): module = _MockModule(10, 20) trainer = pl.Trainer(max_epochs=1, callbacks=[ TuneReportCheckpointCallback( ["avg_val_loss"], "trainer.ckpt", on="validation_end") ]) trainer.fit(module)
def clip_fine_tune( config, num_epochs, num_gpus, dataset: pa.Table, init_config: CLIPConfig, init_state_dict: dict, processor: CLIPProcessor, ): if "SLURM_NTASKS" in os.environ: del os.environ["SLURM_NTASKS"] if "SLURM_JOB_NAME" in os.environ: del os.environ["SLURM_JOB_NAME"] bird_dataset = dataset data_mod = MultiModalDataModule( dataset=bird_dataset, processor=processor, test_size=config["test_size"], batch_size=config["batch_size"], val_batch_size=config["val_batch_size"], num_workers=config["num_workers"], ) clip_model = CLIPModel(init_config) clip_model.load_state_dict(init_state_dict) model = CLIPFineTunedModel(clip_model, **config) tune_cbs = [ TuneReportCheckpointCallback(["val_loss"], on="validation_end") ] logger = TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".") trainer = pl.Trainer( logger=logger, num_sanity_val_steps=0, max_epochs=num_epochs, gpus=math.ceil(num_gpus), progress_bar_refresh_rate=0, log_every_n_steps=1, callbacks=[LearningRateMonitor(logging_interval="step")] + tune_cbs, ) trainer.validate(model, data_mod) trainer.fit(model, data_mod) return trainer
def trainWithTune(config, checkpoint_dir=None, datamodule=None, num_epochs=10, num_gpus=0): trainer = Trainer( max_epochs=num_epochs, # If fractional GPUs passed in, convert to int. gpus=math.ceil(num_gpus), logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[ TuneReportCheckpointCallback(metrics={ "loss": "val_loss", "mean_accuracy": "val_acc", "mean_iou": "val_iou", }, filename="checkpoint", on="validation_end") ]) if checkpoint_dir: # Currently, this leads to errors: # model = LightningMNISTClassifier.load_from_checkpoint( # os.path.join(checkpoint, "checkpoint")) # Workaround: ckpt = pl_load(os.path.join(checkpoint_dir, "checkpoint"), map_location=lambda storage, loc: storage) model = MMETrainingModule._load_model_state( ckpt, lr=10**config['log_lr'], lrRatio=10**config['log_lrRatio'], decay=10**config['log_decay'], num_cls=NUM_CLS) trainer.current_epoch = ckpt["epoch"] else: model = MMETrainingModule(lr=10**config['log_lr'], lrRatio=10**config['log_lrRatio'], decay=10**config['log_decay'], num_cls=NUM_CLS) trainer.fit(model, datamodule=datamodule)
def train(config, batch_size, num_epochs=20, num_gpus=0): training = dl.loader(55000, batch_size, 0) validation = dl.loader(8250, 1, 55000) cae = ContractiveAutoEncoder(training_dataloader=training, val_dataloader=validation, config=config) trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, auto_select_gpus=True if num_gpus else False, logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version='.'), stochastic_weight_avg=True, benchmark=True, callbacks=[ TuneReportCheckpointCallback({"loss": "val_loss"}, filename="checkpoint", on="validation_end") ]) trainer.fit(cae)
def trainable(config, train_loader, val_loader, test_loader): input_size = 28 ae_arch = architecture.get_ae_architecture(input_size=input_size, latent_dim=128) # model = ConvAutoencoder(**{**ae_arch, 'verbose': True}) model = ConvAutoencoder( **{ **ae_arch, 'optimizer_name': config['optimizer_name'], 'lr': config['lr'] }) model.logdir = 'ConvAutoencoder' model.set_latent(input_size) # print('model latent dim:', model.latent_size) config_str = json.dumps({ **config, 'channels': ae_arch['encoder_channels'], 'stride': ae_arch['encoder_stride'], 'kernel_size': ae_arch['encoder_kernel_size'], 'latent_dim': model.latent_size }) # SET UP LOGGER section_name = 'ConvAutoencoder' save_dir = f'{os.path.expanduser("~")}/ai-core/Embedder/runs/{section_name}/' # save_dir =f'{os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")}/runs/{section_name}/' if not os.path.exists(save_dir): os.makedirs(save_dir) # print(save_dir) # print(__name__) # print(__file__) # sdfcds experiment_name = f'ConvAutoencoder-{config_str}-{time()}' model.experiment_name = experiment_name logger = pl.loggers.TensorBoardLogger( save_dir=save_dir, name=experiment_name, default_hp_metric=False, ) # CREATE CHECKPOINTS DIR checkpoint_dir = f'checkpoints/{experiment_name}' os.makedirs(checkpoint_dir) # RUN TRAINER trainer = pl.Trainer( logger=logger, log_every_n_steps=1, max_epochs=10, val_check_interval=0.05, # for dev progress_bar_refresh_rate=0, callbacks=[ TuneReportCallback(metrics={ "loss": "val_loss", }, on="validation_end"), TuneReportCheckpointCallback( metrics={"loss": "val_loss"}, filename= f"{checkpoint_dir}/latest_checkpoint.ckpt", # TODO edit callback so that it saves history of checkpoints and make PR to ray[tune] on="validation_end"), SampleReconstructionCallback(loader=val_loader) ]) trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader) test_result = Trainer.test(model=model, test_dataloaders=test_loader, verbose=True)
# CREATE CHECKPOINTS DIR checkpoint_dir = f'checkpoints/{experiment_name}' os.makedirs(checkpoint_dir) # RUN TRAINER trainer = pl.Trainer( logger=logger, log_every_n_steps=1, max_epochs=10, val_check_interval=0.05, # for dev progress_bar_refresh_rate=1, callbacks=[ TuneReportCallback(metrics={ "loss": "val_loss", }, on="validation_end"), TuneReportCheckpointCallback( metrics={"loss": "val_loss"}, filename= f"{checkpoint_dir}/latest_checkpoint.ckpt", # TODO edit callback so that it saves history of checkpoints and make PR to ray[tune] on="validation_end"), SampleReconstructionCallback(loader=val_loader) ]) trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader) test_result = Trainer.test(model=model, test_dataloaders=test_loader, verbose=True)
def _tune(tune_param_config, vl_bert_config=None, pl_ckpt_path=None, checkpoint_dir=None, num_gpus=1): pickle.DEFAULT_PROTOCOL = 4 with logger.catch(reraise=True): config = copy.deepcopy(vl_bert_config) # config.TRAIN.LR = lr # config.TRAIN.WD = weight_decay # config.TRAIN.BATCH_IMAGES = batch_size # config.TRAIN.END_EPOCH = max_epoch # config.TRAIN.WARMUP_FACTOR = warmup_factor # config.TRAIN.WARMUP_STEPS = warmup_steps logger.warning(os.path.abspath('.')) checkpoint = ModelCheckpoint( filepath=pl_ckpt_path, save_last=False, save_top_k=3, monitor='val_accuracy', ) tune_report = TuneReportCheckpointCallback( { # "loss": "val_checkpoint_on", "mean_accuracy": "val_checkpoint_on" }, on="validation_end") adhoc_logger = TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".") trainer = pl.Trainer( # limit_train_batches=0.1, # limit_val_batches=0.1, accumulate_grad_batches=config.TRAIN.GRAD_ACCUMULATE_STEPS, checkpoint_callback=None, callbacks=[tune_report], logger=adhoc_logger, default_root_dir=pl_ckpt_path, gpus=num_gpus, num_nodes=1, distributed_backend='dp', precision=16, max_epochs=config.TRAIN.END_EPOCH, resume_from_checkpoint=None, ) # vl_bert = LitVLBERT(config) hateful_meme = LitHatefulMeme(config) if checkpoint_dir: # Currently, this leads to errors: # model = LightningMNISTClassifier.load_from_checkpoint( # os.path.join(checkpoint, "checkpoint")) # Workaround: ckpt = pl_load(os.path.join(checkpoint_dir, "checkpoint"), map_location=lambda storage, loc: storage) vl_bert = LitVLBERT._load_model_state(ckpt, config) trainer.current_epoch = ckpt["epoch"] else: logger.info(config) vl_bert = LitVLBERT(config) trainer.fit(vl_bert, datamodule=hateful_meme)
def run(self, args: AttributeDict): """Run hyperparameter search using the `tune.schedulers.ASHAScheduler` Args: args (AttributeDict): Arguments Side-effects: Saves logs to `TUNE_LOGS_PATH / args.id` """ try: from ray import tune from ray.tune.integration.pytorch_lightning import ( TuneReportCheckpointCallback, ) except ModuleNotFoundError as e: # pragma: no cover logger.error( "To use hyperparameter search, first install Ray Tune via `pip install 'ray[tune]'` or `pip install 'ride[extras]'`" ) raise e if not hasattr(args, "id"): args.id = "hparamsearch" module_config = ( Configs.from_file(args.from_hparam_space_file) if args.from_hparam_space_file else self.Module.configs() ).tune_config() config = { **dict(args), **module_config, # pl.Trainer args: "gpus": args.gpus_per_trial, "logger": False, "accumulate_grad_batches": ( (8 // args.gpus_per_trial) * args.accumulate_grad_batches if args.gpus_per_trial else args.accumulate_grad_batches ), } scheduler = tune.schedulers.ASHAScheduler( metric=f"val/{args.optimization_metric}", mode=self.Module.metrics()[args.optimization_metric].value, max_t=args.max_epochs, grace_period=1, reduction_factor=2, ) metric_names = [f"val/{m}" for m in self.Module.metrics().keys()] reporter = tune.CLIReporter( metric_columns=[*metric_names, "training_iteration"], ) tune_callbacks = [ TuneReportCheckpointCallback( metrics=metric_names, filename="checkpoint", on="validation_end", ) ] cpus_per_trial = max( 1, ( min(10 * args.gpus_per_trial, NUM_CPU - 10) if args.gpus_per_trial else min(10, NUM_CPU - 2) ), ) analysis = tune.run( partial( Runner.static_train_and_val, self.Module, trainer_callbacks=tune_callbacks, ), name=args.id, local_dir=str(TUNE_LOGS_PATH), resources_per_trial={"cpu": cpus_per_trial, "gpu": args.gpus_per_trial}, config=config, num_samples=args.trials, scheduler=scheduler, progress_reporter=reporter, raise_on_failed_trial=False, ) best_hparams = analysis.get_best_config( metric=f"val/{args.optimization_metric}", mode=self.Module.metrics()[args.optimization_metric].value, scope="all", ) # Select only model parameters if best_hparams: best_hparams = { k: best_hparams[k] for k in [ *self.Module.configs().names, # Trainer parameters that influence model hparams: "accumulate_grad_batches", "batch_size", "gpus", ] } return best_hparams