Example #1
0
def trainable(config, name_fmt, envname, trainingconfig, evaluate_mean_n):
    # Parse arguments
    trial_dir = Path(tune.get_trial_dir()) if tune.get_trial_dir(
    ) is not None else Path.cwd()
    adv_force = config["adv_force"]
    name = name_fmt.format(adv_force=adv_force)
    cmd_args = [
        '--name', name, '--env', envname, '--log', '--trainingconfig',
        str(trainingconfig), '--root',
        str(trial_dir), '--monitor-dir',
        str(monitor_dir_name(envname, adv_force))
    ]
    cmd_args += ['--adv_force', str(adv_force)]
    args = parse_args(cmd_args)
    # Add adversarial force
    logging.info(f'Running {name=} with {args=}')

    def evaluate(prot, ts):
        # reward = get_mean_reward_last_n_steps(evaluate_mean_n, args.monitor_dir)
        # logging.info(f'{name} {reward=:.2f} {ts=}')
        # tune.report(reward=reward)
        robustness = eval_robustness(args, prot, envname, trainingconfig, name)
        logging.info(f'{name} {robustness=:.2f} {ts=}')
        tune.report(robustness=robustness)

    run(args, evaluate_fn=evaluate)
def train_submodel_diff(config):
		from keras.models import Sequential
		from keras.layers import Dense
		from keras.layers import LSTM
		from keras.layers import Embedding
		from keras.callbacks import ModelCheckpoint
		from keras.optimizers import Adam
		from ray.tune.integration.keras import TuneReporterCallback
		import utils.definition_network as dn
		import pandas as pd
		from ray import tune
		
		x_train, y_train, x_valid, y_valid, num_words, embedding_matrix = config["exp_sets"].pp_data.load_data()
		
		trainable_emb = (config["exp_sets"].pp_data.use_embedding == (dn.UseEmbedding.RAND or dn.UseEmbedding.NON_STATIC))
		
		layers_model = [Embedding(config["exp_sets"].pp_data.vocabulary_size, config["exp_sets"].pp_data.embedding_size,
																	trainable=trainable_emb, name=config["name"]+'_rt_emb_1')]
												
		for id_hl in range(config["hidden_layers"]-1):
				layers_model.append(LSTM(config["lstm_units"], kernel_initializer='lecun_uniform',
														 activation='tanh', dropout=config["dropout_lstm"],
														 recurrent_dropout=config["dropout_lstm"],
														 return_sequences=True, name=config["name"]+'_rt_lstm_'+str(id_hl)))
				
		layers_model.append(LSTM(config["lstm_units"], kernel_initializer='lecun_uniform',
														 activation='tanh', dropout=config["dropout_lstm"],
														 recurrent_dropout=config["dropout_lstm"],
														 name=config["name"]+'_rt_lstm_'+str(id_hl+1)))
		
		layers_model.append(Dense(3, activation='sigmoid', name=config["name"]+'_rt_dense_1'))
		
		model = Sequential(layers_model)
		model.compile(loss="binary_crossentropy",
									optimizer=Adam(lr=config["lr"]),
									metrics=["accuracy"])
		
		history = model.fit(x_train,
												y_train,
												batch_size=config["batch_size"],
												epochs=config["epochs"],
												verbose=0,
												validation_data=(x_valid, y_valid),
												callbacks=[TuneReporterCallback(freq="epoch"),
																	 ModelCheckpoint(tune.get_trial_dir() + 'train_model.h5',
																									 monitor='val_acc', mode='max', save_best_only=True,
																									 save_weights_only=False, verbose=0)])


		hist_df = pd.DataFrame(history.history)
		with open(tune.get_trial_dir() + 'history_train_model.csv', mode='w') as file:
				hist_df.to_csv(file)
Example #3
0
    def tune_train_once(config,
                        checkpoint_dir=None,
                        args: argparse.Namespace = None,
                        model_class: type = None,
                        build_method=None,
                        task_info: TaskInfo = None,
                        model_kwargs: dict = None,
                        resume: str = None,
                        **kwargs):
        if resume is None:
            resume = 'all'
        args_vars = vars(args)
        args_vars.update(config)

        pl.seed_everything(args.seed)
        logger = [
            loggers.CSVLogger(save_dir=tune.get_trial_dir(),
                              name="",
                              version="."),
            loggers.TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                      name="",
                                      version=".",
                                      default_hp_metric=False)
        ]
        trainer_args = dict(logger=logger,
                            progress_bar_refresh_rate=0,
                            callbacks=[
                                TuneReportCheckpointCallback(
                                    metrics={
                                        f'tune_{task_info.metric_name}':
                                        f'val_{task_info.metric_name}'
                                    },
                                    filename="tune.ckpt",
                                    on="validation_end")
                            ])
        if checkpoint_dir and resume == 'all':
            trainer_args['resume_from_checkpoint'] = os.path.join(
                checkpoint_dir, "tune.ckpt")

        # fix slurm trainer
        os.environ["SLURM_JOB_NAME"] = "bash"
        model = model_class(args, **model_kwargs)
        build_method(model, task_info)
        trainer: Trainer = Trainer.from_argparse_args(args, **trainer_args)
        if checkpoint_dir and resume == 'model':
            ckpt = pl_load(os.path.join(checkpoint_dir, "tune.ckpt"),
                           map_location=lambda storage, loc: storage)
            model = model._load_model_state(ckpt)
            trainer.current_epoch = ckpt["epoch"]
        trainer.fit(model)
Example #4
0
def train_tune(hparams, rdm):

    model = get_model(hparams)

    logger = TensorBoardLogger(save_dir=tune.get_trial_dir(),
                               name="",
                               version=".",
                               default_hp_metric=False)
    logger.log_hyperparams(
        hparams, {
            'train_acc': 0,
            'train_f1': 0,
            'train_loss': 0,
            'valid_acc': 0,
            'valid_f1': 0,
            'valid_loss': 0,
        })

    trainer = pl.Trainer(max_epochs=hparams['n_epochs'],
                         gpus=1,
                         logger=logger,
                         progress_bar_refresh_rate=0,
                         callbacks=[
                             TuneReportCallback(
                                 ['valid_acc', 'valid_f1', 'valid_loss'],
                                 on="validation_end")
                         ])
    trainer.fit(model, rdm)
Example #5
0
def tune_main(hparams, num_epochs=15, num_gpus=0):

    print(hparams)

    mean, std, traindir, valdir, num_classes = choose_dataset('cifar10')
    traindir = '/home/jovyan/work/cv_data/cifar10/train'
    valdir = '/home/jovyan/work/cv_data/cifar10/test'
    hparams['num_classes'] = num_classes

    train_logger.info('Training Directory: {0}'.format(traindir) )

    model = LightningModel(hparams)

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        #distributed_backend=hparams.distributed_backend,
        precision=32,
        #early_stop_callback=early_stop_callback,
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCallback(
                {
                    "loss": "val_loss_epoch",
                    "accuracy": "val_acc_epoch"
                },
                on="validation_end"
            )
        ])

    normal_pipe = BasicPipe(hparams, traindir, valdir, mean, std)
    trainer.fit(model, normal_pipe)
Example #6
0
def experiment(config):
    iterations = config.pop("train-iterations")
    train_agent = ppo.PPOTrainer(config=config, env="CartPole-v0")
    checkpoint = None
    train_results = {}

    # Train
    for i in range(iterations):
        train_results = train_agent.train()
        if i % 2 == 0 or i == iterations - 1:
            checkpoint = train_agent.save(tune.get_trial_dir())
        tune.report(**train_results)
    train_agent.stop()

    # Manual Eval
    config["num_workers"] = 0
    eval_agent = ppo.PPOTrainer(config=config, env="CartPole-v0")
    eval_agent.restore(checkpoint)
    env = eval_agent.workers.local_worker().env

    obs = env.reset()
    done = False
    eval_results = {"eval_reward": 0, "eval_eps_length": 0}
    while not done:
        action = eval_agent.compute_action(obs)
        next_obs, reward, done, info = env.step(action)
        eval_results["eval_reward"] += reward
        eval_results["eval_eps_length"] += 1
    results = {**train_results, **eval_results}
    tune.report(results)
def train_mnist_tune_checkpoint(config,
                                checkpoint_dir=None,
                                num_epochs=10,
                                num_gpus=0,
                                data_dir="~/data"):
    data_dir = os.path.expanduser(data_dir)
    kwargs = {
        "max_epochs": num_epochs,
        # If fractional GPUs passed in, convert to int.
        "gpus": math.ceil(num_gpus),
        "logger": TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        "progress_bar_refresh_rate": 0,
        "callbacks": [
            TuneReportCheckpointCallback(
                metrics={
                    "loss": "ptl/val_loss",
                    "mean_accuracy": "ptl/val_accuracy"
                },
                filename="checkpoint",
                on="validation_end")
        ]
    }

    if checkpoint_dir:
        kwargs["resume_from_checkpoint"] = os.path.join(
            checkpoint_dir, "checkpoint")

    model = LightningMNISTClassifier(config=config, data_dir=data_dir)
    trainer = pl.Trainer(**kwargs)

    trainer.fit(model)
Example #8
0
def train_mnist_tune_checkpoint(
    config,
    checkpoint_dir=None,
    data_dir=None,
    num_epochs=10,
    num_gpus=0):
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[CheckpointCallback(),
                   TuneReportCallback()])
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = LightningMNISTClassifier.load_from_checkpoint(
        #     os.path.join(checkpoint, "checkpoint"))
        # Workaround:
        ckpt = pl_load(
            os.path.join(checkpoint_dir, "checkpoint"),
            map_location=lambda storage, loc: storage)
        model = LightningMNISTClassifier._load_model_state(ckpt, config=config)
        trainer.current_epoch = ckpt["epoch"]
    else:
        model = LightningMNISTClassifier(
            config=config, data_dir=data_dir)

    trainer.fit(model)
Example #9
0
        def report(progress_tracker):
            # The progress tracker's metrics are nested dictionaries of TrainerMetrics: feature_name -> metric_name ->
            # List[TrainerMetric], with one entry per training checkpoint, according to steps_per_checkpoint.
            # We reduce the dictionary of TrainerMetrics to a simple list of floats for interfacing with Ray Tune.
            train_stats = {
                TRAINING:
                metric_utils.reduce_trainer_metrics_dict(
                    progress_tracker.train_metrics),
                VALIDATION:
                metric_utils.reduce_trainer_metrics_dict(
                    progress_tracker.validation_metrics),
                TEST:
                metric_utils.reduce_trainer_metrics_dict(
                    progress_tracker.test_metrics),
            }

            metric_score = tune_executor.get_metric_score(train_stats)
            tune.report(
                parameters=json.dumps(config, cls=NumpyEncoder),
                metric_score=metric_score,
                training_stats=json.dumps(train_stats, cls=NumpyEncoder),
                eval_stats="{}",
                trial_id=tune.get_trial_id(),
                trial_dir=tune.get_trial_dir(),
            )
Example #10
0
            def on_epoch_end(self, trainer, progress_tracker, save_path):
                with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir:
                    checkpoint_model = os.path.join(checkpoint_dir, 'model')
                    # shutil.copytree(save_path, checkpoint_model)
                    # Note: A previous implementation used shutil.copytree()
                    # however, this copying method is non atomic
                    if not os.path.isdir(checkpoint_model):
                        copy_id = uuid.uuid4()
                        tmp_dst = "%s.%s.tmp" % (checkpoint_model, copy_id)
                        shutil.copytree(save_path, tmp_dst)
                        try:
                            os.rename(tmp_dst, checkpoint_model)
                        except:
                            shutil.rmtree(tmp_dst)

                train_stats = {
                    TRAINING: progress_tracker.train_metrics,
                    VALIDATION: progress_tracker.vali_metrics,
                    TEST: progress_tracker.test_metrics,
                }

                metric_score = tune_executor.get_metric_score(
                    train_stats, eval_stats=None)
                tune.report(
                    parameters=json.dumps(config, cls=NumpyEncoder),
                    metric_score=metric_score,
                    training_stats=json.dumps(
                        train_stats[TRAINING], cls=NumpyEncoder),
                    eval_stats=json.dumps(
                        train_stats[VALIDATION], cls=NumpyEncoder),
                    trial_id=tune.get_trial_id(),
                    trial_dir=tune.get_trial_dir()
                )
Example #11
0
def train_t(config):
    seed = config.pop('seed')
    static_params = config.pop('static_params')

    torch.backends.cudnn.enabled = True
    if static_params['t_id'] == 0:
        torch.backends.cudnn.deterministic = True
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)
    else:
        torch.backends.cudnn.deterministic = False

    if 'PSSN' in tune.get_trial_name() or static_params['t_id'] == 0:
        torch.backends.cudnn.benchmark = False
    else:
        torch.backends.cudnn.benchmark = True

    if 'learner' in config:
        learner = config.pop('learner')
    else:
        learner_path = config.pop('learner_path')
        learner = torch.load(learner_path)

    rescaled, t, metrics, b_state_dict, stats = train_single_task(config=config, learner=learner, **static_params)

    learner_save_path = os.path.join(tune.get_trial_dir(), 'learner.pth')
    # raise ValueError(learner_save_path)
    torch.save(learner, learner_save_path)
Example #12
0
    def train_tune(config, epochs, resources, checkpoint_dir=None):
        # viz logger
        logger = TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                   name=model_name)

        # metric reporter + checkpoint callback
        callback = TuneReportCheckpointCallback(
            metrics=pbt_config['metrics_to_report'])

        # search trainer object
        trainer = pl.Trainer(
            max_epochs=epochs,
            gpus=resources['gpu'],
            logger=logger,
            callbacks=[callback],
            progress_bar_refresh_rate=50,
            precision=16,
        )

        # checkpointing system
        if checkpoint_dir:
            model = network.load_from_checkpoint(
                os.path.join(checkpoint_dir, 'checkpoint'))
        else:
            model = network(config)

        # fits model/data module with current hyperparameter set
        data_module = dm(config)
        trainer.fit(model, datamodule=data_module)
Example #13
0
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0):
    model = LightningMNISTClassifier(config, data_dir)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[TuneReportCallback()])

    trainer.fit(model)
Example #14
0
    def _train_fn(self,
                  config: Dict,
                  checkpoint_dir=None,
                  fast_dev_run=False,
                  include_gpus=False):
        utils.hprint('Starting train function with config:')
        utils.print_dict(config)
        print()

        utils.set_pandas_disp(width=200)

        hp = self._model_param_class.from_dict(config)
        assert isinstance(hp, self._model_param_class)
        print('  hp:', hp)

        if checkpoint_dir:
            # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing
            raise NotImplementedError(
                f"Got checkpoint_dir in trian_fn: {checkpoint_dir}")

        utils.hprint("About to create net in TuneRunner")
        net = hp.build()
        # import torch.autograd.profiler as profiler
        # with profiler.profile(record_shapes=True, use_cuda=True, profile_memory=True) as prof:
        #     net = self._factored_lightning_module_class.from_hp(hp=hp)
        # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=1000))

        utils.set_seeds(hp.data.seed)

        # noinspection PyTypeChecker
        trainer = pl.Trainer(
            logger=logs_mod.get_pl_logger(hp=hp.exp, tune=tune),
            default_root_dir=tune.get_trial_dir(),
            callbacks=self.extra_pl_callbacks +
            self.get_pl_callbacks_for_tune(),
            max_epochs=hp.opt.num_epochs,
            gpus=hp.data.num_gpus if include_gpus else None,
            weights_summary='full',
            fast_dev_run=fast_dev_run,
            accumulate_grad_batches=1,
            profiler='simple',
            deterministic=True,
            log_every_n_steps=hp.logs.num_steps_per_metric_log,
            log_gpu_memory=hp.logs.log_gpu_memory,
        )
        utils.hprint('About to start tune_runner\'s trainer.fit...')
        fit_out = trainer.fit(net, datamodule=net.dm)
        utils.hprint('Done with tune_runner._train_fn')

        return fit_out
Example #15
0
def clip_fine_tune(
    config,
    num_epochs,
    num_gpus,
    dataset: pa.Table,
    init_config: CLIPConfig,
    init_state_dict: dict,
    processor: CLIPProcessor,
):
    if "SLURM_NTASKS" in os.environ:
        del os.environ["SLURM_NTASKS"]

    if "SLURM_JOB_NAME" in os.environ:
        del os.environ["SLURM_JOB_NAME"]

    bird_dataset = dataset
    data_mod = MultiModalDataModule(
        dataset=bird_dataset,
        processor=processor,
        test_size=config["test_size"],
        batch_size=config["batch_size"],
        val_batch_size=config["val_batch_size"],
        num_workers=config["num_workers"],
    )

    clip_model = CLIPModel(init_config)
    clip_model.load_state_dict(init_state_dict)
    model = CLIPFineTunedModel(clip_model, **config)

    tune_cbs = [
        TuneReportCheckpointCallback(["val_loss"], on="validation_end")
    ]
    logger = TensorBoardLogger(save_dir=tune.get_trial_dir(),
                               name="",
                               version=".")

    trainer = pl.Trainer(
        logger=logger,
        num_sanity_val_steps=0,
        max_epochs=num_epochs,
        gpus=math.ceil(num_gpus),
        progress_bar_refresh_rate=0,
        log_every_n_steps=1,
        callbacks=[LearningRateMonitor(logging_interval="step")] + tune_cbs,
    )

    trainer.validate(model, data_mod)
    trainer.fit(model, data_mod)
    return trainer
Example #16
0
def train_transformer(config, checkpoint_dir=None):
    data_args = DataTrainingArguments(task_name=config["task_name"],
                                      data_dir=config["data_dir"])
    tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
    train_dataset = GlueDataset(data_args,
                                tokenizer=tokenizer,
                                mode="train",
                                cache_dir=config["data_dir"])
    eval_dataset = GlueDataset(data_args,
                               tokenizer=tokenizer,
                               mode="dev",
                               cache_dir=config["data_dir"])
    eval_dataset = eval_dataset[:len(eval_dataset) // 2]
    training_args = TrainingArguments(
        output_dir=tune.get_trial_dir(),
        learning_rate=config["learning_rate"],
        do_train=True,
        do_eval=True,
        evaluate_during_training=True,
        eval_steps=(len(train_dataset) // config["per_gpu_train_batch_size"]) +
        1,
        # We explicitly set save to 0, and do saving in evaluate instead
        save_steps=0,
        num_train_epochs=config["num_epochs"],
        max_steps=config["max_steps"],
        per_device_train_batch_size=config["per_gpu_train_batch_size"],
        per_device_eval_batch_size=config["per_gpu_val_batch_size"],
        warmup_steps=0,
        weight_decay=config["weight_decay"],
        logging_dir="./logs",
    )

    # Arguments for W&B.
    name = tune.get_trial_name()
    wandb_args = {
        "project_name": "transformers_pbt",
        "watch": "false",  # Either set to gradient, false, or all
        "run_name": name,
    }

    tune_trainer = get_trainer(recover_checkpoint(checkpoint_dir,
                                                  config["model_name"]),
                               train_dataset,
                               eval_dataset,
                               config["task_name"],
                               training_args,
                               wandb_args=wandb_args)
    tune_trainer.train(recover_checkpoint(checkpoint_dir,
                                          config["model_name"]))
Example #17
0
def experiment(config):
    
    #global unused_shared_step
    #global unused_own_step
    #global unsatisfied_shared_step
    #global unsatisfied_own_step

    

    iterations = 2
    train_agent = ppo.PPOTrainer(config=config, env=ContentCaching)#"ContentCaching-v0")
    checkpoint = None
    train_results = {}

    # Train
    #iterations = 20
    for i in range(iterations):
        train_results = train_agent.train()
        if i % 2 == 0 or i == iterations - 1:
            checkpoint = train_agent.save(tune.get_trial_dir())
        tune.report(**train_results)
    train_agent.stop()

    # Manual Eval
    config["num_workers"] = 0
    eval_agent = ppo.PPOTrainer(config=config, env=ContentCaching)#"ContentCaching-v0")
    eval_agent.restore(checkpoint)
    env = eval_agent.workers.local_worker().env

    obs = env.reset()
    done = False
    eval_results = {"eval_reward": 0, "eval_eps_length": 0}
    while not done:
        action = eval_agent.compute_action(obs)
        next_obs, reward, done, info,  = env.step(action)

        #unused_shared_step.append(info["unused_shared"])
        #unused_own_step.append(info["unused_own"])
        #unsatisfied_shared_step.append(info["unsatisfied_shared"])
        global unsatisfied_own_step
        unsatisfied_own_step= 99#.append(10)#info["unused_own"])


        print(" info[unused_shared] =xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx ", info["unused_shared"] )

        eval_results["eval_reward"] += reward
        eval_results["eval_eps_length"] += 1
    results = {**train_results, **eval_results}
    tune.report(results)
Example #18
0
        def report(progress_tracker):
            train_stats = {
                TRAINING: progress_tracker.train_metrics,
                VALIDATION: progress_tracker.vali_metrics,
                TEST: progress_tracker.test_metrics,
            }

            metric_score = tune_executor.get_metric_score(train_stats)
            tune.report(
                parameters=json.dumps(config, cls=NumpyEncoder),
                metric_score=metric_score,
                training_stats=json.dumps(train_stats, cls=NumpyEncoder),
                eval_stats="{}",
                trial_id=tune.get_trial_id(),
                trial_dir=tune.get_trial_dir(),
            )
Example #19
0
 def _setup_wandb(self):
     if self.is_world_master() and self.wandb_args is not None:
         wandb.init(project=self.wandb_args["project_name"],
                    name=self.wandb_args["run_name"],
                    id=self.wandb_args["run_name"],
                    dir=tune.get_trial_dir(),
                    config=vars(self.args),
                    reinit=True,
                    allow_val_change=True,
                    resume=self.wandb_args["run_name"])
         # keep track of model topology and gradients, unsupported on TPU
         if not is_torch_tpu_available(
         ) and self.wandb_args["watch"] != "false":
             wandb.watch(self.model,
                         log=self.wandb_args["watch"],
                         log_freq=max(100, self.args.logging_steps))
Example #20
0
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0):
    model = LightningMNISTClassifier(config, data_dir)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCallback(
                {
                    "loss": "ptl/val_loss",
                    "mean_accuracy": "ptl/val_accuracy"
                },
                on="validation_end")
        ])
    trainer.fit(model)
Example #21
0
def trainWithTune(config,
                  checkpoint_dir=None,
                  datamodule=None,
                  num_epochs=10,
                  num_gpus=0):
    trainer = Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                 name="",
                                 version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCheckpointCallback(metrics={
                "loss": "val_loss",
                "mean_accuracy": "val_acc",
                "mean_iou": "val_iou",
            },
                                         filename="checkpoint",
                                         on="validation_end")
        ])

    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = LightningMNISTClassifier.load_from_checkpoint(
        #     os.path.join(checkpoint, "checkpoint"))
        # Workaround:
        ckpt = pl_load(os.path.join(checkpoint_dir, "checkpoint"),
                       map_location=lambda storage, loc: storage)
        model = MMETrainingModule._load_model_state(
            ckpt,
            lr=10**config['log_lr'],
            lrRatio=10**config['log_lrRatio'],
            decay=10**config['log_decay'],
            num_cls=NUM_CLS)
        trainer.current_epoch = ckpt["epoch"]
    else:
        model = MMETrainingModule(lr=10**config['log_lr'],
                                  lrRatio=10**config['log_lrRatio'],
                                  decay=10**config['log_decay'],
                                  num_cls=NUM_CLS)

    trainer.fit(model, datamodule=datamodule)
Example #22
0
    def _train_fn(self,
                  config: Dict,
                  checkpoint_dir=None,
                  fast_dev_run=False,
                  include_gpus=False):
        utils.hprint('Starting train function with config:')
        utils.print_dict(config)

        del config['tune']
        hp = self._param_class.from_dict(config)
        assert isinstance(hp, self._param_class)

        if checkpoint_dir:
            # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing
            raise NotImplementedError(
                f"Got checkpoint_dir in trian_fn: {checkpoint_dir}")

        net = self._factored_lightning_module_class.from_hp(hp=hp)

        utils.set_seeds(hp.data.seed)

        # noinspection PyTypeChecker
        trainer = pl.Trainer(
            logger=torch_mod.get_pl_logger(hp=hp.exp,
                                           tune=tune,
                                           offline_mode=fast_dev_run),
            default_root_dir=tune.get_trial_dir(),
            callbacks=self.extra_pl_callbacks + self.get_tune_callbacks(),
            max_epochs=hp.opt.num_epochs,
            gpus=hp.data.num_gpus if include_gpus else None,
            weights_summary='full',
            fast_dev_run=fast_dev_run,
            accumulate_grad_batches=1,
            profiler='simple',
            deterministic=True,
            log_every_n_steps=hp.metrics.num_steps_per_metric_log,
        )
        fit_out = trainer.fit(net, datamodule=net.dm)

        utils.print_dict(config)
        utils.hprint('Done with tune_runner._train_fn')

        return fit_out
Example #23
0
def train_mnist_tune(tuning_config, data_dir=None, num_epochs=10, num_gpus=0):
    # Only Training
    model = LightningMNISTClassifier(tuning_config, data_dir)

    # ===============================================================================
    # Callback
    # ===============================================================================\
    from pytorch_lightning.callbacks import ModelCheckpoint
    from pytorch_lightning.callbacks import EarlyStopping

    early_stop_cb = EarlyStopping(monitor='ptl/val_loss',
                                  patience=5,
                                  verbose=True,
                                  mode='min')

    ckpt_cb = ModelCheckpoint(tune.get_trial_dir() + '/checkpoints',
                              save_top_k=5,
                              verbose=True,
                              monitor='ptl/val_loss',
                              mode='min',
                              save_last=True,
                              filename='model_{epoch:03d}-{step}')

    tune_rp_cb = TuneReportCallback(
        {
            "val_loss": "ptl/val_loss",
            "val_accuracy": "ptl/val_accuracy"
        },
        on="validation_end")

    # ===============================================================================
    # Trainer
    # Note: Must set logger as default with
    # ===============================================================================
    trainer = pl.Trainer(
        progress_bar_refresh_rate=0,  # 0 means no print progress
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        callbacks=[ckpt_cb, tune_rp_cb, early_stop_cb])
    trainer.logger._default_hp_metric = False  # hp_metrc must be False
    trainer.fit(model)
def worker_function(inner_ex_config, config):
    """
    Combines experiment config and auto-generated Ray config, and runs an iteration of
    inner_ex on that combined config.

    :param inner_ex_config: The current values of inner experiment config, including
    any modifications we might have made in an macro_experiment config update
    :param config: Config generated by Ray tune
    :return:
    """
    from inner_experiment import inner_ex
    # Something that runs inner_ex by combining "base" config and ray experiment config
    inner_ex_dict = dict(inner_ex_config)
    merged_config = update(inner_ex_dict, config)

    # This will create an observer in the Tune trial directory, meaning that
    # inner experiment configs will be saved at <trial.log_dir>/1
    observer = FileStorageObserver.create(tune.get_trial_dir())
    inner_ex.observers.append(observer)
    ret_val = inner_ex.run(config_updates=merged_config)
    tune.report(accuracy=ret_val.result)
Example #25
0
def train(config, batch_size, num_epochs=20, num_gpus=0):
    training = dl.loader(55000, batch_size, 0)
    validation = dl.loader(8250, 1, 55000)
    cae = ContractiveAutoEncoder(training_dataloader=training,
                                 val_dataloader=validation,
                                 config=config)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        auto_select_gpus=True if num_gpus else False,
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                 name="",
                                 version='.'),
        stochastic_weight_avg=True,
        benchmark=True,
        callbacks=[
            TuneReportCheckpointCallback({"loss": "val_loss"},
                                         filename="checkpoint",
                                         on="validation_end")
        ])

    trainer.fit(cae)
Example #26
0
def train_libmultilabel_tune(config, datasets, classes, word_dict):
    """The training function for ray tune.

    Args:
        config (AttributeDict): Config of the experiment.
        datasets (dict): A dictionary of datasets.
        classes(list): List of class names.
        word_dict(torchtext.vocab.Vocab): A vocab object which maps tokens to indices.
    """
    set_seed(seed=config.seed)
    config.run_name = tune.get_trial_dir()
    logging.info(f'Run name: {config.run_name}')
    config.checkpoint_dir = os.path.join(config.result_dir, config.run_name)
    config.log_path = os.path.join(config.checkpoint_dir, 'logs.json')

    trainer = TorchTrainer(config=config,
                           datasets=datasets,
                           classes=classes,
                           word_dict=word_dict,
                           search_params=True,
                           save_checkpoints=False)
    trainer.train()
def run_parameterised_experiment(config):
    # Hyperparameters
    trial_dir = tune.get_trial_dir()
    problem, method, other_config = config["main_params"]
    n_workers = config["n_workers"]

    experiment = CartpoleExperiment()
    experiment.nn_path = other_config[
        "folder"]  # nn_paths_cartpole[other_config["nn_path"]]
    experiment.tau = other_config["tau"]
    if other_config["template"] == 2:  # octagon
        experiment.analysis_template = Experiment.octagon(
            experiment.env_input_size)
    elif other_config["template"] == 0:  # box
        experiment.analysis_template = Experiment.box(
            experiment.env_input_size)
    else:
        _, template = experiment.get_template(1)
        experiment.analysis_template = template  # standard
    experiment.n_workers = n_workers
    experiment.show_progressbar = False
    experiment.show_progress_plot = False
    # experiment.use_rounding = False
    experiment.save_dir = trial_dir
    experiment.update_progress_fn = update_progress
    elapsed_seconds, safe, max_t = experiment.run_experiment()

    safe_value = 0
    if safe is None:
        safe_value = 0
    elif safe:
        safe_value = 1
    elif not safe:
        safe_value = -1
    tune.report(elapsed_seconds=elapsed_seconds,
                safe=safe_value,
                max_t=max_t,
                done=True)
Example #28
0
def train_mnist_tune_checkpoint(config,
                                checkpoint_dir=None,
                                data_dir=None,
                                num_epochs=10,
                                num_gpus=0):
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    "loss": "ptl/val_loss",
                    "mean_accuracy": "ptl/val_accuracy"
                },
                filename="checkpoint",
                on="validation_end")
        ])
    if checkpoint_dir:
        # Currently, this leads to errors:
        # model = LightningMNISTClassifier.load_from_checkpoint(
        #     os.path.join(checkpoint, "checkpoint"))
        # Workaround:
        ckpt = pl_load(
            os.path.join(checkpoint_dir, "checkpoint"),
            map_location=lambda storage, loc: storage)
        model = LightningMNISTClassifier._load_model_state(
            ckpt, config=config, data_dir=data_dir)
        trainer.current_epoch = ckpt["epoch"]
    else:
        model = LightningMNISTClassifier(config=config, data_dir=data_dir)

    trainer.fit(model)
Example #29
0
    def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx, is_using_ray_backend=False):
        for gpu_id in ray.get_gpu_ids():
            # Previous trial may not have freed its memory yet, so wait to avoid OOM
            wait_for_gpu(gpu_id)
        # Some config values may be JSON encoded as strings, so decode them here
        config = RayTuneSampler.decode_values(config, decode_ctx)

        trial_id = tune.get_trial_id()
        modified_config = substitute_parameters(copy.deepcopy(hyperopt_dict["config"]), config)

        trial_dir = Path(tune.get_trial_dir())
        trial_location = ray.util.get_node_ip_address()

        hyperopt_dict["config"] = modified_config
        hyperopt_dict["experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}'
        hyperopt_dict["output_directory"] = str(trial_dir)

        tune_executor = self
        if is_using_ray_backend:
            ray_queue = RayQueue(actor_options={"num_cpus": 0})
        else:
            ray_queue = None

        def checkpoint(progress_tracker, save_path):
            with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir:
                checkpoint_model = os.path.join(checkpoint_dir, "model")
                # shutil.copytree(save_path, checkpoint_model)
                # Note: A previous implementation used shutil.copytree()
                # however, this copying method is non atomic
                if not os.path.isdir(checkpoint_model):
                    copy_id = uuid.uuid4()
                    tmp_dst = f"{checkpoint_model}.{copy_id}.tmp"
                    assert os.path.exists(save_path)
                    shutil.copytree(save_path, tmp_dst)
                    try:
                        os.rename(tmp_dst, checkpoint_model)
                    except Exception:
                        shutil.rmtree(tmp_dst)

        def report(progress_tracker):
            train_stats = {
                TRAINING: progress_tracker.train_metrics,
                VALIDATION: progress_tracker.vali_metrics,
                TEST: progress_tracker.test_metrics,
            }

            metric_score = tune_executor.get_metric_score(train_stats)
            tune.report(
                parameters=json.dumps(config, cls=NumpyEncoder),
                metric_score=metric_score,
                training_stats=json.dumps(train_stats, cls=NumpyEncoder),
                eval_stats="{}",
                trial_id=tune.get_trial_id(),
                trial_dir=tune.get_trial_dir(),
            )

        class RayTuneReportCallback(Callback):
            def _get_sync_client_and_remote_checkpoint_dir(self) -> Optional[Tuple["CommandBasedClient", str]]:
                # sync client has to be recreated to avoid issues with serialization
                return tune_executor._get_sync_client_and_remote_checkpoint_dir(trial_dir)

            def on_trainer_train_setup(self, trainer, save_path, is_coordinator):
                if is_using_ray_backend and checkpoint_dir and trial_location != ray.util.get_node_ip_address():
                    save_path = Path(save_path)

                    for path in trial_dir.glob("checkpoint*"):
                        if path not in (save_path.parent, checkpoint_dir):
                            shutil.rmtree(path, ignore_errors=True)

                    sync_info = self._get_sync_client_and_remote_checkpoint_dir()
                    if sync_info is not None:
                        sync_client, remote_checkpoint_dir = sync_info
                        sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute()))
                        sync_client.wait()

            def on_epoch_end(self, trainer, progress_tracker, save_path):
                if is_using_ray_backend:
                    save_path = Path(save_path)
                    if trial_location != ray.util.get_node_ip_address():
                        sync_info = self._get_sync_client_and_remote_checkpoint_dir()
                        if sync_info is not None:
                            sync_client, remote_checkpoint_dir = sync_info
                            sync_client.sync_up(str(save_path.parent.parent.absolute()), remote_checkpoint_dir)
                            sync_client.wait()
                    ray_queue.put((progress_tracker, str(save_path)))
                    return

                checkpoint(progress_tracker, save_path)
                report(progress_tracker)

        callbacks = hyperopt_dict.get("callbacks") or []
        hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()]

        # set tune resources
        if is_using_ray_backend:
            resources = tune.get_trial_resources()
            # check if we are using at least 1 gpu per trial
            use_gpu = bool(self._gpu_resources_per_trial_non_none)
            # get the resources assigned to the current trial
            current_resources = resources.required_resources["GPU" if use_gpu else "CPU"]

            hvd_kwargs = {
                "num_workers": int(current_resources),
                "use_gpu": use_gpu,
            }
            hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs)

            logger.debug(f"Trial horovod kwargs: {hvd_kwargs}")

        stats = []

        def _run():
            train_stats, eval_stats = run_experiment(
                **hyperopt_dict,
                model_resume_path=checkpoint_dir,
                parameters=config,
            )
            stats.append((train_stats, eval_stats))

        sync_info = self._get_sync_client_and_remote_checkpoint_dir(trial_dir)
        if is_using_ray_backend and sync_info is not None:
            # We have to pull the results to the trial actor
            # from worker actors, as the Tune session is running
            # only on the trial actor
            thread = threading.Thread(target=_run)
            thread.daemon = True
            thread.start()

            sync_client, remote_checkpoint_dir = sync_info

            def check_queue():
                qsize = ray_queue.qsize()
                if qsize:
                    results = ray_queue.get_nowait_batch(qsize)
                    sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute()))
                    sync_client.wait()
                    for progress_tracker, save_path in results:
                        checkpoint(progress_tracker, str(trial_dir.joinpath(Path(save_path))))
                        report(progress_tracker)

            while thread.is_alive():
                thread.join(timeout=0)
                check_queue()
                time.sleep(0.1)
            thread.join()
            check_queue()
        else:
            # remove threading overhead
            _run()

        if not stats:
            raise RuntimeError("Experiment did not complete.")
        train_stats, eval_stats = stats.pop()

        metric_score = self.get_metric_score(train_stats)
        tune.report(
            parameters=json.dumps(config, cls=NumpyEncoder),
            metric_score=metric_score,
            training_stats=json.dumps(train_stats, cls=NumpyEncoder),
            eval_stats=json.dumps(eval_stats, cls=NumpyEncoder),
            trial_id=tune.get_trial_id(),
            trial_dir=tune.get_trial_dir(),
        )
def train_submodel_diff(config):
    from keras.models import Sequential
    from keras.layers import Dense, Dropout
    from keras.layers import Conv1D, MaxPooling1D, LSTM
    from keras.layers import Embedding
    from keras.callbacks import ModelCheckpoint
    from keras.optimizers import adadelta
    from ray.tune.integration.keras import TuneReporterCallback
    import utils.definition_network as dn
    import pandas as pd
    from ray import tune

    x_train, y_train, x_valid, y_valid, num_words, embedding_matrix = config[
        "exp_sets"].pp_data.load_data()

    trainable_emb = (config["exp_sets"].pp_data.use_embedding == (
        dn.UseEmbedding.RAND or dn.UseEmbedding.NON_STATIC))

    model = Sequential([
        Embedding(config["exp_sets"].pp_data.vocabulary_size,
                  config["exp_sets"].pp_data.embedding_size,
                  trainable=trainable_emb,
                  name=config["name"] + '_rt_emb_1'),
        Dropout(config["dropout"], name=config["name"] + '_rt_dropout_1'),
        Conv1D(filters=config["filters_by_layer"],
               kernel_size=config["kernels_size"],
               kernel_initializer='glorot_uniform',
               padding='valid',
               activation='relu',
               name=config["name"] + '_rt_conv_1'),
        MaxPooling1D(name=config["name"] + '_rt_max_pool_1'),
        LSTM(config["lstm_units"],
             kernel_initializer='glorot_uniform',
             activation='tanh',
             dropout=config["dropout_lstm"],
             recurrent_dropout=config["dropout_lstm"],
             return_sequences=True,
             name=config["name"] + '_rt_lstm_1'),
        LSTM(config["lstm_units"],
             kernel_initializer='glorot_uniform',
             activation='tanh',
             dropout=config["dropout_lstm"],
             recurrent_dropout=config["dropout_lstm"],
             return_sequences=True,
             name=config["name"] + '_rt_lstm_2'),
        LSTM(config["lstm_units"],
             kernel_initializer='glorot_uniform',
             activation='tanh',
             dropout=config["dropout_lstm"],
             recurrent_dropout=config["dropout_lstm"],
             name=config["name"] + '_rt_lstm_3'),
        Dense(3, activation='sigmoid', name=config["name"] + '_rt_dense_1')
    ])

    model.compile(loss="binary_crossentropy",
                  optimizer=adadelta(lr=config["lr"]),
                  metrics=["accuracy"])

    history = model.fit(x_train,
                        y_train,
                        batch_size=config["batch_size"],
                        epochs=config["epochs"],
                        verbose=0,
                        validation_data=(x_valid, y_valid),
                        callbacks=[
                            TuneReporterCallback(freq="epoch"),
                            ModelCheckpoint(tune.get_trial_dir() +
                                            'train_model.h5',
                                            monitor='val_acc',
                                            mode='max',
                                            save_best_only=True,
                                            save_weights_only=False,
                                            verbose=0)
                        ])

    hist_df = pd.DataFrame(history.history)
    with open(tune.get_trial_dir() + 'history_train_model.csv',
              mode='w') as file:
        hist_df.to_csv(file)