def tune_mnist_mxnet(num_samples=10, num_epochs=10): logger.info("Downloading MNIST data...") mnist_data = mx.test_utils.get_mnist() logger.info("Got MNIST data, starting Ray Tune.") config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "lr": tune.loguniform(1e-3, 1e-1), "batch_size": tune.choice([32, 64, 128]) } scheduler = ASHAScheduler(metric="mean_accuracy", mode="max", max_t=num_epochs, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) tune.run(partial(train_mnist_mxnet, mnist=mnist_data, num_epochs=num_epochs), resources_per_trial={ "cpu": 1, }, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_mnist_mxnet")
def grid_search(hparams): scheduler = ASHAScheduler(max_t=hparams['n_epochs'], grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=hparams['param_cols'], metric_columns=['valid_acc', 'valid_f1', 'valid_loss']) rdm = RetinalDataModule() analysis = tune.run(tune.with_parameters(train_tune, rdm=rdm), resources_per_trial={ "cpu": 1, "gpu": 1 }, metric="valid_loss", mode="min", config=hparams, local_dir=Path(hparams['output_dir'], 'ray_tune'), num_samples=5, scheduler=scheduler, progress_reporter=reporter, name=f"tune_{hparams['model']}_DRIVE") print("Best hyperparameters found were: ", analysis.best_config)
def tune_mnist_pbt(): data_dir = mkdtemp(prefix="mnist_data_") LightningMNISTClassifier.download_data(data_dir) config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "lr": 1e-3, "batch_size": 64, "data_dir": data_dir } scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="loss", mode="min", perturbation_interval=4, hyperparam_mutations={ "lr": lambda: tune.loguniform(1e-4, 1e-1).func(None), "batch_size": [32, 64, 128] }) reporter = CLIReporter( parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) tune.run( train_mnist_tune_checkpoint, resources_per_trial={"cpu": 1}, config=config, num_samples=10, scheduler=scheduler, progress_reporter=reporter) shutil.rmtree(data_dir)
def tune_mnist_asha(): data_dir = mkdtemp(prefix="mnist_data_") LightningMNISTClassifier.download_data(data_dir) config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "data_dir": data_dir } scheduler = ASHAScheduler( metric="loss", mode="min", max_t=10, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) tune.run( train_mnist_tune, resources_per_trial={"cpu": 1}, config=config, num_samples=10, scheduler=scheduler, progress_reporter=reporter) shutil.rmtree(data_dir)
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): data_dir = os.path.abspath("./data") load_data(data_dir) # 配置搜索空间 # 随机搜索 config = { "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)), "l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([2, 4, 8, 16]) } # ASHAScheduler,它将尽早终止效果不佳的测试。 scheduler = ASHAScheduler( metric="loss", mode="min", max_t=max_num_epochs, grace_period=1, reduction_factor=2 ) reporter = CLIReporter( parameter_columns=["l1", "l2", "lr", "batch_size"], metric_columns=["loss","accuracy", "training_iteration"] ) result = tune.run( partial(train_cifar, data_dir=data_dir), resources_per_trial={"cpu": 2, "gpu": gpus_per_trial}, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter ) best_trial = result.get_best_trial("loss","min","last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"] )) print("Best trial final validation accuracy: {}".format( best_trial.last_result["accuracy"])) # 设置模型超参 best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if gpus_per_trial > 1: best_trained_model = nn.DataParallel(best_trained_model) best_trained_model.to(device) best_checkpoint_dir = best_trial.checkpoint.value model_state, optimizer_state = torch.load(os.path.join( best_checkpoint_dir, "checkpoint" )) # 加载模型参数 best_trained_model.load_state_dict(model_state) test_acc = test_accuracy(best_trained_model, device) print("Best trial test set accuracy: {}".format(test_acc))
def tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0, data_dir="~/data"): config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), } scheduler = ASHAScheduler( max_t=num_epochs, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) train_fn_with_parameters = tune.with_parameters(train_mnist_tune, num_epochs=num_epochs, num_gpus=gpus_per_trial, data_dir=data_dir) resources_per_trial = {"cpu": 1, "gpu": gpus_per_trial} analysis = tune.run(train_fn_with_parameters, resources_per_trial=resources_per_trial, metric="loss", mode="min", config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_mnist_asha") print("Best hyperparameters found were: ", analysis.best_config)
def main(): # parse config parser = flags.get_parser() args, override_args = parser.parse_known_args() config = build_config(args, override_args) # add parameters to tune using grid or random search config["lr"] = tune.loguniform(0.0001, 0.01) # define scheduler scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="val_loss", mode="min", perturbation_interval=1, hyperparam_mutations={ "lr": tune.loguniform(0.000001, 0.01), }, ) # ray init ray.init( address="auto", _node_ip_address=os.environ["ip_head"].split(":")[0], _redis_password=os.environ["redis_password"], ) # define command line reporter reporter = CLIReporter( print_intermediate_tables=True, metric="val_loss", mode="min", metric_columns={ "act_lr": "act_lr", "steps": "steps", "epochs": "epochs", "training_iteration": "training_iteration", "val_loss": "val_loss", "val_forces_mae": "val_forces_mae", }, ) # define run parameters analysis = tune.run( ocp_trainable, resources_per_trial={ "cpu": 8, "gpu": 1 }, config=config, stop={"epochs": 12}, # time_budget_s=28200, fail_fast=False, local_dir=config.get("run_dir", "./"), num_samples=8, progress_reporter=reporter, scheduler=scheduler, ) print( "Best config is:", analysis.get_best_config(metric="val_forces_mae", mode="min", scope="last"), )
def main(loss_function="L1", num_samples=25, max_num_epochs=25, gpus_per_trial=1, cpus_per_trial=10): experiment_name = loss_function + "_shuffle_validation" save_dir = '/data/results/vcpujol/transformers/single_deployment/predict_maxmax/pytorch_transformer/' config = { "lr": tune.loguniform(1e-4, 5e-1), "lr_step": tune.randint(1,10), "gamma": tune.loguniform(0.85,0.9999), "epochs": tune.choice([5, 10, 15, 20, 25]), "n_heads": tune.randint(2,10), "dim_val": tune.choice([2,4,6]), # FIXME requires numero parell... "dim_att": tune.randint(2,12), "encoder_layers": tune.randint(1,7), "decoder_layers": tune.randint(1,7), "batch_size": tune.randint(1,10), "input_feat_enc": tune.choice([94]), "input_feat_dec": tune.choice([1]), "seq_len": tune.choice([16, 32, 64, 96, 128, 180, 220, 256, 312, 350, 420, 470, 512]), #[16, 32, 64, 128, 256, 512, 1024, 2048] "prediction_step": tune.choice([1]) } scheduler = ASHAScheduler( metric="loss", mode="min", max_t=max_num_epochs, grace_period=4, reduction_factor=2) reporter = CLIReporter( parameter_columns=["lr", "lr_step", "gamma", "epochs", "n_heads", "dim_val", "dim_att", "encoder_layers", "decoder_layers", "batch_size", "seq_len"], metric_columns=["loss", "training_iteration"]) result = tune.run( partial(transformer_train, save_dir=save_dir, loss_function=loss_function), resources_per_trial={"cpu": cpus_per_trial, "gpu": gpus_per_trial}, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, local_dir=save_dir, name=experiment_name) best_trial = result.get_best_trial("loss", "min", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format(best_trial.last_result["loss"])) # print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"])) best_trained_model = Transformer(best_trial.config["dim_val"], best_trial.config["dim_att"], best_trial.config["input_feat_enc"], best_trial.config["input_feat_dec"], best_trial.config["seq_len"], best_trial.config["decoder_layers"], best_trial.config["encoder_layers"], best_trial.config["n_heads"]) best_checkpoint_dir = best_trial.checkpoint.value model_state, optimizer_state = torch.load(os.path.join( best_checkpoint_dir, "checkpoint")) best_trained_model.load_state_dict(model_state) local_dir = save_dir exp_name = experiment_name test_acc = test_transformer(best_trained_model, best_trial.config, local_dir, exp_name, loss_function) print("Best trial test set accuracy: {}".format(test_acc))
def main_tune(base_args): # ray.init(log_to_driver=False) tune_config = { "learning_rate": tune.loguniform(5e-6, 1e-3), "weight_decay": tune.choice([0.0, 1e-3, 1e-2, 0.1]), "batch_size": tune.choice([16, 32, 64, 128]), "latent_dim": tune.choice([2, 3, 8, 16, 32, 128, 256, 512]) } scheduler = ASHAScheduler(max_t=base_args.max_tune_epoches, grace_period=3, reduction_factor=2) reporter = CLIReporter(parameter_columns=[ "learning_rate", "weight_decay", "batch_size", "latent_dim" ], metric_columns=[ "val_lossR", "loss", "Reconstruction_Loss", "training_iteration" ]) analysis = tune.run(tune.with_parameters(tune_train, base_arg=base_args), resources_per_trial={ "cpu": 12, "gpu": 1.0, }, metric="val_lossR", mode="min", config=tune_config, num_samples=10, scheduler=scheduler, progress_reporter=reporter, name="tune_vae_chol") print("Best hyperparameters found were: ", analysis.best_config)
def run_best_params(opt): best_params_dir = get_best_params_dir(opt) with open(best_params_dir + '/params.json') as f: best_params = json.loads(f.read()) # allow params specified at the cmd line to override best_params_ret = {**best_params, **opt} # the exception is number of epochs as we want to use more here than we would for hyperparameter tuning. best_params_ret['epoch'] = opt['epoch'] print("Running with parameters {}".format(best_params_ret)) data_dir = os.path.abspath("../data") reporter = CLIReporter(metric_columns=[ "accuracy", "loss", "test_acc", "train_acc", "best_time", "best_epoch", "training_iteration" ]) if opt['name'] is None: name = opt['folder'] + '_test' else: name = opt['name'] result = tune.run( partial(train_ray_int, data_dir=data_dir), name=name, resources_per_trial={ "cpu": opt['cpus'], "gpu": opt['gpus'] }, search_alg=None, keep_checkpoints_num=3, checkpoint_score_attr='accuracy', config=best_params_ret, num_samples=opt['reps'] if opt["num_splits"] == 0 else opt["num_splits"] * opt["reps"], scheduler=None, max_failures= 1, # early stop solver can't recover from failure as it doesn't own m2. local_dir='../ray_tune', progress_reporter=reporter, raise_on_failed_trial=False) df = result.dataframe(metric=opt['metric'], mode="max").sort_values(opt['metric'], ascending=False) try: df.to_csv('../ray_results/{}_{}.csv'.format( name, time.strftime("%Y%m%d-%H%M%S"))) except: pass print(df[['accuracy', 'test_acc', 'train_acc', 'best_time', 'best_epoch']]) test_accs = df['test_acc'].values print("test accuracy {}".format(test_accs)) log = "mean test {:04f}, test std {:04f}, test sem {:04f}, test 95% conf {:04f}" print( log.format(test_accs.mean(), np.std(test_accs), get_sem(test_accs), mean_confidence_interval(test_accs)))
def tune_from_existing(start_model, start_config, num_samples=10, num_epochs=10, gpus_per_trial=0.0, day=0): data_interface = MNISTDataInterface("/tmp/mnist_data", max_days=10) num_examples = data_interface._get_day_slice( day) - data_interface._get_day_slice(day - 1) config = start_config.copy() config.update({ "batch_size": tune.choice([16, 32, 64]), "lr": tune.loguniform(1e-4, 1e-1), "momentum": tune.uniform(0.1, 0.9), }) scheduler = ASHAScheduler( metric="mean_accuracy", mode="max", max_t=num_epochs, grace_period=1, reduction_factor=2, ) reporter = CLIReporter( parameter_columns=["lr", "momentum", "batch_size"], metric_columns=["mean_accuracy", "training_iteration"], ) analysis = tune.run( partial( train_mnist, start_model=start_model, data_fn=data_interface.get_incremental_data, num_epochs=num_epochs, use_gpus=True if gpus_per_trial > 0 else False, day=day, ), resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, verbose=0, name="tune_serve_mnist_fromsexisting", ) best_trial = analysis.get_best_trial("mean_accuracy", "max", "last") best_accuracy = best_trial.metric_analysis["mean_accuracy"]["last"] best_trial_config = best_trial.config best_checkpoint = best_trial.checkpoint.value return best_accuracy, best_trial_config, best_checkpoint, num_examples
def main(): parser = argparse.ArgumentParser() parser.add_argument("--X_dir", type=str) parser.add_argument("--y_dir", type=str) parser.add_argument("--epoch", type=int) parser.add_argument("--config_dir", type=str) parser.add_argument("--model", type=str) parser.add_argument("--n_sample", type=int) args = parser.parse_args() X_train = torch.load(args.X_dir) y_train = torch.load(args.y_dir) config = { "n_hidden": tune.sample_from(lambda _: 2**np.random.randint(2, 9)), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([16, 32, 64, 128]), } CL = CustomLoss(1, 2) scheduler = ASHAScheduler(metric="loss", mode="min", max_t=args.epoch, grace_period=1, reduction_factor=2) reporter = CLIReporter(metric_columns=["loss", "training_iteration"]) def train_func(config): train_model( X=X_train, y=y_train, num_epochs=args.epoch, loss_func=CL.custom_loss_1, model_name=args.model, config=config, ) result = tune.run( train_func, resources_per_trial={ "cpu": 2, "gpu": 2 }, config=config, num_samples=args.n_sample, scheduler=scheduler, progress_reporter=reporter, ) best_trial = result.get_best_trial("loss", "min", "last") with open(args.config_dir, "w") as json_file: json.dump(best_trial.last_result["config"], json_file) last_loss = best_trial.last_result["loss"] print(f"Validation Loss of best model was {last_loss}.")
def tune_vl_bert(config_path, pl_ckpt_path, num_samples=10, num_epochs=10, gpus_per_trial=2): # scheduler = ASHAScheduler( # metric="loss", # mode="min", # max_t=num_epochs, # grace_period=1, # reduction_factor=2) reporter = CLIReporter( parameter_columns=[ "lr", "weight_decay", "warmup_factor", "max_epoch", "batch_size" ], metric_columns=["mean_accuracy", "training_iteration"]) param_config = { "lr": 6.25e-7, "weight_decay": tune.loguniform(1e-5, 1e-2), "batch_size": 4, "max_epoch": tune.choice([4, 6, 8, 10]), "warmup_factor": tune.uniform(0, 1), "warmup_steps": tune.uniform(100, 800), } scheduler = PopulationBasedTraining(time_attr="training_iteration", metric="mean_accuracy", mode="max", perturbation_interval=2, hyperparam_mutations={ "lr": tune.loguniform(6.25e-6, 6.25e-8), "batch_size": [1, 2, 3, 4], }) update_config(config_path) model_base_cfg = copy.deepcopy(config) tune.run(partial( _tune, vl_bert_config=model_base_cfg, pl_ckpt_path=pl_ckpt_path, num_gpus=gpus_per_trial, ), resources_per_trial={ "cpu": 4, "gpu": gpus_per_trial, }, config=param_config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_vl_bert")
def tune4_withLabel( model, train_set: Dataset, val_set: Dataset, dims: list, config: dict, EPOCHS: int = 300, extra_feature_len: int = 0, extra_feature_len2: int = 0, n_gpu=1, n_samples=20, model_name="model", ): dim1, dim2, dim3, dim4 = dims[0], dims[1], dims[2], dims[3] scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["k", "lr", "batch_size", "hidden_dim"], metric_columns=["loss", "training_iteration"], max_error_rows=5, max_progress_rows=5, max_report_frequency=10) analysis = tune.run(tune.with_parameters( train4_withLabel, model=model, dim1=dim1, dim2=dim2, dim3=dim3, dim4=dim4, extra_feature_len=extra_feature_len, extra_feature_len2=extra_feature_len2, train_set=train_set, val_set=val_set, num_epochs=EPOCHS, num_gpus=n_gpu, model_name=model_name), resources_per_trial={ "cpu": 1, "gpu": n_gpu }, metric="loss", mode="min", config=config, num_samples=n_samples, scheduler=scheduler, progress_reporter=reporter, name=model_name, verbose=False) print("-" * 70) print("Done") print("Best hyperparameters found were: ", analysis.best_config) print("Best achieved loss was: ", analysis.best_result) print("-" * 70)
def start_training(name): Epochs = 1000 Samples = 50 ModelName = name pose_autoencoder = MLP_withLabel.load_checkpoint( "/home/nuoc/Documents/MEX/models/MLP4_withLabel_best/M3/0.00324857.512.pbz2" ) # pose_autoencoder = MLP_withLabel.load_checkpoint("/home/nuoc/Documents/MEX/models/MLP_withLabel/0.0013522337.512.pbz2") pose_encoder_out_dim = pose_autoencoder.dimensions[-1] scheduler = ASHAScheduler(max_t=Epochs, grace_period=15, reduction_factor=2) reporter = CLIReporter( parameter_columns=["k", "lr", "batch_size", "loss_fn"], metric_columns=["loss", "training_iteration"], max_error_rows=5, max_progress_rows=5, max_report_frequency=1) analysis = tune.run(tune.with_parameters( tuning, MODEL=MotionGenerationModel, pose_autoencoder=pose_autoencoder, cost_dim=cost_dim, phase_dim=phase_dim, input_slices=[phase_dim, pose_dim, cost_dim], output_slices=[phase_dim, phase_dim, pose_encoder_out_dim], train_set=train_set, val_set=val_set, num_epochs=Epochs, model_name=ModelName), resources_per_trial={ "cpu": 2, "gpu": 1 }, metric="loss", mode="min", config=config, num_samples=Samples, scheduler=scheduler, progress_reporter=reporter, name=ModelName, verbose=False) print("-" * 70) print("Done") print("Best hyperparameters found were: ", analysis.best_config) print("Best achieved loss was: ", analysis.best_result) print("-" * 70) ray.shutdown()
def main(args): cfg = setup(args) search_space = CS.ConfigurationSpace() search_space.add_hyperparameters([ CS.UniformFloatHyperparameter(name="lr", lower=1e-6, upper=1e-3), CS.UniformFloatHyperparameter(name="wd", lower=0, upper=1e-3), CS.UniformFloatHyperparameter(name="wd_bias", lower=0, upper=1e-3), CS.CategoricalHyperparameter(name="bsz", choices=[64, 96, 128, 160, 224, 256]), CS.CategoricalHyperparameter(name="num_inst", choices=[2, 4, 8, 16, 32]), CS.UniformIntegerHyperparameter(name="delay_iters", lower=20, upper=60), CS.UniformFloatHyperparameter(name="ce_scale", lower=0.1, upper=1.0), CS.UniformIntegerHyperparameter(name="circle_scale", lower=8, upper=256), CS.UniformFloatHyperparameter(name="circle_margin", lower=0.1, upper=0.5), CS.CategoricalHyperparameter(name="autoaug_enabled", choices=[True, False]), CS.CategoricalHyperparameter(name="cj_enabled", choices=[True, False]), ]) exp_metrics = dict(metric="score", mode="max") bohb_hyperband = HyperBandForBOHB( time_attr="training_iteration", max_t=7, **exp_metrics, ) bohb_search = TuneBOHB(search_space, max_concurrent=4, **exp_metrics) reporter = CLIReporter(parameter_columns=["bsz", "num_inst", "lr"], metric_columns=["r1", "map", "training_iteration"]) analysis = tune.run(partial(train_reid_tune, cfg), resources_per_trial={ "cpu": 10, "gpu": 1 }, search_alg=bohb_search, num_samples=args.num_samples, scheduler=bohb_hyperband, progress_reporter=reporter, local_dir=cfg.OUTPUT_DIR, keep_checkpoints_num=4, name="bohb") best_trial = analysis.get_best_trial("map", "max", "last") logger.info("Best trial config: {}".format(best_trial.config)) logger.info("Best trial final validation mAP: {}, Rank-1: {}".format( best_trial.last_result["map"], best_trial.last_result["r1"]))
def hyperparameter_tuning_initializer(loss_type='SL', learning_rate_scheduler ='CARM'): # defining the hyperparameters if loss_type == 'FL': config = { 'gamma': tune.choice([0.5, 1, 2]), 'lr': tune.loguniform(1e-4, 1e-3) } elif loss_type == 'CEDL': config = { 'dice_loss': tune.uniform(0, 3), 'lr': tune.loguniform(1e-4, 1e-3) } elif loss_type == 'CEDIL': config = { 'dice_loss': tune.uniform(0, 3), 'inverse_dice_loss': tune.uniform(0, 3), 'lr': tune.loguniform(1e-4, 1e-3) } elif loss_type == 'SL': config = { 'lambda': tune.uniform(0, 1), 'tau': tune.uniform(0.02, 0.04), 'lr': tune.loguniform(1e-4, 1e-3) } elif loss_type == 'VCE': config = { 'var_loss': tune.uniform(0.5, 5.5), 'lr': tune.loguniform(1e-4, 1e-3) } # hyperparameters for learning rate scheduler if learning_rate_scheduler == 'CARM': config['T_0'] = tune.choice([5, 10, 20, 40, 50]) config['eta_min_factor'] = tune.loguniform(1e2, 1e4) if learning_rate_scheduler == 'SLR': config['step_size'] = tune.choice([5, 10, 20, 40, 50]) if learning_rate_scheduler == 'MLR': config['lr_lambda'] = tune.uniform(0.8, 0.99) # defining the scheduler scheduler = ASHAScheduler( metric='loss', mode='min', max_t=conf['max_epochs'] // 20, grace_period=1, reduction_factor=2 ) # defining the reporter reporter = CLIReporter(metric_columns=['loss', 'avg_dice_coefficient', 'epoch']) return config, scheduler, reporter
def run_ray_single_instance(trainer, logger, **kwargs): """Run with ray in a single instance. Tested.""" # adapted from HF integrations prefix_checkpoint_dir = "checkpoint" def _objective(trial, checkpoint_dir=None): model_path = None if checkpoint_dir: for subdir in os.listdir(checkpoint_dir): if subdir.startswith(prefix_checkpoint_dir): model_path = os.path.join(checkpoint_dir, subdir) trainer.objective = None trainer.train(model_path=model_path, trial=trial) # If there hasn't been any evaluation during the training loop. if getattr(trainer, "objective", None) is None: metrics = trainer.evaluate() # Logs at the end of the objective function, useful for hp search only trainer._tune_save_checkpoint() # Q: what else is reporting, when not reporting hp search metrics? tune.report(**metrics, done=True) _tb_writer = trainer.pop_callback(TensorBoardCallback) if _tb_writer is not None: trainer.add_callback(_tb_writer) # Setup default `resources_per_trial` and `reporter`. if "resources_per_trial" not in kwargs and trainer.args.n_gpu > 0: # `args.n_gpu` is considered the total number of GPUs that will be split # among the `n_jobs` n_jobs = int(kwargs.pop("n_jobs", 1)) num_gpus_per_trial = trainer.args.n_gpu if num_gpus_per_trial / n_jobs >= 1: num_gpus_per_trial = int(math.ceil(num_gpus_per_trial / n_jobs)) kwargs["resources_per_trial"] = {"gpu": num_gpus_per_trial} if "progress_reporter" not in kwargs: from ray.tune import CLIReporter kwargs["progress_reporter"] = CLIReporter(metric_columns=["objective"]) if "keep_checkpoints_num" in kwargs and kwargs["keep_checkpoints_num"] > 0: # `keep_checkpoints_num=0` would disabled checkpointing trainer.use_tune_checkpoints = True if kwargs["keep_checkpoints_num"] > 1: logger.warning( "Currently keeping {} checkpoints for each trial." "Checkpoints are large, consider setting `keep_checkpoints_num=1`." ) # run tune tune.run(_objective, **kwargs)
def main(num_samples=10, max_num_epochs=50, gpus_per_trial=2): data_dir = os.path.abspath("./data") checkpoint_dir = os.path.abspath("./checkpoints") config = { "l1": tune.choice([32, 64, 128, 256]), "l2": tune.choice([32, 64, 128, 256]), "lr": tune.loguniform(1e-5, 1e-3), "batch_size": tune.choice([64, 128, 256]) } scheduler = ASHAScheduler(metric="loss", mode="min", max_t=max_num_epochs, grace_period=5, reduction_factor=2) reporter = CLIReporter( # parameter_columns=["l1", "l2", "lr", "batch_size"], metric_columns=["loss", "accuracy", "training_iteration"]) st_time = time.time() result = tune.run(partial(train_regression, checkpoint_dir=checkpoint_dir, data_dir=data_dir), resources_per_trial={ "cpu": 2, "gpu": gpus_per_trial }, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter) best_trial = result.get_best_trial("loss", "min", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" best_trained_model.to(device) best_checkpoint_dir = best_trial.checkpoint.value model_state, optimizer_state = torch.load( os.path.join(best_checkpoint_dir, "checkpoint")) best_trained_model.load_state_dict(model_state) test_acc = test_accuracy(best_trained_model, device) print("Best trial test set accuracy: {}".format(test_acc)) print("Total Time", st_time - time.time())
def main(args): device = config_cuda(args.use_cuda) data_pth = Path(Path.cwd() / args.data_path) out_pth = Path(Path.cwd() / args.out_path) train_set, valid_set, test_set = get_datasets(data_pth) config = { 'eta': tune.loguniform(1e-5, 1e-1), 'batch_size': tune.choice([2, 4, 8, 16]) } scheduler = ASHAScheduler( metric='loss', mode='min', max_t=args.max_epochs, grace_period=1, reduction_factor=2 ) reporter = CLIReporter( parameter_columns=['eta', 'batch_size'], metric_columns=['loss', 'accuracy', 'training_iteration'] ) num_gpus = 1 if device == 'cuda' else 0 result = tune.run( partial(train_model, model_name=args.model, device=device, max_epochs=args.max_epochs, num_workers=args.num_workers, data_pth=data_pth), resources_per_trial={'cpu': args.num_workers, 'gpu': num_gpus}, config=config, num_samples=args.num_samples, scheduler=scheduler, progress_reporter=reporter ) best_trial = result.get_best_trial('loss', 'min', 'last') print(f'Best trial config {best_trial.config}') print( f'Best trial final validation loss: {best_trial.last_result["loss"]}' ) model = load_model(args.model, len(train_set.classes)) best_checkpoint_dir = best_trial.checkpoint.value model_state, optimizer_state = torch.load(os.path.join( best_checkpoint_dir, 'checkpoint')) model.load_state_dict(model_state) test_acc = test_model(model, device, test_set, out_pth, args.num_workers) print(f'Best trial test set accuracy: {test_acc}')
def main(num_samples=10, max_num_epochs=10): data_dir = os.path.abspath("./data") trainset, _ = load_data(data_dir) class_names = trainset.classes config = { "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), "lr": tune.loguniform(1e-4, 1e-2), "batch_size": tune.choice([8, 16, 32]) } scheduler = ASHAScheduler( metric="loss", mode="min", max_t=max_num_epochs, grace_period=1, reduction_factor=2 ) reporter = CLIReporter( parameter_columns=["l1", "l2", "lr", "batch_size"], metric_columns=["loss", "accuracy", "training_iteration"] ) result = tune.run( partial(train_net, data_dir=data_dir), resources_per_trial={"cpu": 2, "gpu": 1}, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter ) best_trial = result.get_best_trial("loss", "min", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) print("Best trial final validation accuracy: {}".format( best_trial.last_result["accuracy"])) best_trained_model = Net(len(class_names), best_trial.config["l1"], best_trial.config["l2"]) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") best_trained_model.to(device) best_checkpoint_dir = best_trial.checkpoint.value model_state, optimizer_state = torch.load(os.path.join( best_checkpoint_dir, "checkpoint")) best_trained_model.load_state_dict(model_state) test_acc = test_accuracy(best_trained_model, device) print("Best trial test set accuracy: {}".format(test_acc))
def run_learning(env_config): config['env_config'] = env_config stop = { #"episodes_total": 10 } tune.run("PPO", name="Herding", stop=stop, config=config, trial_name_creator=lambda trial: 'Herding', trial_dirname_creator=lambda trial: 'Herding', keep_checkpoints_num=1, checkpoint_freq=5, progress_reporter=CLIReporter(max_report_frequency=60) #resume=True )
def tuning(args): activation = nn.PReLU if args.actv == 'prelu' else nn.SELU config = { "l1_units": tune.choice([480, 512, 544]), "l2_units": tune.choice([224, 256, 288]), "l3_units": tune.choice([96, 128, 160]), "lambda": tune.choice([1e-3, 1e-4, 1e-5]), "actv": tune.choice([activation]) } scheduler = PopulationBasedTraining(time_attr='training_iteration', perturbation_interval=4, hyperparam_mutations={ "l1_units": [464, 496, 528, 560, 576], "l2_units": [208, 240, 272, 304, 328], "l3_units": [80, 112, 144, 176, 208] }) reporter = CLIReporter(parameter_columns=[ "l1_units", "l2_units", "l3_units", "lambda", ], metric_columns=["loss", "training_iteration"]) analysis = tune.run(tune.with_parameters(train, batch_size=args.batch_size, num_epochs=args.num_epochs, num_gpus=args.num_gpus), resources_per_trial={ "cpu": args.num_cpus, "gpu": args.num_gpus }, metric="loss", mode="min", config=config, num_samples=args.num_trials, scheduler=scheduler, progress_reporter=reporter, max_failures=3, stop={"training_iteration": 10}, name="tune_cae") print(f"Found best hyperparameters: {analysis.best_config}")
def tune_ecg(data_dir, num_epochs=1, normalised=True, num_samples=10, gpus_per_trial=1): config = { "lstm_size": tune.choice([2, 3, 4, 5, 32, 64, 128]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128, 256, 512]) } # scheduler = PopulationBasedTraining( # time_attr="training_iteration", # perturbation_interval=5, # hyperparam_mutations={ # # distribution for resampling # "lr": lambda: np.random.uniform(0.0001, 1), # # allow perturbations within this set of categorical values # "momentum": [0.8, 0.9, 0.99], # }) reporter = CLIReporter( parameter_columns=["lstm_size", "lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) trainable = tune.with_parameters(train_ecg, data_dir=data_dir, normalised=normalised, num_epochs=num_epochs, num_gpus=gpus_per_trial) analysis = tune.run( trainable, resources_per_trial={ "cpu": 16, "gpu": gpus_per_trial }, metric="loss", mode="min", config=config, num_samples=num_samples, local_dir="./results", # scheduler=scheduler, progress_reporter=reporter, name="tune_ecg") print("Best hyperparameters found were: ", analysis.best_config)
def main(args, reproducible: bool): if reproducible: seed_everything(42) datamodule = TwoDomainMMEDM(dataPath=args.dataPath, augment=True, batch_size=32, num_workers=8) config = { "log_lr": tune.uniform(-4, -2), "log_lrRatio": tune.uniform(-3, 0), "log_decay": tune.uniform(-8, -1), } search_alg = BayesOptSearch( metric='mean_iou', mode='max', ) scheduler = ASHAScheduler(grace_period=25, ) reporter = CLIReporter( parameter_columns=["log_lr", "log_lrRatio", "log_decay"], metric_columns=["loss", "mean_iou", "training_iteration"]) analysis = tune.run(tune.with_parameters( trainWithTune, datamodule=datamodule, num_epochs=175, num_gpus=1, ), resources_per_trial={ "cpu": 5, "gpu": 0.5, }, metric="mean_iou", mode="max", config=config, num_samples=20, scheduler=scheduler, search_alg=search_alg, progress_reporter=reporter, name="tune_minimax_segmenter") print("Best hyperparameters found were: ", analysis.best_config)
def optimize(): name_dir = os.path.join('saved', 'hyper-lstm') hyperparam_config = { 'name': 'lstm', 'num_hidden': tune.sample_from(lambda _: np.random.randint(1, 10)), 'num_layers': tune.sample_from(lambda _: np.random.randint(1, 5)), 'opt': tune.choice(['adam', 'sgd', 'adamw', 'lbfgs']), 'lr': tune.loguniform(1e-10, 1), 'epoch': tune.sample_from(lambda _: np.random.randint(5, 25)), 'beta_1': tune.loguniform(1e-8, 1e-2), 'beta_2': tune.loguniform(1e-8, 1e-2), 'weight_decay': tune.loguniform(1e-8, 1e-2), 'max_iter': tune.sample_from(lambda _: np.random.randint(10, 100)), 'momentum': tune.uniform(0.5, 0.9), 'patience': tune.sample_from(lambda _: np.random.randint(5, 25)), 'batch_size': 16 } if not os.path.isdir(name_dir): os.mkdir(name_dir) scheduler = ASHAScheduler(metric='accuracy', mode='max', max_t=25, grace_period=1, reduction_factor=2) reporter = CLIReporter(metric_columns=["loss", "accuracy"]) result = tune.run(partial(train, checkpoint_dir=name_dir, cwd=os.getcwd(), tuning=True), resources_per_trial={ "cpu": 1, "gpu": 0.5 }, config=hyperparam_config, num_samples=200, scheduler=scheduler, progress_reporter=reporter) best_trial = result.get_best_trial("accuracy", "max", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) print("Best trial final validation accuracy: {}".format( best_trial.last_result["accuracy"])) print("Best Checkpoint Dir: " + str(best_trial.checkpoint.value)) return best_trial.config
def _tune( model, train_set: Dataset, val_set: Dataset, dim: int, config: dict, EPOCHS: int = 300, n_gpu=1, n_samples=20, model_name="model", ): scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["k", "lr", "batch_size", "loss_fn"], metric_columns=["loss", "training_iteration"], max_error_rows=5, max_progress_rows=5, max_report_frequency=10) analysis = tune.run(tune.with_parameters(train, model=model, dim=dim, train_set=train_set, val_set=val_set, num_epochs=EPOCHS, num_gpus=n_gpu, model_name=model_name), resources_per_trial={ "cpu": 1, "gpu": n_gpu }, metric="loss", mode="min", config=config, num_samples=n_samples, scheduler=scheduler, progress_reporter=reporter, name=model_name, verbose=False) print("-" * 70) print("Done") print("Best hyperparameters found were: ", analysis.best_config) print("Best achieved loss was: ", analysis.best_result) print("-" * 70)
def run(smoke_test=False): stop = {"training_iteration": 1 if smoke_test else 50} num_workers = 1 if smoke_test else 20 num_gpus = 0 if smoke_test else 1 config = { "env": "PongNoFrameskip-v4", "framework": tune.grid_search(["tf", "torch"]), "num_gpus": num_gpus, "rollout_fragment_length": 50, "train_batch_size": 750, "num_workers": num_workers, "num_envs_per_worker": 1, "clip_rewards": True, "num_sgd_iter": 2, "vf_loss_coeff": 1.0, "clip_param": 0.3, "grad_clip": 10, "vtrace": True, "use_kl_loss": False, } logger.info("Configuration: \n %s", pformat(config)) # Run the experiment. # TODO(jungong) : maybe add checkpointing. return tune.run( "APPO", config=config, stop=stop, verbose=1, num_samples=1, progress_reporter=CLIReporter( metric_columns={ "training_iteration": "iter", "time_total_s": "time_total_s", "timesteps_total": "ts", "snapshots": "snapshots", "episodes_this_iter": "train_episodes", "episode_reward_mean": "reward_mean", }, sort_by_metric=True, max_report_frequency=30, ), )
def tune_mnist_asha(num_samples=10, num_epochs=50, gpus_per_trial=0, cpus_per_trial=4): data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_") LightningMNISTClassifier.download_data(data_dir) config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "layer_3_size": tune.choice([128, 256, 512]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), } scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=[ "layer_1_size", "layer_2_size", "layer_3_size", "lr", "batch_size" ], metric_columns=["loss", "mean_accuracy", "training_iteration"]) analysis = tune.run(tune.with_parameters(train_mnist_tune, data_dir=data_dir, num_epochs=num_epochs, num_gpus=gpus_per_trial), resources_per_trial={ "cpu": cpus_per_trial, "gpu": gpus_per_trial }, metric="loss", mode="min", config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_mnist_asha") print("Best hyperparameters found were: ", analysis.best_config) shutil.rmtree(data_dir)
def tune_mnist_pbt(num_samples=10, num_epochs=10, gpus_per_trial=0): data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_") LightningMNISTClassifier.download_data(data_dir) config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "lr": 1e-3, "batch_size": 64, } scheduler = PopulationBasedTraining( perturbation_interval=4, hyperparam_mutations={ "lr": tune.loguniform(1e-4, 1e-1), "batch_size": [32, 64, 128] }) reporter = CLIReporter( parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) analysis = tune.run( tune.with_parameters( train_mnist_tune_checkpoint, data_dir=data_dir, num_epochs=num_epochs, num_gpus=gpus_per_trial), resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, metric="loss", mode="min", config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_mnist_pbt") print("Best hyperparameters found were: ", analysis.best_config) shutil.rmtree(data_dir)