parser.add_argument("--smoke-test", action="store_true", help="Finish quickly for testing") args, _ = parser.parse_known_args() if not args.smoke_test and repo.is_dirty(): raise RepositoryDirtyError( repo, "Have you forgotten to commit the changes?") corpus_size = 100 config = { # A trick to log the SHA of the git HEAD. "SHA": tune.grid_search([sha]), "corpus_size": tune.grid_search([corpus_size]), "margin": tune.loguniform(0.0001, 0.2), "lr": tune.loguniform(0.001, 0.1), "batch_size": tune.grid_search([300]), "num_epochs": max(1000000 // corpus_size, 1), "test_freq": max(10000 // corpus_size, 1), "seed": 0, } analysis = tune.run( TrainBigramNN, name=experiment_name, config=config, num_samples=1 if args.smoke_test else 1000, # trial_name_creator=trial_str_creator, resources_per_trial={ "cpu": 4,
parser.add_argument( "--server-address", type=str, default=None, required=False, help="The address of server to connect to if using Ray Client.", ) args, _ = parser.parse_known_args() if args.server_address: import ray ray.init(f"ray://{args.server_address}") analysis = tune.run( easy_objective, name="hyperband_test", metric="mean_loss", mode="min", num_samples=5, trial_name_creator=trial_str_creator, callbacks=[TestLoggerCallback()], stop={"training_iteration": 1 if args.smoke_test else 100}, config={ "steps": 100, "width": tune.randint(10, 100), "height": tune.loguniform(10, 100), }, ) print("Best hyperparameters: ", analysis.best_config)
"prioritized_replay_beta_annealing_timesteps": 20000, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 0.0, # Whether to LZ4 compress observations "compress_observations": True, # Callback to run before learning on a multi-agent batch of experiences. # "before_learn_on_batch": debug_before_learn_on_batch, # If set, this will fix the ratio of replayed from a buffer and learned on # timesteps to sampled from an environment and stored in the replay buffer # timesteps. Otherwise, the replay will proceed at the native ratio # determined by (train_batch_size / rollout_fragment_length). "training_intensity": None, # === Optimization === # Learning rate for adam optimizer "lr": loguniform(0.0001, 0.1), # Learning rate schedule "lr_schedule": None, # Adam epsilon hyper parameter "adam_epsilon": choice([1e-8, 1e-6, 1e-4, 1e-2]), # If not None, clip gradients during optimization at this value "grad_clip": None, # How many steps of the model to sample before learning starts. "learning_starts": 16000, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1.q "rollout_fragment_length": choice([4, 8, 16, 32]), "batch_mode": "truncate_episodes", # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a
def optimize(train_x, train_y, test_x, test_y): import sklearn.datasets import sklearn.metrics import os from ray.tune.schedulers import ASHAScheduler from sklearn.model_selection import train_test_split import xgboost as xgb from ray import tune from ray.tune.integration.xgboost import TuneReportCheckpointCallback def train_breast_cancer(config: dict): # This is a simple training function to be passed into Tune # Load dataset # Split into train and test set # Build input matrices for XGBoost train_set = xgb.DMatrix(train_x, label=train_y) test_set = xgb.DMatrix(test_x, label=test_y) # Train the classifier, using the Tune callback xgb.train( config, train_set, evals=[(test_set, "eval")], verbose_eval=False, callbacks=[TuneReportCheckpointCallback(filename="model.xgb")]) if __name__ == "__main__": search_space = { # You can mix constants with search space objects. "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "max_depth": tune.randint(1, 9), "min_child_weight": tune.choice([1, 2, 3]), "subsample": tune.uniform(0.5, 1.0), "eta": tune.loguniform(1e-4, 1e-1) } # This will enable aggressive early stopping of bad trials. scheduler = ASHAScheduler( max_t=10, # 10 training iterations grace_period=1, reduction_factor=2) analysis = tune.run( train_breast_cancer, metric="eval-logloss", mode="min", # You can add "gpu": 0.1 to allocate GPUs resources_per_trial={"cpu": 1}, config=search_space, num_samples=10, scheduler=scheduler) # Load the best model checkpoint best_bst = xgb.Booster() best_bst.load_model(os.path.join(analysis.best_checkpoint, "model.xgb")) accuracy = 1. - analysis.best_result["eval-error"] print(f"Best model parameters: {analysis.best_config}") print(f"Best model total accuracy: {accuracy:.4f}") # You could now do further predictions with # best_bst.predict(...) log(action_logging_enum=INFO, logging_text= "[EXTREME GRADIENT BOOSTING]: Starting to search for best number of trees by cross validating different values." ) # read data train = pd.read_csv(DATA_PATH + CONTENT_FEATURE_DATABASE) train = transform_data(train) x_train = train.drop(['Label'], axis=1) y_train = train['Label'].copy() # Create regularization penalty space n_estimators = [40, 50, 60, 80, 100, 120, 140, 160] min_samples_leaf = [1, 2, 3, 4, 5] min_samples_split = [3, 4, 5, 6, 7, 8, 9, 10] max_features = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35] model = XGBClassifier() # Create hyperparameter options hyperparameters = dict(min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, max_features=max_features) # Create grid search using 10-fold cross validation clf = GridSearchCV(model, hyperparameters, cv=10, verbose=0) best_model = clf.fit(x_train, y_train) # View best hyperparameters # Best estimators: 60 # Best samples leaf: 1 # Best samples split: 3 # Best features: 5 log(action_logging_enum=INFO, logging_text="[EXTREME GRADIENT BOOSTING]: Optimization completed.") log( INFO, str('Best estimators:', best_model.best_estimator_.get_params()['n_estimators'])) log( INFO, str('Best samples leaf:', best_model.best_estimator_.get_params()['min_samples_leaf'])) log( INFO, str('Best samples split:', best_model.best_estimator_.get_params()['min_samples_split'])) log( INFO, str('Best features:', best_model.best_estimator_.get_params()['max_features']))
name=expname) print("Best Parameters:", analysis.best_config) analysis.best_result_df.to_csv("best_parameters_exp%s_trials%d.csv" % (expname, ntrials)) analysis.results_df.to_csv("all_results_exp%s_trials%d.csv" % (expname, ntrials)) print("Best 5 results") print(analysis.results_df.sort_values(by="mcc", ascending=False).head(5)) # + default_mpl = { "structure": "mpl", "learning_rate": tune.loguniform(1e-6, 1e-1), "batch_size": tune.choice([32, 64, 128, 256]), "monitor": tune.choice(["loss", "mcc"]), "shared_output_size": tune.randint(2, 256), "opt_step_size": tune.randint(1, 20), "weight_decay": tune.loguniform(1e-5, 1e-2), "dropout_input_layers": tune.uniform(0, 1), "dropout_inner_layers": tune.uniform(0, 1), } default_lstm = { "structure": "lstm", "learning_rate": tune.loguniform(1e-6, 1e-1), "batch_size": tune.choice([32, 64, 128, 256]), "bidirectional": tune.choice([True, False]), "num_layers": tune.choice([1, 2]),
def train_wrapper(config, ray_params): train_ray( path="/data/classification.parquet", num_workers=4, num_boost_rounds=100, num_files=64, regression=False, use_gpu=False, ray_params=ray_params, xgboost_params=config, ) if __name__ == "__main__": search_space = { "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9) } ray.init(address="auto") ray_params = RayParams(elastic_training=False, max_actor_restarts=2, num_actors=4, cpus_per_actor=1, gpus_per_actor=0) analysis = tune.run(tune.with_parameters(train_wrapper, ray_params=ray_params), config=search_space,
def hp_space(trial): return dict(learning_rate=tune.loguniform(1e-4, 1e-2))
def test_tuner_with_xgboost_trainer_driver_fail_and_resume(self): # So that we have some global checkpointing happening. os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "1" shutil.rmtree( os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail"), ignore_errors=True, ) trainer = XGBoostTrainer( label_column="target", params={}, # TODO(xwjiang): change when dataset out-of-band ser/des is landed. datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363 # is resolved. # "preprocessor": tune.grid_search([prep_v1, prep_v2]), # "datasets": { # "train": tune.choice( # [gen_dataset_func(), gen_dataset_func(do_shuffle=True)] # ), # }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } class FailureInjectionCallback(Callback): """Inject failure at the configured iteration number.""" def __init__(self, num_iters=10): self.num_iters = num_iters def on_step_end(self, iteration, trials, **kwargs): if iteration == self.num_iters: print(f"Failing after {self.num_iters} iters.") raise RuntimeError tuner = Tuner( trainable=trainer, run_config=RunConfig( name="test_tuner_driver_fail", callbacks=[FailureInjectionCallback()] ), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), ) with self.assertRaises(TuneError): tuner.fit() # Test resume restore_path = os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail") tuner = Tuner.restore(restore_path) # A hack before we figure out RunConfig semantics across resumes. tuner._local_tuner._run_config.callbacks = None results = tuner.fit() assert len(results) == 2
def cifar10_main(method="BlendSearch", num_samples=10, max_num_epochs=100, gpus_per_trial=1): data_dir = os.path.abspath("test/data") load_data(data_dir) # Download data for all trials before starting the run if method == "BlendSearch": from flaml import tune else: from ray import tune if method in ["BOHB"]: config = { "l1": tune.randint(2, 8), "l2": tune.randint(2, 8), "lr": tune.loguniform(1e-4, 1e-1), "num_epochs": tune.qloguniform(1, max_num_epochs, q=1), "batch_size": tune.randint(1, 4), } else: config = { "l1": tune.randint(2, 9), "l2": tune.randint(2, 9), "lr": tune.loguniform(1e-4, 1e-1), "num_epochs": tune.loguniform(1, max_num_epochs), "batch_size": tune.randint(1, 5), } import ray time_budget_s = 600 np.random.seed(7654321) start_time = time.time() if method == "BlendSearch": result = tune.run( ray.tune.with_parameters(train_cifar, data_dir=data_dir), config=config, metric="loss", mode="min", low_cost_partial_config={"num_epochs": 1}, max_resource=max_num_epochs, min_resource=1, scheduler="asha", resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, local_dir="logs/", num_samples=num_samples, time_budget_s=time_budget_s, use_ray=True, ) else: if "ASHA" == method: algo = None elif "BOHB" == method: from ray.tune.schedulers import HyperBandForBOHB from ray.tune.suggest.bohb import TuneBOHB algo = TuneBOHB() scheduler = HyperBandForBOHB(max_t=max_num_epochs) elif "Optuna" == method: from ray.tune.suggest.optuna import OptunaSearch algo = OptunaSearch(seed=10) elif "CFO" == method: from flaml import CFO algo = CFO(low_cost_partial_config={ "num_epochs": 1, }) elif "Nevergrad" == method: from ray.tune.suggest.nevergrad import NevergradSearch import nevergrad as ng algo = NevergradSearch(optimizer=ng.optimizers.OnePlusOne) if method != "BOHB": from ray.tune.schedulers import ASHAScheduler scheduler = ASHAScheduler(max_t=max_num_epochs, grace_period=1) result = tune.run( tune.with_parameters(train_cifar, data_dir=data_dir), resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, config=config, metric="loss", mode="min", num_samples=num_samples, time_budget_s=time_budget_s, scheduler=scheduler, search_alg=algo, ) ray.shutdown() logger.info(f"method={method}") logger.info(f"#trials={len(result.trials)}") logger.info(f"time={time.time()-start_time}") best_trial = result.get_best_trial("loss", "min", "all") logger.info("Best trial config: {}".format(best_trial.config)) logger.info("Best trial final validation loss: {}".format( best_trial.metric_analysis["loss"]["min"])) logger.info("Best trial final validation accuracy: {}".format( best_trial.metric_analysis["accuracy"]["max"])) best_trained_model = Net(2**best_trial.config["l1"], 2**best_trial.config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if gpus_per_trial > 1: best_trained_model = nn.DataParallel(best_trained_model) best_trained_model.to(device) checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint") model_state, optimizer_state = torch.load(checkpoint_path) best_trained_model.load_state_dict(model_state) test_acc = _test_accuracy(best_trained_model, device) logger.info("Best trial test set accuracy: {}".format(test_acc))
def testTuneSampleAPI(self): config = { "func": tune.sample_from(lambda spec: spec.config.uniform * 0.01), "uniform": tune.uniform(-5, -1), "quniform": tune.quniform(3.2, 5.4, 0.2), "loguniform": tune.loguniform(1e-4, 1e-2), "qloguniform": tune.qloguniform(1e-4, 1e-1, 5e-5), "choice": tune.choice([2, 3, 4]), "randint": tune.randint(-9, 15), "lograndint": tune.lograndint(1, 10), "qrandint": tune.qrandint(-21, 12, 3), "qlograndint": tune.qlograndint(2, 20, 2), "randn": tune.randn(10, 2), "qrandn": tune.qrandn(10, 2, 0.2), } for _, (_, generated) in zip(range(1000), generate_variants({"config": config})): out = generated["config"] self.assertAlmostEqual(out["func"], out["uniform"] * 0.01) self.assertGreaterEqual(out["uniform"], -5) self.assertLess(out["uniform"], -1) self.assertGreaterEqual(out["quniform"], 3.2) self.assertLessEqual(out["quniform"], 5.4) self.assertAlmostEqual(out["quniform"] / 0.2, round(out["quniform"] / 0.2)) self.assertGreaterEqual(out["loguniform"], 1e-4) self.assertLess(out["loguniform"], 1e-2) self.assertGreaterEqual(out["qloguniform"], 1e-4) self.assertLessEqual(out["qloguniform"], 1e-1) self.assertAlmostEqual(out["qloguniform"] / 5e-5, round(out["qloguniform"] / 5e-5)) self.assertIn(out["choice"], [2, 3, 4]) self.assertGreaterEqual(out["randint"], -9) self.assertLess(out["randint"], 15) self.assertTrue(isinstance(out["randint"], int)) self.assertGreaterEqual(out["lograndint"], 1) self.assertLess(out["lograndint"], 10) self.assertTrue(isinstance(out["lograndint"], int)) self.assertGreaterEqual(out["qrandint"], -21) self.assertLessEqual(out["qrandint"], 12) self.assertEqual(out["qrandint"] % 3, 0) self.assertTrue(isinstance(out["qrandint"], int)) self.assertGreaterEqual(out["qlograndint"], 2) self.assertLessEqual(out["qlograndint"], 20) self.assertEqual(out["qlograndint"] % 2, 0) self.assertTrue(isinstance(out["qlograndint"], int)) # Very improbable self.assertGreater(out["randn"], 0) self.assertLess(out["randn"], 20) self.assertGreater(out["qrandn"], 0) self.assertLess(out["qrandn"], 20) self.assertAlmostEqual(out["qrandn"] / 0.2, round(out["qrandn"] / 0.2))
def tune_train(args, model_class, task_info: TaskInfo, build_method=default_build_method, model_kwargs: dict = None, tune_config=None): if model_kwargs is None: model_kwargs = {} this_time = time.strftime("%m-%d_%H:%M:%S", time.localtime()) experiment_name = f'{task_info.task_name}_{this_time}' if tune_config is None: config = { # 3e-4 for Small, 1e-4 for Base, 5e-5 for Large "lr": tune.loguniform(args.tune_min_lr, args.tune_max_lr), # -1 for disable, 0.8 for Base/Small, 0.9 for Large "layerwise_lr_decay_power": tune.choice([0.8, 0.9]), # lr scheduler "lr_scheduler": tune.choice([ 'linear_schedule_with_warmup', 'polynomial_decay_schedule_with_warmup' ]), } else: config = tune_config if torch.cuda.is_available(): resources_per_trial = { "cpu": args.tune_cpus_per_trial, "gpu": args.tune_gpus_per_trial } else: resources_per_trial = {"cpu": args.tune_cpus_per_trial} print("resources_per_trial", resources_per_trial) tune_dir = os.path.abspath('tune_lightning_logs') analysis = tune.run( tune.with_parameters( tune_train_once, args=args, task_info=task_info, model_class=model_class, build_method=build_method, model_kwargs=model_kwargs, resume=args.tune_resume, group=experiment_name, log_dir=tune_dir, ), mode="max", config=config, num_samples=args.tune_num_samples, metric=f'tune_{task_info.metric_name}', name=experiment_name, progress_reporter=CLIReporter( parameter_columns=list(config.keys()), metric_columns=[ "loss", f'tune_{task_info.metric_name}', "training_iteration" ]), callbacks=[TBXLoggerCallback(), CSVLoggerCallback()], resources_per_trial=resources_per_trial, scheduler=ASHAScheduler( max_t=args.max_epochs + 1, # for test grace_period=args.min_epochs), queue_trials=True, keep_checkpoints_num=args.tune_keep_checkpoints_num, checkpoint_score_attr=f'tune_{task_info.metric_name}', local_dir=tune_dir, ) print("Best hyperparameters found were: ", analysis.best_config) print("Best checkpoint: ", analysis.best_checkpoint) args_vars = vars(args) args_vars.update(analysis.best_config) model = model_class.load_from_checkpoint(os.path.join( analysis.best_checkpoint, "tune.ckpt"), hparams=args, **model_kwargs) pl_loggers = [ loggers.CSVLogger(save_dir=tune.get_trial_dir(), name="", version="."), loggers.TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".", default_hp_metric=False), ] try: import wandb pl_loggers.append( loggers.WandbLogger(save_dir=tune_dir, project=args.project, name=tune.get_trial_name(), id=tune.get_trial_id(), offline=args.offline, group=experiment_name)) except Exception: pass trainer: Trainer = Trainer.from_argparse_args(args, logger=pl_loggers) build_method(model, task_info) trainer.test(model)
# Get the empty gpu gpu.get_empty_gpu() # /tmp is not accessible on GABA use the following dir: ray.init(temp_dir='/tmpscratch/alik/runlogs/ray/') # Log uniform function def lognuniform(low=0, high=1, base=np.e): size = 1 return int(np.power(base, np.random.uniform(low, high, size))) # random search space definition, the loaders are added since I'm not sure how the trainable is called inside tune. # TODO space = { "lr": tune.loguniform(1e-6, 0.1), "momentum": tune.loguniform(0.8, 0.9999), "n_latent": tune.choice(list(range( 100, 10000))), # tune.sample_from(lambda _:lognuniform(2, 4, 10)), "n_fmaps": tune.choice(list(range(4, 16))), "validation_loader": validation_loader, "train_loader": train_loader } analysis = tune.run(trainable, config=space, num_samples=100, resources_per_trial={ 'gpu': 1, 'cpu': 4 },
use_gpu=False, trainer_resources={"CPU": 0}, # so that the example works on Colab. ), datasets={"train": train_dataset}, preprocessor=preprocessor, ) # Execute training. result = trainer.fit() print(f"Last result: {result.metrics}") # Last result: {'loss': 0.6559339960416158, ...} # __air_pytorch_train_end__ # __air_pytorch_tuner_start__ from ray import tune param_space = {"train_loop_config": {"lr": tune.loguniform(0.0001, 0.01)}} metric = "loss" # __air_pytorch_tuner_end__ # __air_tune_generic_start__ from ray.tune.tuner import Tuner, TuneConfig from ray.air.config import RunConfig tuner = Tuner( trainer, param_space=param_space, tune_config=TuneConfig(num_samples=5, metric=metric, mode="min"), ) # Execute tuning. result_grid = tuner.fit()
bst = xgb.train( config, train_set, evals=[(test_set, "eval")], callbacks=[XGBCallback]) preds = bst.predict(test_set) pred_labels = np.rint(preds) tune.report( mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels), done=True) if __name__ == "__main__": num_threads = 2 config = { "verbosity": 0, "num_threads": num_threads, "objective": "binary:logistic", "booster": "gbtree", "eval_metric": ["auc", "ams@0", "logloss"], "max_depth": tune.randint(1, 9), "eta": tune.loguniform(1e-4, 1e-1), "gamma": tune.loguniform(1e-8, 1.0), "grow_policy": tune.choice(["depthwise", "lossguide"]) } from ray.tune.schedulers import ASHAScheduler tune.run( train_breast_cancer, resources_per_trial={"cpu": num_threads}, config=config, num_samples=2, scheduler=ASHAScheduler(metric="eval-logloss", mode="min"))
data_paths = { 'train': './data/train/', 'val': './data/val/' } NUM_CPU_PER_TRIAL = os.cpu_count() NUM_GPU_PER_TRIAL = 1 resources_per_trial = {"gpu": NUM_GPU_PER_TRIAL, "cpu": NUM_CPU_PER_TRIAL} MAX_TRAINING_EPOCH_PER_TRIAL = 15 SCHEDULER_GAMMA = 0.3 param_priority = ['lr','step','momentum','weight_decay','batch_size'] param_space = { 'lr': tune.loguniform(1e-5,1e-1), 'momentum': tune.uniform(0.5,0.99), 'step': tune.choice([1,2,3]), 'weight_decay': tune.loguniform(1e-8,1e-5), 'batch_size': tune.choice([2**k for k in range(7,10)]) } param_defaults = { 'lr': 1e-2, 'momentum': 0.9, 'step': 2, 'weight_decay': 1e-7, 'batch_size': 128 }
def searchBestHypers(num_samples=10, max_num_epochs=15, n_epochs_stop=2, grace_period=5, gpus_per_trial=0, data_obj=None): import os os.chdir('/content/drive/My Drive/DL project/') assert data_obj is not None experiment_id = 'no_name_yet' config_schedule = { "batch_size": tune.choice([4, 8, 16, 32]), "lr": tune.loguniform(1e-4, 1e-1), "h1": tune.sample_from(lambda: 2**np.random.randint(3, 8)), "h2": tune.sample_from(lambda: 2**np.random.randint(3, 8)), "wd": tune.loguniform(1e-4, 1e-1), } scheduler = ASHAScheduler(metric="loss", mode="min", max_t=max_num_epochs, grace_period=1, reduction_factor=2) pbt = PopulationBasedTraining(time_attr="training_iteration", metric="loss", mode="min", perturbation_interval=4, hyperparam_mutations={ "batch_size": [8, 16, 32, 64, 128], "lr": tune.loguniform(1e-4, 1e-1), "h1": [4, 8, 16, 32, 64], "h2": [4, 8, 16, 32, 64], "wd": tune.loguniform(1e-4, 1e-1), }) reporter = CLIReporter(metric_columns=["loss", "training_iteration"]) result = tune.run(partial(train_cgm, data_obj=data_obj, n_epochs_stop=n_epochs_stop, max_epochs=max_num_epochs, grace_period=grace_period), resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, config=config_schedule, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter) best_trial = result.get_best_trial("loss", "min", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) # Build best network best_trained_model = DilatedNet(h1=best_trial.config["h1"], h2=best_trial.config["h2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if gpus_per_trial > 1: best_trained_model = nn.DataParallel(best_trained_model) best_trained_model.to(device) best_checkpoint_dir = best_trial.checkpoint.value print("BEST MODEL DIR: ", best_checkpoint_dir) model_state, optimizer_state = torch.load( os.path.join(best_checkpoint_dir, "checkpoint")) best_trained_model.load_state_dict(model_state) # Call load to fit scaler. Should be a better solution trainset, valset = data_obj.load_train_and_val() test_rmse_val = test_rmse(best_trained_model, data_obj) print("Best trial test set rmse: {}".format(test_rmse_val)) # Save the results experiment = { 'name': str(experiment_id), 'best_trial_dir': str(best_checkpoint_dir), 'train_data': str(data_obj.train_data), 'test_data': str(data_obj.test_data), 'start_date_train': str(data_obj.start_date_train), 'start_date_test': str(data_obj.start_date_test), 'end_date_train': str(data_obj.end_date_train), 'end_date_test': str(data_obj.end_date_test) } current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S') user = getpass.getuser() experiment_id = f'id_{current_time}_{user}' experiment_path = code_path / 'hyper_experiments' # / model_id experiment_path.mkdir(exist_ok=True, parents=True) with open(experiment_path / (experiment_id + '.json'), 'w') as outfile: json.dump(experiment, outfile, indent=4) ''' Optinally Print information on where optimal model is saved ''' #print("\n Experiment details are saved in:\n", experiment_path / (experiment_id + '.json')) #print("\n Checkpoint for best configuration issaved in:\n", best_checkpoint_dir) return experiment_id
utils_params.save_config(run_paths['path_gin'], gin.config_str()) # setup pipeline train_ds, valid_ds, test_ds = datasets.load() # model model = DenseNet121(IMG_SIZE=256) trainer = Trainer(model=model, ds_train=train_ds, ds_val=test_ds, run_paths=run_paths) for val_accuracy in trainer.train(): tune.report(val_accuracy=val_accuracy) # some question about tune api analysis = tune.run( train_func, num_samples=100, resources_per_trial={'gpu': 1, 'cpu': 2}, config={ "Trainer.total_steps": tune.grid_search([5000]), "Trainer.total_steps_ft": tune.randint(300, 1500), "Trainer.lr": tune.loguniform(1e-3, 1e-2), "Trainer.lr_ft": tune.loguniform(1e-6, 1e-4), "Trainer.ft_layer_idx": tune.randint(100, 300), "DenseNet121.dense_units": tune.randint(2, 64), "DenseNet121.dropout_rate": tune.uniform(0, 0.9), "DenseNet121.idx_layer": tune.randint(200, 400) }) print("Best config: ", analysis.get_best_config(metric="val_accuracy", mode="max")) # Get a dataframe for analyzing trial results. df = analysis.dataframe()
def trainable(config): # config (dict): A dict of hyperparameters. for x in range(20): score = objective(x, config["a"], config["b"]) tune.track.log(score=score) # This sends the score to Tune. # class based API example class Trainable(tune.Trainable): def _setup(self, config): # config (dict): A dict of hyperparameters self.x = 0 self.a = config["a"] self.b = config["b"] def _train(self): # This is called iteratively. score = objective(self.x, self.a, self.b) self.x += 1 return {"score": score} # Run with a and b uniformly sampled from (-1,1) space = {"a": tune.loguniform(1e-8, 1), "b": tune.uniform(1e-8, 1)} analysis = tune.run(trainable, config=space, num_samples=100, resources_per_trial={'gpu': 1, 'cpu': 4}) # Use analysis object df = analysis.dataframe(metric="score", mode="max") print(df)
# resources per trial (kp_search_kwargs) debug_finetuning_hp_search = deepcopy(finetuning_bert100k_glue) debug_finetuning_hp_search.update( model_name_or_path= "/mnt/efs/results/pretrained-models/transformers-local/bert_100k", # noqa: E501 task_name=None, task_names=["cola", "rte"], num_runs=1, max_steps=200, save_steps=1, warmup_ratio=0.1, hp_validation_dataset_pct=1.0, report_to="none", task_hyperparams=dict( cola=dict(hp_space=lambda trial: dict(learning_rate=tune.loguniform( 1e-5, 1e-2)), hp_num_trials=3, hp_compute_objective=("maximize", "eval_matthews_correlation")), rte=dict(hp_space=lambda trial: dict(learning_rate=tune.loguniform( 1e-5, 1e-2)), hp_num_trials=3, hp_compute_objective=("maximize", "eval_accuracy")), ), ) debug_finetuning_sparse_hp_search = deepcopy( finetuning_bert_sparse_85_trifecta_100k_glue_get_info) debug_finetuning_sparse_hp_search.update( task_name="cola", task_names=None,
gbm = lgb.train(config, train_set, valid_sets=[test_set], verbose_eval=False, callbacks=[LightGBMCallback]) preds = gbm.predict(test_x) pred_labels = np.rint(preds) tune.report(mean_accuracy=sklearn.metrics.accuracy_score( test_y, pred_labels), done=True) if __name__ == "__main__": config = { "objective": "binary", "metric": "binary_error", "verbose": -1, "boosting_type": tune.grid_search(["gbdt", "dart"]), "num_leaves": tune.randint(10, 1000), "learning_rate": tune.loguniform(1e-8, 1e-1) } analysis = tune.run(train_breast_cancer, metric="binary_error", mode="min", config=config, num_samples=2, scheduler=ASHAScheduler()) print("Best hyperparameters found were: ", analysis.best_config)
dataset=dataset)) # Run the training analysis = tune.run(Training, stop={ 'training_iteration': args.epochs, 'stop_early': True }, checkpoint_at_end=True, metric="valid_rmse", mode="min", local_dir=args.logdir, verbose=1, config={ "learning_rate": tune.loguniform(args.learning_rate_low, args.learning_rate_high), "l1": tune.loguniform(args.l1_low, args.l1_high), }, num_samples=args.num_samples, resources_per_trial={ "cpu": 1, "gpu": 0 }) ray.shutdown() # Save args with open(os.path.join(args.logdir, "args.pickle"), 'wb') as f: pickle.dump(vars(args), f)
import torch import mlflow from ray import tune from src.scripts.train_wav2vec_kws import Wav2VecKWS import pytorch_lightning as pl from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.loggers import MLFlowLogger from ray.tune.integration.mlflow import mlflow_mixin from ray.tune.integration.pytorch_lightning import TuneReportCallback from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune.suggest.ax import AxSearch config = { "w2v_lr": tune.loguniform(1e-5, 1e-1), "decoder_lr": tune.loguniform(1e-4, 1e-1), "weight_decay": tune.loguniform(1e-6, 1e-3), "batch_size": tune.choice([32, 64, 128, 256]), "mlflow": { "experiment_name": "wav2vec_kws", "tracking_uri": "http://192.168.0.32" }, } @mlflow_mixin def train_model(config, gpus, w2v, num_epochs=10): early_stop_callback = EarlyStopping(monitor="val_Accuracy", min_delta=0.0, patience=5,
import pytest from autogluon.core.hpo.space_converter import RaySpaceConverterFactory from autogluon.core.space import Space, Categorical, Real, Int, Bool from ray import tune @pytest.mark.parametrize('space, expected_space', [ (Categorical([1, 2]), tune.choice([1, 2])), (Real(1, 2, log=True), tune.loguniform(1, 2)), (Real(1, 2, log=False), tune.uniform(1, 2)), (Int(1, 2), tune.randint(1, 3)), (Bool(), tune.randint(0, 2)), ]) def test_space_converter(space, expected_space): ray_space = RaySpaceConverterFactory.get_space_converter( space.__class__.__name__).convert(space) assert type(ray_space) == type(expected_space)
agent.epsilon_decay = config["epsilon_decay"] agent.eta = config["eta"] for i in range(10): agent.play() agent_test = AgentTest(agent, RandomPlayer(), 0) reward = agent_test.play() tune.report(steps=agent.steps, reward=reward) bayesopt = BayesOptSearch(metric="reward", mode="max") analysis = tune.run( training_function, config={ "learning_rate": tune.loguniform(1e-3, 1e-1), "alpha": tune.loguniform(1e-3, 1e-1), "gamma": tune.loguniform(1e-3, 1e-1), "delta": tune.loguniform(0.1, 0.5), "epsilon_decay": tune.uniform(0.9999, 0.999999), "eta": tune.loguniform(1e-3, 1e-1) }, local_dir="/ray_results/DQN", search_alg=bayesopt, num_samples=100) print("Best config: ", analysis.get_best_config( metric="reward", mode="max")) # Get a dataframe for analyzing trial results. df = analysis.results_df
def main(name=None, num_samples=64, gpus_per_trial=1, metric="sotl", time_budget=None, batch_size=100, steps_per_epoch=100, max_num_epochs=150, total_budget_multiplier=10, seed=None): data_dir = os.path.abspath("../playground/data") load_data(data_dir) # Download data for all trials before starting the run if seed is None: seed = random.randint(0, 1000) config = { "lr": tune.loguniform(5e-5, 5), "conv1_l2": tune.loguniform(5e-5, 5), "conv2_l2": tune.loguniform(5e-5, 5), "conv3_l2": tune.loguniform(5e-5, 5), "fc1_l2": tune.loguniform(5e-3, 500), "lr_reductions": tune.choice([0, 1, 2, 3]), "rnorm_scale": tune.loguniform(5e-6, 5), "rnorm_power": tune.uniform(0.01, 3), "max_num_epochs": max_num_epochs, "batch_size": batch_size, "steps_per_epoch": steps_per_epoch, "data_dir": data_dir, "seed": seed, "metric": metric, "time_budget": time_budget, "total_budget_multiplier": total_budget_multiplier } scheduler = ASHAScheduler(max_t=config["max_num_epochs"], grace_period=1, reduction_factor=4) result = tune.run(train_cifar, name=name, resources_per_trial={ "cpu": 2, "gpu": gpus_per_trial }, config={ **config, "wandb": { "project": "SoTL_Cifar", "api_key_file": "~" + os.sep + ".wandb" + os.sep + "nas_key.txt" } }, metric=config["metric"], mode="min", num_samples=num_samples, scheduler=scheduler, stop=TotalBudgetStopper( config["max_num_epochs"] * config["total_budget_multiplier"]), loggers=DEFAULT_LOGGERS + (WandbLogger, ), time_budget_s=config["time_budget"]) best_trial = result.get_best_trial("loss", "min", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) print("Best trial final validation accuracy: {}".format( best_trial.last_result["accuracy"])) best_trained_model = Net(rnorm_scale=best_trial.config["rnorm_scale"], rnorm_power=best_trial.config["rnorm_power"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if gpus_per_trial > 1: best_trained_model = nn.DataParallel(best_trained_model) best_trained_model.to(device) checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint") model_state, optimizer_state = torch.load(checkpoint_path) best_trained_model.load_state_dict(model_state) test_acc = test_accuracy(best_trained_model, device) print("Best trial test set accuracy: {}".format(test_acc)) if os.path.exists("~" + os.sep + ".wandb" + os.sep + "nas_key.txt"): f = open("~" + os.sep + ".wandb" + os.sep + "nas_key.txt", "r") key = f.read() os.environ["WANDB_API_KEY"] = key
}, }, "goal": "maximize", "num_samples": 4, }, } if RAY_AVAILABLE: EXPECTED_SEARCH_SPACE = { "test_1": { "training.learning_rate": tune.uniform(0.001, 0.1), "combiner.num_fc_layers": tune.qrandint(3, 6, 3), "utterance.cell_type": tune.grid_search(["rnn", "gru", "lstm"]), }, "test_2": { "training.learning_rate": tune.loguniform(0.001, 0.1), "combiner.num_fc_layers": tune.randint(2, 6), "utterance.cell_type": tune.choice(["rnn", "gru", "lstm"]), }, } @pytest.mark.skipif(not RAY_AVAILABLE, reason="Ray is not installed for testing") @pytest.mark.parametrize("key", ["test_1", "test_2"]) def test_grid_strategy(key): hyperopt_test_params = HYPEROPT_PARAMS[key] expected_search_space = EXPECTED_SEARCH_SPACE[key] goal = hyperopt_test_params["goal"]
ck = th.load(checkpoint.best_model_path) model.load_state_dict(ck["state_dict"]) trainer.test(model) config = { "attn_dropout": tune.quniform(0, 1, 0.1), "attn_dropout_a": tune.quniform(0, 1, 0.1), "attn_dropout_v": tune.quniform(0, 1, 0.1), "embed_dropout": tune.quniform(0, 1, 0.1), "out_dropout": tune.quniform(0, 1, 0.1), "relu_dropout": tune.quniform(0, 1, 0.1), "res_dropout": tune.quniform(0, 1, 0.1), # "project_dim": tune.choice([40, 50, 60, 70]), "lr": tune.loguniform(1e-6, 1e-3), "weight_decay": tune.loguniform(1e-10, 1e-2), } previous_best = { "attn_dropout": 0.3, "attn_dropout_a": 0.5, "attn_dropout_v": 0.0, "embed_dropout": 0.0, "out_dropout": 0.2, "relu_dropout": 0.5, "res_dropout": 0.1, "layers": 5, "num_heads": 6, "head_dim": 14, "lr_log": -4,
https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#tune-scheduler-pbt """ """config - returns a dict of hyperparameters Selecting different hyperparameters for tuning l1 : Number of units in first fully connected layer l2 : Number of units in second fully connected layer lr : Learning rate decay : Decay rate for regularization batch_size : Batch size of test and train data """ config = { "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # eg. 4, 8, 16 .. 512 "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # eg. 4, 8, 16 .. 512 "lr": tune.loguniform(1e-4, 1e-1), # Sampling from log uniform distribution "decay": tune.sample_from(lambda _: 10 ** np.random.randint(-7, -3)), # eg. 1e-7, 1e-6, .. 1e-3 "batch_size": tune.choice([32, 64, 128, 256]) } # calling trainer trainer = Trainer(device=device) """ASHA (Asynchronous Successive Halving Algorithm) scheduler max_t : Maximum number of units per trail (can be time or epochs) grace_period : Stop trials after specific number of unit if model is not performing well (can be time or epochs) reduction_factor : Set halving rate """ scheduler = ASHAScheduler( max_t=max_num_epochs, grace_period=4,
if __name__ == '__main__': # =============================================================================== # Start Process # =============================================================================== train_config = { 'data_dir': '/home/congvm/Workspace/evoke/thirdparty/tune/data', 'num_epochs': 40, 'num_gpus': 1 } tuning_config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "opt": tune.choice(['adam', 'sgd']) } log_dir = 'experiments' experiment_name = 'tune_mnist_asha_' + generate_datetime() metric_columns = ["val_loss", "val_accuracy", "training_iteration"] start_tuning(tuning_config=tuning_config, train_config=train_config, training_func=train_mnist_tune, report_metric_columns=metric_columns, monitor_metric='val_loss', monitor_mode='min',
"petal_length": [ 1.4, 4.7, 6, 1.4, 4.7, 6, 1.4, 4.7, 6, 1.4, 4.7, 6], "petal_width": [ 0.2, 1.4, 2.5, 0.2, 1.4, 2.5, 0.2, 1.4, 2.5, 0.2, 1.4, 2.5], "variety": tf.keras.utils.to_categorical([ 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]) } # test_hyperparameter_space = { # "lr": tune.sample_from([0.05, 0.01]), # "dense_1": tune.sample_from([1, 2]), # "dense_2": tune.sample_from([1, 2]), # "epochs": tune.sample_from([2, 3]), # "batch_size": tune.sample_from([5, 6]) # } test_hyperparameter_space = { "lr": tune.loguniform(0.001, 0.1), "dense_1": tune.uniform(2, 128), "dense_2": tune.uniform(2, 128), "epochs": tune.uniform(1, 5), "batch_size": tune.sample_from([16, 32]) } test_simple_train_hyperparameters = { "lr": 0.05, "dense_1": 1, "dense_2": 1, "batch_size": 10, "epochs": 1 }