# Stepsize of SGD. "lr": 0.0005, # PPO clip parameter. "clip_param": 0.03, # Clip param for the value function. Note that this is sensitive to the # scale of the rewards. If your expected V is large, increase this. "vf_clip_param": 10.0, # If specified, clip the global norm of gradients by this amount. "grad_clip": None, # Target value for KL divergence. "kl_target": 0.001, } tune.run( run_or_experiment=trainer_class, name=experiment_name, metric="br_reward_mean", config=hyperparams, num_samples=2, search_alg=None, mode="max", local_dir=data_dir(), stop={"timesteps_total": int(3e6)}, loggers=[ get_trainer_logger_creator(base_dir=data_dir(), scenario_name=experiment_name, should_log_result_fn=lambda result: result["training_iteration"] % 20 == 0) ], )
# if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": choice([4096, 2048, 1024]), # Whether to compute priorities on workers. "worker_side_prioritization": False, # Prevent iterations from going lower than this time span "min_iter_time_s": 0, # Minimum env steps to optimize for per train call. This value does # not affect learning (JB: this is a lie!), only the length of train iterations. "timesteps_per_iteration": 0, } search = HyperOptSearch(metric="br_reward_mean", mode="max", n_initial_points=20) tune.run(run_or_experiment=trainer_class, name=experiment_name, metric="br_reward_mean", config=hyperparams, num_samples=200000000, search_alg=search, mode="max", local_dir=data_dir(), stop={"timesteps_total": int(3e6)}, loggers=[get_trainer_logger_creator( base_dir=data_dir(), scenario_name=experiment_name, should_log_result_fn=lambda result: result["training_iteration"] % 20 == 0)], )