def _validate_base( extra_config, test_mode, env_name, trainer, stop=50000, name="DELETEME_TEST", num_gpus=0 ): initialize_ray(test_mode=test_mode, local_mode=True, num_gpus=num_gpus) num_agents = 3 # policy_names = ["agent{}".format(i) for i in range(num_agents)] env_config = {"env_name": env_name, "num_agents": num_agents} # env = MultiAgentEnvWrapper(env_config) config = { "seed": 0, "env": MultiAgentEnvWrapper, "env_config": env_config, # "multiagent": { # "policies": { # i: (None, env.observation_space, env.action_space, {}) # for i in policy_names # }, # "policy_mapping_fn": lambda x: x, # }, } if extra_config: config.update(extra_config) return tune.run( trainer, name=name, stop={"info/num_steps_sampled": stop}, config=config )
def regression_test(local_mode=False): num_agents = 3 local_dir = tempfile.mkdtemp() initialize_ray(test_mode=True, local_mode=local_mode) train(DiCESACTrainer, { "gamma": 0.95, "target_network_update_freq": 32, "tau": 1.0, "train_batch_size": 200, "rollout_fragment_length": 50, "optimization": { "actor_learning_rate": 0.005, "critic_learning_rate": 0.005, "entropy_learning_rate": 0.0001 }, **get_marl_env_config("CartPole-v0", num_agents, normalize_actions=False) }, {"episode_reward_mean": 150 * num_agents}, exp_name="DELETEME", local_dir=local_dir, test_mode=True) shutil.rmtree(local_dir, ignore_errors=True)
def profile(): """This function is use to profile the efficiency of agent restoring.""" initialize_ray(num_gpus=4, test_mode=True, local_mode=True) ckpt = { 'path': "~/ray_results/0810-20seeds/" "PPO_BipedalWalker-v2_0_seed=20_2019" "-08-10_16-54-37xaa2muqm/checkpoint_469/checkpoint-469", 'run_name': "PPO", 'env_name': "BipedalWalker-v2" } num_agents = 20 master_agents = OrderedDict() for i in range(num_agents): ckpt.update(name=i) agent = MaskSymbolicAgent(ckpt) master_agents[i] = copy.deepcopy(agent) for i, (name, agent) in enumerate(master_agents.items()): print("[{}/{}] RESTORE AGENTS: NAME {}".format(i, num_agents, name)) a = agent.get() print(a)
def test_reference_consistency(): initialize_ray(test_mode=True, local_mode=False) algos = ["PPO", "ES", "A2C", "A3C", "IMPALA", "ARS"] rws = {} for i, algo in enumerate(algos): trainer = get_dynamic_trainer( algo, 10000, "BipedalWalker-v2")(config={ "env": "BipedalWalker-v2", "seed": i * 1000 + 789 }) rw = { k: v for k, v in trainer._reference_agent_weights['default_policy'].items() if "value" not in k } rws[algo] = rw ks = list(rws) first_weight_dict = next(iter(rws.values())) for weight_name in first_weight_dict.keys(): print("Current weight name: ", weight_name) for weight_dict_name in ks[1:]: weight_dict = rws[weight_dict_name] assert_equal(first_weight_dict[weight_name], weight_dict[weight_name])
def __init__(self, ckpt, existing_agent=None): if not ray.is_initialized(): initialize_ray(num_gpus=0) with open(osp.join(osp.dirname(osp.dirname(ckpt)), "params.json"), "rb") as f: config = json.load(f) config["num_workers"] = 0 config["num_cpus_per_worker"] = 1 config["num_cpus_for_driver"] = 1 self.config = config self.config_env = gym.make(self.config["env"]) self.action_dim = self.config_env.action_space.shape[0] self.k = config["model"]["custom_options"]["num_components"] self.is_deterministic = config["model"]["custom_action_dist"] == \ DeterministicMixture.name self.expect_logit_length = (self.k * (1 + 2 * self.action_dim) if not self.is_deterministic else self.k * (1 + self.action_dim)) assert osp.exists(ckpt) if existing_agent is not None: assert isinstance(existing_agent, MOGESAgent) existing_agent = existing_agent.agent agent = restore_agent(GaussianESTrainer, ckpt, "BipedalWalker-v2", config, existing_agent=existing_agent) self.agent = agent self._log_std = None
def _test_dice(extra_config={}, local_mode=False, num_agents=3, env_name="BipedalWalker-v2", t=2000): num_gpus = 0 initialize_ray(test_mode=True, local_mode=local_mode, num_gpus=num_gpus) # default config env_config = {"env_name": env_name, "num_agents": num_agents} config = { "env": MultiAgentEnvWrapper, "env_config": env_config, "num_gpus": num_gpus, "log_level": "DEBUG", "rollout_fragment_length": 20, "train_batch_size": 100, "sgd_minibatch_size": 60, "num_sgd_iter": 3, "num_workers": 1 } if extra_config: config.update(extra_config) stop = {"timesteps_total": t} if not isinstance(t, dict) else t dir_path = tempfile.mkdtemp() ret = tune.run(DiCETrainer, local_dir=dir_path, name="DELETEME_TEST_extra_loss_ppo_trainer", stop=stop, config=config, verbose=2, max_failures=0) shutil.rmtree(dir_path, ignore_errors=True) return ret
def test_restore(): from toolbox.evaluate import restore_agent initialize_ray() ckpt = "~/ray_results/1114-tnb_4in1" \ "/TNBPPO_MultiAgentEnvWrapper_2_novelty_mode=min," \ "use_joint_dataset=False_2019-11-14_10-30-29l456mu0o" \ "/checkpoint_60/checkpoint-60" marl_agent = restore_agent(TNBPPOTrainer, ckpt, MultiAgentEnvWrapper)
def ray_init(): ray.shutdown() initialize_ray( test_mode=args.test_mode, local_mode=False, num_gpus=args.num_gpus if not args.address else None, redis_address=args.address if args.address else None )
def ep_trainer(request): initialize_ray(test_mode=True, local_mode=True) return EPTrainer(env="BipedalWalker-v2", config=dict(num_sgd_iter=2, train_batch_size=400, evolution=dict(episodes_per_batch=20, train_batch_size=400, noise_size=20000000), fuse_mode=request.param))
def dice_sac_trainer(): initialize_ray(test_mode=True, local_mode=True) env_name = "BipedalWalker-v2" num_agents = 3 env = gym.make(env_name) trainer = DiCESACTrainer(get_marl_env_config(env_name, num_agents, normalize_actions=False), env=MultiAgentEnvWrapper) return env, trainer
def test_train_ipd(local_mode=False): initialize_ray(test_mode=True, local_mode=local_mode) env_name = "CartPole-v0" # env_name = "BipedalWalker-v2" config = {"num_sgd_iter": 2, "env": env_name, "novelty_threshold": 0.5} tune.run(TNBTrainer, name="DELETEME_TEST", verbose=2, checkpoint_freq=10, checkpoint_at_end=True, stop={"timesteps_total": 50000}, config=config)
def test_multiple_num_agents(local_mode=False): num_gpus = 0 initialize_ray(test_mode=True, local_mode=local_mode, num_gpus=num_gpus) config = _get_default_test_config( tune.grid_search([2, 3, 4]), "BipedalWalker-v2", num_gpus ) return tune.run( CEPPOTrainer, local_dir=get_local_dir(), name="DELETEME_TEST_extra_loss_ppo_trainer", stop={"timesteps_total": 5000}, config=config )
def test_maddpg_custom_metrics(): extra_config = { "env": IPDEnv, "env_config": { "env_name": "BipedalWalker-v2", "novelty_threshold": 0.0, "yaml_path": os.path.abspath("../data/yaml/test-2-agents.yaml") }, "callbacks": { "on_episode_end": on_episode_end }, } initialize_ray(test_mode=True, local_mode=False) tune.run("PPO", stop={"training_iteration": 10}, config=extra_config)
def _test_blackbox(algo): initialize_ray(test_mode=True) config = {"env": "BipedalWalker-v2"} if algo == "ES": config['num_workers'] = 2 dir_path = tempfile.mkdtemp() trainer = get_dynamic_trainer(algo, 10000, "BipedalWalker-v2") ret = tune.run(trainer, local_dir=dir_path, stop={"training_iteration": 10}, config=config, verbose=2, max_failures=0) shutil.rmtree(dir_path, ignore_errors=True) return ret
def gaussian_mixture_trainer(request): initialize_ray(test_mode=True, local_mode=False) trainer = PGTrainer(env="BipedalWalker-v2", config={ "model": { "custom_action_dist": GaussianMixture.name, "custom_options": { "num_components": request.param } }, "num_workers": 0, "train_batch_size": 300, "sample_batch_size": 100 }) return trainer
def deterministic_mixture_trainer(request): initialize_ray(test_mode=True, local_mode=True) trainer = GaussianESTrainer(env="BipedalWalker-v2", config={ "model": { "custom_action_dist": DeterministicMixture.name, "custom_options": { "num_components": request.param } }, "num_workers": 1, "episodes_per_batch": 1, "train_batch_size": 300, "sample_batch_size": 100 }) return trainer
def test_cetd3(local_mode=False): num_gpus = 0 initialize_ray(test_mode=True, local_mode=local_mode, num_gpus=num_gpus) config = _get_default_test_config( num_agents=3, env_name="BipedalWalker-v2", num_gpus=num_gpus ) if "num_sgd_iter" in config: config.pop("num_sgd_iter") config.pop("sgd_minibatch_size") config['timesteps_per_iteration'] = 80 config['pure_exploration_steps'] = 80 config['learning_starts'] = 180 tune.run( CETD3Trainer, local_dir=get_local_dir(), name="DELETEME_TEST_extra_loss_ppo_trainer", stop={"timesteps_total": 2000}, config=config )
def regression_test2(local_mode=False): from ray import tune num_agents = 3 local_dir = tempfile.mkdtemp() initialize_ray(test_mode=True, local_mode=local_mode) train( DiCESACTrainer, { "soft_horizon": True, "clip_actions": False, "normalize_actions": False, # <<== Handle in MARL env "metrics_smoothing_episodes": 5, "no_done_at_end": True, "train_batch_size": 1000, "rollout_fragment_length": 50, constants.DELAY_UPDATE: tune.grid_search([True, False]), # constants.NOR: tune.grid_search([True, False]), # "optimization": { # "actor_learning_rate": 0.005, # "critic_learning_rate": 0.005, # "entropy_learning_rate": 0.0001 # }, **get_marl_env_config("Pendulum-v0", num_agents, normalize_actions=True) }, { "episode_reward_mean": -300 * num_agents, "timesteps_total": 13000 * num_agents }, exp_name="DELETEME", local_dir=local_dir, test_mode=True) shutil.rmtree(local_dir, ignore_errors=True)
def train(algo, init_seed, extra_config, env_name, stop, exp_name, num_seeds, num_gpus, test_mode=False, **kwargs): initialize_ray(test_mode=test_mode, local_mode=False, num_gpus=num_gpus) config = { "seed": tune.grid_search([i * 100 for i in range(num_seeds)]), "env": env_name, "log_level": "DEBUG" if test_mode else "INFO" } if extra_config: config.update(extra_config) trainer = get_dynamic_trainer(algo, init_seed, env_name) analysis = tune.run( trainer, name=exp_name, checkpoint_freq=10 if not test_mode else None, keep_checkpoints_num=5 if not test_mode else None, checkpoint_score_attr="episode_reward_mean" if not test_mode else None, checkpoint_at_end=True if not test_mode else None, stop={"timesteps_total": stop} if isinstance(stop, int) else stop, config=config, max_failures=5, **kwargs) path = "{}-{}-{}ts-{}.pkl".format(exp_name, env_name, stop, algo) with open(path, "wb") as f: data = analysis.fetch_trial_dataframes() pickle.dump(data, f) print("Result is saved at: <{}>".format(path)) return analysis
def test_agent_with_mask(): initialize_ray(test_mode=True, local_mode=False) ckpt = "~/ray_results/0810-20seeds/PPO_BipedalWalker-v2_0_seed=20_2019" \ "-08-10_16-54-37xaa2muqm/checkpoint_469/checkpoint-469" # ckpt = None ret_list = [] agent = restore_agent_with_mask("PPO", ckpt, "BipedalWalker-v2") # agent.compute_action(np.ones(24)) for i in range(10): test_reward = agent.train() print(pretty_print(test_reward)) ret_list.append(test_reward) print("Test end") agent.get_policy().set_default({ 'fc_1_mask': np.ones([ 256, ]), 'fc_2_mask': np.ones([ 256, ]) }) for i in range(10): test_reward2 = agent.train() print(pretty_print(test_reward2)) ret_list.append(test_reward2) print("Test2 end") return test_reward, test_reward2, ret_list
def _base( trainer, local_mode=False, extra_config=None, t=500, env_name="BipedalWalker-v2", num_agents=3 ): # num_agents = 3 num_gpus = 0 initialize_ray(test_mode=True, local_mode=local_mode, num_gpus=num_gpus) config = _get_default_test_config(num_agents, env_name, num_gpus) if extra_config: config.update(extra_config) stop = {"timesteps_total": t} if not isinstance(t, dict) else t return tune.run( trainer, local_dir=get_local_dir(), name="DELETEME_TEST_extra_loss_ppo_trainer", stop=stop, config=config )
def _test_basic(algo): initialize_ray(test_mode=True) trainer = get_dynamic_trainer(algo, 10000, "BipedalWalker-v2")(config={ "env": "BipedalWalker-v2" }) if algo in ["ES", "ARS"]: tw = { k: v for k, v in trainer.policy.variables.get_weights().items() if "value" not in k } elif algo in ["PPO", "A2C", "A3C", "IMPALA"]: tw = { k: v for k, v in trainer.get_weights()['default_policy'].items() if "value" not in k } rw = { k: v for k, v in trainer._reference_agent_weights['default_policy'].items() if "value" not in k } assert len(tw) == len(rw) twk = list(tw.keys()) rwk = list(rw.keys()) for i in range(len(tw)): arr1 = tw[twk[i]] arr2 = rw[rwk[i]] assert_equal(arr1, arr2)
def init_ray(): initialize_ray(num_gpus=4, test_mode=THIS_SCRIPT_IS_IN_TEST_MODE, object_store_memory=40 * int(1e9))
def ray_init(): ray.shutdown() initialize_ray(test_mode=args.test, num_gpus=num_gpus, local_mode=False)
"kl_coeff": 1.0, "num_sgd_iter": 20, "lr": 0.0002, 'sample_batch_size': 200, 'sgd_minibatch_size': 1000 if not is_humanoid else 4000, 'train_batch_size': 10000 if not is_humanoid else 60000, "num_gpus": 1, "num_cpus_per_worker": 1, "num_cpus_for_driver": 1, "num_envs_per_worker": 8, 'num_workers': 8 if not is_humanoid else 24, } initialize_ray( test_mode=False, local_mode=False, num_gpus=num_gpus if not args.address else None, address=args.address if args.address else None ) if "Bullet" in env_name: from ray.tune.registry import register_env def make_pybullet(_=None): import pybullet_envs import gym print("Successfully import pybullet and found: ", pybullet_envs.getList()) return gym.make(env_name)
def wrap_stats_fn(policy, train_batch): ret = kl_and_loss_stats_modified(policy, train_batch) ret.update(novelty_loss_param=policy.novelty_loss_param, novelty_target=policy.novelty_target_tensor) return ret AdaptiveExtraLossPPOTFPolicy = ExtraLossPPOTFPolicy.with_updates( name="AdaptiveExtraLossPPOTFPolicy", get_default_config=lambda: adaptive_extra_loss_ppo_default_config, before_loss_init=setup_mixins_modified, stats_fn=wrap_stats_fn, mixins=mixin_list + [AddLossMixin, NoveltyParamMixin]) AdaptiveExtraLossPPOTrainer = ExtraLossPPOTrainer.with_updates( name="AdaptiveExtraLossPPO", after_optimizer_step=wrap_after_train_result, validate_config=validate_config_basic, default_config=adaptive_extra_loss_ppo_default_config, default_policy=AdaptiveExtraLossPPOTFPolicy, ) if __name__ == '__main__': from toolbox import initialize_ray print("Prepare to create AELPPO") initialize_ray(test_mode=True, num_gpus=0) AdaptiveExtraLossPPOTrainer(env="BipedalWalker-v2", config=None)
def register_mixture_action_distribution(): ModelCatalog.register_custom_action_dist(GaussianMixture.name, GaussianMixture) ModelCatalog.register_custom_action_dist(DeterministicMixture.name, DeterministicMixture) print("Successfully register GaussianMixture and DeterministicMixture " "action distribution.") register_mixture_action_distribution() if __name__ == '__main__': from ray import tune from toolbox import initialize_ray initialize_ray(test_mode=True, local_mode=True) tune.run( "TD3", # PPOTrainerWithoutKL, local_dir="/tmp/ray", name="DELETE_ME_TEST", config={ "env": "BipedalWalker-v2", "log_level": "DEBUG", "model": { "custom_action_dist": GaussianMixture.name, "custom_options": { "num_components": 7 } }
parser.add_argument("--soft", action="store_true") # default hard parser.add_argument("--ppo", action="store_true") parser.add_argument("--es", action="store_true") parser.add_argument("--es-optimizer", type=str, default="adam") parser.add_argument("--local-mode", "-lm", action="store_true") args = parser.parse_args() print(args) local_mode = args.local_mode env_name = "CartPole-v0" dir_path = tempfile.mkdtemp() now = time.time() num_gpus = 0 initialize_ray(test_mode=True, local_mode=local_mode, num_gpus=1) config = { "env": env_name, "num_sgd_iter": 10, "num_gpus": num_gpus, "train_batch_size": 4000, "sample_batch_size": 200, "fuse_mode": SOFT_FUSE if args.soft else HARD_FUSE, "lr": 0.005, "evolution": { "train_batch_size": 4000, # The same as PPO "num_workers": 10, # default is 10, "optimizer_type": args.es_optimizer } }
def train( extra_config, trainer, env_name, stop, exp_name, num_agents, num_seeds, num_gpus, num_cpus=None, test_mode=False, address=None, redis_password=None, clip_memory=False, init_memory=None, init_object_store_memory=None, init_redis_max_memory=None, **kwargs ): # assert isinstance(stop, int) if address is not None: num_gpus = None if clip_memory: init_memory = int(300 * GB) init_object_store_memory = int(100 * GB) init_redis_max_memory = int(50 * GB) initialize_ray( test_mode=test_mode, local_mode=False, num_gpus=num_gpus, address=address, redis_password=redis_password, memory=init_memory, object_store_memory=init_object_store_memory, redis_max_memory=init_redis_max_memory, num_cpus=num_cpus ) env_config = {"env_name": env_name, "num_agents": num_agents} config = { "seed": tune.grid_search([i * 100 for i in range(num_seeds)]), "env": MultiAgentEnvWrapper, "env_config": env_config, "log_level": "DEBUG" if test_mode else "INFO" } if extra_config: config.update(extra_config) analysis = tune.run( trainer, local_dir=get_local_dir(), name=exp_name, checkpoint_freq=10, keep_checkpoints_num=10, checkpoint_score_attr="episode_reward_mean", checkpoint_at_end=True, stop={"info/num_steps_sampled": stop} if isinstance(stop, int) else stop, config=config, max_failures=20, reuse_actors=False, **kwargs ) path = "{}-{}-{}ts-{}agents.pkl".format( exp_name, env_name, stop, num_agents ) with open(path, "wb") as f: data = analysis.fetch_trial_dataframes() pickle.dump(data, f) print("Result is saved at: <{}>".format(path)) return analysis
"seed": tune.grid_search([i * 100 for i in range(3)]), "env": "DeceptiveMaze-v0", "num_sgd_iter": 10, "lr": 0.001, 'sample_batch_size': 16, 'sgd_minibatch_size': 32, 'train_batch_size': 512, "num_gpus": 0.2, "num_envs_per_worker": 4, 'num_workers': 1, "model": {"fcnet_hiddens": [64, 64]} } initialize_ray( test_mode=False, local_mode=False, num_gpus=4) num_agents = 4 dece_config = copy.deepcopy(ppo_default_config) dece_config.update( { "env": MultiAgentEnvWrapper, "constrain_novelty": tune.grid_search(['hard', 'soft', None]), "env_config": { "env_name": ppo_default_config['env'], "num_agents": num_agents }, "alpha_coefficient": tune.grid_search([0.05, 0.1, 0.01]), "tau": tune.grid_search([0.05, 0.1, 0.01]),