def testTrainExternalMultiCartpoleManyPolicies(self): n = 20 single_env = gym.make("CartPole-v0") act_space = single_env.action_space obs_space = single_env.observation_space policies = {} for i in range(20): policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space, {}) policy_ids = list(policies.keys()) ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), batch_steps=100) optimizer = SyncSamplesOptimizer(ev, [], {}) for i in range(100): optimizer.step() result = collect_metrics(ev) print("Iteration {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return raise Exception("failed to improve reward")
def test_train_external_multi_cartpole_many_policies(self): n = 20 single_env = gym.make("CartPole-v0") act_space = single_env.action_space obs_space = single_env.observation_space policies = {} for i in range(20): policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) ev = RolloutWorker( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), rollout_fragment_length=100) optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev)) for i in range(100): optimizer.step() result = collect_metrics(ev) print("Iteration {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return raise Exception("failed to improve reward")
def make_cartpol(_): # Simple environment with 4 independent cartpole entities register_env("multi_cartpole", lambda _: MultiCartpole(4)) single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space return obs_space, act_space, "multi_cartpole"
def check_support_multiagent(alg, config): register_env("multi_mountaincar", lambda _: MultiMountainCar(2)) register_env("multi_cartpole", lambda _: MultiCartpole(2)) if "DDPG" in alg: a = get_agent_class(alg)(config=config, env="multi_mountaincar") else: a = get_agent_class(alg)(config=config, env="multi_cartpole") try: a.train() finally: a.stop()
def testMultiAgent(self): register_env("multi_cartpole", lambda _: MultiCartpole(10)) single_env = gym.make("CartPole-v0") def gen_policy(): obs_space = single_env.observation_space act_space = single_env.action_space return (PGTFPolicy, obs_space, act_space, {}) pg = PGTrainer( env="multi_cartpole", config={ "num_workers": 0, "output": self.test_dir, "multiagent": { "policies": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": ( lambda agent_id: random.choice( ["policy_1", "policy_2"])), }, }) pg.train() self.assertEqual(len(os.listdir(self.test_dir)), 1) pg.stop() pg = PGTrainer( env="multi_cartpole", config={ "num_workers": 0, "input": self.test_dir, "input_evaluation": ["simulation"], "train_batch_size": 2000, "multiagent": { "policies": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": ( lambda agent_id: random.choice( ["policy_1", "policy_2"])), }, }) for _ in range(50): result = pg.train() if not np.isnan(result["episode_reward_mean"]): return # simulation ok time.sleep(0.1) assert False, "did not see any simulation results"
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy from ray.rllib.agents.ppo.ppo import PPOTrainer from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy from ray.rllib.tests.test_multi_agent_env import MultiCartpole from ray.tune.logger import pretty_print from ray.tune.registry import register_env parser = argparse.ArgumentParser() parser.add_argument("--num-iters", type=int, default=20) if __name__ == "__main__": args = parser.parse_args() ray.init() # Simple environment with 4 independent cartpole entities register_env("multi_cartpole", lambda _: MultiCartpole(4)) single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space # You can also have multiple policies per trainer, but here we just # show one each for PPO and DQN. policies = { "ppo_policy": (PPOTFPolicy, obs_space, act_space, {}), "dqn_policy": (DQNTFPolicy, obs_space, act_space, {}), } def policy_mapping_fn(agent_id): if agent_id % 2 == 0: return "ppo_policy" else:
64, activation=tf.nn.relu, name="fc2") output = tf.layers.dense(last_layer, num_outputs, activation=None, name="fc_out") return output, last_layer if __name__ == "__main__": args = parser.parse_args() ray.init() # Simple environment with `num_agents` independent cartpole entities register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents)) ModelCatalog.register_custom_model("model1", CustomModel1) ModelCatalog.register_custom_model("model2", CustomModel2) single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(i): config = { "model": { "custom_model": ["model1", "model2"][i % 2], }, "gamma": random.choice([0.95, 0.99]), } return (None, obs_space, act_space, config)