def testTrainExternalMultiCartpoleManyPolicies(self):
     n = 20
     single_env = gym.make("CartPole-v0")
     act_space = single_env.action_space
     obs_space = single_env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = PolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     optimizer = SyncSamplesOptimizer(ev, [], {})
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Beispiel #2
0
 def test_train_external_multi_cartpole_many_policies(self):
     n = 20
     single_env = gym.make("CartPole-v0")
     act_space = single_env.action_space
     obs_space = single_env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = RolloutWorker(
         env_creator=lambda _: MultiCartpole(n),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         rollout_fragment_length=100)
     optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev))
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Beispiel #3
0
def make_cartpol(_):
    # Simple environment with 4 independent cartpole entities
    register_env("multi_cartpole", lambda _: MultiCartpole(4))
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    return obs_space, act_space, "multi_cartpole"
Beispiel #4
0
def check_support_multiagent(alg, config):
    register_env("multi_mountaincar", lambda _: MultiMountainCar(2))
    register_env("multi_cartpole", lambda _: MultiCartpole(2))
    if "DDPG" in alg:
        a = get_agent_class(alg)(config=config, env="multi_mountaincar")
    else:
        a = get_agent_class(alg)(config=config, env="multi_cartpole")
    try:
        a.train()
    finally:
        a.stop()
Beispiel #5
0
    def testMultiAgent(self):
        register_env("multi_cartpole", lambda _: MultiCartpole(10))
        single_env = gym.make("CartPole-v0")

        def gen_policy():
            obs_space = single_env.observation_space
            act_space = single_env.action_space
            return (PGTFPolicy, obs_space, act_space, {})

        pg = PGTrainer(
            env="multi_cartpole",
            config={
                "num_workers": 0,
                "output": self.test_dir,
                "multiagent": {
                    "policies": {
                        "policy_1": gen_policy(),
                        "policy_2": gen_policy(),
                    },
                    "policy_mapping_fn": (
                        lambda agent_id: random.choice(
                            ["policy_1", "policy_2"])),
                },
            })
        pg.train()
        self.assertEqual(len(os.listdir(self.test_dir)), 1)

        pg.stop()
        pg = PGTrainer(
            env="multi_cartpole",
            config={
                "num_workers": 0,
                "input": self.test_dir,
                "input_evaluation": ["simulation"],
                "train_batch_size": 2000,
                "multiagent": {
                    "policies": {
                        "policy_1": gen_policy(),
                        "policy_2": gen_policy(),
                    },
                    "policy_mapping_fn": (
                        lambda agent_id: random.choice(
                            ["policy_1", "policy_2"])),
                },
            })
        for _ in range(50):
            result = pg.train()
            if not np.isnan(result["episode_reward_mean"]):
                return  # simulation ok
            time.sleep(0.1)
        assert False, "did not see any simulation results"
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.tests.test_multi_agent_env import MultiCartpole
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env

parser = argparse.ArgumentParser()
parser.add_argument("--num-iters", type=int, default=20)

if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    # Simple environment with 4 independent cartpole entities
    register_env("multi_cartpole", lambda _: MultiCartpole(4))
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # You can also have multiple policies per trainer, but here we just
    # show one each for PPO and DQN.
    policies = {
        "ppo_policy": (PPOTFPolicy, obs_space, act_space, {}),
        "dqn_policy": (DQNTFPolicy, obs_space, act_space, {}),
    }

    def policy_mapping_fn(agent_id):
        if agent_id % 2 == 0:
            return "ppo_policy"
        else:
Beispiel #7
0
                                     64,
                                     activation=tf.nn.relu,
                                     name="fc2")
        output = tf.layers.dense(last_layer,
                                 num_outputs,
                                 activation=None,
                                 name="fc_out")
        return output, last_layer


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    # Simple environment with `num_agents` independent cartpole entities
    register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents))
    ModelCatalog.register_custom_model("model1", CustomModel1)
    ModelCatalog.register_custom_model("model2", CustomModel2)
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model)
    def gen_policy(i):
        config = {
            "model": {
                "custom_model": ["model1", "model2"][i % 2],
            },
            "gamma": random.choice([0.95, 0.99]),
        }
        return (None, obs_space, act_space, config)