Example #1
0
    def testMinibatchSequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOAgent(
            env="counter",
            config={
                "num_workers": 0,
                "sample_batch_size": 20,
                "train_batch_size": 20,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": False,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                },
            })
        ppo.train()
        ppo.train()

        # first epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]:
            batch0, batch1 = batch1, batch0  # sort minibatches
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 3])
        self.assertEqual(batch0["sequences"].tolist(), [
            [[0], [1], [2], [3]],
            [[4], [5], [6], [7]],
        ])
        self.assertEqual(batch1["sequences"].tolist(), [
            [[8], [9], [10], [11]],
            [[12], [13], [14], [0]],
        ])

        # second epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch2 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2"))
        batch3 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3"))
        if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]:
            batch2, batch3 = batch3, batch2
        self.assertEqual(batch2["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch3["seq_lens"].tolist(), [2, 4])
        self.assertEqual(batch2["sequences"].tolist(), [
            [[5], [6], [7], [8]],
            [[9], [10], [11], [12]],
        ])
        self.assertEqual(batch3["sequences"].tolist(), [
            [[13], [14], [0], [0]],
            [[0], [1], [2], [3]],
        ])
Example #2
0
    def testMinibatchSequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOAgent(env="counter",
                       config={
                           "num_workers": 0,
                           "sample_batch_size": 20,
                           "train_batch_size": 20,
                           "sgd_minibatch_size": 10,
                           "vf_share_layers": True,
                           "simple_optimizer": False,
                           "num_sgd_iter": 1,
                           "model": {
                               "custom_model": "rnn",
                               "max_seq_len": 4,
                           },
                       })
        ppo.train()
        ppo.train()

        # first epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]:
            batch0, batch1 = batch1, batch0  # sort minibatches
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 3])
        self.assertEqual(batch0["sequences"].tolist(), [
            [[0], [1], [2], [3]],
            [[4], [5], [6], [7]],
        ])
        self.assertEqual(batch1["sequences"].tolist(), [
            [[8], [9], [10], [11]],
            [[12], [13], [14], [0]],
        ])

        # second epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch2 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2"))
        batch3 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3"))
        if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]:
            batch2, batch3 = batch3, batch2
        self.assertEqual(batch2["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch3["seq_lens"].tolist(), [2, 4])
        self.assertEqual(batch2["sequences"].tolist(), [
            [[5], [6], [7], [8]],
            [[9], [10], [11], [12]],
        ])
        self.assertEqual(batch3["sequences"].tolist(), [
            [[13], [14], [0], [0]],
            [[0], [1], [2], [3]],
        ])
Example #3
0
    def testSimpleOptimizerSequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOAgent(
            env="counter",
            config={
                "num_workers": 0,
                "sample_batch_size": 10,
                "train_batch_size": 10,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": True,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                },
            })
        ppo.train()
        ppo.train()

        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        self.assertEqual(
            batch0["sequences"].tolist(),
            [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]])
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2])
        self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0])
        self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0)
        self.assertTrue(
            np.allclose(batch0["state_in"][0].tolist()[1:],
                        batch0["state_out"][0].tolist()[:-1]))
        self.assertTrue(
            np.allclose(batch0["state_in"][1].tolist()[1:],
                        batch0["state_out"][1].tolist()[:-1]))

        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        self.assertEqual(batch1["sequences"].tolist(), [
            [[10], [11], [12], [13]],
            [[14], [0], [0], [0]],
            [[0], [1], [2], [3]],
            [[4], [0], [0], [0]],
        ])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1])
        self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0])
        self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
Example #4
0
    def testSimpleOptimizerSequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOAgent(env="counter",
                       config={
                           "num_workers": 0,
                           "sample_batch_size": 10,
                           "train_batch_size": 10,
                           "sgd_minibatch_size": 10,
                           "vf_share_layers": True,
                           "simple_optimizer": True,
                           "num_sgd_iter": 1,
                           "model": {
                               "custom_model": "rnn",
                               "max_seq_len": 4,
                           },
                       })
        ppo.train()
        ppo.train()

        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        self.assertEqual(
            batch0["sequences"].tolist(),
            [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]])
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2])
        self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0])
        self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0)
        self.assertTrue(
            np.allclose(batch0["state_in"][0].tolist()[1:],
                        batch0["state_out"][0].tolist()[:-1]))
        self.assertTrue(
            np.allclose(batch0["state_in"][1].tolist()[1:],
                        batch0["state_out"][1].tolist()[:-1]))

        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        self.assertEqual(batch1["sequences"].tolist(), [
            [[10], [11], [12], [13]],
            [[14], [0], [0], [0]],
            [[0], [1], [2], [3]],
            [[4], [0], [0], [0]],
        ])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1])
        self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0])
        self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
Example #5
0
def my_train_fn(config, reporter):
    # Train for 100 iterations with high LR
    import gym
    import gym_xplane
    env_creator_name = "gymXplane-v2"
    #env = gym.make("gymXplane-v2")
    register_env(env_creator_name, env_creator)
    agent1 = PPOAgent(env=env_creator_name, config=config)
    for _ in range(10):
        result = agent1.train()
        result["phase"] = 1
        reporter(**result)
        phase1_time = result["timesteps_total"]
    state = agent1.save()
    agent1.stop()
Example #6
0
 def testLocal(self):
     ray.init(local_mode=True)
     cf = DEFAULT_CONFIG.copy()
     agent = PPOAgent(cf, "CartPole-v0")
     print(agent.train())
Example #7
0
    def testPPOSampleWaste(self):
        ray.init(num_cpus=4)

        # Check we at least collect the initial wave of samples
        ppo = PPOAgent(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "train_batch_size": 128,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 600)
        ppo.stop()

        # Check we collect at least the specified amount of samples
        ppo = PPOAgent(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "train_batch_size": 900,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1000)
        ppo.stop()

        # Check in vectorized mode
        ppo = PPOAgent(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "num_envs_per_worker": 2,
                "train_batch_size": 900,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1200)
        ppo.stop()

        # Check legacy mode
        ppo = PPOAgent(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "train_batch_size": 128,
                "num_workers": 3,
                "straggler_mitigation": True,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 200)
        ppo.stop()
Example #8
0
def my_train_fn(config, reporter):
    # Train for 100 iterations with high LR
    agent1 = PPOAgent(env="CartPole-v0", config=config)
    for _ in range(10):
        result = agent1.train()
        result["phase"] = 1
        reporter(**result)
        phase1_time = result["timesteps_total"]
    state = agent1.save()
    agent1.stop()

    # Train for 100 iterations with low LR
    config["lr"] = 0.0001
    agent2 = PPOAgent(env="CartPole-v0", config=config)
    agent2.restore(state)
    for _ in range(10):
        result = agent2.train()
        result["phase"] = 2
        result["timesteps_total"] += phase1_time  # keep time moving forward
        reporter(**result)
    agent2.stop()
def testGymHighway(args):
    checkpoint_dir = "/home/s6pereir/ray_results/PPO_Highway-v0_2018-12-04_16-58-49ejlbc74y/"
    checkpoint = "/home/s6pereir/ray_results/PPO_Highway-v0_2018-12-04_16-58-49ejlbc74y/checkpoint-1290"

    # Load configuration from file
    config_dir = os.path.dirname(checkpoint_dir)
    config_path = os.path.join(config_dir, "params.json")
    with open(config_path) as f:
        # args.config = json.load(f)
        config = json.load(f)

    env_creator_name = "Highway-v0"
    # register_env(env_creator_name, lambda _: HighwayEnv(False, True, False, True))
    register_env(env_creator_name,
                 lambda _: HighwayEnv(False, True, False, False))

    # register custom model
    register_custom_model()

    env = gym.make('Highway-v0')

    ray.init(num_gpus=1)

    num_steps = int(10)
    agent = PPOAgent(env=env_creator_name,
                     config={
                         "num_workers": Config.num_workers,
                         "num_envs_per_worker": Config.num_envs_per_worker,
                         "sample_batch_size": Config.sample_batch_size,
                         "train_batch_size": Config.train_batch_size,
                         "num_gpus": Config.num_gpus,
                         "entropy_coeff": Config.entropy_coeff,
                         "model": {
                             "custom_model": "custom_fc_model",
                             "custom_options": {},
                             "use_lstm": Config.use_lstm,
                         },
                     })

    # TEST: using LSTM
    # ====================================================
    # agent = PPOAgent(env=env_creator_name,
    #     config={
    #         "num_workers": 1,
    #         "num_envs_per_worker": 1,
    #         "sample_batch_size":64,
    #         "train_batch_size":1280,
    #         "num_gpus":1,
    #         # "entropy_coeff":0.3,
    #         "model": {
    #             "custom_model": "custom_fc_model",
    #             "custom_options": {},
    #             "use_lstm": True,
    #         },
    #     })

    agent.restore(checkpoint)

    steps = 0
    while steps < num_steps:
        state = env.reset()
        # lstm_state = agent._policy_graph.get_initial_state(agent)
        lstm_state = [np.zeros(256), np.zeros(256)]
        reward_total = 0.0
        while True:
            # computed action, rnn state, logits dictionary
            # action, lstm_state, _ = agent.compute_action(state, lstm_state)
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)
            reward_total += reward
            state = next_state
            if done:
                print("Done")
                break
        steps += 1
        print("Episode reward", reward_total)

    # ====================================================

    # agent.restore(checkpoint)

    # steps = 0
    # while steps < num_steps:
    #     state = env.reset()
    #     reward_total = 0.0
    #     while True:
    #         action = agent.compute_action(state)
    #         next_state, reward, done, _ = env.step(action)
    #         reward_total += reward
    #         state = next_state
    #         if done:
    #             print("Done")
    #             break
    #     steps += 1
    #     print("Episode reward", reward_total)
    env.close()
import ray
from ray.tune.registry import register_env
from ray.rllib.agents.ppo import PPOAgent
import gym_xplane

def env_creator(env_config):
    import gym
    return gym.make("gymXplane-v2")  # or return your own custom env

env_creator_name = "gymXplane-v2"
register_env(env_creator_name, env_creator)

ray.init()
agent = PPOAgent(env=env_creator_name, config={
    "env_config": {},  # config to pass to env creator
})

Example #11
0
    def testPPOSampleWaste(self):
        ray.init(num_cpus=4)

        # Check we at least collect the initial wave of samples
        ppo = PPOAgent(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "train_batch_size": 128,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 600)
        ppo.stop()

        # Check we collect at least the specified amount of samples
        ppo = PPOAgent(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "train_batch_size": 900,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1000)
        ppo.stop()

        # Check in vectorized mode
        ppo = PPOAgent(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "num_envs_per_worker": 2,
                "train_batch_size": 900,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1200)
        ppo.stop()

        # Check legacy mode
        ppo = PPOAgent(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "train_batch_size": 128,
                "num_workers": 3,
                "straggler_mitigation": True,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 200)
        ppo.stop()
Example #12
0
        "num_workers": 4,
        "model": {
            "custom_model": "mask_model",
        },
        "env_config": {
            "pymarl_path": path_to_pymarl
        }
    }
    if args.run.lower() == "qmix":

        def grouped_sc2(cfg):
            env = SC2MultiAgentEnv(cfg)
            agent_list = list(range(env._starcraft_env.n_agents))
            grouping = {
                "group_1": agent_list,
            }
            obs_space = Tuple([env.observation_space for i in agent_list])
            act_space = Tuple([env.action_space for i in agent_list])
            return env.with_agent_groups(
                grouping, obs_space=obs_space, act_space=act_space)

        register_env("grouped_starcraft", grouped_sc2)
        agent = QMixAgent(env="grouped_starcraft", config=agent_cfg)
    elif args.run.lower() == "pg":
        agent = PGAgent(env="starcraft", config=agent_cfg)
    elif args.run.lower() == "ppo":
        agent_cfg.update({"vf_share_layers": True})
        agent = PPOAgent(env="starcraft", config=agent_cfg)
    for i in range(args.num_iters):
        print(pretty_print(agent.train()))
Example #13
0

if __name__ == "__main__":
    ray.init()
    register_env("srv", lambda _: TradingServing())

    # We use PPO since it supports off-policy actions, but you can choose and
    # configure any agent.
    ppo = PPOAgent(
        env="srv",
        config={
            # Use a single process to avoid needing to set up a load balancer
            "num_workers": 0,
            # "num_gpus": 1
            # We can set GPUs later ... dog ... keep it real.
            # Configure the agent to run short iterations for debugging
            # "exploration_fraction": 0.01,
            # "learning_starts": 100,
            "model":{
                "use_lstm": True
            },
            "timesteps_per_iteration": 200,
        })

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        ppo.restore(checkpoint_path)

    # Serving and training loop
Example #14
0
def visualize_agent(agent, name):
    env = make_costly_env(config['env_config'])
    env = gym.wrappers.Monitor(env, name, force=True)
    state = env.reset()
    cumulative_reward = 0
    policy = agent.get_policy('default')
    rnn_state = policy.get_initial_state() # for lstm-based model
    done = False
    while not done:
        action = agent.compute_action(state, state=rnn_state)
        if rnn_state:
            action, rnn_state, logits = action
        action = (action[0][0], action[1][0])
        state, reward, done, _ = env.step(action)
        env.render()
        cumulative_reward += reward
    env.close()
    print("reward =", cumulative_reward)

checkpoint_path = None

for cost in range(4):
    print("training with cost =", cost)
    config['env_config']['observation_cost'] = cost
    agent = PPOAgent(config)
    if checkpoint_path: 
        agent.restore(checkpoint_path)
    checkpoint_path = train_agent_for(agent, 10 + cost * 15)
    print("saving checkpoint to ", checkpoint_path)
    visualize_agent(agent, "results/recording_%d" % cost)