def testMinibatchSequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOAgent( env="counter", config={ "num_workers": 0, "sample_batch_size": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": False, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, }) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 3]) self.assertEqual(batch0["sequences"].tolist(), [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], ]) self.assertEqual(batch1["sequences"].tolist(), [ [[8], [9], [10], [11]], [[12], [13], [14], [0]], ]) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch3["seq_lens"].tolist(), [2, 4]) self.assertEqual(batch2["sequences"].tolist(), [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], ]) self.assertEqual(batch3["sequences"].tolist(), [ [[13], [14], [0], [0]], [[0], [1], [2], [3]], ])
def testMinibatchSequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOAgent(env="counter", config={ "num_workers": 0, "sample_batch_size": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": False, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, }) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 3]) self.assertEqual(batch0["sequences"].tolist(), [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], ]) self.assertEqual(batch1["sequences"].tolist(), [ [[8], [9], [10], [11]], [[12], [13], [14], [0]], ]) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch3["seq_lens"].tolist(), [2, 4]) self.assertEqual(batch2["sequences"].tolist(), [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], ]) self.assertEqual(batch3["sequences"].tolist(), [ [[13], [14], [0], [0]], [[0], [1], [2], [3]], ])
def testSimpleOptimizerSequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOAgent( env="counter", config={ "num_workers": 0, "sample_batch_size": 10, "train_batch_size": 10, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": True, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, }) ppo.train() ppo.train() batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) self.assertEqual( batch0["sequences"].tolist(), [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]]) self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2]) self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0]) self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0) self.assertTrue( np.allclose(batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1])) self.assertTrue( np.allclose(batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1])) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) self.assertEqual(batch1["sequences"].tolist(), [ [[10], [11], [12], [13]], [[14], [0], [0], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1]) self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0]) self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
def testSimpleOptimizerSequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOAgent(env="counter", config={ "num_workers": 0, "sample_batch_size": 10, "train_batch_size": 10, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": True, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, }) ppo.train() ppo.train() batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) self.assertEqual( batch0["sequences"].tolist(), [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]]) self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2]) self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0]) self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0) self.assertTrue( np.allclose(batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1])) self.assertTrue( np.allclose(batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1])) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) self.assertEqual(batch1["sequences"].tolist(), [ [[10], [11], [12], [13]], [[14], [0], [0], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1]) self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0]) self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
def my_train_fn(config, reporter): # Train for 100 iterations with high LR import gym import gym_xplane env_creator_name = "gymXplane-v2" #env = gym.make("gymXplane-v2") register_env(env_creator_name, env_creator) agent1 = PPOAgent(env=env_creator_name, config=config) for _ in range(10): result = agent1.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop()
def testLocal(self): ray.init(local_mode=True) cf = DEFAULT_CONFIG.copy() agent = PPOAgent(cf, "CartPole-v0") print(agent.train())
def testPPOSampleWaste(self): ray.init(num_cpus=4) # Check we at least collect the initial wave of samples ppo = PPOAgent( env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 600) ppo.stop() # Check we collect at least the specified amount of samples ppo = PPOAgent( env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1000) ppo.stop() # Check in vectorized mode ppo = PPOAgent( env="CartPole-v0", config={ "sample_batch_size": 200, "num_envs_per_worker": 2, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1200) ppo.stop() # Check legacy mode ppo = PPOAgent( env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, "straggler_mitigation": True, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 200) ppo.stop()
def my_train_fn(config, reporter): # Train for 100 iterations with high LR agent1 = PPOAgent(env="CartPole-v0", config=config) for _ in range(10): result = agent1.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop() # Train for 100 iterations with low LR config["lr"] = 0.0001 agent2 = PPOAgent(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(10): result = agent2.train() result["phase"] = 2 result["timesteps_total"] += phase1_time # keep time moving forward reporter(**result) agent2.stop()
def testGymHighway(args): checkpoint_dir = "/home/s6pereir/ray_results/PPO_Highway-v0_2018-12-04_16-58-49ejlbc74y/" checkpoint = "/home/s6pereir/ray_results/PPO_Highway-v0_2018-12-04_16-58-49ejlbc74y/checkpoint-1290" # Load configuration from file config_dir = os.path.dirname(checkpoint_dir) config_path = os.path.join(config_dir, "params.json") with open(config_path) as f: # args.config = json.load(f) config = json.load(f) env_creator_name = "Highway-v0" # register_env(env_creator_name, lambda _: HighwayEnv(False, True, False, True)) register_env(env_creator_name, lambda _: HighwayEnv(False, True, False, False)) # register custom model register_custom_model() env = gym.make('Highway-v0') ray.init(num_gpus=1) num_steps = int(10) agent = PPOAgent(env=env_creator_name, config={ "num_workers": Config.num_workers, "num_envs_per_worker": Config.num_envs_per_worker, "sample_batch_size": Config.sample_batch_size, "train_batch_size": Config.train_batch_size, "num_gpus": Config.num_gpus, "entropy_coeff": Config.entropy_coeff, "model": { "custom_model": "custom_fc_model", "custom_options": {}, "use_lstm": Config.use_lstm, }, }) # TEST: using LSTM # ==================================================== # agent = PPOAgent(env=env_creator_name, # config={ # "num_workers": 1, # "num_envs_per_worker": 1, # "sample_batch_size":64, # "train_batch_size":1280, # "num_gpus":1, # # "entropy_coeff":0.3, # "model": { # "custom_model": "custom_fc_model", # "custom_options": {}, # "use_lstm": True, # }, # }) agent.restore(checkpoint) steps = 0 while steps < num_steps: state = env.reset() # lstm_state = agent._policy_graph.get_initial_state(agent) lstm_state = [np.zeros(256), np.zeros(256)] reward_total = 0.0 while True: # computed action, rnn state, logits dictionary # action, lstm_state, _ = agent.compute_action(state, lstm_state) action = agent.compute_action(state) next_state, reward, done, _ = env.step(action) reward_total += reward state = next_state if done: print("Done") break steps += 1 print("Episode reward", reward_total) # ==================================================== # agent.restore(checkpoint) # steps = 0 # while steps < num_steps: # state = env.reset() # reward_total = 0.0 # while True: # action = agent.compute_action(state) # next_state, reward, done, _ = env.step(action) # reward_total += reward # state = next_state # if done: # print("Done") # break # steps += 1 # print("Episode reward", reward_total) env.close()
import ray from ray.tune.registry import register_env from ray.rllib.agents.ppo import PPOAgent import gym_xplane def env_creator(env_config): import gym return gym.make("gymXplane-v2") # or return your own custom env env_creator_name = "gymXplane-v2" register_env(env_creator_name, env_creator) ray.init() agent = PPOAgent(env=env_creator_name, config={ "env_config": {}, # config to pass to env creator })
"num_workers": 4, "model": { "custom_model": "mask_model", }, "env_config": { "pymarl_path": path_to_pymarl } } if args.run.lower() == "qmix": def grouped_sc2(cfg): env = SC2MultiAgentEnv(cfg) agent_list = list(range(env._starcraft_env.n_agents)) grouping = { "group_1": agent_list, } obs_space = Tuple([env.observation_space for i in agent_list]) act_space = Tuple([env.action_space for i in agent_list]) return env.with_agent_groups( grouping, obs_space=obs_space, act_space=act_space) register_env("grouped_starcraft", grouped_sc2) agent = QMixAgent(env="grouped_starcraft", config=agent_cfg) elif args.run.lower() == "pg": agent = PGAgent(env="starcraft", config=agent_cfg) elif args.run.lower() == "ppo": agent_cfg.update({"vf_share_layers": True}) agent = PPOAgent(env="starcraft", config=agent_cfg) for i in range(args.num_iters): print(pretty_print(agent.train()))
if __name__ == "__main__": ray.init() register_env("srv", lambda _: TradingServing()) # We use PPO since it supports off-policy actions, but you can choose and # configure any agent. ppo = PPOAgent( env="srv", config={ # Use a single process to avoid needing to set up a load balancer "num_workers": 0, # "num_gpus": 1 # We can set GPUs later ... dog ... keep it real. # Configure the agent to run short iterations for debugging # "exploration_fraction": 0.01, # "learning_starts": 100, "model":{ "use_lstm": True }, "timesteps_per_iteration": 200, }) # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) ppo.restore(checkpoint_path) # Serving and training loop
def visualize_agent(agent, name): env = make_costly_env(config['env_config']) env = gym.wrappers.Monitor(env, name, force=True) state = env.reset() cumulative_reward = 0 policy = agent.get_policy('default') rnn_state = policy.get_initial_state() # for lstm-based model done = False while not done: action = agent.compute_action(state, state=rnn_state) if rnn_state: action, rnn_state, logits = action action = (action[0][0], action[1][0]) state, reward, done, _ = env.step(action) env.render() cumulative_reward += reward env.close() print("reward =", cumulative_reward) checkpoint_path = None for cost in range(4): print("training with cost =", cost) config['env_config']['observation_cost'] = cost agent = PPOAgent(config) if checkpoint_path: agent.restore(checkpoint_path) checkpoint_path = train_agent_for(agent, 10 + cost * 15) print("saving checkpoint to ", checkpoint_path) visualize_agent(agent, "results/recording_%d" % cost)