def my_train_fn(config, reporter): iterations = config.pop("train-iterations", 10) # Train for n iterations with high LR agent1 = PPO(env="CartPole-v0", config=config) for _ in range(iterations): result = agent1.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop() # Train for n iterations with low LR config["lr"] = 0.0001 agent2 = PPO(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(iterations): result = agent2.train() result["phase"] = 2 result["timesteps_total"] += phase1_time # keep time moving forward reporter(**result) agent2.stop()
# Config dict to be passed to our custom env's constructor. "env_config": { # Use corridor with 20 fields (including S and G). "corridor_length": 20 }, # Parallelize environment rollouts. "num_workers": 3, } ) # Train for n iterations and report results (mean episode rewards). # Since we have to move at least 19 times in the env to reach the goal and # each move gives us -0.1 reward (except the last move at the end: +1.0), # we can expect to reach an optimal episode reward of -0.1*18 + 1.0 = -0.8 for i in range(5): results = trainer.train() print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}") # Perform inference (action computations) based on given env observations. # Note that we are using a slightly different env here (len 10 instead of 20), # however, this should still work as the agent has (hopefully) learned # to "just always walk right!" env = SimpleCorridor({"corridor_length": 10}) # Get the initial observation (should be: [0.0] for the starting position). obs = env.reset() done = False total_reward = 0.0 # Play one episode. while not done: # Compute a single action, given the current observation # from the environment.
def test_minibatch_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPO( env="counter", config={ "shuffle_sequences": False, # for deterministic testing "num_workers": 0, "rollout_fragment_length": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, "vf_share_layers": True, }, "framework": "tf", }, ) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2]) self.assertEqual(batch1[SampleBatch.SEQ_LENS].tolist(), [2, 3, 4, 1]) check( batch0["sequences"], [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]], ], ) check( batch1["sequences"], [ [[10], [11], [0], [0]], [[12], [13], [14], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ], ) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2]) self.assertEqual(batch3[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2]) check( batch2["sequences"], [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]], ], ) check( batch3["sequences"], [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], [[13], [14], [0], [0]], ], )
def test_simple_optimizer_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPO( env="counter", config={ "num_workers": 0, "rollout_fragment_length": 10, "train_batch_size": 10, "sgd_minibatch_size": 10, "num_sgd_iter": 1, "simple_optimizer": True, "model": { "custom_model": "rnn", "max_seq_len": 4, "vf_share_layers": True, }, "framework": "tf", }, ) ppo.train() ppo.train() batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) self.assertEqual( batch0["sequences"].tolist(), [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]], ) self.assertEqual(batch0[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2]) self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0]) self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0) self.assertTrue( np.allclose(batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1])) self.assertTrue( np.allclose(batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1])) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) self.assertEqual( batch1["sequences"].tolist(), [ [[10], [11], [12], [13]], [[14], [0], [0], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ], ) self.assertEqual(batch1[SampleBatch.SEQ_LENS].tolist(), [4, 1, 4, 1]) self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0]) self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
# Env class to use (here: our gym.Env sub-class from above). "env": ParrotEnv, # Config dict to be passed to our custom env's constructor. "env_config": { "parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1, )) }, # Parallelize environment rollouts. "num_workers": 3, }) # Train for n iterations and report results (mean episode rewards). # Since we have to guess 10 times and the optimal reward is 0.0 # (exact match between observation and action value), # we can expect to reach an optimal episode reward of 0.0. for i in range(5): results = algo.train() print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}") # Perform inference (action computations) based on given env observations. # Note that we are using a slightly simpler env here (-3.0 to 3.0, instead # of -5.0 to 5.0!), however, this should still work as the agent has # (hopefully) learned to "just always repeat the observation!". env = ParrotEnv({"parrot_shriek_range": gym.spaces.Box(-3.0, 3.0, (1, ))}) # Get the initial observation (some value between -10.0 and 10.0). obs = env.reset() done = False total_reward = 0.0 # Play one episode. while not done: # Compute a single action, given the current observation # from the environment.
# You should see both the printed X and Y approach 200 as this trains: # info: # policy_reward_mean: # dqn_policy: X # ppo_policy: Y for i in range(args.stop_iters): print("== Iteration", i, "==") # improve the DQN policy print("-- DQN --") result_dqn = dqn_trainer.train() print(pretty_print(result_dqn)) # improve the PPO policy print("-- PPO --") result_ppo = ppo_trainer.train() print(pretty_print(result_ppo)) # Test passed gracefully. if (args.as_test and result_dqn["episode_reward_mean"] > args.stop_reward and result_ppo["episode_reward_mean"] > args.stop_reward): print("test passed (both agents above requested reward)") quit(0) # swap weights to synchronize dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"])) ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"])) # Desired reward not reached. if args.as_test:
"framework": "tf", # Tweak the default model provided automatically by RLlib, # given the environment's observation- and action spaces. "model": { "fcnet_hiddens": [64, 64], "fcnet_activation": "relu", }, # Set up a separate evaluation worker set for the # `algo.evaluate()` call after training (see below). "evaluation_num_workers": 1, # Only for evaluation runs, render the env. "evaluation_config": { "render_env": True, }, } # Create our RLlib Trainer. algo = PPO(config=config) # Run it for n training iterations. A training iteration includes # parallel sample collection by the environment workers as well as # loss calculation on the collected batch and a model update. for _ in range(3): print(algo.train()) # Evaluate the trained Trainer (and render each timestep to the shell's # output). algo.evaluate() # __rllib-in-60s-end__
"framework": "tf", # Tweak the default model provided automatically by RLlib, # given the environment's observation- and action spaces. "model": { "fcnet_hiddens": [64, 64], "fcnet_activation": "relu", }, # Set up a separate evaluation worker set for the # `trainer.evaluate()` call after training (see below). "evaluation_num_workers": 1, # Only for evaluation runs, render the env. "evaluation_config": { "render_env": True, }, } # Create our RLlib Trainer. trainer = PPO(config=config) # Run it for n training iterations. A training iteration includes # parallel sample collection by the environment workers as well as # loss calculation on the collected batch and a model update. for _ in range(3): print(trainer.train()) # Evaluate the trained Trainer (and render each timestep to the shell's # output). trainer.evaluate() # __rllib-in-60s-end__
# You should see both the printed X and Y approach 200 as this trains: # info: # policy_reward_mean: # dqn_policy: X # ppo_policy: Y for i in range(args.stop_iters): print("== Iteration", i, "==") # improve the DQN policy print("-- DQN --") result_dqn = dqn.train() print(pretty_print(result_dqn)) # improve the PPO policy print("-- PPO --") result_ppo = ppo.train() print(pretty_print(result_ppo)) # Test passed gracefully. if (args.as_test and result_dqn["episode_reward_mean"] > args.stop_reward and result_ppo["episode_reward_mean"] > args.stop_reward): print("test passed (both agents above requested reward)") quit(0) # swap weights to synchronize dqn.set_weights(ppo.get_weights(["ppo_policy"])) ppo.set_weights(dqn.get_weights(["dqn_policy"])) # Desired reward not reached. if args.as_test: