results = tune.run(args.run, config=config, stop=stop, verbose=2, checkpoint_at_end=True) if args.as_test: check_learning_achieved(results, args.stop_reward) checkpoints = results.get_trial_checkpoints_paths( trial=results.get_best_trial("episode_reward_mean", mode="max"), metric="episode_reward_mean", ) checkpoint_path = checkpoints[0][0] trainer = PPO(config) trainer.restore(checkpoint_path) # Inference loop. env = StatelessCartPole() # Run manual inference loop for n episodes. for _ in range(10): episode_reward = 0.0 reward = 0.0 action = 0 done = False obs = env.reset() while not done: # Create a dummy action using the same observation n times, # as well as dummy prev-n-actions and prev-n-rewards.
self.cur_pos += 1 # Set `done` flag when end of corridor (goal) reached. done = self.cur_pos >= self.end_pos # +1 when goal reached, otherwise -1. reward = 1.0 if done else -0.1 return [self.cur_pos], reward, done, {} # Create an RLlib Trainer instance. trainer = PPO( config={ # Env class to use (here: our gym.Env sub-class from above). "env": SimpleCorridor, # Config dict to be passed to our custom env's constructor. "env_config": { # Use corridor with 20 fields (including S and G). "corridor_length": 20 }, # Parallelize environment rollouts. "num_workers": 3, } ) # Train for n iterations and report results (mean episode rewards). # Since we have to move at least 19 times in the env to reach the goal and # each move gives us -0.1 reward (except the last move at the end: +1.0), # we can expect to reach an optimal episode reward of -0.1*18 + 1.0 = -0.8 for i in range(5): results = trainer.train() print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")
def test_simple_optimizer_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPO( env="counter", config={ "num_workers": 0, "rollout_fragment_length": 10, "train_batch_size": 10, "sgd_minibatch_size": 10, "num_sgd_iter": 1, "simple_optimizer": True, "model": { "custom_model": "rnn", "max_seq_len": 4, "vf_share_layers": True, }, "framework": "tf", }, ) ppo.train() ppo.train() batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) self.assertEqual( batch0["sequences"].tolist(), [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]], ) self.assertEqual(batch0[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2]) self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0]) self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0) self.assertTrue( np.allclose(batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1])) self.assertTrue( np.allclose(batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1])) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) self.assertEqual( batch1["sequences"].tolist(), [ [[10], [11], [12], [13]], [[14], [0], [0], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ], ) self.assertEqual(batch1[SampleBatch.SEQ_LENS].tolist(), [4, 1, 4, 1]) self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0]) self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
def test_minibatch_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPO( env="counter", config={ "shuffle_sequences": False, # for deterministic testing "num_workers": 0, "rollout_fragment_length": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, "vf_share_layers": True, }, "framework": "tf", }, ) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2]) self.assertEqual(batch1[SampleBatch.SEQ_LENS].tolist(), [2, 3, 4, 1]) check( batch0["sequences"], [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]], ], ) check( batch1["sequences"], [ [[10], [11], [0], [0]], [[12], [13], [14], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ], ) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2]) self.assertEqual(batch3[SampleBatch.SEQ_LENS].tolist(), [4, 4, 2]) check( batch2["sequences"], [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]], ], ) check( batch3["sequences"], [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], [[13], [14], [0], [0]], ], )
results = tune.run( "PPO", config=config, stop={"training_iteration": args.pre_training_iters}, verbose=1, checkpoint_freq=1, checkpoint_at_end=True, ) print("Pre-training done.") best_checkpoint = results.get_best_checkpoint(results.trials[0], mode="max") print(f".. best checkpoint was: {best_checkpoint}") # Create a new dummy Trainer to "fix" our checkpoint. new_trainer = PPO(config=config) # Get untrained weights for all policies. untrained_weights = new_trainer.get_weights() # Restore all policies from checkpoint. new_trainer.restore(best_checkpoint) # Set back all weights (except for 1st agent) to original # untrained weights. new_trainer.set_weights( {pid: w for pid, w in untrained_weights.items() if pid != "policy_0"}) # Create the checkpoint from which tune can pick up the # experiment. new_checkpoint = new_trainer.save() new_trainer.stop() print(".. checkpoint to restore from (all policies reset, " f"except policy_0): {new_checkpoint}")
prep = get_preprocessor(env.observation_space)(env.observation_space) # <ray.rllib.models.preprocessors.GenericPixelPreprocessor object at 0x7fc4d049de80> # Observations should be preprocessed prior to feeding into a model env.reset().shape # (210, 160, 3) prep.transform(env.reset()).shape # (84, 84, 3) # __preprocessing_observations_end__ # __query_action_dist_start__ # Get a reference to the policy import numpy as np from ray.rllib.algorithms.ppo import PPO algo = PPO(env="CartPole-v0", config={"framework": "tf2", "num_workers": 0}) policy = algo.get_policy() # <ray.rllib.policy.eager_tf_policy.PPOTFPolicy_eager object at 0x7fd020165470> # Run a forward pass to get model output logits. Note that complex observations # must be preprocessed as in the above code block. logits, _ = policy.model({"obs": np.array([[0.1, 0.2, 0.3, 0.4]])}) # (<tf.Tensor: id=1274, shape=(1, 2), dtype=float32, numpy=...>, []) # Compute action distribution given logits policy.dist_class # <class_object 'ray.rllib.models.tf.tf_action_dist.Categorical'> dist = policy.dist_class(logits, policy.model) # <ray.rllib.models.tf.tf_action_dist.Categorical object at 0x7fd02301d710> # Query the distribution for samples, sample logps
done = self.episode_len >= 10 # r = -abs(obs - action) reward = -sum(abs(self.cur_obs - action)) # Set a new observation (random sample). self.cur_obs = self.observation_space.sample() return self.cur_obs, reward, done, {} # Create an RLlib Algorithm instance to learn how to act in the above. # environment. algo = PPO( config={ # Env class to use (here: our gym.Env sub-class from above). "env": ParrotEnv, # Config dict to be passed to our custom env's constructor. "env_config": { "parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1, )) }, # Parallelize environment rollouts. "num_workers": 3, }) # Train for n iterations and report results (mean episode rewards). # Since we have to guess 10 times and the optimal reward is 0.0 # (exact match between observation and action value), # we can expect to reach an optimal episode reward of 0.0. for i in range(5): results = algo.train() print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}") # Perform inference (action computations) based on given env observations.
results = None if not args.from_checkpoint: results = tune.run( "PPO", config=config, stop=stop, checkpoint_at_end=True, checkpoint_freq=10, verbose=3, ) # Restore trained trainer (set to non-explore behavior) and play against # human on command line. if args.num_episodes_human_play > 0: num_episodes = 0 trainer = PPO(config=dict(config, **{"explore": False})) if args.from_checkpoint: trainer.restore(args.from_checkpoint) else: checkpoint = results.get_last_checkpoint() if not checkpoint: raise ValueError("No last checkpoint found in results!") trainer.restore(checkpoint) # Play from the command line against the trained agent # in an actual (non-RLlib-wrapped) open-spiel env. human_player = 1 env = Environment(args.env) while num_episodes < args.num_episodes_human_play: print("You play as {}".format("o" if human_player else "x"))
if agent_id % 2 == 0: return "ppo_policy" else: return "dqn_policy" ppo_trainer = PPO( env="multi_agent_cartpole", config={ "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["ppo_policy"], }, "model": { "vf_share_layers": True, }, "num_sgd_iter": 6, "vf_loss_coeff": 0.01, # disable filters, otherwise we would need to synchronize those # as well to the DQN agent "observation_filter": "MeanStdFilter", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": args.framework, }, ) dqn_trainer = DQN( env="multi_agent_cartpole", config={ "multiagent": {
"framework": "tf", # Tweak the default model provided automatically by RLlib, # given the environment's observation- and action spaces. "model": { "fcnet_hiddens": [64, 64], "fcnet_activation": "relu", }, # Set up a separate evaluation worker set for the # `algo.evaluate()` call after training (see below). "evaluation_num_workers": 1, # Only for evaluation runs, render the env. "evaluation_config": { "render_env": True, }, } # Create our RLlib Trainer. algo = PPO(config=config) # Run it for n training iterations. A training iteration includes # parallel sample collection by the environment workers as well as # loss calculation on the collected batch and a model update. for _ in range(3): print(algo.train()) # Evaluate the trained Trainer (and render each timestep to the shell's # output). algo.evaluate() # __rllib-in-60s-end__
"framework": "tf", # Tweak the default model provided automatically by RLlib, # given the environment's observation- and action spaces. "model": { "fcnet_hiddens": [64, 64], "fcnet_activation": "relu", }, # Set up a separate evaluation worker set for the # `trainer.evaluate()` call after training (see below). "evaluation_num_workers": 1, # Only for evaluation runs, render the env. "evaluation_config": { "render_env": True, }, } # Create our RLlib Trainer. trainer = PPO(config=config) # Run it for n training iterations. A training iteration includes # parallel sample collection by the environment workers as well as # loss calculation on the collected batch and a model update. for _ in range(3): print(trainer.train()) # Evaluate the trained Trainer (and render each timestep to the shell's # output). trainer.evaluate() # __rllib-in-60s-end__
def my_train_fn(config, reporter): iterations = config.pop("train-iterations", 10) # Train for n iterations with high LR agent1 = PPO(env="CartPole-v0", config=config) for _ in range(iterations): result = agent1.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop() # Train for n iterations with low LR config["lr"] = 0.0001 agent2 = PPO(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(iterations): result = agent2.train() result["phase"] = 2 result["timesteps_total"] += phase1_time # keep time moving forward reporter(**result) agent2.stop()
phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop() # Train for n iterations with low LR config["lr"] = 0.0001 agent2 = PPO(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(iterations): result = agent2.train() result["phase"] = 2 result["timesteps_total"] += phase1_time # keep time moving forward reporter(**result) agent2.stop() if __name__ == "__main__": ray.init() args = parser.parse_args() config = { # Special flag signalling `my_train_fn` how many iters to do. "train-iterations": 2, "lr": 0.01, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_workers": 0, "framework": args.framework, } resources = PPO.default_resource_request(config) tune.run(my_train_fn, resources_per_trial=resources, config=config)