def testBatchesLargerWhenVectorized(self): ev = PolicyEvaluator(env_creator=lambda _: MockEnv(episode_length=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=4, num_envs=4) batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) batch = ev.sample() result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 4)
def test_batches_larger_when_vectorized(self): ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=8), policy=MockPolicy, batch_mode="truncate_episodes", batch_steps=4, num_envs=4) batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) batch = ev.sample() result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 4)
def testBatchesLargerWhenVectorized(self): ev = PolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=4, num_envs=4) batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) batch = ev.sample() result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 4)
def testTrainMultiCartpoleManyPolicies(self): n = 20 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space policies = {} for i in range(20): policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space, {}) policy_ids = list(policies.keys()) ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), batch_steps=100) optimizer = SyncSamplesOptimizer(ev, [], {}) for i in range(100): optimizer.step() result = collect_metrics(ev) print("Iteration {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return raise Exception("failed to improve reward")
def testVectorEnvSupport(self): ev = PolicyEvaluator( env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=10) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8)
def testVectorEnvSupport(self): ev = PolicyEvaluator( env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=10) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result.episodes_total, 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result.episodes_total, 8)
def _testWithOptimizer(self, optimizer_cls): n = 3 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. policies = { "p1": (DQNTFPolicy, obs_space, act_space, dqn_config), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } else: policies = { "p1": (PGTFPolicy, obs_space, act_space, {}), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } worker = RolloutWorker( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: def policy_mapper(agent_id): return ["p1", "p2"][agent_id % 2] remote_workers = [ RolloutWorker.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=policy_mapper, batch_steps=50) ] else: remote_workers = [] workers = WorkerSet._from_existing(worker, remote_workers) optimizer = optimizer_cls(workers) for i in range(200): worker.foreach_policy( lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02)) if isinstance(p, DQNTFPolicy) else None) optimizer.step() result = collect_metrics(worker, remote_workers) if i % 20 == 0: def do_update(p): if isinstance(p, DQNTFPolicy): p.update_target() worker.foreach_policy(lambda p, _: do_update(p)) print("Iter {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return print(result) raise Exception("failed to improve reward")
def test_train_multi_cartpole_many_policies(self): n = 20 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space policies = {} for i in range(20): policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) worker = RolloutWorker( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), batch_steps=100) workers = WorkerSet._from_existing(worker, []) optimizer = SyncSamplesOptimizer(workers) for i in range(100): optimizer.step() result = collect_metrics(worker) print("Iteration {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return raise Exception("failed to improve reward")
def __call__(self, samples): # Dreamer training loop. for n in range(self.dreamer_train_iters): print(f"sub-iteration={n}/{self.dreamer_train_iters}") batch = self.episode_buffer.sample(self.batch_size) # if n == self.dreamer_train_iters - 1: # batch["log_gif"] = True fetches = self.worker.learn_on_batch(batch) # Custom Logging policy_fetches = fetches[DEFAULT_POLICY_ID]["learner_stats"] if "log_gif" in policy_fetches: gif = policy_fetches["log_gif"] policy_fetches["log_gif"] = self.postprocess_gif(gif) # Metrics Calculation metrics = _get_shared_metrics() metrics.info[LEARNER_INFO] = fetches metrics.counters[STEPS_SAMPLED_COUNTER] = self.episode_buffer.timesteps metrics.counters[STEPS_SAMPLED_COUNTER] *= self.repeat res = collect_metrics(local_worker=self.worker) res["info"] = metrics.info res["info"].update(metrics.counters) res["timesteps_total"] = metrics.counters[STEPS_SAMPLED_COUNTER] self.episode_buffer.add(samples) return res
def _train(self): self.optimizer.step() FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) result = collect_metrics(self.local_evaluator, self.remote_evaluators) result = result._replace(info=self.optimizer.stats()) return result
def eval_func(trainer, workers): logger.info("Evaluating current policy for {} episodes.".format( trainer.config["evaluation_num_episodes"])) if trainer.config["evaluation_num_workers"] == 0: for _ in range(trainer.config["evaluation_num_episodes"]): trainer.evaluation_workers.local_worker().sample() else: num_rounds = int( math.ceil(trainer.config["evaluation_num_episodes"] / trainer.config["evaluation_num_workers"])) num_workers = len(trainer.evaluation_workers.remote_workers()) num_episodes = num_rounds * num_workers for i in range(num_rounds): logger.info("Running round {} of parallel evaluation " "({}/{} episodes)".format(i, (i + 1) * num_workers, num_episodes)) ray.get([ w.sample.remote() for w in trainer.evaluation_workers.remote_workers() ]) metrics = collect_metrics( trainer.evaluation_workers.local_worker(), trainer.evaluation_workers.remote_workers(), ) return metrics
def test_train_external_multi_agent_cartpole_many_policies(self): n = 20 single_env = gym.make("CartPole-v0") act_space = single_env.action_space obs_space = single_env.observation_space policies = {} for i in range(20): policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) ev = RolloutWorker( env_creator=lambda _: MultiAgentCartPole({"num_agents": n}), policy=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), rollout_fragment_length=100) optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev)) for i in range(100): optimizer.step() result = collect_metrics(ev) print("Iteration {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return raise Exception("failed to improve reward")
def training_workflow(config, reporter): # Setup policy and policy evaluation actors env = gym.make("CartPole-v0") policy = CustomPolicy(env.observation_space, env.action_space, {}) workers = [ PolicyEvaluator.as_remote().remote(lambda c: gym.make("CartPole-v0"), CustomPolicy) for _ in range(config["num_workers"]) ] for _ in range(config["num_iters"]): # Broadcast weights to the policy evaluation workers weights = ray.put({"default_policy": policy.get_weights()}) for w in workers: w.set_weights.remote(weights) # Gather a batch of samples T1 = SampleBatch.concat_samples( ray.get([w.sample.remote() for w in workers])) print("DEBUG* BATCH ************************") print(T1) print("DEBUG*************************") # Improve the policy using the T1 batch policy.learn_on_batch(T1) reporter(**collect_metrics(remote_evaluators=workers))
def testAutoVectorization(self): ev = CommonPolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=20), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=16, num_envs=8) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result.episodes_total, 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result.episodes_total, 8)
def _train(self): def postprocess_samples(batch): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal value = batch["advantages"] standardized = (value - value.mean()) / max(1e-4, value.std()) batch.data["advantages"] = standardized batch.shuffle() dummy = np.zeros_like(batch["advantages"]) if not self.config["use_gae"]: batch.data["value_targets"] = dummy batch.data["vf_preds"] = dummy extra_fetches = self.optimizer.step(postprocess_fn=postprocess_samples) kl = np.array(extra_fetches["kl"]).mean(axis=1)[-1] total_loss = np.array(extra_fetches["total_loss"]).mean(axis=1)[-1] policy_loss = np.array(extra_fetches["policy_loss"]).mean(axis=1)[-1] vf_loss = np.array(extra_fetches["vf_loss"]).mean(axis=1)[-1] entropy = np.array(extra_fetches["entropy"]).mean(axis=1)[-1] newkl = self.local_evaluator.for_policy(lambda pi: pi.update_kl(kl)) info = { "kl_divergence": kl, "kl_coefficient": newkl, "total_loss": total_loss, "policy_loss": policy_loss, "vf_loss": vf_loss, "entropy": entropy, } FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators) res = collect_metrics(self.local_evaluator, self.remote_evaluators) res = res._replace(info=info) return res
def test_rllib_batch_policy_eval(init_done=False): if not init_done: init() evaluator = PolicyEvaluator( env_creator=lambda _: MultiCarlaEnv(configs), # TODO: Remove the hardcoded spaces policy_graph={ "def_policy": (PGPolicyGraph, Box(0.0, 255.0, shape=(84, 84, 3)), Box(-1.0, 1.0, shape=(2, )), { "gamma": 0.99 }) }, policy_mapping_fn=lambda agent_id: "def_policy", batch_mode=BATCH_MODE, batch_steps=BATCH_STEPS, num_envs=NUM_ENVS) for _ in range(NUM_ENVS): samples, count = evaluator.sample_with_count() # print("sample:", samples.policy_batches["def_policy"]["actions"]) # count >= BATCH_STEPS for complete_episodes # == for truncate_episodes if BATCH_MODE == "complete_episodes": assert count >= BATCH_STEPS, "Expected count:{}. actual:{}".format( BATCH_STEPS, count) elif BATCH_MODE == "truncate_episodes": assert count == BATCH_STEPS, "Expected count:{}. actual:{}".format( BATCH_STEPS, count) print("Successfully sampled {} items".format(count)) results = collect_metrics(evaluator, []) print("results: \n", results) if BATCH_MODE == "complete_episodes": assert (results["episodes"] >= NUM_ENVS), "Expected num episodes:{}," \ "actual:{}".format(NUM_ENVS, results["episodes"])
def testVectorEnvSupport(self): ev = RolloutWorker( env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8), policy=MockPolicy, batch_mode="truncate_episodes", batch_steps=10) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8)
def test_reward_clipping(self): # Clipping: True (clip between -1.0 and 1.0). ev = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy_spec=MockPolicy, clip_rewards=True, batch_mode="complete_episodes", ) self.assertEqual(max(ev.sample()["rewards"]), 1) result = collect_metrics(ev, []) self.assertEqual(result["episode_reward_mean"], 1000) ev.stop() from ray.rllib.examples.env.random_env import RandomEnv # Clipping in certain range (-2.0, 2.0). ev2 = RolloutWorker( env_creator=lambda _: RandomEnv( dict( reward_space=gym.spaces.Box(low=-10, high=10, shape=()), p_done=0.0, max_episode_len=10, ) ), policy_spec=MockPolicy, clip_rewards=2.0, batch_mode="complete_episodes", ) sample = ev2.sample() self.assertEqual(max(sample["rewards"]), 2.0) self.assertEqual(min(sample["rewards"]), -2.0) self.assertLess(np.mean(sample["rewards"]), 0.5) self.assertGreater(np.mean(sample["rewards"]), -0.5) ev2.stop() # Clipping: Off. ev2 = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy_spec=MockPolicy, clip_rewards=False, batch_mode="complete_episodes", ) self.assertEqual(max(ev2.sample()["rewards"]), 100) result2 = collect_metrics(ev2, []) self.assertEqual(result2["episode_reward_mean"], 1000) ev2.stop()
def testRewardClipping(self): # clipping on ev = RolloutWorker(env_creator=lambda _: MockEnv2(episode_length=10), policy=MockPolicy, clip_rewards=True, batch_mode="complete_episodes") self.assertEqual(max(ev.sample()["rewards"]), 1) result = collect_metrics(ev, []) self.assertEqual(result["episode_reward_mean"], 1000) # clipping off ev2 = RolloutWorker(env_creator=lambda _: MockEnv2(episode_length=10), policy=MockPolicy, clip_rewards=False, batch_mode="complete_episodes") self.assertEqual(max(ev2.sample()["rewards"]), 100) result2 = collect_metrics(ev2, []) self.assertEqual(result2["episode_reward_mean"], 1000)
def _evaluate(self): steps = 0 self.evaluation_ev.restore(self.local_evaluator.save()) self.evaluation_ev.foreach_policy(lambda p, _: p.set_epsilon(0)) while steps < self.evaluation_steps: batch = self.evaluation_ev.sample() steps += batch.count metrics = collect_metrics(self.evaluation_ev) return {"evaluation": metrics}
def _evaluate(self): logger.info("Evaluating current policy for {} episodes".format( self.config["evaluation_num_episodes"])) self.evaluation_ev.restore(self.local_evaluator.save()) self.evaluation_ev.foreach_policy(lambda p, _: p.set_epsilon(0)) for _ in range(self.config["evaluation_num_episodes"]): self.evaluation_ev.sample() metrics = collect_metrics(self.evaluation_ev) return {"evaluation": metrics}
def _testWithOptimizer(self, optimizer_cls): n = 3 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. policies = { "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } else: policies = { "p1": (PGPolicyGraph, obs_space, act_space, {}), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: def policy_mapper(agent_id): return ["p1", "p2"][agent_id % 2] remote_evs = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=policy_mapper, batch_steps=50) ] else: remote_evs = [] optimizer = optimizer_cls(ev, remote_evs, {}) for i in range(200): ev.foreach_policy( lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02)) if isinstance(p, DQNPolicyGraph) else None) optimizer.step() result = collect_metrics(ev, remote_evs) if i % 20 == 0: ev.foreach_policy( lambda p, _: p.update_target() if isinstance(p, DQNPolicyGraph) else None) print("Iter {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return print(result) raise Exception("failed to improve reward")
def collect_metrics(self): """Returns evaluator and optimizer stats. Returns: res (TrainingResult): TrainingResult from evaluator metrics with `info` replaced with stats from self. """ res = collect_metrics(self.local_evaluator, self.remote_evaluators) res = res._replace(info=self.stats()) return res
def _evaluate(self): """Evaluates current policy under `evaluation_config` settings. Note that this default implementation does not do anything beyond merging evaluation_config with the normal trainer config. """ if not self.config["evaluation_config"]: raise ValueError( "No evaluation_config specified. It doesn't make sense " "to enable evaluation without specifying any config " "overrides, since the results will be the " "same as reported during normal policy evaluation.") self._before_evaluate() # Broadcast the new policy weights to all evaluation workers. logger.info("Synchronizing weights to evaluation workers.") weights = ray.put(self.workers.local_worker().save()) self.evaluation_workers.foreach_worker( lambda w: w.restore(ray.get(weights))) self._sync_filters_if_needed(self.evaluation_workers) if self.config["custom_eval_function"]: logger.info("Running custom eval function {}".format( self.config["custom_eval_function"])) metrics = self.config["custom_eval_function"]( self, self.evaluation_workers) if not metrics or not isinstance(metrics, dict): raise ValueError("Custom eval function must return " "dict of metrics, got {}.".format(metrics)) else: logger.info("Evaluating current policy for {} episodes.".format( self.config["evaluation_num_episodes"])) if self.config["evaluation_num_workers"] == 0: for _ in range(self.config["evaluation_num_episodes"]): self.evaluation_workers.local_worker().sample() else: num_rounds = int( math.ceil(self.config["evaluation_num_episodes"] / self.config["evaluation_num_workers"])) num_workers = len(self.evaluation_workers.remote_workers()) num_episodes = num_rounds * num_workers for i in range(num_rounds): logger.info("Running round {} of parallel evaluation " "({}/{} episodes)".format( i, (i + 1) * num_workers, num_episodes)) ray.get([ w.sample.remote() for w in self.evaluation_workers.remote_workers() ]) metrics = collect_metrics(self.evaluation_workers.local_worker(), self.evaluation_workers.remote_workers()) return {"evaluation": metrics}
def _train(self): start_timestep = self.global_timestep # Update worker explorations exp_vals = [self.exploration0.value(self.global_timestep)] self.local_evaluator.foreach_trainable_policy( lambda p, _: p.set_epsilon(exp_vals[0])) for i, e in enumerate(self.remote_evaluators): exp_val = self.explorations[i].value(self.global_timestep) e.foreach_trainable_policy.remote( lambda p, _: p.set_epsilon(exp_val)) exp_vals.append(exp_val) # Do optimization steps start = time.time() while (self.global_timestep - start_timestep < self.config["timesteps_per_iteration"] ) or time.time() - start < self.config["min_iter_time_s"]: self.optimizer.step() self.update_target_if_needed() if self.config["per_worker_exploration"]: # Only collect metrics from the third of workers with lowest eps result = collect_metrics( self.local_evaluator, self.remote_evaluators[-len(self.remote_evaluators) // 3:], timeout_seconds=self.config["collect_metrics_timeout"]) else: result = collect_metrics( self.local_evaluator, self.remote_evaluators, timeout_seconds=self.config["collect_metrics_timeout"]) result.update(timesteps_this_iter=self.global_timestep - start_timestep, info=dict( { "min_exploration": min(exp_vals), "max_exploration": max(exp_vals), "num_target_updates": self.num_target_updates, }, **self.optimizer.stats())) return result
def testRewardClipping(self): # clipping on ev = PolicyEvaluator( env_creator=lambda _: MockEnv2(episode_length=10), policy_graph=MockPolicyGraph, clip_rewards=True, batch_mode="complete_episodes") self.assertEqual(max(ev.sample()["rewards"]), 1) result = collect_metrics(ev, []) self.assertEqual(result["episode_reward_mean"], 1000) # clipping off ev2 = PolicyEvaluator( env_creator=lambda _: MockEnv2(episode_length=10), policy_graph=MockPolicyGraph, clip_rewards=False, batch_mode="complete_episodes") self.assertEqual(max(ev2.sample()["rewards"]), 100) result2 = collect_metrics(ev2, []) self.assertEqual(result2["episode_reward_mean"], 1000)
def test_vector_env_support(self): # Test a vector env that contains 8 actual envs # (MockEnv instances). ev = RolloutWorker( env_creator=( lambda _: VectorizedMockEnv(episode_length=20, num_envs=8)), policy_spec=MockPolicy, batch_mode="truncate_episodes", rollout_fragment_length=10, ) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8) ev.stop() # Test a vector env that pretends(!) to contain 4 envs, but actually # only has 1 (CartPole). ev = RolloutWorker( env_creator=(lambda _: MockVectorEnv(20, mocked_num_envs=4)), policy_spec=MockPolicy, batch_mode="truncate_episodes", rollout_fragment_length=10, ) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertGreater(result["episodes_this_iter"], 3) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertGreater(result["episodes_this_iter"], 6) ev.stop()
def post_process_metrics(adapt_iter, workers, metrics): # Obtain Current Dataset Metrics and filter out name = "_adapt_" + str(adapt_iter) if adapt_iter > 0 else "" # Only workers are collecting data res = collect_metrics(remote_workers=workers.remote_workers()) metrics["episode_reward_max" + str(name)] = res["episode_reward_max"] metrics["episode_reward_mean" + str(name)] = res["episode_reward_mean"] metrics["episode_reward_min" + str(name)] = res["episode_reward_min"] return metrics
def testMetrics(self): ev = CommonPolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") remote_ev = CommonPolicyEvaluator.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result.episodes_total, 20) self.assertEqual(result.episode_reward_mean, 10)
def post_process_metrics(prefix, workers, metrics): """Update current dataset metrics and filter out specific keys. Args: prefix: Prefix string to be appended workers: Set of workers metrics: Current metrics dictionary """ res = collect_metrics(remote_workers=workers.remote_workers()) for key in METRICS_KEYS: metrics[prefix + "_" + key] = res[key] return metrics
def testAutoVectorization(self): ev = PolicyEvaluator( env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=2, num_envs=8) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8) indices = [] for env in ev.async_env.vector_env.envs: self.assertEqual(env.unwrapped.config.worker_index, 0) indices.append(env.unwrapped.config.vector_index) self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
def testAutoVectorization(self): ev = RolloutWorker( env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg), policy=MockPolicy, batch_mode="truncate_episodes", batch_steps=2, num_envs=8) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8) indices = [] for env in ev.async_env.vector_env.envs: self.assertEqual(env.unwrapped.config.worker_index, 0) indices.append(env.unwrapped.config.vector_index) self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
def testMetrics(self): ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes") remote_ev = RolloutWorker.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result["episodes_this_iter"], 20) self.assertEqual(result["episode_reward_mean"], 10)
def testMetrics(self): ev = PolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") remote_ev = PolicyEvaluator.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result["episodes_this_iter"], 20) self.assertEqual(result["episode_reward_mean"], 10)
def _testWithOptimizer(self, optimizer_cls): n = 3 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. policies = { "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } else: policies = { "p1": (PGPolicyGraph, obs_space, act_space, {}), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: remote_evs = [PolicyEvaluator.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50)] else: remote_evs = [] optimizer = optimizer_cls(ev, remote_evs, {}) for i in range(200): ev.foreach_policy( lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02)) if isinstance(p, DQNPolicyGraph) else None) optimizer.step() result = collect_metrics(ev, remote_evs) if i % 20 == 0: ev.foreach_policy( lambda p, _: p.update_target() if isinstance(p, DQNPolicyGraph) else None) print("Iter {}, rew {}".format(i, result.policy_reward_mean)) print("Total reward", result.episode_reward_mean) if result.episode_reward_mean >= 25 * n: return print(result) raise Exception("failed to improve reward")
def test_metrics(self): # Allow for Unittest run. ray.init(num_cpus=5, ignore_reinit_error=True) ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes") remote_ev = RolloutWorker.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result["episodes_this_iter"], 20) self.assertEqual(result["episode_reward_mean"], 10)