def do_test_nested_dict(self, make_env, test_lstm=False): ModelCatalog.register_custom_model("composite", DictSpyModel) register_env("nested", make_env) pg = PGTrainer( env="nested", config={ "num_workers": 0, "rollout_fragment_length": 5, "train_batch_size": 5, "model": { "custom_model": "composite", "use_lstm": test_lstm, }, "framework": "tf", "disable_env_checking": True, }, ) # Skip first passes as they came from the TorchPolicy loss # initialization. DictSpyModel.capture_index = 0 pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get("d_spy_in_{}".format(i)) ) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = DICT_SAMPLES[i]["inner_state"]["job_status"]["task"] self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) check(seen[2][0], task_i)
def do_test_nested_tuple(self, make_env): ModelCatalog.register_custom_model("composite2", TupleSpyModel) register_env("nested2", make_env) pg = PGTrainer( env="nested2", config={ "num_workers": 0, "rollout_fragment_length": 5, "train_batch_size": 5, "model": { "custom_model": "composite2", }, "framework": "tf", "disable_env_checking": True, }, ) # Skip first passes as they came from the TorchPolicy loss # initialization. TupleSpyModel.capture_index = 0 pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get("t_spy_in_{}".format(i)) ) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = TUPLE_SAMPLES[i][2] self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) check(seen[2][0], task_i)
def test_custom_input_procedure(self): class CustomJsonReader(JsonReader): def __init__(self, ioctx: IOContext): super().__init__(ioctx.input_config["input_files"], ioctx) def input_creator(ioctx: IOContext) -> InputReader: return ShuffledInput(CustomJsonReader(ioctx)) register_input("custom_input", input_creator) test_input_procedure = [ "custom_input", input_creator, "ray.rllib.examples.custom_input_api.CustomJsonReader", ] for input_procedure in test_input_procedure: for fw in framework_iterator(frameworks=("torch", "tf")): self.write_outputs(self.test_dir, fw) agent = PGTrainer( env="CartPole-v0", config={ "input": input_procedure, "input_config": { "input_files": self.test_dir + fw }, "input_evaluation": [], "framework": fw, }, ) result = agent.train() self.assertEqual(result["timesteps_total"], 250) self.assertTrue(np.isnan(result["episode_reward_mean"]))
def test_local(self): cf = DEFAULT_CONFIG.copy() cf["model"]["fcnet_hiddens"] = [10] cf["num_workers"] = 2 for _ in framework_iterator(cf): agent = PGTrainer(cf, "CartPole-v0") print(agent.train()) agent.stop()
def write_outputs(self, output, fw, output_config=None): agent = PGTrainer( env="CartPole-v0", config={ "output": output + (fw if output != "logdir" else ""), "rollout_fragment_length": 250, "framework": fw, "output_config": output_config or {}, }, ) agent.train() return agent
def test_multi_agent_dict_invalid_sub_values(self): config = {"multiagent": {"count_steps_by": "invalid_value"}} self.assertRaisesRegex( ValueError, "config.multiagent.count_steps_by must be", lambda: PGTrainer(config, env="CartPole-v0"), ) config = {"multiagent": {"replay_mode": "invalid_value"}} self.assertRaisesRegex( ValueError, "config.multiagent.replay_mode must be", lambda: PGTrainer(config, env="CartPole-v0"), )
def test_multi_agent_with_flex_agents(self): register_env("flex_agents_multi_agent_cartpole", lambda _: FlexAgentsMultiAgent()) pg = PGTrainer( env="flex_agents_multi_agent_cartpole", config={ "num_workers": 0, "framework": "tf", }, ) for i in range(10): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"]))
def test_agent_input_dir(self): for fw in framework_iterator(frameworks=("torch", "tf")): self.write_outputs(self.test_dir, fw) print("WROTE TO: ", self.test_dir) agent = PGTrainer( env="CartPole-v0", config={ "input": self.test_dir + fw, "input_evaluation": [], "framework": fw, }, ) result = agent.train() self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"]))
def test_agent_input_list(self): for fw in framework_iterator(frameworks=("torch", "tf")): self.write_outputs(self.test_dir, fw) agent = PGTrainer( env="CartPole-v0", config={ "input": glob.glob(self.test_dir + fw + "/*.json"), "input_evaluation": [], "rollout_fragment_length": 99, "framework": fw, }, ) result = agent.train() self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"]))
def test_train_cartpole(self): register_env("test", lambda _: SimpleServing(gym.make("CartPole-v0"))) config = {"num_workers": 0} for _ in framework_iterator(config, frameworks=("tf", "torch")): pg = PGTrainer(env="test", config=config) reached = False for i in range(80): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 80: reached = True break if not reached: raise Exception("failed to improve reward")
def test_agent_input_dict(self): for fw in framework_iterator(): self.write_outputs(self.test_dir, fw) agent = PGTrainer( env="CartPole-v0", config={ "input": { self.test_dir + fw: 0.1, "sampler": 0.9, }, "train_batch_size": 2000, "input_evaluation": [], "framework": fw, }, ) result = agent.train() self.assertTrue(not np.isnan(result["episode_reward_mean"]))
def test_agent_input_eval_sim(self): for fw in framework_iterator(): self.write_outputs(self.test_dir, fw) agent = PGTrainer( env="CartPole-v0", config={ "input": self.test_dir + fw, "input_evaluation": ["simulation"], "framework": fw, }, ) for _ in range(50): result = agent.train() if not np.isnan(result["episode_reward_mean"]): return # simulation ok time.sleep(0.1) assert False, "did not see any simulation results"
def test_train_multi_agent_cartpole_single_policy(self): n = 10 register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": n})) pg = PGTrainer( env="multi_agent_cartpole", config={ "num_workers": 0, "framework": "tf", }, ) for i in range(50): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 50 * n: return raise Exception("failed to improve reward")
def test_nested_action_spaces(self): config = DEFAULT_CONFIG.copy() config["env"] = RandomEnv # Write output to check, whether actions are written correctly. tmp_dir = os.popen("mktemp -d").read()[:-1] if not os.path.exists(tmp_dir): # Last resort: Resolve via underlying tempdir (and cut tmp_. tmp_dir = ray._private.utils.tempfile.gettempdir() + tmp_dir[4:] assert os.path.exists(tmp_dir), f"'{tmp_dir}' not found!" config["output"] = tmp_dir # Switch off OPE as we don't write action-probs. # TODO: We should probably always write those if `output` is given. config["input_evaluation"] = [] # Pretend actions in offline files are already normalized. config["actions_in_input_normalized"] = True for _ in framework_iterator(config): for name, action_space in SPACES.items(): config["env_config"] = { "action_space": action_space, } for flatten in [False, True]: print(f"A={action_space} flatten={flatten}") shutil.rmtree(config["output"]) config["_disable_action_flattening"] = not flatten trainer = PGTrainer(config) trainer.train() trainer.stop() # Check actions in output file (whether properly flattened # or not). reader = JsonReader( inputs=config["output"], ioctx=trainer.workers.local_worker().io_context, ) sample_batch = reader.next() if flatten: assert isinstance(sample_batch["actions"], np.ndarray) assert len(sample_batch["actions"].shape) == 2 assert sample_batch["actions"].shape[0] == len( sample_batch) else: tree.assert_same_structure( trainer.get_policy().action_space_struct, sample_batch["actions"], ) # Test, whether offline data can be properly read by a # BCTrainer, configured accordingly. config["input"] = config["output"] del config["output"] bc_trainer = BCTrainer(config=config) bc_trainer.train() bc_trainer.stop() config["output"] = tmp_dir config["input"] = "sampler"
def test_multi_agent_dict_bad_policy_ids(self): config = { "multiagent": { "policies": {1, "good_id"}, "policy_mapping_fn": lambda aid, **kw: "good_id", } } self.assertRaisesRegex( KeyError, "Policy IDs must always be of type", lambda: PGTrainer(config, env="CartPole-v0"), )
def test_multi_agent(self): register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 10})) for fw in framework_iterator(): pg = PGTrainer( env="multi_agent_cartpole", config={ "num_workers": 0, "output": self.test_dir, "multiagent": { "policies": {"policy_1", "policy_2"}, "policy_mapping_fn": (lambda aid, **kwargs: random.choice( ["policy_1", "policy_2"])), }, "framework": fw, }, ) pg.train() self.assertEqual(len(os.listdir(self.test_dir)), 1) pg.stop() pg = PGTrainer( env="multi_agent_cartpole", config={ "num_workers": 0, "input": self.test_dir, "input_evaluation": ["simulation"], "train_batch_size": 2000, "multiagent": { "policies": {"policy_1", "policy_2"}, "policy_mapping_fn": (lambda aid, **kwargs: random.choice( ["policy_1", "policy_2"])), }, "framework": fw, }, ) for _ in range(50): result = pg.train() if not np.isnan(result["episode_reward_mean"]): return # simulation ok time.sleep(0.1) assert False, "did not see any simulation results"
def test_callbacks(self): for fw in framework_iterator(frameworks=("torch", "tf")): counts = Counter() pg = PGTrainer( env="CartPole-v0", config={ "num_workers": 0, "rollout_fragment_length": 50, "train_batch_size": 50, "callbacks": { "on_episode_start": lambda x: counts.update({"start": 1}), "on_episode_step": lambda x: counts.update({"step": 1}), "on_episode_end": lambda x: counts.update({"end": 1}), "on_sample_end": lambda x: counts.update({"sample": 1}), }, "framework": fw, }, ) pg.train() pg.train() self.assertGreater(counts["sample"], 0) self.assertGreater(counts["start"], 0) self.assertGreater(counts["end"], 0) self.assertGreater(counts["step"], 0) pg.stop()
def test_multi_agent_dict_invalid_subkeys(self): config = { "multiagent": { "wrong_key": 1, "policies": {"p0"}, "policies_to_train": ["p0"], } } self.assertRaisesRegex( KeyError, "You have invalid keys in your", lambda: PGTrainer(config, env="CartPole-v0"), )
def test_agent_input_postprocessing_enabled(self): for fw in framework_iterator(frameworks=("tf", "torch")): self.write_outputs(self.test_dir, fw) # Rewrite the files to drop advantages and value_targets for # testing for path in glob.glob(self.test_dir + fw + "/*.json"): out = [] with open(path) as f: for line in f.readlines(): data = json.loads(line) # Data won't contain rewards as these are not included # in the write_outputs run (not needed in the # SampleBatch). Flip out "rewards" for "advantages" # just for testing. data["rewards"] = data["advantages"] del data["advantages"] if "value_targets" in data: del data["value_targets"] out.append(data) with open(path, "w") as f: for data in out: f.write(json.dumps(data)) agent = PGTrainer( env="CartPole-v0", config={ "input": self.test_dir + fw, "input_evaluation": [], "postprocess_inputs": True, # adds back 'advantages' "framework": fw, }, ) result = agent.train() self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"]))
def test_invalid_model2(self): ModelCatalog.register_custom_model("invalid2", InvalidModel2) self.assertRaisesRegex( ValueError, "State output is not a list", lambda: PGTrainer( env="CartPole-v0", config={ "model": { "custom_model": "invalid2", }, "framework": "tf", }, ), )
def test_query_evaluators(self): register_env("test", lambda _: gym.make("CartPole-v0")) for fw in framework_iterator(frameworks=("torch", "tf")): pg = PGTrainer( env="test", config={ "num_workers": 2, "rollout_fragment_length": 5, "num_envs_per_worker": 2, "framework": fw, "create_env_on_driver": True, }, ) results = pg.workers.foreach_worker(lambda ev: ev.rollout_fragment_length) results2 = pg.workers.foreach_worker_with_index( lambda ev, i: (i, ev.rollout_fragment_length) ) results3 = pg.workers.foreach_worker( lambda ev: ev.foreach_env(lambda env: 1) ) self.assertEqual(results, [10, 10, 10]) self.assertEqual(results2, [(0, 10), (1, 10), (2, 10)]) self.assertEqual(results3, [[1, 1], [1, 1], [1, 1]]) pg.stop()
def test_invalid_model(self): ModelCatalog.register_custom_model("invalid", InvalidModel) self.assertRaisesRegex( ValueError, "Subclasses of TorchModelV2 must also inherit from nn.Module", lambda: PGTrainer( env="CartPole-v0", config={ "model": { "custom_model": "invalid", }, "framework": "torch", }, ), )
def test_no_step_on_init(self): register_env("fail", lambda _: FailOnStepEnv()) for fw in framework_iterator(): # We expect this to fail already on Trainer init due # to the env sanity check right after env creation (inside # RolloutWorker). self.assertRaises( Exception, lambda: PGTrainer( env="fail", config={ "num_workers": 2, "framework": fw, }, ), )
def test_train_multi_agent_cartpole_multi_policy(self): n = 10 register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": n})) def gen_policy(): config = { "gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]), "n_step": random.choice([1, 2, 3, 4, 5]), } return PolicySpec(config=config) pg = PGTrainer( env="multi_agent_cartpole", config={ "num_workers": 0, "multiagent": { "policies": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": lambda aid, **kwargs: "policy_1", }, "framework": "tf", }, ) # Just check that it runs without crashing for i in range(10): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) self.assertTrue( pg.compute_single_action([0, 0, 0, 0], policy_id="policy_1") in [0, 1]) self.assertTrue( pg.compute_single_action([0, 0, 0, 0], policy_id="policy_2") in [0, 1]) self.assertRaisesRegex( KeyError, "not found in PolicyMap", lambda: pg.compute_single_action([0, 0, 0, 0], policy_id="policy_3"), )
def test_multi_agent_complex_spaces(self): ModelCatalog.register_custom_model("dict_spy", DictSpyModel) ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel) register_env("nested_ma", lambda _: NestedMultiAgentEnv()) act_space = spaces.Discrete(2) pg = PGTrainer( env="nested_ma", config={ "num_workers": 0, "rollout_fragment_length": 5, "train_batch_size": 5, "multiagent": { "policies": { "tuple_policy": ( None, TUPLE_SPACE, act_space, {"model": {"custom_model": "tuple_spy"}}, ), "dict_policy": ( None, DICT_SPACE, act_space, {"model": {"custom_model": "dict_spy"}}, ), }, "policy_mapping_fn": lambda aid, **kwargs: { "tuple_agent": "tuple_policy", "dict_agent": "dict_policy", }[aid], }, "framework": "tf", "disable_env_checking": True, }, ) # Skip first passes as they came from the TorchPolicy loss # initialization. TupleSpyModel.capture_index = DictSpyModel.capture_index = 0 pg.train() for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get("d_spy_in_{}".format(i)) ) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = DICT_SAMPLES[i]["inner_state"]["job_status"]["task"] self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) check(seen[2][0], task_i) for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get("t_spy_in_{}".format(i)) ) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = TUPLE_SAMPLES[i][2] self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) check(seen[2][0], task_i)
def test_rollout_dict_space(self): register_env("nested", lambda _: NestedDictEnv()) agent = PGTrainer(env="nested", config={"framework": "tf"}) agent.train() path = agent.save() agent.stop() # Test train works on restore agent2 = PGTrainer(env="nested", config={"framework": "tf"}) agent2.restore(path) agent2.train() # Test rollout works on restore rollout(agent2, "nested", 100)