def custom_loss(self, policy_loss, loss_inputs): # create a new input reader per worker reader = JsonReader(self.options["custom_options"]["input_files"]) input_ops = reader.tf_input_ops() # define a secondary loss by building a graph copy with weight sharing with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE, auxiliary_name_scope=False): logits, _ = self._build_layers_v2( { "obs": restore_original_dimensions(input_ops["obs"], self.obs_space) }, self.num_outputs, self.options) # You can also add self-supervised losses easily by referencing tensors # created during _build_layers_v2(). For example, an autoencoder-style # loss can be added as follows: # ae_loss = squared_diff( # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # compute the IL loss action_dist = Categorical(logits) self.policy_loss = policy_loss self.imitation_loss = tf.reduce_mean( -action_dist.logp(input_ops["actions"])) return policy_loss + 10 * self.imitation_loss
def custom_loss(self, policy_loss, loss_inputs): # create a new input reader per worker reader = JsonReader( self.model_config["custom_model_config"]["input_files"]) input_ops = reader.tf_input_ops( self.model_config["custom_model_config"].get("expert_size", 1)) # define a secondary loss by building a graph copy with weight sharing obs = restore_original_dimensions( tf.cast(input_ops["obs"], tf.float32), self.obs_space) logits, _ = self.forward({"obs": obs}, [], None) # You can also add self-supervised losses easily by referencing tensors # created during _build_layers_v2(). For example, an autoencoder-style # loss can be added as follows: # ae_loss = squared_diff( # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) # print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # compute the IL loss self.policy_loss = policy_loss (action_scores, model_logits, dist) = self.get_q_value_distributions(logits) model_logits = tf.squeeze(model_logits) action_dist = Categorical(model_logits, self.model_config) expert_logits = tf.cast(input_ops["actions"], tf.int32) expert_action = tf.math.argmax(expert_logits) expert_action_one_hot = tf.one_hot(expert_action, self.num_outputs) model_action = action_dist.deterministic_sample() model_action_one_hot = tf.one_hot(model_action, self.num_outputs) model_expert = model_action_one_hot * expert_action_one_hot imitation_loss = 0 loss_type = self.model_config["custom_model_config"].get("loss", "ce") if loss_type == "ce": imitation_loss = tf.reduce_mean(-action_dist.logp(expert_logits)) elif loss_type == "kl": expert_dist = Categorical(tf.one_hot(expert_logits, \ self.num_outputs), self.model_config) imitation_loss = tf.reduce_mean(-action_dist.kl(expert_dist)) elif loss_type == "dqfd": max_value = float("-inf") Q_select = model_logits # TODO: difference in action_scores,dist and logits for a in range(self.num_outputs): max_value = tf.maximum( Q_select[a] + 0.8 * tf.cast(model_expert[a], tf.float32), max_value) imitation_loss = tf.reduce_mean( 1 * (max_value - Q_select[tf.cast(expert_action, tf.int32)])) self.imitation_loss = imitation_loss total_loss = self.model_config["custom_model_config"]["lambda1"] * policy_loss \ + self.model_config["custom_model_config"]["lambda2"] \ * self.imitation_loss return total_loss
def test_agent_output_infos(self): """Verify that the infos dictionary is written to the output files. Note, with torch this is always the case. """ output_config = {"store_infos": True} for fw in framework_iterator(frameworks=("torch", "tf")): self.write_outputs(self.test_dir, fw, output_config=output_config) self.assertEqual(len(os.listdir(self.test_dir + fw)), 1) reader = JsonReader(self.test_dir + fw + "/*.json") data = reader.next() assert "infos" in data
def __init__( self, obs_space, action_space, num_outputs, model_config, name, input_files ): super().__init__(obs_space, action_space, num_outputs, model_config, name) nn.Module.__init__(self) self.input_files = input_files # Create a new input reader per worker. self.reader = JsonReader(self.input_files) self.fcnet = TorchFC( self.obs_space, self.action_space, num_outputs, model_config, name="fcnet" )
def __init__(self, inputs, num_samples_per_task=0, policy_id="human_0", transform=None, target_transform=None, dataset_transform=None): super(Behaviour, self).__init__(meta_split='train', target_transform=target_transform, dataset_transform=dataset_transform) self.transform = transform self._datasets = [] for input_path in inputs: # Cache all samples in rllib, to ensure stochastic loading data_paths = [ osp.join(input_path, f) for f in os.listdir(input_path) ] print(f"Loading from {input_path}: {len(data_paths)} files") # dataset = ShuffledInput(JsonReader(data_paths), n=num_samples_per_task) dataset = ShuffledInput(JsonReader(data_paths)) self._datasets.append(dataset) self.num_tasks = len(inputs) self.num_samples_per_task = num_samples_per_task self.policy_id = policy_id
def test_multiple_output_workers(self): ray.shutdown() ray.init(num_cpus=4, ignore_reinit_error=True) for fw in framework_iterator(frameworks=["tf", "torch"]): agent = PG( env="CartPole-v0", config={ "num_workers": 2, "output": self.test_dir + fw, "rollout_fragment_length": 250, "framework": fw, }, ) agent.train() self.assertEqual(len(os.listdir(self.test_dir + fw)), 2) reader = JsonReader(self.test_dir + fw + "/*.json") reader.next()
def testReadWrite(self): ioctx = IOContext(self.test_dir, {}, 0, None) writer = JsonWriter( self.test_dir, ioctx, max_file_size=5000, compress_columns=["obs"]) for i in range(100): writer.write(make_sample_batch(i)) reader = JsonReader(self.test_dir + "/*.json") seen_a = set() seen_o = set() for i in range(1000): batch = reader.next() seen_a.add(batch["actions"][0]) seen_o.add(batch["obs"][0]) self.assertGreater(len(seen_a), 90) self.assertLess(len(seen_a), 101) self.assertGreater(len(seen_o), 90) self.assertLess(len(seen_o), 101)
def testAbortOnAllEmptyInputs(self): open(self.test_dir + "/empty", "w").close() reader = JsonReader([ self.test_dir + "/empty", ]) self.assertRaises(ValueError, lambda: reader.next()) with open(self.test_dir + "/empty1", "w") as f: for _ in range(100): f.write("\n") with open(self.test_dir + "/empty2", "w") as f: for _ in range(100): f.write("\n") reader = JsonReader([ self.test_dir + "/empty1", self.test_dir + "/empty2", ]) self.assertRaises(ValueError, lambda: reader.next())
def test_skips_over_empty_lines_and_files(self): open(self.test_dir + "/empty", "w").close() with open(self.test_dir + "/f1", "w") as f: f.write("\n") f.write("\n") f.write(_to_json(make_sample_batch(0), [])) with open(self.test_dir + "/f2", "w") as f: f.write(_to_json(make_sample_batch(1), [])) f.write("\n") reader = JsonReader([ self.test_dir + "/empty", self.test_dir + "/f1", "file://" + self.test_dir + "/f2", ]) seen_a = set() for i in range(100): batch = reader.next() seen_a.add(batch["actions"][0]) self.assertEqual(len(seen_a), 2)
def test_skips_over_corrupted_lines(self): with open(self.test_dir + "/f1", "w") as f: f.write(_to_json(make_sample_batch(0), [])) f.write("\n") f.write(_to_json(make_sample_batch(1), [])) f.write("\n") f.write(_to_json(make_sample_batch(2), [])) f.write("\n") f.write(_to_json(make_sample_batch(3), [])) f.write("\n") f.write("{..corrupted_json_record") reader = JsonReader([ self.test_dir + "/f1", ]) seen_a = set() for i in range(10): batch = reader.next() seen_a.add(batch["actions"][0]) self.assertEqual(len(seen_a), 4)
def testSkipsOverCorruptedLines(self): with open(self.test_dir + "/f1", "w") as f: f.write(_to_json(make_sample_batch(0), [])) f.write("\n") f.write(_to_json(make_sample_batch(1), [])) f.write("\n") f.write(_to_json(make_sample_batch(2), [])) f.write("\n") f.write(_to_json(make_sample_batch(3), [])) f.write("\n") f.write("{..corrupted_json_record") reader = JsonReader([ self.test_dir + "/f1", ]) seen_a = set() for i in range(10): batch = reader.next() seen_a.add(batch["actions"][0]) self.assertEqual(len(seen_a), 4)
def testSkipsOverEmptyLinesAndFiles(self): open(self.test_dir + "/empty", "w").close() with open(self.test_dir + "/f1", "w") as f: f.write("\n") f.write("\n") f.write(_to_json(make_sample_batch(0), [])) with open(self.test_dir + "/f2", "w") as f: f.write(_to_json(make_sample_batch(1), [])) f.write("\n") reader = JsonReader([ self.test_dir + "/empty", self.test_dir + "/f1", "file:" + self.test_dir + "/f2", ]) seen_a = set() for i in range(100): batch = reader.next() seen_a.add(batch["actions"][0]) self.assertEqual(len(seen_a), 2)
def testSkipsOverEmptyLinesAndFiles(self): ioctx = IOContext(self.test_dir, {}, 0, None) open(self.test_dir + "/empty", "w").close() with open(self.test_dir + "/f1", "w") as f: f.write("\n") f.write("\n") f.write(_to_json(make_sample_batch(0), [])) with open(self.test_dir + "/f2", "w") as f: f.write(_to_json(make_sample_batch(1), [])) f.write("\n") reader = JsonReader(ioctx, [ self.test_dir + "/empty", self.test_dir + "/f1", "file:" + self.test_dir + "/f2", ]) seen_a = set() for i in range(100): batch = reader.next() seen_a.add(batch["actions"][0]) self.assertEqual(len(seen_a), 2)
def testSkipsOverCorruptedLines(self): ioctx = IOContext(self.test_dir, {}, 0, None) with open(self.test_dir + "/f1", "w") as f: f.write(_to_json(make_sample_batch(0), [])) f.write("\n") f.write(_to_json(make_sample_batch(1), [])) f.write("\n") f.write(_to_json(make_sample_batch(2), [])) f.write("\n") f.write(_to_json(make_sample_batch(3), [])) f.write("\n") f.write("{..corrupted_json_record") reader = JsonReader(ioctx, [ self.test_dir + "/f1", ]) seen_a = set() for i in range(10): batch = reader.next() seen_a.add(batch["actions"][0]) self.assertEqual(len(seen_a), 4)
def custom_loss(self, policy_loss, loss_inputs): # Create a new input reader per worker. reader = JsonReader(self.model_config["custom_model_config"]["input_files"]) input_ops = reader.tf_input_ops() # Define a secondary loss by building a graph copy with weight sharing. obs = restore_original_dimensions( tf.cast(input_ops["obs"], tf.float32), self.obs_space ) logits, _ = self.forward({"obs": obs}, [], None) # You can also add self-supervised losses easily by referencing tensors # created during _build_layers_v2(). For example, an autoencoder-style # loss can be added as follows: # ae_loss = squared_diff( # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # Compute the IL loss. action_dist = Categorical(logits, self.model_config) self.policy_loss = policy_loss self.imitation_loss = tf.reduce_mean(-action_dist.logp(input_ops["actions"])) return policy_loss + 10 * self.imitation_loss
def custom_loss(self, policy_loss, loss_inputs): # create a new input reader per worker reader = JsonReader(self.options["custom_options"]["input_files"]) input_ops = reader.tf_input_ops() # define a secondary loss by building a graph copy with weight sharing logits, _ = self._build_layers_v2({ "obs": restore_original_dimensions(input_ops["obs"], self.obs_space) }, self.num_outputs, self.options) # You can also add self-supervised losses easily by referencing tensors # created during _build_layers_v2(). For example, an autoencoder-style # loss can be added as follows: # ae_loss = squared_diff( # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # compute the IL loss action_dist = Categorical(logits) self.policy_loss = policy_loss self.imitation_loss = tf.reduce_mean( -action_dist.logp(input_ops["actions"])) return policy_loss + 10 * self.imitation_loss
def test_abort_on_all_empty_inputs(self): open(self.test_dir + "/empty", "w").close() reader = JsonReader([ self.test_dir + "/empty", ]) self.assertRaises(ValueError, lambda: reader.next()) with open(self.test_dir + "/empty1", "w") as f: for _ in range(100): f.write("\n") with open(self.test_dir + "/empty2", "w") as f: for _ in range(100): f.write("\n") reader = JsonReader([ self.test_dir + "/empty1", self.test_dir + "/empty2", ]) self.assertRaises(ValueError, lambda: reader.next())
def testAbortOnAllEmptyInputs(self): ioctx = IOContext(self.test_dir, {}, 0, None) open(self.test_dir + "/empty", "w").close() reader = JsonReader(ioctx, [ self.test_dir + "/empty", ]) self.assertRaises(ValueError, lambda: reader.next()) with open(self.test_dir + "/empty1", "w") as f: for _ in range(100): f.write("\n") with open(self.test_dir + "/empty2", "w") as f: for _ in range(100): f.write("\n") reader = JsonReader(ioctx, [ self.test_dir + "/empty1", self.test_dir + "/empty2", ]) self.assertRaises(ValueError, lambda: reader.next())
def _make_evaluator(self, cls, env_creator, policy_graph, worker_index, config): def session_creator(): logger.debug("Creating TF session {}".format( config["tf_session_args"])) return tf.Session(config=tf.ConfigProto( **config["tf_session_args"])) if isinstance(config["input"], FunctionType): input_creator = config["input"] elif config["input"] == "sampler": input_creator = (lambda ioctx: ioctx.default_sampler_input()) elif isinstance(config["input"], dict): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), config["shuffle_buffer_size"])) if isinstance(config["output"], FunctionType): output_creator = config["output"] elif config["output"] is None: output_creator = (lambda ioctx: NoopOutput()) elif config["output"] == "logdir": output_creator = (lambda ioctx: JsonWriter( ioctx.log_dir, ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) else: output_creator = (lambda ioctx: JsonWriter( config["output"], ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) if config["input"] == "sampler": input_evaluation = [] else: input_evaluation = config["input_evaluation"] return cls( env_creator, self.config["multiagent"]["policy_graphs"] or policy_graph, policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"], policies_to_train=self.config["multiagent"]["policies_to_train"], tf_session_creator=(session_creator if config["tf_session_args"] else None), batch_steps=config["sample_batch_size"], batch_mode=config["batch_mode"], episode_horizon=config["horizon"], preprocessor_pref=config["preprocessor_pref"], sample_async=config["sample_async"], compress_observations=config["compress_observations"], num_envs=config["num_envs_per_worker"], observation_filter=config["observation_filter"], clip_rewards=config["clip_rewards"], clip_actions=config["clip_actions"], env_config=config["env_config"], model_config=config["model"], policy_config=config, worker_index=worker_index, monitor_path=self.logdir if config["monitor"] else None, log_dir=self.logdir, log_level=config["log_level"], callbacks=config["callbacks"], input_creator=input_creator, input_evaluation=input_evaluation, output_creator=output_creator, remote_worker_envs=config["remote_worker_envs"], async_remote_worker_envs=config["async_remote_worker_envs"])
def testAgentOutputOk(self): self.writeOutputs(self.test_dir) self.assertEqual(len(os.listdir(self.test_dir)), 1) reader = JsonReader(self.test_dir + "/*.json") reader.next()
def _make_worker( self, *, cls: Callable, env_creator: Callable[[EnvContext], EnvType], validate_env: Optional[Callable[[EnvType], None]], policy_cls: Type[Policy], worker_index: int, num_workers: int, config: TrainerConfigDict, spaces: Optional[Dict[PolicyID, Tuple[gym.spaces.Space, gym.spaces.Space]]] = None, ) -> Union[RolloutWorker, "ActorHandle"]: def session_creator(): logger.debug("Creating TF session {}".format( config["tf_session_args"])) return tf1.Session(config=tf1.ConfigProto( **config["tf_session_args"])) if isinstance(config["input"], FunctionType): input_creator = config["input"] elif config["input"] == "sampler": input_creator = (lambda ioctx: ioctx.default_sampler_input()) elif isinstance(config["input"], dict): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), config["shuffle_buffer_size"])) if isinstance(config["output"], FunctionType): output_creator = config["output"] elif config["output"] is None: output_creator = (lambda ioctx: NoopOutput()) elif config["output"] == "logdir": output_creator = (lambda ioctx: JsonWriter( ioctx.log_dir, ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) else: output_creator = (lambda ioctx: JsonWriter( config["output"], ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) if config["input"] == "sampler": input_evaluation = [] else: input_evaluation = config["input_evaluation"] # Fill in the default policy_cls if 'None' is specified in multiagent. if config["multiagent"]["policies"]: tmp = config["multiagent"]["policies"] _validate_multiagent_config(tmp, allow_none_graph=True) # TODO: (sven) Allow for setting observation and action spaces to # None as well, in which case, spaces are taken from env. # It's tedious to have to provide these in a multi-agent config. for k, v in tmp.items(): if v[0] is None: tmp[k] = (policy_cls, v[1], v[2], v[3]) policy_spec = tmp # Otherwise, policy spec is simply the policy class itself. else: policy_spec = policy_cls if worker_index == 0: extra_python_environs = config.get( "extra_python_environs_for_driver", None) else: extra_python_environs = config.get( "extra_python_environs_for_worker", None) worker = cls( env_creator=env_creator, validate_env=validate_env, policy_spec=policy_spec, policy_mapping_fn=config["multiagent"]["policy_mapping_fn"], policies_to_train=config["multiagent"]["policies_to_train"], tf_session_creator=(session_creator if config["tf_session_args"] else None), rollout_fragment_length=config["rollout_fragment_length"], batch_mode=config["batch_mode"], episode_horizon=config["horizon"], preprocessor_pref=config["preprocessor_pref"], sample_async=config["sample_async"], compress_observations=config["compress_observations"], num_envs=config["num_envs_per_worker"], observation_fn=config["multiagent"]["observation_fn"], observation_filter=config["observation_filter"], clip_rewards=config["clip_rewards"], clip_actions=config["clip_actions"], env_config=config["env_config"], model_config=config["model"], policy_config=config, worker_index=worker_index, num_workers=num_workers, monitor_path=self._logdir if config["monitor"] else None, log_dir=self._logdir, log_level=config["log_level"], callbacks=config["callbacks"], input_creator=input_creator, input_evaluation=input_evaluation, output_creator=output_creator, remote_worker_envs=config["remote_worker_envs"], remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], soft_horizon=config["soft_horizon"], no_done_at_end=config["no_done_at_end"], seed=(config["seed"] + worker_index) if config["seed"] is not None else None, fake_sampler=config["fake_sampler"], extra_python_environs=extra_python_environs, spaces=spaces, ) return worker
def test_agent_output_ok(self): for fw in framework_iterator(frameworks=("torch", "tf")): self.write_outputs(self.test_dir, fw) self.assertEqual(len(os.listdir(self.test_dir + fw)), 1) reader = JsonReader(self.test_dir + fw + "/*.json") reader.next()
def _make_worker( self, *, cls: Callable, env_creator: Callable[[EnvContext], EnvType], validate_env: Optional[Callable[[EnvType], None]], policy_cls: Type[Policy], worker_index: int, num_workers: int, config: TrainerConfigDict, spaces: Optional[Dict[PolicyID, Tuple[gym.spaces.Space, gym.spaces.Space]]] = None, ) -> Union[RolloutWorker, ActorHandle]: def session_creator(): logger.debug("Creating TF session {}".format( config["tf_session_args"])) return tf1.Session( config=tf1.ConfigProto(**config["tf_session_args"])) def valid_module(class_path): if isinstance(class_path, str) and "." in class_path: module_path, class_name = class_path.rsplit(".", 1) try: spec = importlib.util.find_spec(module_path) if spec is not None: return True except (ModuleNotFoundError, ValueError): print( f"module {module_path} not found while trying to get " f"input {class_path}") return False if isinstance(config["input"], FunctionType): input_creator = config["input"] elif config["input"] == "sampler": input_creator = (lambda ioctx: ioctx.default_sampler_input()) elif isinstance(config["input"], dict): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) elif isinstance(config["input"], str) and \ registry_contains_input(config["input"]): input_creator = registry_get_input(config["input"]) elif "d4rl" in config["input"]: env_name = config["input"].split(".")[-1] input_creator = (lambda ioctx: D4RLReader(env_name, ioctx)) elif valid_module(config["input"]): input_creator = (lambda ioctx: ShuffledInput(from_config( config["input"], ioctx=ioctx))) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), config["shuffle_buffer_size"])) if isinstance(config["output"], FunctionType): output_creator = config["output"] elif config["output"] is None: output_creator = (lambda ioctx: NoopOutput()) elif config["output"] == "logdir": output_creator = (lambda ioctx: JsonWriter( ioctx.log_dir, ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) else: output_creator = (lambda ioctx: JsonWriter( config["output"], ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) if config["input"] == "sampler": input_evaluation = [] else: input_evaluation = config["input_evaluation"] # Assert everything is correct in "multiagent" config dict (if given). ma_policies = config["multiagent"]["policies"] if ma_policies: for pid, policy_spec in ma_policies.copy().items(): assert isinstance(policy_spec, (PolicySpec, list, tuple)) # Class is None -> Use `policy_cls`. if policy_spec.policy_class is None: ma_policies[pid] = ma_policies[pid]._replace( policy_class=policy_cls) policies = ma_policies # Create a policy_spec (MultiAgentPolicyConfigDict), # even if no "multiagent" setup given by user. else: policies = policy_cls if worker_index == 0: extra_python_environs = config.get( "extra_python_environs_for_driver", None) else: extra_python_environs = config.get( "extra_python_environs_for_worker", None) worker = cls( env_creator=env_creator, validate_env=validate_env, policy_spec=policies, policy_mapping_fn=config["multiagent"]["policy_mapping_fn"], policies_to_train=config["multiagent"]["policies_to_train"], tf_session_creator=(session_creator if config["tf_session_args"] else None), rollout_fragment_length=config["rollout_fragment_length"], count_steps_by=config["multiagent"]["count_steps_by"], batch_mode=config["batch_mode"], episode_horizon=config["horizon"], preprocessor_pref=config["preprocessor_pref"], sample_async=config["sample_async"], compress_observations=config["compress_observations"], num_envs=config["num_envs_per_worker"], observation_fn=config["multiagent"]["observation_fn"], observation_filter=config["observation_filter"], clip_rewards=config["clip_rewards"], normalize_actions=config["normalize_actions"], clip_actions=config["clip_actions"], env_config=config["env_config"], policy_config=config, worker_index=worker_index, num_workers=num_workers, record_env=config["record_env"], log_dir=self._logdir, log_level=config["log_level"], callbacks=config["callbacks"], input_creator=input_creator, input_evaluation=input_evaluation, output_creator=output_creator, remote_worker_envs=config["remote_worker_envs"], remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], soft_horizon=config["soft_horizon"], no_done_at_end=config["no_done_at_end"], seed=(config["seed"] + worker_index) if config["seed"] is not None else None, fake_sampler=config["fake_sampler"], extra_python_environs=extra_python_environs, spaces=spaces, ) return worker
def test_marwil_loss_function(self): """ To generate the historic data used in this test case, first run: $ ./train.py --run=PPO --env=CartPole-v0 \ --stop='{"timesteps_total": 50000}' \ --config='{"output": "/tmp/out", "batch_mode": "complete_episodes"}' """ rllib_dir = Path(__file__).parent.parent.parent.parent print("rllib dir={}".format(rllib_dir)) data_file = os.path.join(rllib_dir, "tests/data/cartpole/small.json") print("data_file={} exists={}".format(data_file, os.path.isfile(data_file))) config = (marwil.MARWILConfig().rollouts( num_rollout_workers=0).offline_data(input_=[data_file]) ) # Learn from offline data. for fw, sess in framework_iterator(config, session=True): reader = JsonReader(inputs=[data_file]) batch = reader.next() trainer = config.build(env="CartPole-v0") policy = trainer.get_policy() model = policy.model # Calculate our own expected values (to then compare against the # agent's loss output). cummulative_rewards = compute_advantages(batch, 0.0, config.gamma, 1.0, False, False)["advantages"] if fw == "torch": cummulative_rewards = torch.tensor(cummulative_rewards) if fw != "tf": batch = policy._lazy_tensor_dict(batch) model_out, _ = model(batch) vf_estimates = model.value_function() if fw == "tf": model_out, vf_estimates = policy.get_session().run( [model_out, vf_estimates]) adv = cummulative_rewards - vf_estimates if fw == "torch": adv = adv.detach().cpu().numpy() adv_squared = np.mean(np.square(adv)) c_2 = 100.0 + 1e-8 * (adv_squared - 100.0) c = np.sqrt(c_2) exp_advs = np.exp(config.beta * (adv / c)) dist = policy.dist_class(model_out, model) logp = dist.logp(batch["actions"]) if fw == "torch": logp = logp.detach().cpu().numpy() elif fw == "tf": logp = sess.run(logp) # Calculate all expected loss components. expected_vf_loss = 0.5 * adv_squared expected_pol_loss = -1.0 * np.mean(exp_advs * logp) expected_loss = expected_pol_loss + config.vf_coeff * expected_vf_loss # Calculate the algorithm's loss (to check against our own # calculation above). batch.set_get_interceptor(None) postprocessed_batch = policy.postprocess_trajectory(batch) loss_func = (MARWILTF2Policy.loss if fw != "torch" else MARWILTorchPolicy.loss) if fw != "tf": policy._lazy_tensor_dict(postprocessed_batch) loss_out = loss_func(policy, model, policy.dist_class, postprocessed_batch) else: loss_out, v_loss, p_loss = policy.get_session().run( # policy._loss is create by TFPolicy, and is basically the # loss tensor of the static graph. [ policy._loss, policy._marwil_loss.v_loss, policy._marwil_loss.p_loss, ], feed_dict=policy._get_loss_inputs_dict(postprocessed_batch, shuffle=False), ) # Check all components. if fw == "torch": check(policy.v_loss, expected_vf_loss, decimals=4) check(policy.p_loss, expected_pol_loss, decimals=4) elif fw == "tf": check(v_loss, expected_vf_loss, decimals=4) check(p_loss, expected_pol_loss, decimals=4) else: check(policy._marwil_loss.v_loss, expected_vf_loss, decimals=4) check(policy._marwil_loss.p_loss, expected_pol_loss, decimals=4) check(loss_out, expected_loss, decimals=3)
def _make_worker(self, cls, env_creator, policy, worker_index, config): def session_creator(): logger.debug("Creating TF session {}".format( config["tf_session_args"])) return tf.Session(config=tf.ConfigProto( **config["tf_session_args"])) if isinstance(config["input"], FunctionType): input_creator = config["input"] elif config["input"] == "sampler": input_creator = (lambda ioctx: ioctx.default_sampler_input()) elif isinstance(config["input"], dict): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), config["shuffle_buffer_size"])) if isinstance(config["output"], FunctionType): output_creator = config["output"] elif config["output"] is None: output_creator = (lambda ioctx: NoopOutput()) elif config["output"] == "logdir": output_creator = (lambda ioctx: JsonWriter( ioctx.log_dir, ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) else: output_creator = (lambda ioctx: JsonWriter( config["output"], ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) if config["input"] == "sampler": input_evaluation = [] else: input_evaluation = config["input_evaluation"] # Fill in the default policy if 'None' is specified in multiagent if config["multiagent"]["policies"]: tmp = config["multiagent"]["policies"] _validate_multiagent_config(tmp, allow_none_graph=True) for k, v in tmp.items(): if v[0] is None: tmp[k] = (policy, v[1], v[2], v[3]) policy = tmp return cls(env_creator, policy, policy_mapping_fn=config["multiagent"]["policy_mapping_fn"], policies_to_train=config["multiagent"]["policies_to_train"], tf_session_creator=(session_creator if config["tf_session_args"] else None), batch_steps=config["sample_batch_size"], batch_mode=config["batch_mode"], episode_horizon=config["horizon"], preprocessor_pref=config["preprocessor_pref"], sample_async=config["sample_async"], compress_observations=config["compress_observations"], num_envs=config["num_envs_per_worker"], observation_filter=config["observation_filter"], clip_rewards=config["clip_rewards"], clip_actions=config["clip_actions"], env_config=config["env_config"], model_config=config["model"], policy_config=config, worker_index=worker_index, monitor_path=self._logdir if config["monitor"] else None, log_dir=self._logdir, log_level=config["log_level"], callbacks=config["callbacks"], input_creator=input_creator, input_evaluation=input_evaluation, output_creator=output_creator, remote_worker_envs=config["remote_worker_envs"], remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], soft_horizon=config["soft_horizon"], no_done_at_end=config["no_done_at_end"], seed=(config["seed"] + worker_index) if config["seed"] is not None else None, _fake_sampler=config.get("_fake_sampler", False))
def _make_worker( self, *, cls: Callable, env_creator: EnvCreator, validate_env: Optional[Callable[[EnvType], None]], policy_cls: Type[Policy], worker_index: int, num_workers: int, recreated_worker: bool = False, config: AlgorithmConfigDict, spaces: Optional[Dict[PolicyID, Tuple[gym.spaces.Space, gym.spaces.Space]]] = None, ) -> Union[RolloutWorker, ActorHandle]: def session_creator(): logger.debug("Creating TF session {}".format( config["tf_session_args"])) return tf1.Session(config=tf1.ConfigProto( **config["tf_session_args"])) def valid_module(class_path): if (isinstance(class_path, str) and not os.path.isfile(class_path) and "." in class_path): module_path, class_name = class_path.rsplit(".", 1) try: spec = importlib.util.find_spec(module_path) if spec is not None: return True except (ModuleNotFoundError, ValueError): print( f"module {module_path} not found while trying to get " f"input {class_path}") return False # A callable returning an InputReader object to use. if isinstance(config["input"], FunctionType): input_creator = config["input"] # Use RLlib's Sampler classes (SyncSampler or AsynchSampler, depending # on `config.sample_async` setting). elif config["input"] == "sampler": input_creator = lambda ioctx: ioctx.default_sampler_input() # Ray Dataset input -> Use `config.input_config` to construct DatasetReader. elif config["input"] == "dataset": # Input dataset shards should have already been prepared. # We just need to take the proper shard here. input_creator = lambda ioctx: DatasetReader( ioctx, self._ds_shards[worker_index]) # Dict: Mix of different input methods with different ratios. elif isinstance(config["input"], dict): input_creator = lambda ioctx: ShuffledInput( MixedInput(config["input"], ioctx), config[ "shuffle_buffer_size"]) # A pre-registered input descriptor (str). elif isinstance(config["input"], str) and registry_contains_input( config["input"]): input_creator = registry_get_input(config["input"]) # D4RL input. elif "d4rl" in config["input"]: env_name = config["input"].split(".")[-1] input_creator = lambda ioctx: D4RLReader(env_name, ioctx) # Valid python module (class path) -> Create using `from_config`. elif valid_module(config["input"]): input_creator = lambda ioctx: ShuffledInput( from_config(config["input"], ioctx=ioctx)) # JSON file or list of JSON files -> Use JsonReader (shuffled). else: input_creator = lambda ioctx: ShuffledInput( JsonReader(config["input"], ioctx), config[ "shuffle_buffer_size"]) if isinstance(config["output"], FunctionType): output_creator = config["output"] elif config["output"] is None: output_creator = lambda ioctx: NoopOutput() elif config["output"] == "dataset": output_creator = lambda ioctx: DatasetWriter( ioctx, compress_columns=config["output_compress_columns"]) elif config["output"] == "logdir": output_creator = lambda ioctx: JsonWriter( ioctx.log_dir, ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"], ) else: output_creator = lambda ioctx: JsonWriter( config["output"], ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"], ) # Assert everything is correct in "multiagent" config dict (if given). ma_policies = config["multiagent"]["policies"] if ma_policies: for pid, policy_spec in ma_policies.copy().items(): assert isinstance(policy_spec, PolicySpec) # Class is None -> Use `policy_cls`. if policy_spec.policy_class is None: ma_policies[pid].policy_class = policy_cls policies = ma_policies # Create a policy_spec (MultiAgentPolicyConfigDict), # even if no "multiagent" setup given by user. else: policies = policy_cls if worker_index == 0: extra_python_environs = config.get( "extra_python_environs_for_driver", None) else: extra_python_environs = config.get( "extra_python_environs_for_worker", None) worker = cls( env_creator=env_creator, validate_env=validate_env, policy_spec=policies, policy_mapping_fn=config["multiagent"]["policy_mapping_fn"], policies_to_train=config["multiagent"]["policies_to_train"], tf_session_creator=(session_creator if config["tf_session_args"] else None), rollout_fragment_length=config["rollout_fragment_length"], count_steps_by=config["multiagent"]["count_steps_by"], batch_mode=config["batch_mode"], episode_horizon=config["horizon"], preprocessor_pref=config["preprocessor_pref"], sample_async=config["sample_async"], compress_observations=config["compress_observations"], num_envs=config["num_envs_per_worker"], observation_fn=config["multiagent"]["observation_fn"], observation_filter=config["observation_filter"], clip_rewards=config["clip_rewards"], normalize_actions=config["normalize_actions"], clip_actions=config["clip_actions"], env_config=config["env_config"], policy_config=config, worker_index=worker_index, num_workers=num_workers, recreated_worker=recreated_worker, log_dir=self._logdir, log_level=config["log_level"], callbacks=config["callbacks"], input_creator=input_creator, output_creator=output_creator, remote_worker_envs=config["remote_worker_envs"], remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], soft_horizon=config["soft_horizon"], no_done_at_end=config["no_done_at_end"], seed=(config["seed"] + worker_index) if config["seed"] is not None else None, fake_sampler=config["fake_sampler"], extra_python_environs=extra_python_environs, spaces=spaces, disable_env_checking=config["disable_env_checking"], ) return worker
def test_marwil_loss_function(self): """ To generate the historic data used in this test case, first run: $ ./train.py --run=PPO --env=CartPole-v0 \ --stop='{"timesteps_total": 50000}' \ --config='{"output": "/tmp/out", "batch_mode": "complete_episodes"}' """ rllib_dir = Path(__file__).parent.parent.parent.parent print("rllib dir={}".format(rllib_dir)) data_file = os.path.join(rllib_dir, "tests/data/cartpole/small.json") print("data_file={} exists={}".format(data_file, os.path.isfile(data_file))) config = marwil.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. # Learn from offline data. config["input"] = [data_file] for fw in framework_iterator(config, frameworks=["torch", "tf2"]): reader = JsonReader(inputs=[data_file]) batch = reader.next() trainer = marwil.MARWILTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() model = policy.model # Calculate our own expected values (to then compare against the # agent's loss output). cummulative_rewards = compute_advantages(batch, 0.0, config["gamma"], 1.0, False, False)["advantages"] if fw == "torch": cummulative_rewards = torch.tensor(cummulative_rewards) batch = policy._lazy_tensor_dict(batch) model_out, _ = model.from_batch(batch) vf_estimates = model.value_function() adv = cummulative_rewards - vf_estimates if fw == "torch": adv = adv.detach().cpu().numpy() adv_squared = np.mean(np.square(adv)) c_2 = 100.0 + 1e-8 * (adv_squared - 100.0) c = np.sqrt(c_2) exp_advs = np.exp(config["beta"] * (adv / c)) logp = policy.dist_class(model_out, model).logp(batch["actions"]) if fw == "torch": logp = logp.detach().cpu().numpy() # Calculate all expected loss components. expected_vf_loss = 0.5 * adv_squared expected_pol_loss = -1.0 * np.mean(exp_advs * logp) expected_loss = \ expected_pol_loss + config["vf_coeff"] * expected_vf_loss # Calculate the algorithm's loss (to check against our own # calculation above). batch.set_get_interceptor(None) postprocessed_batch = policy.postprocess_trajectory(batch) loss_func = marwil.marwil_tf_policy.marwil_loss if fw != "torch" \ else marwil.marwil_torch_policy.marwil_loss loss_out = loss_func(policy, model, policy.dist_class, policy._lazy_tensor_dict(postprocessed_batch)) # Check all components. if fw == "torch": check(policy.v_loss, expected_vf_loss, decimals=4) check(policy.p_loss, expected_pol_loss, decimals=4) else: check(policy.loss.v_loss, expected_vf_loss, decimals=4) check(policy.loss.p_loss, expected_pol_loss, decimals=4) check(loss_out, expected_loss, decimals=3)
def _make_worker(self, cls, env_creator, policy, worker_index, config): def session_creator(): logger.debug("Creating TF session {}".format( config["tf_session_args"])) return tf.Session(config=tf.ConfigProto( **config["tf_session_args"])) if isinstance(config["input"], FunctionType): input_creator = config["input"] elif config["input"] == "sampler": input_creator = (lambda ioctx: ioctx.default_sampler_input()) elif isinstance(config["input"], dict): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), config["shuffle_buffer_size"])) if isinstance(config["output"], FunctionType): output_creator = config["output"] elif config["output"] is None: output_creator = (lambda ioctx: NoopOutput()) elif config["output"] == "logdir": output_creator = (lambda ioctx: JsonWriter( ioctx.log_dir, ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) else: output_creator = (lambda ioctx: JsonWriter( config["output"], ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) if config["input"] == "sampler": input_evaluation = [] else: input_evaluation = config["input_evaluation"] # Fill in the default policy if 'None' is specified in multiagent if config["multiagent"]["policies"]: tmp = config["multiagent"]["policies"] _validate_multiagent_config(tmp, allow_none_graph=True) for k, v in tmp.items(): if v[0] is None: tmp[k] = (policy, v[1], v[2], v[3]) policy = tmp worker = cls( env_creator, policy, policy_mapping_fn=config["multiagent"]["policy_mapping_fn"], policies_to_train=config["multiagent"]["policies_to_train"], tf_session_creator=(session_creator if config["tf_session_args"] else None), rollout_fragment_length=config["rollout_fragment_length"], batch_mode=config["batch_mode"], episode_horizon=config["horizon"], preprocessor_pref=config["preprocessor_pref"], sample_async=config["sample_async"], compress_observations=config["compress_observations"], num_envs=config["num_envs_per_worker"], observation_filter=config["observation_filter"], clip_rewards=config["clip_rewards"], clip_actions=config["clip_actions"], env_config=config["env_config"], model_config=config["model"], policy_config=config, worker_index=worker_index, num_workers=self._num_workers, monitor_path=self._logdir if config["monitor"] else None, log_dir=self._logdir, log_level=config["log_level"], callbacks=config["callbacks"], input_creator=input_creator, input_evaluation=input_evaluation, output_creator=output_creator, remote_worker_envs=config["remote_worker_envs"], remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], soft_horizon=config["soft_horizon"], no_done_at_end=config["no_done_at_end"], seed=(config["seed"] + worker_index) if config["seed"] is not None else None, _fake_sampler=config.get("_fake_sampler", False)) # Check for correct policy class (only locally, remote Workers should # create the exact same Policy types). if type(worker) is RolloutWorker: actual_class = type(worker.get_policy()) # Pytorch case: Policy must be a TorchPolicy. if config["use_pytorch"]: assert issubclass(actual_class, TorchPolicy), \ "Worker policy must be subclass of `TorchPolicy`, " \ "but is {}!".format(actual_class.__name__) # non-Pytorch case: # Policy may be None AND must not be a TorchPolicy. else: assert issubclass(actual_class, type(None)) or \ (issubclass(actual_class, Policy) and not issubclass(actual_class, TorchPolicy)), "Worker " \ "policy must be subclass of `Policy`, but NOT " \ "`TorchPolicy` (your class={})! If you have a torch " \ "Trainer, make sure to set `use_pytorch=True` in " \ "your Trainer's config)!".format(actual_class.__name__) return worker
def testAgentOutputOk(self): self.writeOutputs(self.test_dir) self.assertEqual(len(os.listdir(self.test_dir)), 1) ioctx = IOContext(self.test_dir, {}, 0, None) reader = JsonReader(ioctx, self.test_dir + "/*.json") reader.next()
class TorchCustomLossModel(TorchModelV2, nn.Module): """PyTorch version of the CustomLossModel above.""" def __init__(self, obs_space, action_space, num_outputs, model_config, name, input_files): super().__init__(obs_space, action_space, num_outputs, model_config, name) nn.Module.__init__(self) self.input_files = input_files # Create a new input reader per worker. self.reader = JsonReader(self.input_files) self.fcnet = TorchFC(self.obs_space, self.action_space, num_outputs, model_config, name="fcnet") @override(ModelV2) def forward(self, input_dict, state, seq_lens): # Delegate to our FCNet. return self.fcnet(input_dict, state, seq_lens) @override(ModelV2) def custom_loss(self, policy_loss, loss_inputs): """Calculates a custom loss on top of the given policy_loss(es). Args: policy_loss (List[TensorType]): The list of already calculated policy losses (as many as there are optimizers). loss_inputs (TensorStruct): Struct of np.ndarrays holding the entire train batch. Returns: List[TensorType]: The altered list of policy losses. In case the custom loss should have its own optimizer, make sure the returned list is one larger than the incoming policy_loss list. In case you simply want to mix in the custom loss into the already calculated policy losses, return a list of altered policy losses (as done in this example below). """ # Get the next batch from our input files. batch = self.reader.next() # Define a secondary loss by building a graph copy with weight sharing. obs = restore_original_dimensions(torch.from_numpy( batch["obs"]).float(), self.obs_space, tensorlib="torch") logits, _ = self.forward({"obs": obs}, [], None) # You can also add self-supervised losses easily by referencing tensors # created during _build_layers_v2(). For example, an autoencoder-style # loss can be added as follows: # ae_loss = squared_diff( # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # Compute the IL loss. action_dist = TorchCategorical(logits, self.model_config) imitation_loss = torch.mean( -action_dist.logp(torch.from_numpy(batch["actions"]))) self.imitation_loss_metric = imitation_loss.item() self.policy_loss_metric = np.mean([l.item() for l in policy_loss]) # Add the imitation loss to each already calculated policy loss term. # Alternatively (if custom loss has its own optimizer): # return policy_loss + [10 * self.imitation_loss] return [loss_ + 10 * imitation_loss for loss_ in policy_loss] def metrics(self): return { "policy_loss": self.policy_loss_metric, "imitation_loss": self.imitation_loss_metric, }