def test_traj_view_next_action(self): action_space = Discrete(2) rollout_worker_w_api = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy_config=ppo.DEFAULT_CONFIG, rollout_fragment_length=200, policy_spec=ppo.PPOTorchPolicy, policy_mapping_fn=None, num_envs=1, ) # Add the next action to the view reqs of the policy. # This should be visible then in postprocessing and train batches. rollout_worker_w_api.policy_map["default_policy"].view_requirements[ "next_actions"] = ViewRequirement(SampleBatch.ACTIONS, shift=1, space=action_space) # Make sure, we have DONEs as well. rollout_worker_w_api.policy_map["default_policy"].view_requirements[ "dones"] = ViewRequirement() batch = rollout_worker_w_api.sample() self.assertTrue("next_actions" in batch.data) expected_a_ = None # expected next action for i in range(len(batch["actions"])): a, d, a_ = batch["actions"][i], batch["dones"][i], \ batch["next_actions"][i] if not d and expected_a_ is not None: check(a, expected_a_) elif d: check(a_, 0) expected_a_ = None continue expected_a_ = a_
def _update_model_view_requirements_from_init_state(self): """Uses Model's (or this Policy's) init state to add needed ViewReqs. Can be called from within a Policy to make sure RNNs automatically update their internal state-related view requirements. Changes the `self.view_requirements` dict. """ self._model_init_state_automatically_added = True model = getattr(self, "model", None) obj = model or self if model and not hasattr(model, "view_requirements"): model.view_requirements = { SampleBatch.OBS: ViewRequirement(space=self.observation_space) } view_reqs = obj.view_requirements # Add state-ins to this model's view. init_state = [] if hasattr(obj, "get_initial_state") and callable( obj.get_initial_state): init_state = obj.get_initial_state() else: # Add this functionality automatically for new native model API. if tf and isinstance(model, tf.keras.Model) and \ "state_in_0" not in view_reqs: obj.get_initial_state = lambda: [ np.zeros_like(view_req.space.sample()) for k, view_req in model.view_requirements.items() if k.startswith("state_in_") ] else: obj.get_initial_state = lambda: [] if "state_in_0" in view_reqs: self.is_recurrent = lambda: True # Make sure auto-generated init-state view requirements get added # to both Policy and Model, no matter what. view_reqs = [view_reqs] + ([self.view_requirements] if hasattr( self, "view_requirements") else []) for i, state in enumerate(init_state): # Allow `state` to be either a Space (use zeros as initial values) # or any value (e.g. a dict or a non-zero tensor). fw = np if isinstance(state, np.ndarray) else torch if \ torch and torch.is_tensor(state) else None if fw: space = Box(-1.0, 1.0, shape=state.shape) if \ fw.all(state == 0.0) else state else: space = state for vr in view_reqs: vr["state_in_{}".format(i)] = ViewRequirement( "state_out_{}".format(i), shift=-1, used_for_compute_actions=True, batch_repeat_value=self.config.get("model", {}).get( "max_seq_len", 1), space=space) vr["state_out_{}".format(i)] = ViewRequirement( space=space, used_for_training=True)
def __init__(self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, num_outputs: int, model_config: ModelConfigDict, name: str): super(LSTMWrapper, self).__init__(obs_space, action_space, None, model_config, name) self.cell_size = model_config["lstm_cell_size"] self.use_prev_action_reward = model_config[ "lstm_use_prev_action_reward"] if action_space.shape is not None: self.action_dim = int(np.product(action_space.shape)) else: self.action_dim = int(len(action_space)) # Add prev-action/reward nodes to input to LSTM. if self.use_prev_action_reward: self.num_outputs += 1 + self.action_dim # Define input layers. input_layer = tf.keras.layers.Input( shape=(None, self.num_outputs), name="inputs") self.num_outputs = num_outputs state_in_h = tf.keras.layers.Input(shape=(self.cell_size, ), name="h") state_in_c = tf.keras.layers.Input(shape=(self.cell_size, ), name="c") seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32) # Preprocess observation with a hidden layer and send to LSTM cell lstm_out, state_h, state_c = tf.keras.layers.LSTM( self.cell_size, return_sequences=True, return_state=True, name="lstm")( inputs=input_layer, mask=tf.sequence_mask(seq_in), initial_state=[state_in_h, state_in_c]) # Postprocess LSTM output with another hidden layer and compute values logits = tf.keras.layers.Dense( self.num_outputs, activation=tf.keras.activations.linear, name="logits")(lstm_out) values = tf.keras.layers.Dense( 1, activation=None, name="values")(lstm_out) # Create the RNN model self._rnn_model = tf.keras.Model( inputs=[input_layer, seq_in, state_in_h, state_in_c], outputs=[logits, values, state_h, state_c]) self.register_variables(self._rnn_model.variables) self._rnn_model.summary() # Add prev-a/r to this model's view, if required. if model_config["lstm_use_prev_action_reward"]: self.inference_view_requirements[SampleBatch.PREV_REWARDS] = \ ViewRequirement(SampleBatch.REWARDS, shift=-1) self.inference_view_requirements[SampleBatch.PREV_ACTIONS] = \ ViewRequirement(SampleBatch.ACTIONS, space=self.action_space, shift=-1)
def __init__(self, observation_space, action_space, model_config, *args, **kwargs): super().__init__(observation_space, action_space, model_config, *args, **kwargs) self.observation_space = observation_space self.action_space = action_space self.state_size = 10 self.model_config = model_config or {} space = Box(low=-np.inf, high=np.inf, shape=(self.state_size, ), dtype=np.float64) # Set view requirements such that the policy state is held in # memory for 2 environment steps. self.view_requirements["state_in_0"] = ViewRequirement( "state_out_0", shift="-2:-1", used_for_training=False, used_for_compute_actions=True, batch_repeat_value=1, ) self.view_requirements["state_out_0"] = ViewRequirement( space=space, used_for_training=False, used_for_compute_actions=True, batch_repeat_value=1, )
def test_view_requirement_connector(self): view_requirements = { "obs": ViewRequirement(used_for_training=True, used_for_compute_actions=True), "prev_actions": ViewRequirement( data_col="actions", shift=-1, used_for_training=True, used_for_compute_actions=True, ), } ctx = ConnectorContext(view_requirements=view_requirements) c = ViewRequirementAgentConnector(ctx) f = FlattenDataAgentConnector(ctx) d = AgentConnectorDataType( 0, 1, { SampleBatch.NEXT_OBS: { "sensor1": [[1, 1], [2, 2]], "sensor2": 8.8, }, SampleBatch.ACTIONS: np.array(0), }, ) # ViewRequirementAgentConnector then FlattenAgentConnector. processed = f(c([d])) self.assertTrue("obs" in processed[0].data.for_action) self.assertTrue("prev_actions" in processed[0].data.for_action)
def __init__(self, obs_space, action_space, num_outputs, model_config, name, num_frames=3): super(FrameStackingCartPoleModel, self).__init__(obs_space, action_space, None, model_config, name) self.num_frames = num_frames self.num_outputs = num_outputs # Construct actual (very simple) FC model. assert len(obs_space.shape) == 1 input_ = tf.keras.layers.Input(shape=(self.num_frames, obs_space.shape[0])) reshaped = tf.keras.layers.Reshape( [obs_space.shape[0] * self.num_frames])(input_) layer1 = tf.keras.layers.Dense(64, activation=tf.nn.relu)(reshaped) out = tf.keras.layers.Dense(self.num_outputs)(layer1) values = tf.keras.layers.Dense(1)(layer1) self.base_model = tf.keras.models.Model([input_], [out, values]) self.register_variables(self.base_model.variables) self._last_value = None self.view_requirements["prev_n_obs"] = ViewRequirement( data_col="obs", shift="-{}:0".format(num_frames - 1), space=obs_space) self.view_requirements["prev_rewards"] = ViewRequirement( data_col="rewards", shift=-1)
def _update_model_view_requirements_from_init_state(self): """Uses Model's (or this Policy's) init state to add needed ViewReqs. Can be called from within a Policy to make sure RNNs automatically update their internal state-related view requirements. Changes the `self.view_requirements` dict. """ self._model_init_state_automatically_added = True model = getattr(self, "model", None) obj = model or self if model and not hasattr(model, "view_requirements"): model.view_requirements = { SampleBatch.OBS: ViewRequirement(space=self.observation_space) } view_reqs = obj.view_requirements # Add state-ins to this model's view. init_state = [] if hasattr(obj, "get_initial_state") and callable( obj.get_initial_state): init_state = obj.get_initial_state() else: obj.get_initial_state = lambda: [] for i, state in enumerate(init_state): space = Box(-1.0, 1.0, shape=state.shape) if \ hasattr(state, "shape") else state view_reqs["state_in_{}".format(i)] = ViewRequirement( "state_out_{}".format(i), shift=-1, used_for_compute_actions=True, batch_repeat_value=self.config.get("model", {}).get("max_seq_len", 1), space=space) view_reqs["state_out_{}".format(i)] = ViewRequirement( space=space, used_for_training=True)
def __init__(self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, num_outputs: Optional[int], model_config: ModelConfigDict, name: str, policy_model_config: ModelConfigDict = None, q_model_config: ModelConfigDict = None, twin_q: bool = False, initial_alpha: float = 1.0, target_entropy: Optional[float] = None): super().__init__(obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=model_config, name=name, policy_model_config=policy_model_config, q_model_config=q_model_config, twin_q=twin_q, initial_alpha=initial_alpha, target_entropy=target_entropy) self.use_prev_action = (model_config["lstm_use_prev_action"] or policy_model_config["lstm_use_prev_action"] or q_model_config["lstm_use_prev_action"]) self.use_prev_reward = (model_config["lstm_use_prev_reward"] or policy_model_config["lstm_use_prev_reward"] or q_model_config["lstm_use_prev_reward"]) if self.use_prev_action: self.view_requirements[SampleBatch.PREV_ACTIONS] = \ ViewRequirement(SampleBatch.ACTIONS, space=self.action_space, shift=-1) if self.use_prev_reward: self.view_requirements[SampleBatch.PREV_REWARDS] = \ ViewRequirement(SampleBatch.REWARDS, shift=-1)
def __init__(self, obs_space, action_space, num_outputs, model_config, name, num_frames=3): nn.Module.__init__(self) super(TorchFrameStackingCartPoleModel, self).__init__(obs_space, action_space, None, model_config, name) self.num_frames = num_frames self.num_outputs = num_outputs # Construct actual (very simple) FC model. assert len(obs_space.shape) == 1 self.layer1 = SlimFC(in_size=obs_space.shape[0] * self.num_frames, out_size=64, activation_fn="relu") self.out = SlimFC(in_size=64, out_size=self.num_outputs, activation_fn="linear") self.values = SlimFC(in_size=64, out_size=1, activation_fn="linear") self._last_value = None self.view_requirements["prev_n_obs"] = ViewRequirement( data_col="obs", shift="-{}:0".format(num_frames - 1), space=obs_space) self.view_requirements["prev_rewards"] = ViewRequirement( data_col="rewards", shift=-1)
def _get_default_view_requirements(self): """Returns a default ViewRequirements dict. Note: This is the base/maximum requirement dict, from which later some requirements will be subtracted again automatically to streamline data collection, batch creation, and data transfer. Returns: ViewReqDict: The default view requirements dict. """ # Default view requirements (equal to those that we would use before # the trajectory view API was introduced). return { SampleBatch.OBS: ViewRequirement(space=self.observation_space), SampleBatch.NEXT_OBS: ViewRequirement( data_col=SampleBatch.OBS, shift=1, space=self.observation_space), SampleBatch.ACTIONS: ViewRequirement(space=self.action_space), SampleBatch.REWARDS: ViewRequirement(), SampleBatch.DONES: ViewRequirement(), SampleBatch.INFOS: ViewRequirement(), SampleBatch.EPS_ID: ViewRequirement(), SampleBatch.AGENT_INDEX: ViewRequirement(), SampleBatch.UNROLL_ID: ViewRequirement(), "t": ViewRequirement(), }
def test_traj_view_next_action(self): action_space = Discrete(2) rollout_worker_w_api = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy_config=ppo.DEFAULT_CONFIG, rollout_fragment_length=200, policy_spec=ppo.PPOTorchPolicy, policy_mapping_fn=None, num_envs=1, ) # Add the next action (a') and 2nd next action (a'') to the view # requirements of the policy. # This should be visible then in postprocessing and train batches. # Switch off for action computations (can't be there as we don't know # the next actions already at action computation time). rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[ "next_actions"] = ViewRequirement( SampleBatch.ACTIONS, shift=1, space=action_space, used_for_compute_actions=False, ) rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[ "2nd_next_actions"] = ViewRequirement( SampleBatch.ACTIONS, shift=2, space=action_space, used_for_compute_actions=False, ) # Make sure, we have DONEs as well. rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[ "dones"] = ViewRequirement() batch = rollout_worker_w_api.sample() self.assertTrue("next_actions" in batch) self.assertTrue("2nd_next_actions" in batch) expected_a_ = None # expected next action expected_a__ = None # expected 2nd next action for i in range(len(batch["actions"])): a, d, a_, a__ = ( batch["actions"][i], batch["dones"][i], batch["next_actions"][i], batch["2nd_next_actions"][i], ) # Episode done: next action and 2nd next action should be 0. if d: check(a_, 0) check(a__, 0) expected_a_ = None expected_a__ = None continue # Episode is not done and we have an expected next-a. if expected_a_ is not None: check(a, expected_a_) if expected_a__ is not None: check(a_, expected_a__) expected_a__ = a__ expected_a_ = a_
def training_view_requirements_fn(policy): return { # Next obs are needed for PPO postprocessing. SampleBatch.NEXT_OBS: ViewRequirement(SampleBatch.OBS, shift=1), # VF preds are needed for the loss. SampleBatch.VF_PREDS: ViewRequirement(shift=0), }
def training_view_requirements(self): if hasattr(self, "view_requirements"): return self.view_requirements return { SampleBatch.ACTIONS: ViewRequirement(space=self.action_space, shift=0), SampleBatch.REWARDS: ViewRequirement(shift=0), SampleBatch.DONES: ViewRequirement(shift=0), }
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.episode_id = None self.env_id = None class _fake_model: pass self.model = _fake_model() self.model.time_major = True self.model.inference_view_requirements = { SampleBatch.EPS_ID: ViewRequirement(), "env_id": ViewRequirement(), SampleBatch.OBS: ViewRequirement(), SampleBatch.PREV_ACTIONS: ViewRequirement(SampleBatch.ACTIONS, space=self.action_space, shift=-1), SampleBatch.PREV_REWARDS: ViewRequirement(SampleBatch.REWARDS, shift=-1), } self.training_view_requirements = dict( **{ SampleBatch.NEXT_OBS: ViewRequirement(SampleBatch.OBS, shift=1), SampleBatch.ACTIONS: ViewRequirement(space=self.action_space), SampleBatch.REWARDS: ViewRequirement(), SampleBatch.DONES: ViewRequirement(), }, **self.model.inference_view_requirements)
def inference_view_requirements(self) -> Dict[str, ViewRequirement]: req = super().inference_view_requirements() # Optional: prev-actions/rewards for forward pass. if self.model_config["lstm_use_prev_action_reward"]: req.update({ SampleBatch.PREV_REWARDS: ViewRequirement( SampleBatch.REWARDS, shift=-1), SampleBatch.PREV_ACTIONS: ViewRequirement( SampleBatch.ACTIONS, space=self.action_space, shift=-1), }) return req
def __init__(self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, num_outputs: int, model_config: ModelConfigDict, name: str): nn.Module.__init__(self) super().__init__(obs_space, action_space, None, model_config, name) self.cell_size = model_config["lstm_cell_size"] self.time_major = model_config.get("_time_major", False) self.use_prev_action = model_config["lstm_use_prev_action"] self.use_prev_reward = model_config["lstm_use_prev_reward"] if isinstance(action_space, Discrete): self.action_dim = action_space.n elif isinstance(action_space, MultiDiscrete): self.action_dim = np.product(action_space.nvec) elif action_space.shape is not None: self.action_dim = int(np.product(action_space.shape)) else: self.action_dim = int(len(action_space)) # Add prev-action/reward nodes to input to LSTM. if self.use_prev_action: self.num_outputs += self.action_dim if self.use_prev_reward: self.num_outputs += 1 self.lstm = nn.LSTM(self.num_outputs, self.cell_size, batch_first=not self.time_major) self.num_outputs = num_outputs # Postprocess LSTM output with another hidden layer and compute values. self._logits_branch = SlimFC(in_size=self.cell_size, out_size=self.num_outputs, activation_fn=None, initializer=torch.nn.init.xavier_uniform_) self._value_branch = SlimFC(in_size=self.cell_size, out_size=1, activation_fn=None, initializer=torch.nn.init.xavier_uniform_) # Add prev-a/r to this model's view, if required. if model_config["lstm_use_prev_action"]: self.inference_view_requirements[SampleBatch.PREV_ACTIONS] = \ ViewRequirement(SampleBatch.ACTIONS, space=self.action_space, data_rel_pos=-1) if model_config["lstm_use_prev_reward"]: self.inference_view_requirements[SampleBatch.PREV_REWARDS] = \ ViewRequirement(SampleBatch.REWARDS, data_rel_pos=-1)
def __init__(self, obs_space, action_space, num_outputs, model_config, name): nn.Module.__init__(self) super().__init__(obs_space, action_space, None, model_config, name) self.cell_size = model_config["lstm_cell_size"] self.time_major = model_config.get("_time_major", False) self.use_prev_action_reward = model_config[ "lstm_use_prev_action_reward"] self.action_dim = int(np.product(action_space.shape)) # Add prev-action/reward nodes to input to LSTM. if self.use_prev_action_reward: self.num_outputs += 1 + self.action_dim self.lstm = nn.LSTM(self.num_outputs, self.cell_size, batch_first=not self.time_major) self.num_outputs = num_outputs # Postprocess LSTM output with another hidden layer and compute values. self._logits_branch = SlimFC(in_size=self.cell_size, out_size=self.num_outputs, activation_fn=None, initializer=torch.nn.init.xavier_uniform_) self._value_branch = SlimFC(in_size=self.cell_size, out_size=1, activation_fn=None, initializer=torch.nn.init.xavier_uniform_) self.inference_view_requirements.update( dict( **{ SampleBatch.OBS: ViewRequirement(shift=0), SampleBatch.PREV_REWARDS: ViewRequirement(SampleBatch.REWARDS, shift=-1), SampleBatch.PREV_ACTIONS: ViewRequirement(SampleBatch.ACTIONS, space=self.action_space, shift=-1), })) for i in range(2): self.inference_view_requirements["state_in_{}".format(i)] = \ ViewRequirement( "state_out_{}".format(i), shift=-1, space=Box(-1.0, 1.0, shape=(self.cell_size,))) self.inference_view_requirements["state_out_{}".format(i)] = \ ViewRequirement( space=Box(-1.0, 1.0, shape=(self.cell_size,)))
def ppo_init(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> None: """ TODO: Write documentation. """ # Call base implementation setup_mixins(policy, obs_space, action_space, config) # Add previous observation in viewer requirements for CAPS loss computation # TODO: Remove update of `policy.model.view_requirements` after ray fix caps_view_requirements = { "_prev_obs": ViewRequirement(data_col="obs", space=obs_space, shift=-1, used_for_compute_actions=False) } policy.model.view_requirements.update(caps_view_requirements) policy.view_requirements.update(caps_view_requirements) # Initialize extra loss policy._mean_symmetric_policy_loss = 0.0 policy._mean_temporal_caps_loss = 0.0 policy._mean_spatial_caps_loss = 0.0 policy._mean_global_caps_loss = 0.0
def test_pad_batch_fixed_max(self): """Test pad_batch_to_sequences_of_same_size when dynamic_max = False""" view_requirements = { "state_in_0": ViewRequirement( "state_out_0", shift="-3:-1", used_for_training=False, used_for_compute_actions=True, batch_repeat_value=1, ) } max_seq_len = 20 num_seqs = np.random.randint(1, 20) seq_lens = np.random.randint(1, max_seq_len, size=(num_seqs)) sum_seq_lens = np.sum(seq_lens) s1 = SampleBatch( { "a": np.arange(sum_seq_lens), "b": np.arange(sum_seq_lens), "seq_lens": seq_lens, "state_in_0": [[0]] * num_seqs, }, _max_seq_len=max_seq_len, ) pad_batch_to_sequences_of_same_size( s1, max_seq_len=max_seq_len, feature_keys=["a", "b"], view_requirements=view_requirements, ) check(s1.max_seq_len, max_seq_len) check(s1["a"].shape[0], max_seq_len * num_seqs) check(s1["b"].shape[0], max_seq_len * num_seqs)
def __init__(self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, num_outputs: int, model_config: ModelConfigDict, name: str, framework: str): """Initializes a ModelV2 object. This method should create any variables used by the model. Args: obs_space (gym.spaces.Space): Observation space of the target gym env. This may have an `original_space` attribute that specifies how to unflatten the tensor into a ragged tensor. action_space (gym.spaces.Space): Action space of the target gym env. num_outputs (int): Number of output units of the model. model_config (ModelConfigDict): Config for the model, documented in ModelCatalog. name (str): Name (scope) for the model. framework (str): Either "tf" or "torch". """ self.obs_space: gym.spaces.Space = obs_space self.action_space: gym.spaces.Space = action_space self.num_outputs: int = num_outputs self.model_config: ModelConfigDict = model_config self.name: str = name or "default_model" self.framework: str = framework self._last_output = None self.time_major = self.model_config.get("_time_major") # Basic view requirement for all models: Use the observation as input. self.inference_view_requirements = { SampleBatch.OBS: ViewRequirement(data_rel_pos=0, space=self.obs_space), }
def __init__(self, obs_space, action_space, num_outputs, model_config, name, fc_size=64, lstm_state_size=256): nn.Module.__init__(self) super().__init__(obs_space, action_space, num_outputs, model_config, name) self.obs_size = get_preprocessor(obs_space)(obs_space).size self.fc_size = fc_size self.lstm_state_size = lstm_state_size # Build the Module from fc + LSTM + 2xfc (action + value outs). self.fc1 = nn.Linear(self.obs_size, self.fc_size) self.lstm = nn.LSTM( self.fc_size, self.lstm_state_size, batch_first=True) self.action_branch = nn.Linear(self.lstm_state_size, num_outputs) self.value_branch = nn.Linear(self.lstm_state_size, 1) # Holds the current "base" output (before logits layer). self._features = None # Add state-ins to this model's view. for i in range(2): self.inference_view_requirements["state_in_{}".format(i)] = \ ViewRequirement( "state_out_{}".format(i), shift=-1, space=Box(-1.0, 1.0, shape=(self.lstm_state_size,)))
def _update_model_inference_view_requirements_from_init_state(self): """Uses this Model's initial state to auto-add necessary ViewReqs. Can be called from within a Policy to make sure RNNs automatically update their internal state-related view requirements. Changes the `self.inference_view_requirements` dict. """ model = self.model # Add state-ins to this model's view. for i, state in enumerate(model.get_initial_state()): model.inference_view_requirements["state_in_{}".format(i)] = \ ViewRequirement( "state_out_{}".format(i), shift=-1, space=Box(-1.0, 1.0, shape=state.shape)) model.inference_view_requirements["state_out_{}".format(i)] = \ ViewRequirement(space=Box(-1.0, 1.0, shape=state.shape))
def __init__(self, obs_space, action_space, num_outputs, model_config, name, num_frames=3): super(FrameStackingCartPoleModel, self).__init__(obs_space, action_space, None, model_config, name) self.num_frames = num_frames self.num_outputs = num_outputs # Construct actual (very simple) FC model. assert len(obs_space.shape) == 1 obs = tf.keras.layers.Input(shape=(self.num_frames, obs_space.shape[0])) obs_reshaped = tf.keras.layers.Reshape( [obs_space.shape[0] * self.num_frames])(obs) rewards = tf.keras.layers.Input(shape=(self.num_frames)) rewards_reshaped = tf.keras.layers.Reshape([self.num_frames])(rewards) actions = tf.keras.layers.Input(shape=(self.num_frames, self.action_space.n)) actions_reshaped = tf.keras.layers.Reshape( [action_space.n * self.num_frames])(actions) input_ = tf.keras.layers.Concatenate(axis=-1)( [obs_reshaped, actions_reshaped, rewards_reshaped]) layer1 = tf.keras.layers.Dense(256, activation=tf.nn.relu)(input_) layer2 = tf.keras.layers.Dense(256, activation=tf.nn.relu)(layer1) out = tf.keras.layers.Dense(self.num_outputs)(layer2) values = tf.keras.layers.Dense(1)(layer1) self.base_model = tf.keras.models.Model([obs, actions, rewards], [out, values]) self._last_value = None self.view_requirements["prev_n_obs"] = ViewRequirement( data_col="obs", shift="-{}:0".format(num_frames - 1), space=obs_space) self.view_requirements["prev_n_rewards"] = ViewRequirement( data_col="rewards", shift="-{}:-1".format(self.num_frames)) self.view_requirements["prev_n_actions"] = ViewRequirement( data_col="actions", shift="-{}:-1".format(self.num_frames), space=self.action_space, )
def _update_model_view_requirements_from_init_state(self): """Uses Model's (or this Policy's) init state to add needed ViewReqs. Can be called from within a Policy to make sure RNNs automatically update their internal state-related view requirements. Changes the `self.view_requirements` dict. """ self._model_init_state_automatically_added = True model = getattr(self, "model", None) obj = model or self if model and not hasattr(model, "view_requirements"): model.view_requirements = { SampleBatch.OBS: ViewRequirement(space=self.observation_space) } view_reqs = obj.view_requirements # Add state-ins to this model's view. init_state = [] if hasattr(obj, "get_initial_state") and callable( obj.get_initial_state): init_state = obj.get_initial_state() else: # Add this functionality automatically for new native model API. if tf and isinstance(model, tf.keras.Model) and \ "state_in_0" not in view_reqs: obj.get_initial_state = lambda: [ np.zeros_like(view_req.space.sample()) for k, view_req in model.view_requirements.items() if k.startswith("state_in_") ] else: obj.get_initial_state = lambda: [] if "state_in_0" in view_reqs: self.is_recurrent = lambda: True for i, state in enumerate(init_state): space = Box(-1.0, 1.0, shape=state.shape) if \ hasattr(state, "shape") else state view_reqs["state_in_{}".format(i)] = ViewRequirement( "state_out_{}".format(i), shift=-1, used_for_compute_actions=True, batch_repeat_value=self.config.get("model", {}).get("max_seq_len", 1), space=space) view_reqs["state_out_{}".format(i)] = ViewRequirement( space=space, used_for_training=True)
def view_requirements_fn(policy: Policy) -> Dict[str, ViewRequirement]: """Function defining the view requirements for training/postprocessing. These go on top of the Policy's Model's own view requirements used for the action computing forward passes. Args: policy (Policy): The Policy that requires the returned ViewRequirements. Returns: Dict[str, ViewRequirement]: The Policy's view requirements. """ ret = { # Next obs are needed for PPO postprocessing, but not in loss. SampleBatch.NEXT_OBS: ViewRequirement( SampleBatch.OBS, shift=1, used_for_training=False), # Created during postprocessing. Postprocessing.ADVANTAGES: ViewRequirement(shift=0), Postprocessing.VALUE_TARGETS: ViewRequirement(shift=0), # Needed for PPO's loss function. SampleBatch.ACTION_DIST_INPUTS: ViewRequirement(shift=0), SampleBatch.ACTION_LOGP: ViewRequirement(shift=0), SampleBatch.VF_PREDS: ViewRequirement(shift=0), } # If policy is recurrent, have to add state_out for PPO postprocessing # (calculating GAE from next-obs and last state-out). if policy.is_recurrent(): init_state = policy.get_initial_state() for i, s in enumerate(init_state): ret["state_out_{}".format(i)] = ViewRequirement( space=gym.spaces.Box(-1.0, 1.0, shape=(s.shape[0], )), used_for_training=False) return ret
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.exploration = self._create_exploration() self.view_requirements.update({ "state_in_0": ViewRequirement( "state_out_0", shift=-1, space=gym.spaces.Box(0, 100, shape=(), dtype=np.int32)) })
def _update_model_inference_view_requirements_from_init_state(self): """Uses Model's (or this Policy's) init state to add needed ViewReqs. Can be called from within a Policy to make sure RNNs automatically update their internal state-related view requirements. Changes the `self.inference_view_requirements` dict. """ self._model_init_state_automatically_added = True model = getattr(self, "model", None) obj = model or self # Add state-ins to this model's view. for i, state in enumerate(obj.get_initial_state()): space = Box(-1.0, 1.0, shape=state.shape) if \ hasattr(state, "shape") else state view_reqs = model.inference_view_requirements if model else \ self.view_requirements view_reqs["state_in_{}".format(i)] = ViewRequirement( "state_out_{}".format(i), shift=-1, space=space) view_reqs["state_out_{}".format(i)] = ViewRequirement(space=space)
def training_view_requirements_fn( policy: Policy) -> Dict[str, ViewRequirement]: """Function defining the view requirements for training the policy. These go on top of the Policy's Model's own view requirements used for action computing forward passes. Args: policy (Policy): The Policy that requires the returned ViewRequirements. Returns: Dict[str, ViewRequirement]: The Policy's view requirements. """ return { # Next obs are needed for PPO postprocessing. SampleBatch.NEXT_OBS: ViewRequirement(SampleBatch.OBS, shift=1), # VF preds are needed for the loss. SampleBatch.VF_PREDS: ViewRequirement(shift=0), # Needed for postprocessing. SampleBatch.ACTION_DIST_INPUTS: ViewRequirement(shift=0), SampleBatch.ACTION_LOGP: ViewRequirement(shift=0), # Created during postprocessing. Postprocessing.ADVANTAGES: ViewRequirement(shift=0), Postprocessing.VALUE_TARGETS: ViewRequirement(shift=0), }
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.state_space = Box(-1.0, 1.0, (1, )) self.config["model"] = {"max_seq_len": 50} class _fake_model: pass self.model = _fake_model() self.model.inference_view_requirements = { SampleBatch.AGENT_INDEX: ViewRequirement(), SampleBatch.EPS_ID: ViewRequirement(), "env_id": ViewRequirement(), "t": ViewRequirement(), SampleBatch.OBS: ViewRequirement(), "state_in_0": ViewRequirement( "state_out_0", # Provide state outs -50 to -1 as "state-in". data_rel_pos="-50:-1", # Repeat the incoming state every n time steps (usually max seq # len). batch_repeat_value=self.config["model"]["max_seq_len"], space=self.state_space) } self.view_requirements = dict(super()._get_default_view_requirements(), **self.model.inference_view_requirements)
def view_requirements_fn(policy: Policy) -> Dict[str, ViewRequirement]: """Function defining the view requirements for training/postprocessing. These go on top of the Policy's Model's own view requirements used for the action computing forward passes. Args: policy (Policy): The Policy that requires the returned ViewRequirements. Returns: Dict[str, ViewRequirement]: The Policy's view requirements. """ ret = { SampleBatch.NEXT_OBS: ViewRequirement(SampleBatch.OBS, shift=1, used_for_training=False), Postprocessing.ADVANTAGES: ViewRequirement(shift=0), Postprocessing.VALUE_TARGETS: ViewRequirement(shift=0), } return ret