Exemple #1
0
 def test_traj_view_next_action(self):
     action_space = Discrete(2)
     rollout_worker_w_api = RolloutWorker(
         env_creator=lambda _: gym.make("CartPole-v0"),
         policy_config=ppo.DEFAULT_CONFIG,
         rollout_fragment_length=200,
         policy_spec=ppo.PPOTorchPolicy,
         policy_mapping_fn=None,
         num_envs=1,
     )
     # Add the next action to the view reqs of the policy.
     # This should be visible then in postprocessing and train batches.
     rollout_worker_w_api.policy_map["default_policy"].view_requirements[
         "next_actions"] = ViewRequirement(SampleBatch.ACTIONS,
                                           shift=1,
                                           space=action_space)
     # Make sure, we have DONEs as well.
     rollout_worker_w_api.policy_map["default_policy"].view_requirements[
         "dones"] = ViewRequirement()
     batch = rollout_worker_w_api.sample()
     self.assertTrue("next_actions" in batch.data)
     expected_a_ = None  # expected next action
     for i in range(len(batch["actions"])):
         a, d, a_ = batch["actions"][i], batch["dones"][i], \
                    batch["next_actions"][i]
         if not d and expected_a_ is not None:
             check(a, expected_a_)
         elif d:
             check(a_, 0)
             expected_a_ = None
             continue
         expected_a_ = a_
Exemple #2
0
    def _update_model_view_requirements_from_init_state(self):
        """Uses Model's (or this Policy's) init state to add needed ViewReqs.

        Can be called from within a Policy to make sure RNNs automatically
        update their internal state-related view requirements.
        Changes the `self.view_requirements` dict.
        """
        self._model_init_state_automatically_added = True
        model = getattr(self, "model", None)

        obj = model or self
        if model and not hasattr(model, "view_requirements"):
            model.view_requirements = {
                SampleBatch.OBS: ViewRequirement(space=self.observation_space)
            }
        view_reqs = obj.view_requirements
        # Add state-ins to this model's view.
        init_state = []
        if hasattr(obj, "get_initial_state") and callable(
                obj.get_initial_state):
            init_state = obj.get_initial_state()
        else:
            # Add this functionality automatically for new native model API.
            if tf and isinstance(model, tf.keras.Model) and \
                    "state_in_0" not in view_reqs:
                obj.get_initial_state = lambda: [
                    np.zeros_like(view_req.space.sample())
                    for k, view_req in model.view_requirements.items()
                    if k.startswith("state_in_")
                ]
            else:
                obj.get_initial_state = lambda: []
                if "state_in_0" in view_reqs:
                    self.is_recurrent = lambda: True

        # Make sure auto-generated init-state view requirements get added
        # to both Policy and Model, no matter what.
        view_reqs = [view_reqs] + ([self.view_requirements] if hasattr(
            self, "view_requirements") else [])

        for i, state in enumerate(init_state):
            # Allow `state` to be either a Space (use zeros as initial values)
            # or any value (e.g. a dict or a non-zero tensor).
            fw = np if isinstance(state, np.ndarray) else torch if \
                torch and torch.is_tensor(state) else None
            if fw:
                space = Box(-1.0, 1.0, shape=state.shape) if \
                    fw.all(state == 0.0) else state
            else:
                space = state
            for vr in view_reqs:
                vr["state_in_{}".format(i)] = ViewRequirement(
                    "state_out_{}".format(i),
                    shift=-1,
                    used_for_compute_actions=True,
                    batch_repeat_value=self.config.get("model", {}).get(
                        "max_seq_len", 1),
                    space=space)
                vr["state_out_{}".format(i)] = ViewRequirement(
                    space=space, used_for_training=True)
Exemple #3
0
    def __init__(self, obs_space: gym.spaces.Space,
                 action_space: gym.spaces.Space, num_outputs: int,
                 model_config: ModelConfigDict, name: str):

        super(LSTMWrapper, self).__init__(obs_space, action_space, None,
                                          model_config, name)

        self.cell_size = model_config["lstm_cell_size"]
        self.use_prev_action_reward = model_config[
            "lstm_use_prev_action_reward"]
        if action_space.shape is not None:
            self.action_dim = int(np.product(action_space.shape))
        else:
            self.action_dim = int(len(action_space))
        # Add prev-action/reward nodes to input to LSTM.
        if self.use_prev_action_reward:
            self.num_outputs += 1 + self.action_dim

        # Define input layers.
        input_layer = tf.keras.layers.Input(
            shape=(None, self.num_outputs), name="inputs")

        self.num_outputs = num_outputs

        state_in_h = tf.keras.layers.Input(shape=(self.cell_size, ), name="h")
        state_in_c = tf.keras.layers.Input(shape=(self.cell_size, ), name="c")
        seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32)

        # Preprocess observation with a hidden layer and send to LSTM cell
        lstm_out, state_h, state_c = tf.keras.layers.LSTM(
            self.cell_size,
            return_sequences=True,
            return_state=True,
            name="lstm")(
                inputs=input_layer,
                mask=tf.sequence_mask(seq_in),
                initial_state=[state_in_h, state_in_c])

        # Postprocess LSTM output with another hidden layer and compute values
        logits = tf.keras.layers.Dense(
            self.num_outputs,
            activation=tf.keras.activations.linear,
            name="logits")(lstm_out)
        values = tf.keras.layers.Dense(
            1, activation=None, name="values")(lstm_out)

        # Create the RNN model
        self._rnn_model = tf.keras.Model(
            inputs=[input_layer, seq_in, state_in_h, state_in_c],
            outputs=[logits, values, state_h, state_c])
        self.register_variables(self._rnn_model.variables)
        self._rnn_model.summary()

        # Add prev-a/r to this model's view, if required.
        if model_config["lstm_use_prev_action_reward"]:
            self.inference_view_requirements[SampleBatch.PREV_REWARDS] = \
                ViewRequirement(SampleBatch.REWARDS, shift=-1)
            self.inference_view_requirements[SampleBatch.PREV_ACTIONS] = \
                ViewRequirement(SampleBatch.ACTIONS, space=self.action_space,
                                shift=-1)
 def __init__(self, observation_space, action_space, model_config, *args,
              **kwargs):
     super().__init__(observation_space, action_space, model_config, *args,
                      **kwargs)
     self.observation_space = observation_space
     self.action_space = action_space
     self.state_size = 10
     self.model_config = model_config or {}
     space = Box(low=-np.inf,
                 high=np.inf,
                 shape=(self.state_size, ),
                 dtype=np.float64)
     # Set view requirements such that the policy state is held in
     # memory for 2 environment steps.
     self.view_requirements["state_in_0"] = ViewRequirement(
         "state_out_0",
         shift="-2:-1",
         used_for_training=False,
         used_for_compute_actions=True,
         batch_repeat_value=1,
     )
     self.view_requirements["state_out_0"] = ViewRequirement(
         space=space,
         used_for_training=False,
         used_for_compute_actions=True,
         batch_repeat_value=1,
     )
Exemple #5
0
    def test_view_requirement_connector(self):
        view_requirements = {
            "obs":
            ViewRequirement(used_for_training=True,
                            used_for_compute_actions=True),
            "prev_actions":
            ViewRequirement(
                data_col="actions",
                shift=-1,
                used_for_training=True,
                used_for_compute_actions=True,
            ),
        }
        ctx = ConnectorContext(view_requirements=view_requirements)

        c = ViewRequirementAgentConnector(ctx)
        f = FlattenDataAgentConnector(ctx)

        d = AgentConnectorDataType(
            0,
            1,
            {
                SampleBatch.NEXT_OBS: {
                    "sensor1": [[1, 1], [2, 2]],
                    "sensor2": 8.8,
                },
                SampleBatch.ACTIONS: np.array(0),
            },
        )
        # ViewRequirementAgentConnector then FlattenAgentConnector.
        processed = f(c([d]))

        self.assertTrue("obs" in processed[0].data.for_action)
        self.assertTrue("prev_actions" in processed[0].data.for_action)
Exemple #6
0
    def __init__(self,
                 obs_space,
                 action_space,
                 num_outputs,
                 model_config,
                 name,
                 num_frames=3):
        super(FrameStackingCartPoleModel,
              self).__init__(obs_space, action_space, None, model_config, name)

        self.num_frames = num_frames
        self.num_outputs = num_outputs

        # Construct actual (very simple) FC model.
        assert len(obs_space.shape) == 1
        input_ = tf.keras.layers.Input(shape=(self.num_frames,
                                              obs_space.shape[0]))
        reshaped = tf.keras.layers.Reshape(
            [obs_space.shape[0] * self.num_frames])(input_)
        layer1 = tf.keras.layers.Dense(64, activation=tf.nn.relu)(reshaped)
        out = tf.keras.layers.Dense(self.num_outputs)(layer1)
        values = tf.keras.layers.Dense(1)(layer1)
        self.base_model = tf.keras.models.Model([input_], [out, values])
        self.register_variables(self.base_model.variables)

        self._last_value = None

        self.view_requirements["prev_n_obs"] = ViewRequirement(
            data_col="obs",
            shift="-{}:0".format(num_frames - 1),
            space=obs_space)
        self.view_requirements["prev_rewards"] = ViewRequirement(
            data_col="rewards", shift=-1)
Exemple #7
0
    def _update_model_view_requirements_from_init_state(self):
        """Uses Model's (or this Policy's) init state to add needed ViewReqs.

        Can be called from within a Policy to make sure RNNs automatically
        update their internal state-related view requirements.
        Changes the `self.view_requirements` dict.
        """
        self._model_init_state_automatically_added = True
        model = getattr(self, "model", None)
        obj = model or self
        if model and not hasattr(model, "view_requirements"):
            model.view_requirements = {
                SampleBatch.OBS: ViewRequirement(space=self.observation_space)
            }
        view_reqs = obj.view_requirements
        # Add state-ins to this model's view.
        init_state = []
        if hasattr(obj, "get_initial_state") and callable(
                obj.get_initial_state):
            init_state = obj.get_initial_state()
        else:
            obj.get_initial_state = lambda: []
        for i, state in enumerate(init_state):
            space = Box(-1.0, 1.0, shape=state.shape) if \
                hasattr(state, "shape") else state
            view_reqs["state_in_{}".format(i)] = ViewRequirement(
                "state_out_{}".format(i),
                shift=-1,
                used_for_compute_actions=True,
                batch_repeat_value=self.config.get("model",
                                                   {}).get("max_seq_len", 1),
                space=space)
            view_reqs["state_out_{}".format(i)] = ViewRequirement(
                space=space, used_for_training=True)
Exemple #8
0
    def __init__(self,
                 obs_space: gym.spaces.Space,
                 action_space: gym.spaces.Space,
                 num_outputs: Optional[int],
                 model_config: ModelConfigDict,
                 name: str,
                 policy_model_config: ModelConfigDict = None,
                 q_model_config: ModelConfigDict = None,
                 twin_q: bool = False,
                 initial_alpha: float = 1.0,
                 target_entropy: Optional[float] = None):
        super().__init__(obs_space=obs_space,
                         action_space=action_space,
                         num_outputs=num_outputs,
                         model_config=model_config,
                         name=name,
                         policy_model_config=policy_model_config,
                         q_model_config=q_model_config,
                         twin_q=twin_q,
                         initial_alpha=initial_alpha,
                         target_entropy=target_entropy)
        self.use_prev_action = (model_config["lstm_use_prev_action"]
                                or policy_model_config["lstm_use_prev_action"]
                                or q_model_config["lstm_use_prev_action"])

        self.use_prev_reward = (model_config["lstm_use_prev_reward"]
                                or policy_model_config["lstm_use_prev_reward"]
                                or q_model_config["lstm_use_prev_reward"])
        if self.use_prev_action:
            self.view_requirements[SampleBatch.PREV_ACTIONS] = \
                ViewRequirement(SampleBatch.ACTIONS, space=self.action_space,
                                shift=-1)
        if self.use_prev_reward:
            self.view_requirements[SampleBatch.PREV_REWARDS] = \
                ViewRequirement(SampleBatch.REWARDS, shift=-1)
Exemple #9
0
    def __init__(self,
                 obs_space,
                 action_space,
                 num_outputs,
                 model_config,
                 name,
                 num_frames=3):
        nn.Module.__init__(self)
        super(TorchFrameStackingCartPoleModel,
              self).__init__(obs_space, action_space, None, model_config, name)

        self.num_frames = num_frames
        self.num_outputs = num_outputs

        # Construct actual (very simple) FC model.
        assert len(obs_space.shape) == 1
        self.layer1 = SlimFC(in_size=obs_space.shape[0] * self.num_frames,
                             out_size=64,
                             activation_fn="relu")
        self.out = SlimFC(in_size=64,
                          out_size=self.num_outputs,
                          activation_fn="linear")
        self.values = SlimFC(in_size=64, out_size=1, activation_fn="linear")

        self._last_value = None

        self.view_requirements["prev_n_obs"] = ViewRequirement(
            data_col="obs",
            shift="-{}:0".format(num_frames - 1),
            space=obs_space)
        self.view_requirements["prev_rewards"] = ViewRequirement(
            data_col="rewards", shift=-1)
Exemple #10
0
    def _get_default_view_requirements(self):
        """Returns a default ViewRequirements dict.

        Note: This is the base/maximum requirement dict, from which later
        some requirements will be subtracted again automatically to streamline
        data collection, batch creation, and data transfer.

        Returns:
            ViewReqDict: The default view requirements dict.
        """

        # Default view requirements (equal to those that we would use before
        # the trajectory view API was introduced).
        return {
            SampleBatch.OBS: ViewRequirement(space=self.observation_space),
            SampleBatch.NEXT_OBS: ViewRequirement(
                data_col=SampleBatch.OBS,
                shift=1,
                space=self.observation_space),
            SampleBatch.ACTIONS: ViewRequirement(space=self.action_space),
            SampleBatch.REWARDS: ViewRequirement(),
            SampleBatch.DONES: ViewRequirement(),
            SampleBatch.INFOS: ViewRequirement(),
            SampleBatch.EPS_ID: ViewRequirement(),
            SampleBatch.AGENT_INDEX: ViewRequirement(),
            SampleBatch.UNROLL_ID: ViewRequirement(),
            "t": ViewRequirement(),
        }
Exemple #11
0
    def test_traj_view_next_action(self):
        action_space = Discrete(2)
        rollout_worker_w_api = RolloutWorker(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_config=ppo.DEFAULT_CONFIG,
            rollout_fragment_length=200,
            policy_spec=ppo.PPOTorchPolicy,
            policy_mapping_fn=None,
            num_envs=1,
        )
        # Add the next action (a') and 2nd next action (a'') to the view
        # requirements of the policy.
        # This should be visible then in postprocessing and train batches.
        # Switch off for action computations (can't be there as we don't know
        # the next actions already at action computation time).
        rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[
            "next_actions"] = ViewRequirement(
                SampleBatch.ACTIONS,
                shift=1,
                space=action_space,
                used_for_compute_actions=False,
            )
        rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[
            "2nd_next_actions"] = ViewRequirement(
                SampleBatch.ACTIONS,
                shift=2,
                space=action_space,
                used_for_compute_actions=False,
            )

        # Make sure, we have DONEs as well.
        rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[
            "dones"] = ViewRequirement()
        batch = rollout_worker_w_api.sample()
        self.assertTrue("next_actions" in batch)
        self.assertTrue("2nd_next_actions" in batch)
        expected_a_ = None  # expected next action
        expected_a__ = None  # expected 2nd next action
        for i in range(len(batch["actions"])):
            a, d, a_, a__ = (
                batch["actions"][i],
                batch["dones"][i],
                batch["next_actions"][i],
                batch["2nd_next_actions"][i],
            )
            # Episode done: next action and 2nd next action should be 0.
            if d:
                check(a_, 0)
                check(a__, 0)
                expected_a_ = None
                expected_a__ = None
                continue
            # Episode is not done and we have an expected next-a.
            if expected_a_ is not None:
                check(a, expected_a_)
            if expected_a__ is not None:
                check(a_, expected_a__)
            expected_a__ = a__
            expected_a_ = a_
Exemple #12
0
def training_view_requirements_fn(policy):
    return {
        # Next obs are needed for PPO postprocessing.
        SampleBatch.NEXT_OBS:
        ViewRequirement(SampleBatch.OBS, shift=1),
        # VF preds are needed for the loss.
        SampleBatch.VF_PREDS:
        ViewRequirement(shift=0),
    }
Exemple #13
0
 def training_view_requirements(self):
     if hasattr(self, "view_requirements"):
         return self.view_requirements
     return {
         SampleBatch.ACTIONS: ViewRequirement(space=self.action_space,
                                              shift=0),
         SampleBatch.REWARDS: ViewRequirement(shift=0),
         SampleBatch.DONES: ViewRequirement(shift=0),
     }
Exemple #14
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.episode_id = None
        self.env_id = None

        class _fake_model:
            pass

        self.model = _fake_model()
        self.model.time_major = True
        self.model.inference_view_requirements = {
            SampleBatch.EPS_ID:
            ViewRequirement(),
            "env_id":
            ViewRequirement(),
            SampleBatch.OBS:
            ViewRequirement(),
            SampleBatch.PREV_ACTIONS:
            ViewRequirement(SampleBatch.ACTIONS,
                            space=self.action_space,
                            shift=-1),
            SampleBatch.PREV_REWARDS:
            ViewRequirement(SampleBatch.REWARDS, shift=-1),
        }
        self.training_view_requirements = dict(
            **{
                SampleBatch.NEXT_OBS: ViewRequirement(SampleBatch.OBS,
                                                      shift=1),
                SampleBatch.ACTIONS: ViewRequirement(space=self.action_space),
                SampleBatch.REWARDS: ViewRequirement(),
                SampleBatch.DONES: ViewRequirement(),
            }, **self.model.inference_view_requirements)
 def inference_view_requirements(self) -> Dict[str, ViewRequirement]:
     req = super().inference_view_requirements()
     # Optional: prev-actions/rewards for forward pass.
     if self.model_config["lstm_use_prev_action_reward"]:
         req.update({
             SampleBatch.PREV_REWARDS: ViewRequirement(
                 SampleBatch.REWARDS, shift=-1),
             SampleBatch.PREV_ACTIONS: ViewRequirement(
                 SampleBatch.ACTIONS, space=self.action_space, shift=-1),
         })
     return req
Exemple #16
0
    def __init__(self, obs_space: gym.spaces.Space,
                 action_space: gym.spaces.Space, num_outputs: int,
                 model_config: ModelConfigDict, name: str):

        nn.Module.__init__(self)
        super().__init__(obs_space, action_space, None, model_config, name)

        self.cell_size = model_config["lstm_cell_size"]
        self.time_major = model_config.get("_time_major", False)
        self.use_prev_action = model_config["lstm_use_prev_action"]
        self.use_prev_reward = model_config["lstm_use_prev_reward"]

        if isinstance(action_space, Discrete):
            self.action_dim = action_space.n
        elif isinstance(action_space, MultiDiscrete):
            self.action_dim = np.product(action_space.nvec)
        elif action_space.shape is not None:
            self.action_dim = int(np.product(action_space.shape))
        else:
            self.action_dim = int(len(action_space))

        # Add prev-action/reward nodes to input to LSTM.
        if self.use_prev_action:
            self.num_outputs += self.action_dim
        if self.use_prev_reward:
            self.num_outputs += 1

        self.lstm = nn.LSTM(self.num_outputs,
                            self.cell_size,
                            batch_first=not self.time_major)

        self.num_outputs = num_outputs

        # Postprocess LSTM output with another hidden layer and compute values.
        self._logits_branch = SlimFC(in_size=self.cell_size,
                                     out_size=self.num_outputs,
                                     activation_fn=None,
                                     initializer=torch.nn.init.xavier_uniform_)
        self._value_branch = SlimFC(in_size=self.cell_size,
                                    out_size=1,
                                    activation_fn=None,
                                    initializer=torch.nn.init.xavier_uniform_)

        # Add prev-a/r to this model's view, if required.
        if model_config["lstm_use_prev_action"]:
            self.inference_view_requirements[SampleBatch.PREV_ACTIONS] = \
                ViewRequirement(SampleBatch.ACTIONS, space=self.action_space,
                                data_rel_pos=-1)
        if model_config["lstm_use_prev_reward"]:
            self.inference_view_requirements[SampleBatch.PREV_REWARDS] = \
                ViewRequirement(SampleBatch.REWARDS, data_rel_pos=-1)
Exemple #17
0
    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):

        nn.Module.__init__(self)
        super().__init__(obs_space, action_space, None, model_config, name)

        self.cell_size = model_config["lstm_cell_size"]
        self.time_major = model_config.get("_time_major", False)
        self.use_prev_action_reward = model_config[
            "lstm_use_prev_action_reward"]
        self.action_dim = int(np.product(action_space.shape))
        # Add prev-action/reward nodes to input to LSTM.
        if self.use_prev_action_reward:
            self.num_outputs += 1 + self.action_dim
        self.lstm = nn.LSTM(self.num_outputs,
                            self.cell_size,
                            batch_first=not self.time_major)

        self.num_outputs = num_outputs

        # Postprocess LSTM output with another hidden layer and compute values.
        self._logits_branch = SlimFC(in_size=self.cell_size,
                                     out_size=self.num_outputs,
                                     activation_fn=None,
                                     initializer=torch.nn.init.xavier_uniform_)
        self._value_branch = SlimFC(in_size=self.cell_size,
                                    out_size=1,
                                    activation_fn=None,
                                    initializer=torch.nn.init.xavier_uniform_)

        self.inference_view_requirements.update(
            dict(
                **{
                    SampleBatch.OBS:
                    ViewRequirement(shift=0),
                    SampleBatch.PREV_REWARDS:
                    ViewRequirement(SampleBatch.REWARDS, shift=-1),
                    SampleBatch.PREV_ACTIONS:
                    ViewRequirement(SampleBatch.ACTIONS,
                                    space=self.action_space,
                                    shift=-1),
                }))
        for i in range(2):
            self.inference_view_requirements["state_in_{}".format(i)] = \
                ViewRequirement(
                    "state_out_{}".format(i),
                    shift=-1,
                    space=Box(-1.0, 1.0, shape=(self.cell_size,)))
            self.inference_view_requirements["state_out_{}".format(i)] = \
                ViewRequirement(
                    space=Box(-1.0, 1.0, shape=(self.cell_size,)))
Exemple #18
0
def ppo_init(policy: Policy, obs_space: gym.spaces.Space,
             action_space: gym.spaces.Space,
             config: TrainerConfigDict) -> None:
    """ TODO: Write documentation.
    """
    # Call base implementation
    setup_mixins(policy, obs_space, action_space, config)

    # Add previous observation in viewer requirements for CAPS loss computation
    # TODO: Remove update of `policy.model.view_requirements` after ray fix
    caps_view_requirements = {
        "_prev_obs":
        ViewRequirement(data_col="obs",
                        space=obs_space,
                        shift=-1,
                        used_for_compute_actions=False)
    }
    policy.model.view_requirements.update(caps_view_requirements)
    policy.view_requirements.update(caps_view_requirements)

    # Initialize extra loss
    policy._mean_symmetric_policy_loss = 0.0
    policy._mean_temporal_caps_loss = 0.0
    policy._mean_spatial_caps_loss = 0.0
    policy._mean_global_caps_loss = 0.0
    def test_pad_batch_fixed_max(self):
        """Test pad_batch_to_sequences_of_same_size when dynamic_max = False"""
        view_requirements = {
            "state_in_0":
            ViewRequirement(
                "state_out_0",
                shift="-3:-1",
                used_for_training=False,
                used_for_compute_actions=True,
                batch_repeat_value=1,
            )
        }
        max_seq_len = 20
        num_seqs = np.random.randint(1, 20)
        seq_lens = np.random.randint(1, max_seq_len, size=(num_seqs))
        sum_seq_lens = np.sum(seq_lens)
        s1 = SampleBatch(
            {
                "a": np.arange(sum_seq_lens),
                "b": np.arange(sum_seq_lens),
                "seq_lens": seq_lens,
                "state_in_0": [[0]] * num_seqs,
            },
            _max_seq_len=max_seq_len,
        )

        pad_batch_to_sequences_of_same_size(
            s1,
            max_seq_len=max_seq_len,
            feature_keys=["a", "b"],
            view_requirements=view_requirements,
        )
        check(s1.max_seq_len, max_seq_len)
        check(s1["a"].shape[0], max_seq_len * num_seqs)
        check(s1["b"].shape[0], max_seq_len * num_seqs)
Exemple #20
0
    def __init__(self, obs_space: gym.spaces.Space,
                 action_space: gym.spaces.Space, num_outputs: int,
                 model_config: ModelConfigDict, name: str, framework: str):
        """Initializes a ModelV2 object.

        This method should create any variables used by the model.

        Args:
            obs_space (gym.spaces.Space): Observation space of the target gym
                env. This may have an `original_space` attribute that
                specifies how to unflatten the tensor into a ragged tensor.
            action_space (gym.spaces.Space): Action space of the target gym
                env.
            num_outputs (int): Number of output units of the model.
            model_config (ModelConfigDict): Config for the model, documented
                in ModelCatalog.
            name (str): Name (scope) for the model.
            framework (str): Either "tf" or "torch".
        """

        self.obs_space: gym.spaces.Space = obs_space
        self.action_space: gym.spaces.Space = action_space
        self.num_outputs: int = num_outputs
        self.model_config: ModelConfigDict = model_config
        self.name: str = name or "default_model"
        self.framework: str = framework
        self._last_output = None
        self.time_major = self.model_config.get("_time_major")
        # Basic view requirement for all models: Use the observation as input.
        self.inference_view_requirements = {
            SampleBatch.OBS: ViewRequirement(data_rel_pos=0,
                                             space=self.obs_space),
        }
Exemple #21
0
    def __init__(self,
                 obs_space,
                 action_space,
                 num_outputs,
                 model_config,
                 name,
                 fc_size=64,
                 lstm_state_size=256):
        nn.Module.__init__(self)
        super().__init__(obs_space, action_space, num_outputs, model_config,
                         name)

        self.obs_size = get_preprocessor(obs_space)(obs_space).size
        self.fc_size = fc_size
        self.lstm_state_size = lstm_state_size

        # Build the Module from fc + LSTM + 2xfc (action + value outs).
        self.fc1 = nn.Linear(self.obs_size, self.fc_size)
        self.lstm = nn.LSTM(
            self.fc_size, self.lstm_state_size, batch_first=True)
        self.action_branch = nn.Linear(self.lstm_state_size, num_outputs)
        self.value_branch = nn.Linear(self.lstm_state_size, 1)
        # Holds the current "base" output (before logits layer).
        self._features = None

        # Add state-ins to this model's view.
        for i in range(2):
            self.inference_view_requirements["state_in_{}".format(i)] = \
                ViewRequirement(
                    "state_out_{}".format(i),
                    shift=-1,
                    space=Box(-1.0, 1.0, shape=(self.lstm_state_size,)))
Exemple #22
0
    def _update_model_inference_view_requirements_from_init_state(self):
        """Uses this Model's initial state to auto-add necessary ViewReqs.

        Can be called from within a Policy to make sure RNNs automatically
        update their internal state-related view requirements.
        Changes the `self.inference_view_requirements` dict.
        """
        model = self.model
        # Add state-ins to this model's view.
        for i, state in enumerate(model.get_initial_state()):
            model.inference_view_requirements["state_in_{}".format(i)] = \
                ViewRequirement(
                    "state_out_{}".format(i),
                    shift=-1,
                    space=Box(-1.0, 1.0, shape=state.shape))
            model.inference_view_requirements["state_out_{}".format(i)] = \
                ViewRequirement(space=Box(-1.0, 1.0, shape=state.shape))
    def __init__(self,
                 obs_space,
                 action_space,
                 num_outputs,
                 model_config,
                 name,
                 num_frames=3):
        super(FrameStackingCartPoleModel,
              self).__init__(obs_space, action_space, None, model_config, name)

        self.num_frames = num_frames
        self.num_outputs = num_outputs

        # Construct actual (very simple) FC model.
        assert len(obs_space.shape) == 1
        obs = tf.keras.layers.Input(shape=(self.num_frames,
                                           obs_space.shape[0]))
        obs_reshaped = tf.keras.layers.Reshape(
            [obs_space.shape[0] * self.num_frames])(obs)
        rewards = tf.keras.layers.Input(shape=(self.num_frames))
        rewards_reshaped = tf.keras.layers.Reshape([self.num_frames])(rewards)
        actions = tf.keras.layers.Input(shape=(self.num_frames,
                                               self.action_space.n))
        actions_reshaped = tf.keras.layers.Reshape(
            [action_space.n * self.num_frames])(actions)
        input_ = tf.keras.layers.Concatenate(axis=-1)(
            [obs_reshaped, actions_reshaped, rewards_reshaped])
        layer1 = tf.keras.layers.Dense(256, activation=tf.nn.relu)(input_)
        layer2 = tf.keras.layers.Dense(256, activation=tf.nn.relu)(layer1)
        out = tf.keras.layers.Dense(self.num_outputs)(layer2)
        values = tf.keras.layers.Dense(1)(layer1)
        self.base_model = tf.keras.models.Model([obs, actions, rewards],
                                                [out, values])
        self._last_value = None

        self.view_requirements["prev_n_obs"] = ViewRequirement(
            data_col="obs",
            shift="-{}:0".format(num_frames - 1),
            space=obs_space)
        self.view_requirements["prev_n_rewards"] = ViewRequirement(
            data_col="rewards", shift="-{}:-1".format(self.num_frames))
        self.view_requirements["prev_n_actions"] = ViewRequirement(
            data_col="actions",
            shift="-{}:-1".format(self.num_frames),
            space=self.action_space,
        )
Exemple #24
0
    def _update_model_view_requirements_from_init_state(self):
        """Uses Model's (or this Policy's) init state to add needed ViewReqs.

        Can be called from within a Policy to make sure RNNs automatically
        update their internal state-related view requirements.
        Changes the `self.view_requirements` dict.
        """
        self._model_init_state_automatically_added = True
        model = getattr(self, "model", None)

        obj = model or self
        if model and not hasattr(model, "view_requirements"):
            model.view_requirements = {
                SampleBatch.OBS: ViewRequirement(space=self.observation_space)
            }
        view_reqs = obj.view_requirements
        # Add state-ins to this model's view.
        init_state = []
        if hasattr(obj, "get_initial_state") and callable(
                obj.get_initial_state):
            init_state = obj.get_initial_state()
        else:
            # Add this functionality automatically for new native model API.
            if tf and isinstance(model, tf.keras.Model) and \
                    "state_in_0" not in view_reqs:
                obj.get_initial_state = lambda: [
                    np.zeros_like(view_req.space.sample())
                    for k, view_req in model.view_requirements.items()
                    if k.startswith("state_in_")
                ]
            else:
                obj.get_initial_state = lambda: []
                if "state_in_0" in view_reqs:
                    self.is_recurrent = lambda: True
        for i, state in enumerate(init_state):
            space = Box(-1.0, 1.0, shape=state.shape) if \
                hasattr(state, "shape") else state
            view_reqs["state_in_{}".format(i)] = ViewRequirement(
                "state_out_{}".format(i),
                shift=-1,
                used_for_compute_actions=True,
                batch_repeat_value=self.config.get("model",
                                                   {}).get("max_seq_len", 1),
                space=space)
            view_reqs["state_out_{}".format(i)] = ViewRequirement(
                space=space, used_for_training=True)
Exemple #25
0
def view_requirements_fn(policy: Policy) -> Dict[str, ViewRequirement]:
    """Function defining the view requirements for training/postprocessing.

    These go on top of the Policy's Model's own view requirements used for
    the action computing forward passes.

    Args:
        policy (Policy): The Policy that requires the returned
            ViewRequirements.

    Returns:
        Dict[str, ViewRequirement]: The Policy's view requirements.
    """
    ret = {
        # Next obs are needed for PPO postprocessing, but not in loss.
        SampleBatch.NEXT_OBS: ViewRequirement(
            SampleBatch.OBS, shift=1, used_for_training=False),
        # Created during postprocessing.
        Postprocessing.ADVANTAGES: ViewRequirement(shift=0),
        Postprocessing.VALUE_TARGETS: ViewRequirement(shift=0),
        # Needed for PPO's loss function.
        SampleBatch.ACTION_DIST_INPUTS: ViewRequirement(shift=0),
        SampleBatch.ACTION_LOGP: ViewRequirement(shift=0),
        SampleBatch.VF_PREDS: ViewRequirement(shift=0),
    }
    # If policy is recurrent, have to add state_out for PPO postprocessing
    # (calculating GAE from next-obs and last state-out).
    if policy.is_recurrent():
        init_state = policy.get_initial_state()
        for i, s in enumerate(init_state):
            ret["state_out_{}".format(i)] = ViewRequirement(
                space=gym.spaces.Box(-1.0, 1.0, shape=(s.shape[0], )),
                used_for_training=False)
    return ret
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.exploration = self._create_exploration()
     self.view_requirements.update({
         "state_in_0": ViewRequirement(
             "state_out_0",
             shift=-1,
             space=gym.spaces.Box(0, 100, shape=(), dtype=np.int32))
     })
Exemple #27
0
    def _update_model_inference_view_requirements_from_init_state(self):
        """Uses Model's (or this Policy's) init state to add needed ViewReqs.

        Can be called from within a Policy to make sure RNNs automatically
        update their internal state-related view requirements.
        Changes the `self.inference_view_requirements` dict.
        """
        self._model_init_state_automatically_added = True
        model = getattr(self, "model", None)
        obj = model or self
        # Add state-ins to this model's view.
        for i, state in enumerate(obj.get_initial_state()):
            space = Box(-1.0, 1.0, shape=state.shape) if \
                hasattr(state, "shape") else state
            view_reqs = model.inference_view_requirements if model else \
                self.view_requirements
            view_reqs["state_in_{}".format(i)] = ViewRequirement(
                "state_out_{}".format(i), shift=-1, space=space)
            view_reqs["state_out_{}".format(i)] = ViewRequirement(space=space)
Exemple #28
0
def training_view_requirements_fn(
        policy: Policy) -> Dict[str, ViewRequirement]:
    """Function defining the view requirements for training the policy.

    These go on top of the Policy's Model's own view requirements used for
    action computing forward passes.

    Args:
        policy (Policy): The Policy that requires the returned
            ViewRequirements.

    Returns:
        Dict[str, ViewRequirement]: The Policy's view requirements.
    """
    return {
        # Next obs are needed for PPO postprocessing.
        SampleBatch.NEXT_OBS: ViewRequirement(SampleBatch.OBS, shift=1),
        # VF preds are needed for the loss.
        SampleBatch.VF_PREDS: ViewRequirement(shift=0),
        # Needed for postprocessing.
        SampleBatch.ACTION_DIST_INPUTS: ViewRequirement(shift=0),
        SampleBatch.ACTION_LOGP: ViewRequirement(shift=0),
        # Created during postprocessing.
        Postprocessing.ADVANTAGES: ViewRequirement(shift=0),
        Postprocessing.VALUE_TARGETS: ViewRequirement(shift=0),
    }
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.state_space = Box(-1.0, 1.0, (1, ))
        self.config["model"] = {"max_seq_len": 50}

        class _fake_model:
            pass

        self.model = _fake_model()
        self.model.inference_view_requirements = {
            SampleBatch.AGENT_INDEX:
            ViewRequirement(),
            SampleBatch.EPS_ID:
            ViewRequirement(),
            "env_id":
            ViewRequirement(),
            "t":
            ViewRequirement(),
            SampleBatch.OBS:
            ViewRequirement(),
            "state_in_0":
            ViewRequirement(
                "state_out_0",
                # Provide state outs -50 to -1 as "state-in".
                data_rel_pos="-50:-1",
                # Repeat the incoming state every n time steps (usually max seq
                # len).
                batch_repeat_value=self.config["model"]["max_seq_len"],
                space=self.state_space)
        }

        self.view_requirements = dict(super()._get_default_view_requirements(),
                                      **self.model.inference_view_requirements)
Exemple #30
0
def view_requirements_fn(policy: Policy) -> Dict[str, ViewRequirement]:
    """Function defining the view requirements for training/postprocessing.

    These go on top of the Policy's Model's own view requirements used for
    the action computing forward passes.

    Args:
        policy (Policy): The Policy that requires the returned
            ViewRequirements.

    Returns:
        Dict[str, ViewRequirement]: The Policy's view requirements.
    """
    ret = {
        SampleBatch.NEXT_OBS:
        ViewRequirement(SampleBatch.OBS, shift=1, used_for_training=False),
        Postprocessing.ADVANTAGES:
        ViewRequirement(shift=0),
        Postprocessing.VALUE_TARGETS:
        ViewRequirement(shift=0),
    }
    return ret