Beispiel #1
0
 def _build_learn(self, env):
     seqlen = self._sample_size + self._burn_in_size
     # Explicitly instantiate tf.function to initialize variables
     TensorSpecs = dict(obs=((seqlen + 1, *env.obs_shape), env.obs_dtype,
                             'obs'),
                        action=((seqlen + 1, env.action_dim), tf.float32,
                                'action'),
                        reward=((seqlen, ), tf.float32, 'reward'),
                        mu=((seqlen + 1, ), tf.float32, 'mu'),
                        discount=((seqlen, ), tf.float32, 'discount'),
                        mask=((seqlen + 1, ), tf.float32, 'mask'))
     if self._is_per and getattr(self, '_use_is_ratio', self._is_per):
         TensorSpecs['IS_ratio'] = ((), tf.float32, 'IS_ratio')
     if self._store_state:
         state_type = type(self.model.state_size)
         TensorSpecs['state'] = state_type(
             *[((sz, ), self._dtype, name)
               for name, sz in self.model.state_size._asdict().items()])
     if self._additional_rnn_inputs:
         if 'prev_action' in self._additional_rnn_inputs:
             TensorSpecs['prev_action'] = ((seqlen, *env.action_shape),
                                           env.action_dtype, 'prev_action')
         if 'prev_reward' in self._additional_rnn_inputs:
             TensorSpecs['prev_reward'] = (
                 (seqlen, ), self._dtype, 'prev_reward'
             )  # this reward should be unnormlaized
     self.learn = build(self._learn,
                        TensorSpecs,
                        batch_size=self._batch_size)
Beispiel #2
0
 def _build_learn(self, env):
     # Explicitly instantiate tf.function to avoid unintended retracing
     TensorSpecs = dict(
         obs=((self._sample_size, *env.obs_shape), env.obs_dtype, 'obs'),
         action=((self._sample_size, *env.action_shape), env.action_dtype,
                 'action'),
         value=((self._sample_size, ), tf.float32, 'value'),
         traj_ret=((self._sample_size, ), tf.float32, 'traj_ret'),
         advantage=((self._sample_size, ), tf.float32, 'advantage'),
         logpi=((self._sample_size, ), tf.float32, 'logpi'),
         mask=((self._sample_size, ), tf.float32, 'mask'),
     )
     if self._store_state:
         state_type = type(self.model.state_size)
         TensorSpecs['state'] = state_type(
             *[((sz, ), self._dtype, name)
               for name, sz in self.model.state_size._asdict().items()])
     if self._additional_rnn_inputs:
         if 'prev_action' in self._additional_rnn_inputs:
             TensorSpecs['prev_action'] = ((self._sample_size,
                                            *env.action_shape),
                                           env.action_dtype, 'prev_action')
         if 'prev_reward' in self._additional_rnn_inputs:
             TensorSpecs['prev_reward'] = (
                 (self._sample_size, ), self._dtype, 'prev_reward'
             )  # this reward should be unnormlaized
     self.learn = build(self._learn, TensorSpecs)
Beispiel #3
0
 def _build_learn(self, env):
     # Explicitly instantiate tf.function to avoid unintended retracing
     TensorSpecs = dict(
         obs=(env.obs_shape, env.obs_dtype, 'obs'),
         action=(env.action_shape, env.action_dtype, 'action'),
         value=((), tf.float32, 'value'),
         traj_ret=((), tf.float32, 'traj_ret'),
         advantage=((), tf.float32, 'advantage'),
         logpi=((), tf.float32, 'logpi'),
     )
     self.learn = build(self._learn, TensorSpecs)
     TensorSpecs = dict(
         obs=(env.obs_shape, env.obs_dtype, 'obs'),
         action=(env.action_shape, env.action_dtype, 'action'),
         obs_exp=(env.obs_shape, env.obs_dtype, 'obs_exp'),
         action_exp=(env.action_shape, env.action_dtype, 'action_exp'),
     )
     self.learn_discriminator = build(self._learn_discriminator, TensorSpecs)
Beispiel #4
0
 def _build_learn(self, env):
     # Explicitly instantiate tf.function to avoid unintended retracing
     TensorSpecs = dict(
         obs=(env.obs_shape, env.obs_dtype, 'obs'),
         action=(env.action_shape, env.action_dtype, 'action'),
         advantage=((), tf.float32, 'advantage'),
         logpi=((), tf.float32, 'logpi'),
     )
     self._policy_data = ['obs', 'action', 'advantage', 'logpi']
     self.learn_policy = build(self._learn_policy,
                               TensorSpecs,
                               batch_size=self._batch_size)
     TensorSpecs = dict(
         obs=(env.obs_shape, env.obs_dtype, 'obs'),
         value=((), tf.float32, 'value'),
         traj_ret=((), tf.float32, 'traj_ret'),
     )
     self._value_data = ['obs', 'value', 'traj_ret']
     self.learn_value = build(self._learn_value,
                              TensorSpecs,
                              batch_size=self._batch_size)
Beispiel #5
0
 def _build_learn(self, env):
     # Explicitly instantiate tf.function to avoid unintended retracing
     TensorSpecs = dict(
         obs=(env.obs_shape, env.obs_dtype, 'obs'),
         action=(env.action_shape, env.action_dtype, 'action'),
         value=((), tf.float32, 'value'),
         traj_ret=((), tf.float32, 'traj_ret'),
         advantage=((), tf.float32, 'advantage'),
         logpi=((), tf.float32, 'logpi'),
     )
     self.learn = build(self._learn,
                        TensorSpecs,
                        batch_size=self._batch_size)
     TensorSpecs = dict(
         obs=(env.obs_shape, env.obs_dtype, 'obs'),
         logits=((env.action_dim, ), tf.float32, 'logits'),
         value=((), tf.float32, 'value'),
         traj_ret=((), tf.float32, 'traj_ret'),
     )
     self.aux_learn = build(self._aux_learn,
                            TensorSpecs,
                            batch_size=self._aux_batch_size)
Beispiel #6
0
 def _build_learn(self, env):
     # Explicitly instantiate tf.function to avoid unintended retracing
     norm_obs_shape = env.obs_shape[:-1] + (1, )
     TensorSpecs = dict(
         obs=(env.obs_shape, env.obs_dtype, 'obs'),
         obs_norm=(norm_obs_shape, tf.float32, 'obs_norm'),
         action=(env.action_shape, env.action_dtype, 'action'),
         traj_ret_int=((), tf.float32, 'traj_ret_int'),
         traj_ret_ext=((), tf.float32, 'traj_ret_ext'),
         value_int=((), tf.float32, 'value_int'),
         value_ext=((), tf.float32, 'value_ext'),
         advantage=((), tf.float32, 'advantage'),
         logpi=((), tf.float32, 'logpi'),
     )
     self.learn = build(self._learn, TensorSpecs)
Beispiel #7
0
 def _build_learn(self, env):
     # Explicitly instantiate tf.function to initialize variables
     TensorSpecs = dict(
         obs=(env.obs_shape, env.obs_dtype, 'obs'),
         action=((env.action_dim, ), tf.float32, 'action'),
         reward=((), tf.float32, 'reward'),
         next_obs=(env.obs_shape, env.obs_dtype, 'next_obs'),
         discount=((), tf.float32, 'discount'),
     )
     if self._is_per and getattr(self, '_use_is_ratio', self._is_per):
         TensorSpecs['IS_ratio'] = ((), tf.float32, 'IS_ratio')
     if self._n_steps > 1:
         TensorSpecs['steps'] = ((), tf.float32, 'steps')
     self.learn = build(self._learn,
                        TensorSpecs,
                        batch_size=self._batch_size)
Beispiel #8
0
    def _build_learn(self, env):
        # time dimension must be explicitly specified here
        # otherwise, InaccessibleTensorError arises when expanding rssm
        TensorSpecs = dict(
            obs=((self._sample_size, *self._obs_shape), self._dtype, 'obs'),
            prev_action=((self._sample_size, self._action_dim), self._dtype,
                         'prev_action'),
            reward=((self._sample_size, ), self._dtype, 'reward'),
            discount=((self._sample_size, ), self._dtype, 'discount'),
            log_images=(None, tf.bool, 'log_images'))
        if self._store_state:
            state_size = self.rssm.state_size
            TensorSpecs['state'] = (RSSMState(
                *[((sz, ), self._dtype, name)
                  for name, sz in zip(RSSMState._fields, state_size)]))

        self.learn = build(self._learn,
                           TensorSpecs,
                           batch_size=self._batch_size)
Beispiel #9
0
    def _build_learn(self, env):
        # Explicitly instantiate tf.function to avoid unintended retracing
        TensorSpecs = dict(
            obs=((self._sample_size + 1, self._n_agents, *env.obs_shape),
                 env.obs_dtype, 'obs'),
            global_state=((self._sample_size + 1, *env.shared_state_shape),
                          env.shared_state_dtype, 'global_state'),
            action_mask=((self._sample_size + 1, self._n_agents,
                          env.action_dim), tf.bool, 'action_mask'),
            episodic_mask=((self._sample_size, ), tf.float32, 'episodic_mask'),
            action=((self._sample_size, self._n_agents, env.action_dim),
                    tf.float32, 'action'),
            reward=((self._sample_size, ), tf.float32, 'reward'),
            discount=((self._sample_size, ), tf.float32, 'discount'),
        )

        self.learn = build(self._learn,
                           TensorSpecs,
                           batch_size=self._batch_size)