Beispiel #1
0
    def make_actor(self, obs=None, reuse=False, scope="pi"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                pi_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                pi_h = tf.layers.flatten(obs)

            if len(self.layers) > 0:
                pi_h = mlp(pi_h,
                           self.layers,
                           self.activ_fn,
                           layer_norm=self.layer_norm)

            master_W, master_b = get_aggregation_var(pi_h,
                                                     name_scope='master',
                                                     n_sources=self.n_sources,
                                                     SDW=self.SDW,
                                                     n_actions=self.n_actions,
                                                     no_bias=self.no_bias)

            self.act_mu = mu_ = affine_transformation(self.sources_actions,
                                                      master_W, master_b)

            # Important difference with SAC and other algo such as PPO:
            # the std depends on the state, so we cannot use stable_baselines.common.distribution
            log_std = tf.layers.dense(pi_h,
                                      self.ac_space.shape[0],
                                      activation=None,
                                      name='log_std')

        # OpenAI Variation to cap the standard deviation
        # activation = tf.tanh # for log_std
        # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
        # Original Implementation
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)

        self.std = std = tf.exp(log_std)
        # Reparameterization trick
        pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std
        logp_pi = gaussian_likelihood(pi_, mu_, log_std)
        self.entropy = gaussian_entropy(log_std)
        # MISSING: reg params for log and mu
        # Apply squashing and account for it in the probabilty
        deterministic_policy, policy, logp_pi = apply_squashing_func(
            mu_, pi_, logp_pi)

        if isinstance(self.ac_space, gym.spaces.Box):
            policy = tf.clip_by_value(policy, self.ac_space.low + EPS,
                                      self.ac_space.high - EPS)
            deterministic_policy = tf.clip_by_value(deterministic_policy,
                                                    self.ac_space.low + EPS,
                                                    self.ac_space.high - EPS)

        self.policy = policy
        self.deterministic_policy = deterministic_policy

        return deterministic_policy, policy, logp_pi
 def logpac(self, action):
     from stable_baselines.sac.policies import gaussian_likelihood, EPS
     act_mu = self.policy_tf.act_mu
     log_std = tf.log(self.policy_tf.std)
     # Potentially we need to clip atanh and pass gradient
     log_u = gaussian_likelihood(
         tf.atanh(tf.clip_by_value(action, -0.99, 0.99)), act_mu, log_std)
     log_ac = log_u - tf.reduce_sum(tf.log(1 - action**2 + EPS), axis=1)
     return log_ac
Beispiel #3
0
    def make_actor(self, obs=None, reuse=False, scope="pi"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            # if self.feature_extraction == "cnn":
            #     pi_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            # else:
            #     pi_h = tf.layers.flatten(obs)

            pi_h = CnnMlpFeatureExtractor(obs)

            pi_h = mlp(pi_h,
                       self.layers,
                       self.activ_fn,
                       layer_norm=self.layer_norm)

            self.act_mu = mu_ = tf.layers.dense(pi_h,
                                                self.ac_space.shape[0],
                                                activation=None)
            # Important difference with SAC and other algo such as PPO:
            # the std depends on the state, so we cannot use stable_baselines.common.distribution
            log_std = tf.layers.dense(pi_h,
                                      self.ac_space.shape[0],
                                      activation=None)

        # Regularize policy output (not used for now)
        # reg_loss = self.reg_weight * 0.5 * tf.reduce_mean(log_std ** 2)
        # reg_loss += self.reg_weight * 0.5 * tf.reduce_mean(mu ** 2)
        # self.reg_loss = reg_loss

        # OpenAI Variation to cap the standard deviation
        # activation = tf.tanh # for log_std
        # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
        # Original Implementation
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)

        self.std = std = tf.exp(log_std)
        # Reparameterization trick
        pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std
        logp_pi = gaussian_likelihood(pi_, mu_, log_std)
        self.entropy = gaussian_entropy(log_std)
        # MISSING: reg params for log and mu
        # Apply squashing and account for it in the probability
        deterministic_policy, policy, logp_pi = apply_squashing_func(
            mu_, pi_, logp_pi)
        self.policy = policy
        self.deterministic_policy = deterministic_policy

        return deterministic_policy, policy, logp_pi