コード例 #1
0
ファイル: box.py プロジェクト: Mee321/HAPG_exp
    def new_tensor_variable(self, name, extra_dims):
        """
        Create a tensor variable in Theano.

        :param name: name of the variable
        :param extra_dims: extra dimensions to be prepended
        :return: the created tensor variable
        """
        return tensor_utils.new_tensor(name=name,
                                       ndim=extra_dims + 1,
                                       dtype=theano.config.floatX)
コード例 #2
0
ファイル: discrete.py プロジェクト: Mee321/HAPG_exp
    def new_tensor_variable(self, name, extra_dims):
        """
        Create a tensor variable in Theano.

        :param name: name of the variable
        :param extra_dims: extra dimensions to be prepended
        :return: the created tensor variable
        """
        if self.n <= 2**8:
            return tensor_utils.new_tensor(name=name,
                                           ndim=extra_dims + 1,
                                           dtype='uint8')
        elif self.n <= 2**16:
            return tensor_utils.new_tensor(name=name,
                                           ndim=extra_dims + 1,
                                           dtype='uint16')
        else:
            return tensor_utils.new_tensor(name=name,
                                           ndim=extra_dims + 1,
                                           dtype='uint32')
コード例 #3
0
ファイル: capg_corrected.py プロジェクト: Mee321/HAPG_exp
    def init_opt(self):
        observations_var = self.env.observation_space.new_tensor_variable(
            'observations', extra_dims=1)
        actions_var = self.env.action_space.new_tensor_variable('actions',
                                                                extra_dims=1)
        advantages_var = tensor_utils.new_tensor('advantage',
                                                 ndim=1,
                                                 dtype=theano.config.floatX)
        dist = self.policy.distribution
        dist_info_vars = self.policy.dist_info_sym(observations_var)
        old_dist_info_vars = self.backup_policy.dist_info_sym(observations_var)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        mean_kl = TT.mean(kl)
        max_kl = TT.max(kl)

        pos_eps_dist_info_vars = self.pos_eps_policy.dist_info_sym(
            observations_var)
        neg_eps_dist_info_vars = self.neg_eps_policy.dist_info_sym(
            observations_var)
        mix_dist_info_vars = self.mix_policy.dist_info_sym(observations_var)

        surr = TT.sum(
            dist.log_likelihood_sym(actions_var, dist_info_vars) *
            advantages_var)
        surr_pos_eps = TT.sum(
            dist.log_likelihood_sym(actions_var, pos_eps_dist_info_vars) *
            advantages_var)
        surr_neg_eps = TT.sum(
            dist.log_likelihood_sym(actions_var, neg_eps_dist_info_vars) *
            advantages_var)
        surr_mix = TT.sum(
            dist.log_likelihood_sym(actions_var, mix_dist_info_vars) *
            advantages_var)
        surr_loglikelihood = TT.sum(
            dist.log_likelihood_sym(actions_var, mix_dist_info_vars))

        params = self.policy.get_params(trainable=True)
        mix_params = self.mix_policy.get_params(trainable=True)
        pos_eps_params = self.pos_eps_policy.get_params(trainable=True)
        neg_eps_params = self.neg_eps_policy.get_params(trainable=True)
        backup_params = self.backup_policy.get_params(trainable=True)

        grads = theano.grad(surr, params)
        grad_pos_eps = theano.grad(surr_pos_eps, pos_eps_params)
        grad_neg_eps = theano.grad(surr_neg_eps, neg_eps_params)
        grad_mix = theano.grad(surr_mix, mix_params)
        grad_mix_lh = theano.grad(surr_loglikelihood, mix_params)

        self.f_surr = theano.function(
            inputs=[observations_var, actions_var, advantages_var],
            outputs=surr)
        self.f_train = theano.function(
            inputs=[observations_var, actions_var, advantages_var],
            outputs=grads)
        self.f_pos_grad = theano.function(
            inputs=[observations_var, actions_var, advantages_var],
            outputs=grad_pos_eps)
        self.f_neg_grad = theano.function(
            inputs=[observations_var, actions_var, advantages_var],
            outputs=grad_neg_eps)
        self.f_mix_grad = theano.function(
            inputs=[observations_var, actions_var, advantages_var],
            outputs=grad_mix)
        self.f_mix_lh = theano.function(inputs=[observations_var, actions_var],
                                        outputs=grad_mix_lh)
        #self.f_update = theano.function(
        #   inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5, eval_grad6, eval_grad7],
        #  outputs=None,
        # updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5, eval_grad6, eval_grad7], params,
        #            learning_rate=self.learning_rate)
        #)
        self.f_kl = tensor_utils.compile_function(
            inputs=[observations_var],
            outputs=[mean_kl, max_kl],
        )
        return dict()
コード例 #4
0
ファイル: instrumented_npo.py プロジェクト: Mee321/HAPG_exp
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor('advantage',
                                                ndim=1 + is_recurrent,
                                                dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: tensor_utils.new_tensor('old_%s' % k,
                                       ndim=2 + is_recurrent,
                                       dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tensor_utils.new_tensor(k,
                                       ndim=2 + is_recurrent,
                                       dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if self.truncate_local_is_ratio is not None:
            lr = TT.minimum(self.truncate_local_is_ratio, lr)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = -TT.sum(
                lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(kl)
            surr_loss = -TT.mean(lr * advantage_var)

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")
        return dict()
コード例 #5
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        # Init dual param values
        self.param_eta = 15.
        # Adjust for linear feature vector.
        self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 +
                                      4)

        # Theano vars
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        rewards = theano_tensor_utils.new_tensor(
            'rewards',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX,
        )
        # Feature difference variable representing the difference in feature
        # value of the next observation and the current observation \phi(s') -
        # \phi(s).
        feat_diff = theano_tensor_utils.new_tensor(
            'feat_diff', ndim=2 + is_recurrent, dtype=theano.config.floatX)
        param_v = TT.vector('param_v')
        param_eta = TT.scalar('eta')

        valid_var = TT.matrix('valid')

        state_info_vars = {
            k: theano_tensor_utils.new_tensor(
                k, ndim=2 + is_recurrent, dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        # Policy-related symbolics
        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        dist = self.policy.distribution
        # log of the policy dist
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)

        # Symbolic sample Bellman error
        delta_v = rewards + TT.dot(feat_diff, param_v)

        # Policy loss (negative because we minimize)
        if is_recurrent:
            loss = -TT.sum(logli * TT.exp(delta_v / param_eta - TT.max(
                delta_v / param_eta)) * valid_var) / TT.sum(valid_var)
        else:
            loss = -TT.mean(logli * TT.exp(delta_v / param_eta -
                                           TT.max(delta_v / param_eta)))

        # Add regularization to loss.
        reg_params = self.policy.get_params(regularizable=True)
        loss += self.L2_reg_loss * TT.sum(
            [TT.mean(TT.square(param))
             for param in reg_params]) / len(reg_params)

        # Policy loss gradient.
        loss_grad = TT.grad(loss, self.policy.get_params(trainable=True))

        if is_recurrent:
            recurrent_vars = [valid_var]
        else:
            recurrent_vars = []

        input = [
            rewards, obs_var, feat_diff, action_var
        ] + state_info_vars_list + recurrent_vars + [param_eta, param_v]
        # if is_recurrent:
        #     input +=
        f_loss = theano_tensor_utils.compile_function(
            inputs=input,
            outputs=loss,
        )
        f_loss_grad = theano_tensor_utils.compile_function(
            inputs=input,
            outputs=loss_grad,
        )

        # Debug prints
        old_dist_info_vars = {
            k: theano_tensor_utils.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            mean_kl = TT.sum(
                dist.kl_sym(old_dist_info_vars, dist_info_vars) *
                valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars))

        f_kl = theano_tensor_utils.compile_function(
            inputs=[obs_var, action_var] + state_info_vars_list +
            old_dist_info_vars_list + recurrent_vars,
            outputs=mean_kl,
        )

        # Dual-related symbolics
        # Symbolic dual
        if is_recurrent:
            dual = param_eta * self.epsilon + \
                   param_eta * TT.log(
                       TT.sum(
                           TT.exp(
                            delta_v / param_eta - TT.max(delta_v / param_eta)
                           ) * valid_var
                       ) / TT.sum(valid_var)
                   ) + param_eta * TT.max(delta_v / param_eta)
        else:
            dual = param_eta * self.epsilon + \
                   param_eta * TT.log(
                       TT.mean(
                           TT.exp(
                            delta_v / param_eta - TT.max(delta_v / param_eta)
                           )
                       )
                   ) + param_eta * TT.max(delta_v / param_eta)
        # Add L2 regularization.
        dual += self.L2_reg_dual * \
                (TT.square(param_eta) + TT.square(1 / param_eta))

        # Symbolic dual gradient
        dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v])

        # Eval functions.
        f_dual = theano_tensor_utils.compile_function(
            inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars
            + [param_eta, param_v],
            outputs=dual)
        f_dual_grad = theano_tensor_utils.compile_function(
            inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars
            + [param_eta, param_v],
            outputs=dual_grad)

        self.opt_info = dict(
            f_loss_grad=f_loss_grad,
            f_loss=f_loss,
            f_dual=f_dual,
            f_dual_grad=f_dual_grad,
            f_kl=f_kl)
コード例 #6
0
from garage.misc.instrument import run_experiment

env_name = "Swimmer"
hidden_sizes = (32, 32)
env = TheanoEnv(normalize(SwimmerEnv()))
policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes)
backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)
mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)
pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)
neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)

observations_var = env.observation_space.new_tensor_variable('observations',
                                                             extra_dims=1)
actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1)
rewards_var = tensor_utils.new_tensor('rewards',
                                      ndim=1,
                                      dtype=theano.config.floatX)

dist = policy.distribution
dist_info_vars = policy.dist_info_sym(observations_var)
old_dist_info_vars = backup_policy.dist_info_sym(observations_var)
kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
mean_kl = TT.mean(kl)
max_kl = TT.max(kl)

#for test
surr_ll = dist.log_likelihood_sym(actions_var, dist_info_vars)
surr_ll_cumsum = dist.log_likelihood_sym_cumsum(actions_var, dist_info_vars)
surr = TT.sum(surr_ll_cumsum * rewards_var)

f_surr_ll = theano.function(inputs=[observations_var, actions_var],
コード例 #7
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: tensor_utils.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        state_info_vars = {
            k: tensor_utils.new_tensor(
                k, ndim=2 + is_recurrent, dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = -TT.sum(
                logli * advantage_var * valid_var) / TT.sum(valid_var)
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            max_kl = TT.max(kl * valid_var)
        else:
            surr_obj = -TT.mean(logli * advantage_var)
            mean_kl = TT.mean(kl)
            max_kl = TT.max(kl)

        input_list = [obs_var, action_var, advantage_var
                      ] + state_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(
            surr_obj, target=self.policy, inputs=input_list)

        f_kl = tensor_utils.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(f_kl=f_kl, )