コード例 #1
0
ファイル: util.py プロジェクト: gntoni/garage
def new_tensor_variable(space, name, extra_dims):
    if isinstance(space, gym.spaces.Box):
        return ext.new_tensor(name=name,
                              ndim=extra_dims + 1,
                              dtype=theano.config.floatX)
    elif isinstance(space, gym.spaces.Discrete):
        if space.n <= 2**8:
            return ext.new_tensor(name=name,
                                  ndim=extra_dims + 1,
                                  dtype='uint8')
        elif space.n <= 2**16:
            return ext.new_tensor(name=name,
                                  ndim=extra_dims + 1,
                                  dtype='uint16')
        else:
            return ext.new_tensor(name=name,
                                  ndim=extra_dims + 1,
                                  dtype='uint32')
    elif isinstance(space, gym.spaces.Tuple):
        dtypes = [
            new_tensor_variable(c, "tmp", extra_dims=0).dtype
            for c in space.spaces
        ]
        if dtypes and hasattr(dtypes[0], "as_numpy_dtype"):
            dtypes = [d.as_numpy_dtype for d in dtypes]
        common_dtype = np.core.numerictypes.find_common_type([], dtypes)
        return ext.new_tensor(
            name=name,
            ndim=extra_dims + 1,
            dtype=common_dtype,
        )
    else:
        raise NotImplementedError
コード例 #2
0
    def new_tensor_variable(self, name, extra_dims):
        """
        Create a tensor variable in Theano.

        :param name: name of the variable
        :param extra_dims: extra dimensions to be prepended
        :return: the created tensor variable
        """
        if self.n <= 2**8:
            return ext.new_tensor(
                name=name, ndim=extra_dims + 1, dtype='uint8')
        elif self.n <= 2**16:
            return ext.new_tensor(
                name=name, ndim=extra_dims + 1, dtype='uint16')
        else:
            return ext.new_tensor(
                name=name, ndim=extra_dims + 1, dtype='uint32')
コード例 #3
0
ファイル: box.py プロジェクト: gntoni/garage
    def new_tensor_variable(self, name, extra_dims):
        """
        Create a tensor variable in Theano.

        :param name: name of the variable
        :param extra_dims: extra dimensions to be prepended
        :return: the created tensor variable
        """
        return ext.new_tensor(name=name,
                              ndim=extra_dims + 1,
                              dtype=theano.config.floatX)
コード例 #4
0
ファイル: reps.py プロジェクト: gntoni/garage
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        # Init dual param values
        self.param_eta = 15.
        # Adjust for linear feature vector.
        self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 +
                                      4)

        # Theano vars
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        rewards = ext.new_tensor(
            'rewards',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX,
        )
        # Feature difference variable representing the difference in feature
        # value of the next observation and the current observation \phi(s') -
        # \phi(s).
        feat_diff = ext.new_tensor('feat_diff',
                                   ndim=2 + is_recurrent,
                                   dtype=theano.config.floatX)
        param_v = TT.vector('param_v')
        param_eta = TT.scalar('eta')

        valid_var = TT.matrix('valid')

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        # Policy-related symbolics
        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        dist = self.policy.distribution
        # log of the policy dist
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)

        # Symbolic sample Bellman error
        delta_v = rewards + TT.dot(feat_diff, param_v)

        # Policy loss (negative because we minimize)
        if is_recurrent:
            loss = -TT.sum(logli * TT.exp(delta_v / param_eta -
                                          TT.max(delta_v / param_eta)) *
                           valid_var) / TT.sum(valid_var)
        else:
            loss = -TT.mean(logli * TT.exp(delta_v / param_eta -
                                           TT.max(delta_v / param_eta)))

        # Add regularization to loss.
        reg_params = self.policy.get_params(regularizable=True)
        loss += self.L2_reg_loss * TT.sum(
            [TT.mean(TT.square(param))
             for param in reg_params]) / len(reg_params)

        # Policy loss gradient.
        loss_grad = TT.grad(loss, self.policy.get_params(trainable=True))

        if is_recurrent:
            recurrent_vars = [valid_var]
        else:
            recurrent_vars = []

        input = [
            rewards, obs_var, feat_diff, action_var
        ] + state_info_vars_list + recurrent_vars + [param_eta, param_v]
        # if is_recurrent:
        #     input +=
        f_loss = ext.compile_function(
            inputs=input,
            outputs=loss,
        )
        f_loss_grad = ext.compile_function(
            inputs=input,
            outputs=loss_grad,
        )

        # Debug prints
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            mean_kl = TT.sum(
                dist.kl_sym(old_dist_info_vars, dist_info_vars) *
                valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars))

        f_kl = ext.compile_function(
            inputs=[obs_var, action_var] + state_info_vars_list +
            old_dist_info_vars_list + recurrent_vars,
            outputs=mean_kl,
        )

        # Dual-related symbolics
        # Symbolic dual
        if is_recurrent:
            dual = param_eta * self.epsilon + \
                   param_eta * TT.log(
                       TT.sum(
                           TT.exp(
                            delta_v / param_eta - TT.max(delta_v / param_eta)
                           ) * valid_var
                       ) / TT.sum(valid_var)
                   ) + param_eta * TT.max(delta_v / param_eta)
        else:
            dual = param_eta * self.epsilon + \
                   param_eta * TT.log(
                       TT.mean(
                           TT.exp(
                            delta_v / param_eta - TT.max(delta_v / param_eta)
                           )
                       )
                   ) + param_eta * TT.max(delta_v / param_eta)
        # Add L2 regularization.
        dual += self.L2_reg_dual * \
                (TT.square(param_eta) + TT.square(1 / param_eta))

        # Symbolic dual gradient
        dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v])

        # Eval functions.
        f_dual = ext.compile_function(inputs=[rewards, feat_diff] +
                                      state_info_vars_list + recurrent_vars +
                                      [param_eta, param_v],
                                      outputs=dual)
        f_dual_grad = ext.compile_function(
            inputs=[rewards, feat_diff] + state_info_vars_list +
            recurrent_vars + [param_eta, param_v],
            outputs=dual_grad)

        self.opt_info = dict(f_loss_grad=f_loss_grad,
                             f_loss=f_loss,
                             f_dual=f_dual,
                             f_dual_grad=f_dual_grad,
                             f_kl=f_kl)
コード例 #5
0
ファイル: vpg.py プロジェクト: gntoni/garage
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = -TT.sum(
                logli * advantage_var * valid_var) / TT.sum(valid_var)
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            max_kl = TT.max(kl * valid_var)
        else:
            surr_obj = -TT.mean(logli * advantage_var)
            mean_kl = TT.mean(kl)
            max_kl = TT.max(kl)

        input_list = [obs_var, action_var, advantage_var
                      ] + state_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(surr_obj,
                                  target=self.policy,
                                  inputs=input_list)

        f_kl = ext.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(f_kl=f_kl, )