Beispiel #1
0
    def init_opt(self):
        self.start_time = time.time()
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX
        )
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [old_dist_info_vars[k]
                                   for k in dist.dist_info_keys]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, action_var)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(
            action_var, old_dist_info_vars, dist_info_vars)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = - \
                TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(kl)
            surr_loss = - TT.mean(lr * advantage_var)

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            leq_constraint=(mean_kl, self.step_size),
            inputs=input_list,
            constraint_name="mean_kl"
        )
        return dict()
Beispiel #2
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX
        )
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in dist.dist_info_keys
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, action_var)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = - TT.sum(logli * advantage_var * valid_var) / TT.sum(valid_var)
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            max_kl = TT.max(kl * valid_var)
        else:
            surr_obj = - TT.mean(logli * advantage_var)
            mean_kl = TT.mean(kl)
            max_kl = TT.max(kl)

        input_list = [obs_var, action_var, advantage_var]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list)

        f_kl = ext.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(
            f_kl=f_kl,
        )
Beispiel #3
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, action_var)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = -TT.sum(
                logli * advantage_var * valid_var) / TT.sum(valid_var)
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            max_kl = TT.max(kl * valid_var)
        else:
            surr_obj = -TT.mean(logli * advantage_var)
            mean_kl = TT.mean(kl)
            max_kl = TT.max(kl)

        input_list = [obs_var, action_var, advantage_var]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(surr_obj,
                                  target=self.policy,
                                  inputs=input_list)

        f_kl = ext.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(f_kl=f_kl, )
Beispiel #4
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX
        )
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [old_dist_info_vars[k]
                                   for k in dist.dist_info_keys]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, action_var)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(
            action_var, old_dist_info_vars, dist_info_vars)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = - \
                TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(kl)
            surr_loss = - TT.mean(lr * advantage_var)

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            leq_constraint=(mean_kl, self.step_size),
            inputs=input_list,
            constraint_name="mean_kl"
        )
        return dict()
Beispiel #5
0
    def __init__(self, env, policy, baseline, max_kl):
        """ env     = only structural info of env is used here; 
                      you need to pass the 'mode' to functions of this class
            max_kl  = constraint for determining step-size (suggested: 1e-2 or 5e-3)
        """

        self.policy = policy
        self.env = env
        self.baseline = baseline

        self.optimizer = ConjugateGradientOptimizer(**dict())

        # Define symbolic variables
        self.observations_var = self.env.observation_space.new_tensor_variable(
            'observations', extra_dims=1)
        self.actions_var = self.env.action_space.new_tensor_variable(
            'actions', extra_dims=1)
        self.advantages_var = TT.vector('advantages')

        self.dist = self.policy.distribution

        self.old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k, ndim=2, dtype=theano.config.floatX)
            for k in self.dist.dist_info_keys
        }
        self.old_dist_info_vars_list = [
            self.old_dist_info_vars[k] for k in self.dist.dist_info_keys
        ]

        self.state_info_vars = {
            k: ext.new_tensor(k, ndim=2, dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        self.state_info_vars_list = [
            self.state_info_vars[k] for k in self.policy.state_info_keys
        ]

        self.dist_info_vars = self.policy.dist_info_sym(
            self.observations_var, self.state_info_vars)
        # distribution info variable (symbolic) -- interpret as pi
        self.KL = self.dist.kl_sym(self.old_dist_info_vars,
                                   self.dist_info_vars)
        self.LR = self.dist.likelihood_ratio_sym(self.actions_var,
                                                 self.old_dist_info_vars,
                                                 self.dist_info_vars)
        self.mean_KL = TT.mean(self.KL)

        self.surr = -TT.mean(self.LR * self.advantages_var)

        self.input_list = [self.observations_var, self.actions_var, self.advantages_var] + \
                          self.state_info_vars_list + self.old_dist_info_vars_list
        self.optimizer.update_opt(loss=self.surr, target=self.policy, \
                                  leq_constraint=(self.mean_KL, max_kl), \
                                  inputs=self.input_list, constraint_name="mean_kl")
Beispiel #6
0
 def new_tensor_variable(self, name, extra_dims):
     if self.n <= 2**8:
         return ext.new_tensor(name=name,
                               ndim=extra_dims + 1,
                               dtype='uint8')
     elif self.n <= 2**16:
         return ext.new_tensor(name=name,
                               ndim=extra_dims + 1,
                               dtype='uint16')
     else:
         return ext.new_tensor(name=name,
                               ndim=extra_dims + 1,
                               dtype='uint32')
Beispiel #7
0
    def __init__(self, env, policy, baseline, max_kl):
        """ env     = only structural info of env is used here; 
                      you need to pass the 'mode' to functions of this class
            max_kl  = constraint for determining step-size (suggested: 1e-2 or 5e-3)
        """

        self.policy = policy
        self.env = env
        self.baseline = baseline

        self.optimizer = FirstOrderOptimizer(**dict())

        # Define symbolic variables
        self.observations_var = self.env.observation_space.new_tensor_variable(
            'observations', extra_dims=1)
        self.actions_var = self.env.action_space.new_tensor_variable(
            'actions', extra_dims=1)
        self.advantages_var = TT.vector('advantages')

        self.dist = self.policy.distribution

        self.old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k, ndim=2, dtype=theano.config.floatX)
            for k in self.dist.dist_info_keys
        }
        self.old_dist_info_vars_list = [
            self.old_dist_info_vars[k] for k in self.dist.dist_info_keys
        ]

        self.state_info_vars = {
            k: ext.new_tensor(k, ndim=2, dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        self.state_info_vars_list = [
            self.state_info_vars[k] for k in self.policy.state_info_keys
        ]

        self.dist_info_vars = self.policy.dist_info_sym(
            self.observations_var, self.state_info_vars)
        self.logli = self.dist.log_likelihood_sym(self.actions_var,
                                                  self.dist_info_vars)

        self.surr = -TT.mean(logli * advantages_var)

        self.input_list = [
            self.observations_var, self.actions_var, self.advantages_var
        ] + self.state_info_vars_list
        self.optimizer.update_opt(self.surr,
                                  target=self.policy,
                                  inputs=input_list)
Beispiel #8
0
    def __init__(self, env, policy, baseline, max_kl):
        """ env     = only structural info of env is used here; 
                      you need to pass the 'mode' to functions of this class
            max_kl  = constraint for determining step-size (suggested: 1e-2 or 5e-3)
        """
        
        self.policy     = policy
        self.env        = env
        self.baseline   = baseline

        self.optimizer  = ConjugateGradientOptimizer(**dict())

        # Define symbolic variables
        self.observations_var = self.env.observation_space.new_tensor_variable('observations', extra_dims=1)
        self.actions_var      = self.env.action_space.new_tensor_variable('actions', extra_dims=1)
        self.advantages_var   = TT.vector('advantages')

        self.dist = self.policy.distribution  

        self.old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2,
                dtype=theano.config.floatX
            ) for k in self.dist.dist_info_keys
            }
        self.old_dist_info_vars_list = [self.old_dist_info_vars[k] for k in self.dist.dist_info_keys]

        self.state_info_vars = {
            k: ext.new_tensor(
                k,
                ndim=2,
                dtype=theano.config.floatX
            ) for k in self.policy.state_info_keys
        }
        self.state_info_vars_list = [self.state_info_vars[k] for k in self.policy.state_info_keys]

        self.dist_info_vars = self.policy.dist_info_sym(self.observations_var, self.state_info_vars)   
        # distribution info variable (symbolic) -- interpret as pi
        self.KL = self.dist.kl_sym(self.old_dist_info_vars, self.dist_info_vars)
        self.LR = self.dist.likelihood_ratio_sym(self.actions_var, self.old_dist_info_vars, self.dist_info_vars)
        self.mean_KL = TT.mean(self.KL)
        
        self.surr = - TT.mean(self.LR * self.advantages_var)

        self.input_list = [self.observations_var, self.actions_var, self.advantages_var] + \
                          self.state_info_vars_list + self.old_dist_info_vars_list
        self.optimizer.update_opt(loss=self.surr, target=self.policy, \
                                  leq_constraint=(self.mean_KL, max_kl), \
                                  inputs=self.input_list, constraint_name="mean_kl")
Beispiel #9
0
    def __init__(self, env, policy, baseline, max_kl):
        """ env     = only structural info of env is used here; 
                      you need to pass the 'mode' to functions of this class
            max_kl  = constraint for determining step-size (suggested: 1e-2 or 5e-3)
        """
        
        self.policy     = policy
        self.env        = env
        self.baseline   = baseline

        self.optimizer  = FirstOrderOptimizer(**dict())

        # Define symbolic variables
        self.observations_var = self.env.observation_space.new_tensor_variable('observations', extra_dims=1)
        self.actions_var      = self.env.action_space.new_tensor_variable('actions', extra_dims=1)
        self.advantages_var   = TT.vector('advantages')

        self.dist = self.policy.distribution  

        self.old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2,
                dtype=theano.config.floatX
            ) for k in self.dist.dist_info_keys
            }
        self.old_dist_info_vars_list = [self.old_dist_info_vars[k] for k in self.dist.dist_info_keys]

        self.state_info_vars = {
            k: ext.new_tensor(
                k,
                ndim=2,
                dtype=theano.config.floatX
            ) for k in self.policy.state_info_keys
        }
        self.state_info_vars_list = [self.state_info_vars[k] for k in self.policy.state_info_keys]

        self.dist_info_vars = self.policy.dist_info_sym(self.observations_var, self.state_info_vars)
        self.logli = self.dist.log_likelihood_sym(self.actions_var, self.dist_info_vars)

        self.surr = - TT.mean(logli * advantages_var)

        self.input_list = [self.observations_var, self.actions_var, self.advantages_var] + self.state_info_vars_list
        self.optimizer.update_opt(self.surr, target=self.policy, inputs=input_list)
    def init_opt(self):
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1,
            dtype=theano.config.floatX
        )

        mean_var = ext.new_tensor(
            'mean',
            ndim=2,
            dtype=theano.config.floatX
        )

        log_std_var = ext.new_tensor(
            'log_std',
            ndim=2,
            dtype=theano.config.floatX
        )

        old_dist_info_vars = dict(mean=mean_var, log_std=log_std_var)
        dist_info_vars = self.policy.dist_info_sym(obs_var)
        lr = self.policy.distribution.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)

        surr_loss_vector = TT.minimum(lr * advantage_var,
                                      TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var)
        surr_loss = -TT.mean(surr_loss_vector)

        input_list = [obs_var, action_var, advantage_var, mean_var, log_std_var]

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            inputs=input_list
        )
        return dict()
Beispiel #11
0
 def new_tensor_variable(self, name, extra_dims):
     if self.n <= 2 ** 8:
         return ext.new_tensor(
             name=name,
             ndim=extra_dims+1,
             dtype='uint8'
         )
     elif self.n <= 2 ** 16:
         return ext.new_tensor(
             name=name,
             ndim=extra_dims+1,
             dtype='uint16'
         )
     else:
         return ext.new_tensor(
             name=name,
             ndim=extra_dims+1,
             dtype='uint32'
         )
Beispiel #12
0
 def new_tensor_variable(self, name, extra_dims):
     return ext.new_tensor(
         name=name,
         ndim=extra_dims+1,
         dtype=self._common_dtype,
     )
Beispiel #13
0
    def init_opt(self):
        obs_var = ext.new_tensor(
            'obs', ndim=2, dtype=theano.config.floatX)  # todo: check the dtype

        manager_obs_var = ext.new_tensor('manager_obs',
                                         ndim=2,
                                         dtype=theano.config.floatX)

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        # this will have to be the advantage every time the manager makes a decision
        manager_advantage_var = ext.new_tensor('manager_advantage',
                                               ndim=1,
                                               dtype=theano.config.floatX)

        skill_advantage_var = ext.new_tensor('skill_advantage',
                                             ndim=1,
                                             dtype=theano.config.floatX)

        latent_var_sparse = ext.new_tensor('sparse_latent',
                                           ndim=2,
                                           dtype=theano.config.floatX)

        latent_var = ext.new_tensor('latents',
                                    ndim=2,
                                    dtype=theano.config.floatX)

        mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX)

        log_std_var = ext.new_tensor('log_std',
                                     ndim=2,
                                     dtype=theano.config.floatX)

        manager_prob_var = ext.new_tensor('log_std',
                                          ndim=2,
                                          dtype=theano.config.floatX)

        assert isinstance(self.policy, HierarchicalPolicy)

        #############################################################
        ### calculating the manager portion of the surrogate loss ###
        #############################################################

        # i, j should contain the probability of latent j at time step self.period*i
        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(
            manager_obs_var)['prob']
        # old_latent_probs = self.old_policy.manager.dist_info_sym(manager_obs_var)['prob']

        actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1)
        old_actual_latent_probs = TT.sum(manager_prob_var * latent_var_sparse,
                                         axis=1)
        lr = TT.exp(
            TT.log(actual_latent_probs) - TT.log(old_actual_latent_probs))
        manager_surr_loss_vector = TT.minimum(
            lr * manager_advantage_var,
            TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) *
            manager_advantage_var)
        manager_surr_loss = -TT.mean(manager_surr_loss_vector)

        ############################################################
        ### calculating the skills portion of the surrogate loss ###
        ############################################################

        dist_info_var = self.policy.low_policy.dist_info_sym(
            obs_var, state_info_var=latent_var)
        old_dist_info_var = dict(mean=mean_var, log_std=log_std_var)
        skill_lr = self.diagonal.likelihood_ratio_sym(action_var,
                                                      old_dist_info_var,
                                                      dist_info_var)

        skill_surr_loss_vector = TT.minimum(
            skill_lr * skill_advantage_var,
            TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) *
            skill_advantage_var)
        skill_surr_loss = -TT.mean(skill_surr_loss_vector)

        surr_loss = manager_surr_loss / self.average_period + skill_surr_loss

        input_list = [
            obs_var, manager_obs_var, action_var, manager_advantage_var,
            skill_advantage_var, latent_var, latent_var_sparse, mean_var,
            log_std_var, manager_prob_var
        ]

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()
Beispiel #14
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        # Init dual param values
        self.param_eta = 15.
        # Adjust for linear feature vector.
        self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 + 4)

        # Theano vars
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        rewards = ext.new_tensor(
            'rewards',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX,
        )
        # Feature difference variable representing the difference in feature
        # value of the next observation and the current observation \phi(s') -
        # \phi(s).
        feat_diff = ext.new_tensor(
            'feat_diff',
            ndim=2 + is_recurrent,
            dtype=theano.config.floatX
        )
        param_v = TT.vector('param_v')
        param_eta = TT.scalar('eta')

        valid_var = TT.matrix('valid')

        state_info_vars = {
            k: ext.new_tensor(
                k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in self.policy.state_info_keys
        }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        # Policy-related symbolics
        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        dist = self.policy.distribution
        # log of the policy dist
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)

        # Symbolic sample Bellman error
        delta_v = rewards + TT.dot(feat_diff, param_v)

        # Policy loss (negative because we minimize)
        if is_recurrent:
            loss = - TT.sum(logli * TT.exp(
                delta_v / param_eta - TT.max(delta_v / param_eta)
            ) * valid_var) / TT.sum(valid_var)
        else:
            loss = - TT.mean(logli * TT.exp(
                delta_v / param_eta - TT.max(delta_v / param_eta)
            ))

        # Add regularization to loss.
        reg_params = self.policy.get_params(regularizable=True)
        loss += self.L2_reg_loss * TT.sum(
            [TT.mean(TT.square(param)) for param in reg_params]
        ) / len(reg_params)

        # Policy loss gradient.
        loss_grad = TT.grad(
            loss, self.policy.get_params(trainable=True))

        if is_recurrent:
            recurrent_vars = [valid_var]
        else:
            recurrent_vars = []

        input = [rewards, obs_var, feat_diff,
                 action_var] + state_info_vars_list + recurrent_vars + [param_eta, param_v]
        # if is_recurrent:
        #     input +=
        f_loss = ext.compile_function(
            inputs=input,
            outputs=loss,
        )
        f_loss_grad = ext.compile_function(
            inputs=input,
            outputs=loss_grad,
        )

        # Debug prints
        old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in dist.dist_info_keys
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        if is_recurrent:
            mean_kl = TT.sum(dist.kl_sym(old_dist_info_vars, dist_info_vars) * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars))

        f_kl = ext.compile_function(
            inputs=[obs_var, action_var] + state_info_vars_list + old_dist_info_vars_list + recurrent_vars,
            outputs=mean_kl,
        )

        # Dual-related symbolics
        # Symbolic dual
        if is_recurrent:
            dual = param_eta * self.epsilon + \
                   param_eta * TT.log(
                       TT.sum(
                           TT.exp(
                               delta_v / param_eta - TT.max(delta_v / param_eta)
                           ) * valid_var
                       ) / TT.sum(valid_var)
                   ) + param_eta * TT.max(delta_v / param_eta)
        else:
            dual = param_eta * self.epsilon + \
                   param_eta * TT.log(
                       TT.mean(
                           TT.exp(
                               delta_v / param_eta - TT.max(delta_v / param_eta)
                           )
                       )
                   ) + param_eta * TT.max(delta_v / param_eta)
        # Add L2 regularization.
        dual += self.L2_reg_dual * \
                (TT.square(param_eta) + TT.square(1 / param_eta))

        # Symbolic dual gradient
        dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v])

        # Eval functions.
        f_dual = ext.compile_function(
            inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v],
            outputs=dual
        )
        f_dual_grad = ext.compile_function(
            inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v],
            outputs=dual_grad
        )

        self.opt_info = dict(
            f_loss_grad=f_loss_grad,
            f_loss=f_loss,
            f_dual=f_dual,
            f_dual_grad=f_dual_grad,
            f_kl=f_kl
        )
Beispiel #15
0
    def init_opt(self):
        self.start_time = time.time()
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        if self.safety_constraint:
            safety_var = ext.new_tensor('safety_vals',
                                        ndim=1 + is_recurrent,
                                        dtype=theano.config.floatX)

        weights_var = ext.new_tensor('weights',
                                     ndim=1 + is_recurrent,
                                     dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)

        self.dist_info_vars_func = ext.compile_function(
            inputs=[obs_var] + state_info_vars_list,
            outputs=dist_info_vars,
            log_name="dist_info_vars")

        ent = dist.entropy_sym(dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if is_recurrent:
            mean_ent = TT.sum(
                weights_var * ent * valid_var) / TT.sum(valid_var)
            max_kl = TT.max(kl * valid_var)
            mean_kl = TT.sum(weights_var * kl * valid_var) / TT.sum(valid_var)
            surr_loss = -TT.sum(lr * weights_var * advantage_var *
                                valid_var) / TT.sum(valid_var)
            if self.safety_constraint:
                f_safety = TT.sum(lr * weights_var * safety_var *
                                  valid_var) / TT.sum(valid_var)
        else:
            mean_ent = TT.mean(weights_var * ent)
            max_kl = TT.max(kl)
            mean_kl = TT.mean(weights_var * kl)
            surr_loss = -TT.mean(lr * weights_var * advantage_var)
            if self.safety_constraint:
                f_safety = TT.mean(lr * weights_var * safety_var)

        if self.entropy_regularize:
            self.entropy_beta = theano.shared(self.entropy_coeff)
            surr_loss -= self.entropy_beta * mean_ent

        if self.safety_constraint:
            self.safety_gradient_rescale = theano.shared(1.)
            f_safety = self.safety_gradient_rescale * f_safety

        input_list = [
            obs_var,
            action_var,
            advantage_var,
            weights_var,
        ]

        if self.safety_constraint:
            input_list.append(safety_var)

        input_list = input_list + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        if not (self.safety_constrained_optimizer):
            self.optimizer.update_opt(loss=surr_loss,
                                      target=self.policy,
                                      leq_constraint=(mean_kl, self.step_size),
                                      inputs=input_list,
                                      constraint_name="mean_kl")
        else:
            self.optimizer.update_opt(
                loss=surr_loss,
                target=self.policy,
                quad_leq_constraint=(mean_kl, self.step_size),
                lin_leq_constraint=(f_safety, self.safety_step_size),
                inputs=input_list,
                constraint_name_1="mean_kl",
                constraint_name_2="safety",
                using_surrogate=False,
                precompute=True,
                attempt_feasible_recovery=self.attempt_feasible_recovery,
                attempt_infeasible_recovery=self.attempt_infeasible_recovery,
                revert_to_last_safe_point=self.revert_to_last_safe_point)

        f_kl = ext.compile_function(
            inputs=input_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(f_kl=f_kl, )
Beispiel #16
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        task_obs_var = []
        task_old_dist_info_vars_list = []
        task_kls = []
        for i in range(self.task_num):
            task_obs_var.append(
                self.env.observation_space.new_tensor_variable(
                    'obs_task%d' % (i),
                    extra_dims=1 + is_recurrent,
                ))
            temp_dist_info_var = self.policy.dist_info_sym(
                task_obs_var[-1], state_info_vars)
            temp_old_dist_info_vars = {
                k: ext.new_tensor('task%d_old_%s' % (i, k),
                                  ndim=2 + is_recurrent,
                                  dtype=theano.config.floatX)
                for k in dist.dist_info_keys
            }
            task_old_dist_info_vars_list += [
                temp_old_dist_info_vars[k] for k in dist.dist_info_keys
            ]
            task_kls.append(
                dist.kl_sym(temp_old_dist_info_vars, temp_dist_info_var))

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)

        kl_weight_var = ext.new_tensor('kl_weight',
                                       ndim=1,
                                       dtype=theano.config.floatX)

        if self.truncate_local_is_ratio is not None:
            lr = TT.minimum(self.truncate_local_is_ratio, lr)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = -TT.sum(
                lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            weighted_kls = []
            '''for i, one_task_kl in enumerate(task_kls):
                weighted_kls.append(TT.mean(one_task_kl * kl_weight_var[i]))
            mean_kl = TT.mean(weighted_kls)'''
            for i, one_task_kl in enumerate(task_kls):
                weighted_kls.append((one_task_kl * kl_weight_var[i]))
            mean_kl = TT.mean(TT.concatenate(weighted_kls))
            surr_loss = -TT.mean(lr * advantage_var)

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list + task_obs_var + task_old_dist_info_vars_list + [
            kl_weight_var
        ]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")

        self.f_constraints = []
        self.f_constraints.append(
            ext.compile_function(
                inputs=input_list,
                outputs=TT.mean(kl),
                log_name="kl_div_task",
            ))
        for i in range(self.task_num):
            self.f_constraints.append(
                ext.compile_function(
                    inputs=input_list,
                    outputs=TT.mean(task_kls[i]),
                    log_name="kl_div_task%d" % i,
                ))

        return dict()
Beispiel #17
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX
        )
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in dist.dist_info_keys
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: ext.new_tensor(
                k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in self.policy.state_info_keys
        }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        task_obs_var = []
        task_action_var = []
        task_advantage_var = []
        task_old_dist_info_vars_list = []
        task_old_dist_info_vars_list_per_task = []
        lrs = []
        for i in range(self.task_num):
            task_obs_var.append(self.env.observation_space.new_tensor_variable(
                'obs_task%d'%(i),
                extra_dims=1 + is_recurrent,
            ))
            task_action_var.append(self.env.action_space.new_tensor_variable(
                'action_task%d'%(i),
                extra_dims=1 + is_recurrent,
            ))
            task_advantage_var.append(ext.new_tensor(
                'advantage_task%d'%(i),
                ndim=1 + is_recurrent,
                dtype=theano.config.floatX
            ))
            temp_dist_info_var = self.policy.dist_info_sym(task_obs_var[-1], state_info_vars)
            temp_old_dist_info_vars = {
                k: ext.new_tensor(
                    'task%d_old_%s' % (i,k),
                    ndim=2 + is_recurrent,
                    dtype=theano.config.floatX
                ) for k in dist.dist_info_keys
                }
            task_old_dist_info_vars_list += [temp_old_dist_info_vars[k] for k in dist.dist_info_keys]
            task_old_dist_info_vars_list_per_task.append([temp_old_dist_info_vars[k] for k in dist.dist_info_keys])

            lrs.append(dist.likelihood_ratio_sym(task_action_var[i], temp_old_dist_info_vars, temp_dist_info_var))

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)


        surr_loss = 0
        task_sur_losses = []
        for i, one_lr in enumerate(lrs):
            task_sur_losses.append(-TT.mean(one_lr * task_advantage_var[i]))
            surr_loss += task_sur_losses[-1]

        input_list = [
                         obs_var,
                         action_var,
                         advantage_var,
                     ] + state_info_vars_list + old_dist_info_vars_list + task_obs_var + task_action_var + task_advantage_var + task_old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        mean_kl = TT.mean(kl)

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            leq_constraint=(mean_kl, self.step_size),
            inputs=input_list,
            constraint_name="mean_kl"
        )

        self.f_constraints=[]
        self.f_constraints.append(ext.compile_function(
                    inputs=input_list,
                    outputs=TT.mean(kl),
                    log_name="kl_div_task",
                ))

        self.f_task_grads = []
        for i in range(self.task_num):
            task_grads = theano.grad(task_sur_losses[i], wrt=self.policy.get_params(trainable=True), disconnected_inputs='warn')
            self.f_task_grads.append(ext.compile_function(
                    inputs=[
                         task_obs_var[i],
                         task_action_var[i],
                         task_advantage_var[i],
                     ] + task_old_dist_info_vars_list_per_task[i] + state_info_vars_list,
                    outputs=task_grads,
                    log_name="f_task_grads",
                ))

        return dict()
Beispiel #18
0
    def init_opt(self):
        """
        Same as normal NPO, except for setting MKL_NUM_THREADS.
        """
        # Set BEFORE Theano compiling; make equal to number of cores per worker.
        os.environ['MKL_NUM_THREADS'] = str(self.mkl_num_threads)

        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if self.truncate_local_is_ratio is not None:
            lr = TT.minimum(self.truncate_local_is_ratio, lr)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = -TT.sum(
                lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(kl)
            surr_loss = -TT.mean(lr * advantage_var)
            if self.entropy_bonus > 0:
                surr_loss -= self.entropy_bonus * TT.mean(
                    self.policy.distribution.entropy_sym(dist_info_vars))

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")
        return dict()
    def init_opt(self):
        # obs_var_raw = self.env.observation_space.new_tensor_variable(
        #     'obs',
        #     extra_dims=1,
        # )

        obs_var_raw = ext.new_tensor(
            'obs', ndim=3, dtype=theano.config.floatX)  # todo: check the dtype

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        # this will have to be the advantage every self.period timesteps
        advantage_var_sparse = ext.new_tensor('sparse_advantage',
                                              ndim=1,
                                              dtype=theano.config.floatX)

        advantage_var = ext.new_tensor('advantage',
                                       ndim=1,
                                       dtype=theano.config.floatX)

        obs_var_sparse = ext.new_tensor(
            'sparse_obs',
            ndim=2,
            dtype=theano.config.
            floatX  # todo: check this with carlos, refer to discrete.py in rllab.spaces
        )

        latent_var_sparse = ext.new_tensor('sparse_latent',
                                           ndim=2,
                                           dtype=theano.config.floatX)

        latent_var = ext.new_tensor('latents',
                                    ndim=2,
                                    dtype=theano.config.floatX)

        assert isinstance(self.policy, HierarchicalPolicy)

        # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid

        # undoing the reshape, so that batch sampling is ok
        obs_var = TT.reshape(obs_var_raw, [
            obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]
        ])
        # obs_var = obs_var_raw

        #############################################################
        ### calculating the manager portion of the surrogate loss ###
        #############################################################

        # i, j should contain the probability of latent j at time step self.period*i
        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(
            obs_var_sparse)['prob']
        actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1)
        if self.trainable_manager:
            manager_surr_loss = -TT.mean(
                TT.log(actual_latent_probs) * advantage_var_sparse)
        else:
            manager_surr_loss = 0

        ############################################################
        ### calculating the skills portion of the surrogate loss ###
        ############################################################

        # get the distribution parameters
        # dist_info_vars = []
        # for latent in self.latents:
        #     self.policy.low_policy.set_latent_train(latent)
        #     dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var))
        # hopefully the above line takes multiple samples, and state_info_vars not needed as input

        dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents(
            obs_var)
        probs = TT.stack([
            self.diagonal.log_likelihood_sym(action_var, dist_info)
            for dist_info in dist_info_vars
        ],
                         axis=1)
        # todo: verify that dist_info_vars is in order

        actual_action_log_probs = TT.sum(probs * latent_var, axis=1)
        skill_surr_loss = -TT.mean(actual_action_log_probs * advantage_var)

        surr_loss = manager_surr_loss / self.period + skill_surr_loss  # so that the relative magnitudes are correct

        input_list = [
            obs_var_raw, obs_var_sparse, action_var, advantage_var,
            advantage_var_sparse, latent_var, latent_var_sparse
        ]
        # input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var]
        # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()
Beispiel #20
0
 def new_tensor_variable(self, name, extra_dims):
     return ext.new_tensor(
         name=name,
         ndim=extra_dims,
         dtype=self.dtype,
     )
Beispiel #21
0
    def init_opt(self):
        # obs_var_raw = self.env.observation_space.new_tensor_variable(
        #     'obs',
        #     extra_dims=1,
        # )

        obs_var_raw = ext.new_tensor(
            'obs', ndim=3, dtype=theano.config.floatX)  # todo: check the dtype

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        # this will have to be the advantage every self.period timesteps
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1,
                                       dtype=theano.config.floatX)

        obs_var_sparse = ext.new_tensor(
            'sparse_obs',
            ndim=2,
            dtype=theano.config.
            floatX  # todo: check this with carlos, refer to discrete.py in rllab.spaces
        )

        assert isinstance(self.policy, HierarchicalPolicy)

        # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid

        # undoing the reshape, so that batch sampling is ok
        obs_var = TT.reshape(obs_var_raw, [
            obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]
        ])
        # obs_var = obs_var_raw

        # i, j should contain the probability of latent j at time step self.period*i
        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(
            obs_var_sparse)['prob']

        # get the distribution parameters
        # dist_info_vars = []
        # for latent in self.latents:
        #     self.policy.low_policy.set_latent_train(latent)
        #     dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var))
        # hopefully the above line takes multiple samples, and state_info_vars not needed as input

        dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents(
            obs_var)
        probs = [
            TT.exp(self.diagonal.log_likelihood_sym(action_var, dist_info))
            for dist_info in dist_info_vars
        ]

        # need to reshape at the end
        reshaped_probs = [
            TT.reshape(prob, [obs_var.shape[0] // self.period, self.period])
            for prob in probs
        ]

        # now, multiply out each row and concatenate
        subtrajectory_probs = TT.stack([
            TT.prod(reshaped_prob, axis=1) for reshaped_prob in reshaped_probs
        ],
                                       axis=1)
        # shape error might come out of here

        # elementwise multiplication, then sum up each individual row and take log
        likelihood = TT.log(TT.sum(subtrajectory_probs * latent_probs, axis=1))

        surr_loss = -TT.mean(likelihood * advantage_var)

        input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var]
        # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()
Beispiel #22
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX
        )
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in dist.dist_info_keys
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        state_info_vars = {
            k: ext.new_tensor(
                k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in self.policy.state_info_keys
        }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
        if self.truncate_local_is_ratio is not None:
            lr = TT.minimum(self.truncate_local_is_ratio, lr)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = - TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(kl)
            surr_loss = - TT.mean(lr * advantage_var)

        normal_loss = surr_loss

        # symmetry loss
        mirrored_obs_var = self.env.observation_space.new_tensor_variable(
            'mirrored_obs',
            extra_dims=1 + is_recurrent,
        )
        mean_act_collected = L.get_output(self.policy._l_mean, obs_var)
        mean_act_mirrored = L.get_output(self.policy._l_mean, mirrored_obs_var)
        sym_loss = self.sym_loss_weight * TT.mean(TT.square(TT.dot(mean_act_collected, self.act_per_mat.T)-mean_act_mirrored))
        surr_loss += sym_loss

        action_loss = self.action_reg_weight * (TT.mean(TT.abs_(mean_act_collected)) + 5.0*TT.mean(TT.clip(TT.abs_(mean_act_collected)-1.0, 0.0, 100.0)))

        #surr_loss += action_loss

        input_list = [
                         obs_var,
                         action_var,
                         advantage_var,
                     ]  + state_info_vars_list + old_dist_info_vars_list+ [mirrored_obs_var]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            leq_constraint=(mean_kl, self.step_size),
            inputs=input_list,
            constraint_name="mean_kl"
        )

        self._f_sym_loss = ext.compile_function(
                inputs=[obs_var, mirrored_obs_var],
                outputs=[sym_loss]
            )

        self._f_act_loss = ext.compile_function(
            inputs = [obs_var],
            outputs=[action_loss]
        )

        return dict()
    def init_opt(self):
        assert not self.policy.recurrent
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        #print("env.observation_space", self.env.observation_space)
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )

        latent_var = self.policy.latent_space.new_tensor_variable(
            'latents',
            extra_dims=1 + is_recurrent,
        )

        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX
        )
        dist = self.policy.distribution  # this can still be the dist P(a|s,__h__)
        old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,  # define tensors old_mean and old_log_std
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in dist.dist_info_keys
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]  ##put 2 tensors above in a list

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, latent_var)

        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = - TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(kl)
            surr_loss = - TT.mean(lr * advantage_var)

        loss = surr_loss

        input_list = [  # these are sym var. the inputs in optimize_policy have to be in same order!
                         obs_var,
                         action_var,
                         advantage_var,
                         latent_var,
                     ] + old_dist_info_vars_list  # provide old mean and var, for the new states as they were sampled from it!
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(
            loss=loss,
            target=self.policy,
            leq_constraint=(mean_kl, self.step_size),
            inputs=input_list,
            constraint_name="mean_kl"
        )
        return dict()
Beispiel #24
0
    def init_grad_approx_infos(self):
        # variables
        obs_var_raw = ext.new_tensor('obs', ndim=3, dtype=theano.config.floatX)
        obs_var_sparse = ext.new_tensor('sparse_obs',
                                        ndim=2,
                                        dtype=theano.config.floatX)
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )  # this is 5k?
        # this will have to be the advantage every self.period timesteps
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1,
                                       dtype=theano.config.floatX)
        advantage_var_sparse = ext.new_tensor(
            'sparse_advantage', ndim=1,
            dtype=theano.config.floatX)  # this is 5000
        latent_var_sparse = ext.new_tensor('sparse_latent',
                                           ndim=2,
                                           dtype=theano.config.floatX)
        latent_var = ext.new_tensor('latents',
                                    ndim=2,
                                    dtype=theano.config.floatX)  # this is 5000
        obs_var = TT.reshape(obs_var_raw, [
            obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]
        ])
        matrix = TT.eye(self.num_latents)
        latent_vectors = [matrix[i:i + 1, :] for i in range(self.num_latents)]

        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(
            obs_var_sparse)['prob']
        dist_info_vars = [
            self.policy.low_policy.dist_info_sym(obs_var,
                                                 state_info_var=latent.repeat(
                                                     obs_var.shape[0], axis=0))
            for latent in latent_vectors
        ]
        logprobs = [
            self.diagonal.log_likelihood_sym(action_var, dist_info)
            for dist_info in dist_info_vars
        ]

        # need to reshape at the end
        reshaped_logprobs = [
            TT.reshape(prob, [obs_var.shape[0] // self.period, self.period])
            for prob in logprobs
        ]
        # now, multiply out each row and concatenate
        subtrajectory_logprobs = TT.stack([
            TT.sum(reshaped_prob, axis=1)
            for reshaped_prob in reshaped_logprobs
        ],
                                          axis=1)

        # exact loss
        subtrajectory_probs = TT.exp(subtrajectory_logprobs)
        likelihood = TT.log(TT.sum(subtrajectory_probs * latent_probs, axis=1))
        surr_loss_exact = -TT.mean(likelihood * advantage_var_sparse)

        # approximate
        actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1)
        manager_surr_loss = -TT.mean(
            TT.log(actual_latent_probs) * advantage_var_sparse)
        dist_info_approx = self.policy.low_policy.dist_info_sym(
            obs_var, state_info_var=latent_var)
        actual_action_log_probs = self.diagonal.log_likelihood_sym(
            action_var, dist_info_approx)
        skill_surr_loss = -TT.mean(actual_action_log_probs * advantage_var)
        surr_loss_approx = manager_surr_loss / self.period + skill_surr_loss

        input_list = [
            obs_var_raw, obs_var_sparse, action_var, advantage_var,
            advantage_var_sparse, latent_var, latent_var_sparse
        ]
        grad_exact = theano.grad(surr_loss_exact,
                                 self.policy.get_params(trainable=True),
                                 disconnected_inputs='ignore')
        grad_approx = theano.grad(surr_loss_approx,
                                  self.policy.get_params(trainable=True),
                                  disconnected_inputs='ignore')
        grad_exact = [grad.flatten() for grad in grad_exact]
        grad_approx = [grad.flatten() for grad in grad_approx]
        v1 = TT.concatenate(grad_exact, axis=0) + 1e-8
        v2 = TT.concatenate(grad_approx, axis=0) + 1e-8
        v1 = v1 / TT.sqrt(TT.sum(TT.sqr(v1)))
        v2 = v2 / TT.sqrt(TT.sum(TT.sqr(v2)))

        cosine_distance = TT.sum(v1 * v2)
        actual_subtrajectory_prob = TT.sum(subtrajectory_probs *
                                           latent_var_sparse,
                                           axis=1)
        proportion = TT.mean(actual_subtrajectory_prob /
                             TT.sum(subtrajectory_probs, axis=1))

        self.get_dist_infos = ext.compile_function(
            inputs=input_list, outputs=dist_info_vars[0]['mean'])
        self.get_logprobs = ext.compile_function(inputs=input_list,
                                                 outputs=logprobs[0])
        self.get_subprobs = ext.compile_function(
            inputs=input_list,
            outputs=[subtrajectory_probs, actual_subtrajectory_prob])
        self.get_likelihood = ext.compile_function(inputs=input_list,
                                                   outputs=[likelihood])
        self.get_surr_loss_exact = ext.compile_function(
            inputs=input_list, outputs=[surr_loss_exact])
        self.get_surr_loss_approx = ext.compile_function(
            inputs=input_list, outputs=[surr_loss_approx])
        self.get_vs = ext.compile_function(inputs=input_list, outputs=[v1, v2])
        self.get_gradient_infos = ext.compile_function(
            inputs=input_list, outputs=[cosine_distance, proportion])
        return dict()
Beispiel #25
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if self.truncate_local_is_ratio is not None:
            lr = TT.minimum(self.truncate_local_is_ratio, lr)
        if is_recurrent:
            surr_loss = -TT.sum(
                lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            std_advar = (advantage_var -
                         TT.mean(advantage_var)) / TT.std(advantage_var)
            surr_loss = -TT.mean(
                TT.min([
                    lr * std_advar,
                    TT.clip(lr, 1 - self.clip_param, 1 + self.clip_param) *
                    std_advar
                ]))

        # symmetry loss
        mirrored_obs_var = self.env.observation_space.new_tensor_variable(
            'mirrored_obs',
            extra_dims=1 + is_recurrent,
        )

        mean_act_collected = L.get_output(self.policy._l_mean, obs_var)
        mean_act_mirrored = L.get_output(self.policy._l_mean, mirrored_obs_var)
        sym_loss = self.sym_loss_weight * TT.mean(
            TT.square(
                TT.dot(mean_act_collected, self.act_per_mat.T) -
                mean_act_mirrored))
        surr_loss += sym_loss

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list + [
            mirrored_obs_var
        ]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            inputs=input_list,
        )

        self._f_sym_loss = ext.compile_function(
            inputs=[obs_var, mirrored_obs_var], outputs=[sym_loss])

        grad = theano.grad(surr_loss,
                           wrt=self.policy.get_params(trainable=True),
                           disconnected_inputs='warn')

        self._f_grad = ext.compile_function(
            inputs=input_list,
            outputs=grad,
        )

        self._f_loss = ext.compile_function(input_list + list(), surr_loss)

        self.m_prev = []
        self.v_prev = []
        for i in range(len(self.policy.get_params(trainable=True))):
            self.m_prev.append(
                np.zeros(self.policy.get_params(
                    trainable=True)[i].get_value().shape,
                         dtype=self.policy.get_params(
                             trainable=True)[i].get_value().dtype))
            self.v_prev.append(
                np.zeros(self.policy.get_params(
                    trainable=True)[i].get_value().shape,
                         dtype=self.policy.get_params(
                             trainable=True)[i].get_value().dtype))
        self.t_prev = 0

        self.optimizer.update_opt(surr_loss, self.policy, input_list)

        return dict()
Beispiel #26
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if self.truncate_local_is_ratio is not None:
            lr = TT.minimum(self.truncate_local_is_ratio, lr)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = -TT.sum(
                lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(kl)
            surr_loss = -TT.mean(lr * advantage_var)

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list

        # guiding net
        if len(self.guiding_policies) != 0:
            guiding_obs_var = self.policy._aux_pred_network.input_layer.input_var
            guiding_action_var = self.env.action_space.new_tensor_variable(
                'guiding_action',
                extra_dims=1 + is_recurrent,
            )
            prediction = self.policy._aux_pred_network._output
            surr_loss += self.guiding_policy_weight * TT.mean(
                TT.square(guiding_action_var - prediction))
            input_list += [guiding_obs_var, guiding_action_var]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")
        return dict()
Beispiel #27
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        ## different policies should have different loss
        logli_list = []
        dist_info_vars_list = []
        kl_list = []
        for id in range(self.num_of_agents):
            dist_info_vars = self.policy_list[id].dist_info_sym(
                obs_var, state_info_vars)
            logli = dist.log_likelihood_sym(action_var, dist_info_vars)
            kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
            logli_list.append(logli)
            dist_info_vars_list.append(dist_info_vars)
            kl_list.append(kl)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        mean_kl_list = []
        max_kl_list = []
        surr_obj_list = []

        if is_recurrent:
            for id in range(self.num_of_agents):
                surr_obj_raw = -TT.mean(logli_list[id] * advantage_var)
                policy_weight_decay_term = 0.5 * self.policy_weight_decay * sum(
                    [
                        TT.sum(TT.square(param))
                        for param in self.policy_list[id].get_params(
                            regularizable=True)
                    ])
                surr_obj = surr_obj_raw + policy_weight_decay_term
                mean_kl = TT.sum(kl_list[id] * valid_var) / TT.sum(valid_var)
                max_kl = TT.max(kl_list[id] * valid_var)
                mean_kl_list.append(mean_kl)
                max_kl_list.append(max_kl)
                surr_obj_list.append(surr_obj)
        else:
            for id in range(self.num_of_agents):
                surr_obj_raw = -TT.mean(logli_list[id] * advantage_var)
                policy_weight_decay_term = 0.5 * self.policy_weight_decay * sum(
                    [
                        TT.sum(TT.square(param))
                        for param in self.policy_list[id].get_params(
                            regularizable=True)
                    ])
                surr_obj = surr_obj_raw + policy_weight_decay_term
                mean_kl = TT.mean(kl_list[id])
                max_kl = TT.max(kl_list[id])
                mean_kl_list.append(mean_kl)
                max_kl_list.append(max_kl)
                surr_obj_list.append(surr_obj)

        input_list = [obs_var, action_var, advantage_var
                      ] + state_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        for id in range(self.num_of_agents):
            self.optimizer_list[id].update_opt(surr_obj_list[id],
                                               target=self.policy_list[id],
                                               inputs=input_list)

        f_kl_list = []
        for id in range(self.num_of_agents):
            f_kl = ext.compile_function(
                inputs=input_list + old_dist_info_vars_list,
                outputs=[mean_kl_list[id], max_kl_list[id]],
            )
            f_kl_list.append(f_kl)

        self.opt_info = dict(f_kl_list=f_kl_list, )

        self.stein_m = None
        self.stein_v = None
        self.stein_epsilon = 1e-8
        self.stein_beta1 = 0.9
        self.stein_beta2 = 0.999
        self.stein_t = 0
Beispiel #28
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        entropy_input_var = TT.matrix('entropy_inputs')

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if self.truncate_local_is_ratio is not None:
            lr = TT.minimum(self.truncate_local_is_ratio, lr)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = -TT.sum(
                lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(kl)
            surr_loss = -TT.mean(lr * advantage_var)

        # entropy of blended weights
        surr_loss += 1.0 * self.policy.bw_entropy(
            entropy_input_var) - 1.0 * self.policy.bw_choice_entropy(
                entropy_input_var)
        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list + [
            entropy_input_var
        ]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")
        return dict()
Beispiel #29
0
    def init_opt(self):
        assert isinstance(self.policy, HierarchicalPolicy)
        assert not self.freeze_manager and not self.freeze_skills
        manager_surr_loss = 0
        # skill_surr_loss = 0

        obs_var_sparse = ext.new_tensor('sparse_obs',
                                        ndim=2,
                                        dtype=theano.config.floatX)
        obs_var_raw = ext.new_tensor(
            'obs', ndim=3, dtype=theano.config.floatX)  # todo: check the dtype
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1,
                                       dtype=theano.config.floatX)
        # latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX)
        mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX)
        log_std_var = ext.new_tensor('log_std',
                                     ndim=2,
                                     dtype=theano.config.floatX)

        # undoing the reshape, so that batch sampling is ok
        obs_var = TT.reshape(obs_var_raw, [
            obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]
        ])

        ############################################################
        ### calculating the skills portion of the surrogate loss ###
        ############################################################
        latent_var_sparse = self.policy.manager.dist_info_sym(
            obs_var_sparse)['mean']
        latent_var = TT.extra_ops.repeat(latent_var_sparse,
                                         self.period,
                                         axis=0)  #.dimshuffle(0, 'x')
        dist_info_var = self.policy.low_policy.dist_info_sym(
            obs_var, state_info_var=latent_var)
        old_dist_info_var = dict(mean=mean_var, log_std=log_std_var)
        skill_lr = self.diagonal.likelihood_ratio_sym(action_var,
                                                      old_dist_info_var,
                                                      dist_info_var)
        skill_surr_loss_vector = TT.minimum(
            skill_lr * advantage_var,
            TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) *
            advantage_var)
        skill_surr_loss = -TT.mean(skill_surr_loss_vector)

        surr_loss = skill_surr_loss  # so that the relative magnitudes are correct

        if self.freeze_skills and not self.freeze_manager:
            raise NotImplementedError
        elif self.freeze_manager and not self.freeze_skills:
            raise NotImplementedError
        else:
            assert (not self.freeze_manager) or (not self.freeze_skills)
            input_list = [
                obs_var_raw, obs_var_sparse, action_var, advantage_var,
                mean_var, log_std_var
            ]

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()
Beispiel #30
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        # Init dual param values
        self.param_eta = 15.
        # Adjust for linear feature vector.
        self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 +
                                      4)

        # Theano vars
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        rewards = ext.new_tensor(
            'rewards',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX,
        )
        # Feature difference variable representing the difference in feature
        # value of the next observation and the current observation \phi(s') -
        # \phi(s).
        feat_diff = ext.new_tensor('feat_diff',
                                   ndim=2 + is_recurrent,
                                   dtype=theano.config.floatX)
        param_v = TT.vector('param_v')
        param_eta = TT.scalar('eta')

        valid_var = TT.matrix('valid')

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        # Policy-related symbolics
        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        dist = self.policy.distribution
        # log of the policy dist
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)

        # Symbolic sample Bellman error
        delta_v = rewards + TT.dot(feat_diff, param_v)

        # Policy loss (negative because we minimize)
        if is_recurrent:
            loss = -TT.sum(logli * TT.exp(delta_v / param_eta -
                                          TT.max(delta_v / param_eta)) *
                           valid_var) / TT.sum(valid_var)
        else:
            loss = -TT.mean(logli * TT.exp(delta_v / param_eta -
                                           TT.max(delta_v / param_eta)))

        # Add regularization to loss.
        reg_params = self.policy.get_params(regularizable=True)
        loss += self.L2_reg_loss * TT.sum(
            [TT.mean(TT.square(param))
             for param in reg_params]) / len(reg_params)

        # Policy loss gradient.
        loss_grad = TT.grad(loss, self.policy.get_params(trainable=True))

        if is_recurrent:
            recurrent_vars = [valid_var]
        else:
            recurrent_vars = []

        input = [
            rewards, obs_var, feat_diff, action_var
        ] + state_info_vars_list + recurrent_vars + [param_eta, param_v]
        # if is_recurrent:
        #     input +=
        f_loss = ext.compile_function(
            inputs=input,
            outputs=loss,
        )
        f_loss_grad = ext.compile_function(
            inputs=input,
            outputs=loss_grad,
        )

        # Debug prints
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            mean_kl = TT.sum(
                dist.kl_sym(old_dist_info_vars, dist_info_vars) *
                valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars))

        f_kl = ext.compile_function(
            inputs=[obs_var, action_var] + state_info_vars_list +
            old_dist_info_vars_list + recurrent_vars,
            outputs=mean_kl,
        )

        # Dual-related symbolics
        # Symbolic dual
        if is_recurrent:
            dual = param_eta * self.epsilon + \
                param_eta * TT.log(
                    TT.sum(
                        TT.exp(
                            delta_v / param_eta -
                            TT.max(delta_v / param_eta)
                        ) * valid_var
                    ) / TT.sum(valid_var)
                ) + param_eta * TT.max(delta_v / param_eta)
        else:
            dual = param_eta * self.epsilon + \
                param_eta * TT.log(
                    TT.mean(
                        TT.exp(
                            delta_v / param_eta -
                            TT.max(delta_v / param_eta)
                        )
                    )
                ) + param_eta * TT.max(delta_v / param_eta)
        # Add L2 regularization.
        dual += self.L2_reg_dual * \
            (TT.square(param_eta) + TT.square(1 / param_eta))

        # Symbolic dual gradient
        dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v])

        # Eval functions.
        f_dual = ext.compile_function(inputs=[rewards, feat_diff] +
                                      state_info_vars_list + recurrent_vars +
                                      [param_eta, param_v],
                                      outputs=dual)
        f_dual_grad = ext.compile_function(
            inputs=[rewards, feat_diff] + state_info_vars_list +
            recurrent_vars + [param_eta, param_v],
            outputs=dual_grad)

        self.opt_info = dict(f_loss_grad=f_loss_grad,
                             f_loss=f_loss,
                             f_dual=f_dual,
                             f_dual_grad=f_dual_grad,
                             f_kl=f_kl)
Beispiel #31
0
 def new_tensor_variable(self, name, extra_dims):
     return ext.new_tensor(
         name=name,
         ndim=extra_dims+1,
         dtype=theano.config.floatX
     )
Beispiel #32
0
    def init_opt(self, policy_name):
        is_recurrent = int(self.policies[policy_name].recurrent)

        ## By extra_dims they actually mean shape of the tensor
        # Thus, for recurrent they need an extra dimensions in the tensor to store sequences
        # We have 2 options:
        # - either re-use observation vars from policy
        # - create observation vars again (it gives an error at this point: probably requires to dublicate variables)
        reuse_obs_vars = True
        if reuse_obs_vars:
            obs_vars = self.policies[policy_name].input_vars
        else:
            obs_vars = []
            for idx, obs_shape in enumerate(
                    self.policies[policy_name].obs_shapes):
                # name = 'obs_%d' % (idx)
                name = 'obs'
                obs_var_cur = self.env.observation_space.new_tensor_variable(
                    name,
                    extra_dims=1 + is_recurrent,
                )
                obs_vars.append(obs_var_cur)
            print(
                'NPO: Observation vars are created for policy %s' %
                policy_name, obs_vars)

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policies[policy_name].distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        # Here we need to get output variables based on input variables
        # dist_info_sym takes input features and spits out outputs of the policy graph
        # typically input variables are observations (sometimes actions as well)
        dist_info_vars = self.policies[policy_name].dist_info_sym(
            obs_vars, action_var)

        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = - \
                TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(kl)
            surr_loss = -TT.mean(lr * advantage_var)

        # Forming input list for the policy
        input_list = obs_vars + [action_var, advantage_var
                                 ] + old_dist_info_vars_list

        if is_recurrent:
            input_list.append(valid_var)

        # print('NPO: Policy Input list: ', [var for var in input_list])
        # theano.printing.pydotprint(surr_loss, outfile="loss.png",
        #                            var_with_name_simple=True)
        self.optimizers[policy_name].update_opt(
            loss=surr_loss,
            target=self.policies[policy_name],
            leq_constraint=(mean_kl, self.step_size),
            inputs=input_list,
            constraint_name="mean_kl")
        return dict()
Beispiel #33
0
 def new_tensor_variable(self, name, extra_dims):
     return ext.new_tensor(name=name,
                           ndim=extra_dims + 1,
                           dtype=theano.config.floatX)
Beispiel #34
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if self.truncate_local_is_ratio is not None:
            lr = TT.minimum(self.truncate_local_is_ratio, lr)
        if is_recurrent:
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            surr_loss = -TT.sum(
                lr * advantage_var * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(kl)
            surr_loss = -TT.mean(lr * advantage_var)

        # aux net
        aux_input_var = self.policy._aux_pred_network.input_layer.input_var
        aux_target_var = TT.matrix('aux_targets')
        prediction = self.policy._aux_pred_network._output
        surr_loss += 0.01 * TT.mean(TT.square(aux_target_var - prediction))
        '''loss = lasagne.objectives.squared_error(prediction, aux_target_var)
        loss = loss.mean()
        grads = theano.grad(surr_loss, wrt=self.policy.get_params(trainable=True), disconnected_inputs='warn')
        abcd'''
        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list + [
            aux_input_var, aux_target_var
        ]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")
        return dict()
Beispiel #35
0
    def init_opt(self):
        self.start_time = time.time()
        is_recurrent = int(self.policy.recurrent)
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        weights_var = ext.new_tensor('weights',
                                     ndim=1 + is_recurrent,
                                     dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: ext.new_tensor(k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)

        self.dist_info_vars_func = ext.compile_function(
            inputs=[obs_var] + state_info_vars_list,
            outputs=dist_info_vars,
            log_name="dist_info_vars")

        # when we want to get D_KL( pi' || pi) for data that was sampled on
        # some behavior policy pi_b, where pi' is the optimization variable
        # and pi is the policy of the previous iteration,
        # the dist_info in memory will correspond to pi_b and not pi.
        # so we have to compute the dist_info for that data on pi, on the fly.

        ent = dist.entropy_sym(dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if is_recurrent:
            mean_ent = TT.sum(
                weights_var * ent * valid_var) / TT.sum(valid_var)
            max_kl = TT.max(kl * valid_var)
            mean_kl = TT.sum(weights_var * kl * valid_var) / TT.sum(valid_var)
            surr_loss = -TT.sum(lr * weights_var * advantage_var *
                                valid_var) / TT.sum(valid_var)
        else:
            mean_ent = TT.mean(weights_var * ent)
            max_kl = TT.max(kl)
            mean_kl = TT.mean(weights_var * kl)
            surr_loss = -TT.mean(lr * weights_var * advantage_var)

        if self.entropy_regularize:
            self.entropy_beta = theano.shared(self.entropy_coeff)
            surr_loss -= self.entropy_beta * mean_ent

        input_list = [
            obs_var,
            action_var,
            advantage_var,
            weights_var,
        ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")

        f_kl = ext.compile_function(
            inputs=input_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(f_kl=f_kl, )
    def init_opt(self):
        assert isinstance(self.policy, HierarchicalPolicy)
        manager_surr_loss = 0
        skill_surr_loss = 0

        if not self.freeze_manager:
            obs_var_sparse = ext.new_tensor('sparse_obs', ndim=2, dtype=theano.config.floatX)
            latent_var_sparse = ext.new_tensor('sparse_latent', ndim=2, dtype=theano.config.floatX)
            advantage_var_sparse = ext.new_tensor('sparse_advantage', ndim=1,
                                                  dtype=theano.config.floatX)  # advantage every self.period timesteps
            manager_prob_var = ext.new_tensor('manager_prob_var', ndim=2, dtype=theano.config.floatX)
            #############################################################
            ### calculating the manager portion of the surrogate loss ###
            #############################################################

            latent_probs = self.policy.manager.dist_info_sym(obs_var_sparse)['prob']
            actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1)
            old_actual_latent_probs = TT.sum(manager_prob_var * latent_var_sparse, axis=1)
            lr = TT.exp(TT.log(actual_latent_probs) - TT.log(old_actual_latent_probs))
            manager_surr_loss_vector = TT.minimum(lr * advantage_var_sparse,
                                                  TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var_sparse)
            manager_surr_loss = -TT.mean(manager_surr_loss_vector)

        if not self.freeze_skills:
            obs_var_raw = ext.new_tensor('obs', ndim=3, dtype=theano.config.floatX)  # todo: check the dtype
            action_var = self.env.action_space.new_tensor_variable('action', extra_dims=1, )
            advantage_var = ext.new_tensor('advantage', ndim=1, dtype=theano.config.floatX)
            latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX)
            mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX)
            log_std_var = ext.new_tensor('log_std', ndim=2, dtype=theano.config.floatX)


            # undoing the reshape, so that batch sampling is ok
            obs_var = TT.reshape(obs_var_raw, [obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]])

            ############################################################
            ### calculating the skills portion of the surrogate loss ###
            ############################################################
            dist_info_var = self.policy.low_policy.dist_info_sym(obs_var, state_info_var=latent_var)
            old_dist_info_var = dict(mean=mean_var, log_std=log_std_var)
            skill_lr = self.diagonal.likelihood_ratio_sym(action_var, old_dist_info_var, dist_info_var)
            skill_surr_loss_vector = TT.minimum(skill_lr * advantage_var,
                                                TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var)
            skill_surr_loss = -TT.mean(skill_surr_loss_vector)

        surr_loss = manager_surr_loss / self.period + skill_surr_loss  # so that the relative magnitudes are correct

        if self.freeze_skills and not self.freeze_manager:
            input_list = [obs_var_sparse, advantage_var_sparse, latent_var_sparse, manager_prob_var]
        elif self.freeze_manager and not self.freeze_skills:
            input_list = [obs_var_raw, action_var, advantage_var, latent_var, mean_var, log_std_var]
        else:
            assert (not self.freeze_manager) or (not self.freeze_skills)
            input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var, advantage_var_sparse, latent_var,
                          latent_var_sparse, mean_var, log_std_var, manager_prob_var]

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            inputs=input_list
        )
        return dict()