Ejemplo n.º 1
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(MBPG, self).__init__(model, n_in, n_out, state_bounds,
                                   action_bounds, reward_bound, settings_)
        # scale = (bounds[1][i]-bounds[0][i])/2.0
        # create a small convolutional neural network

        # self._action_std_scaling = (self._action_bounds[1] - self._action_bounds[0]) / 2.0

        self._NotFallen = T.bcol("Not_Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1),
                                                  dtype=np.dtype('int8'))

        self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                        dtype='int8'),
                                               broadcastable=(False, True))

        self._advantage = T.col("Advantage")
        self._advantage.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._advantage_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                               broadcastable=(False, True))

        self._dyna_target = T.col("DYNA_Target")
        self._dyna_target.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._dyna_target_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                                 broadcastable=(False, True))

        self._KL_Weight = T.scalar("KL_Weight")
        self._KL_Weight.tag.test_value = np.zeros(
            (1), dtype=np.dtype(self.getSettings()['float_type']))[0]

        self._kl_weight_shared = theano.shared(
            np.ones((1), dtype=self.getSettings()['float_type'])[0])
        self._kl_weight_shared.set_value(
            self.getSettings()['previous_value_regularization_weight'])
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        # primary network
        self._model = model
        # Target network
        self._modelTarget = copy.deepcopy(model)

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._q_valsNextState = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)[:, :self._action_length]
        # self._q_valsActA = scale_action(self._q_valsActA, self._action_bounds)
        self._q_valsActASTD = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)[:, self._action_length:]

        ## prevent value from being 0
        """
        if ( 'use_fixed_std' in self.getSettings() and ( self.getSettings()['use_fixed_std'])): 
            self._q_valsActASTD = ( T.ones_like(self._q_valsActA)) * self.getSettings()['exploration_rate']
            # self._q_valsActASTD = ( T.ones_like(self._q_valsActA)) * self.getSettings()['exploration_rate']
        else:
        """
        self._q_valsActASTD = ((self._q_valsActASTD) *
                               self.getSettings()['exploration_rate']) + 2e-2

        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable())[:, :self._action_length]
        # self._q_valsActTarget = scale_action(self._q_valsActTarget, self._action_bounds)
        self._q_valsActTargetSTD = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable())[:, self._action_length:]
        """
        if ( 'use_fixed_std' in self.getSettings() and ( self.getSettings()['use_fixed_std'])): 
            self._q_valsActTargetSTD = (T.ones_like(self._q_valsActTarget)) * self.getSettings()['exploration_rate']
            # self._q_valsActTargetSTD = (self._action_std_scaling * T.ones_like(self._q_valsActTarget)) * self.getSettings()['exploration_rate']
        else:
        """
        self._q_valsActTargetSTD = (
            (self._q_valsActTargetSTD) *
            self.getSettings()['exploration_rate']) + 2e-2
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen
        # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState )), self._NotFallen) + (self._NotFallen - 1)
        self._target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._q_valsTargetNextState)
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop
        # loss = 0.5 * self._diff ** 2
        loss = 0.5 * T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared,
            self._advantage:
            self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }

        self._allGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared,
            self._advantage:
            self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (
            self._regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getActorNetwork(), lasagne.regularization.l2))
        self._kl_firstfixed = T.mean(
            kl(self._q_valsActTarget, self._q_valsActTargetSTD,
               self._q_valsActA, self._q_valsActASTD, self._action_length))
        # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed )
        # self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])*
        #                                                                              T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold']))
        self._actor_entropy = 0.5 * T.mean((2 * np.pi * self._q_valsActASTD))
        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params(
        # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        ## Clipping the max gradient
        """
        for x in range(len(self._value_grad)): 
            self._value_grad[x] = T.clip(self._value_grad[x] ,  -0.1, 0.1)
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.rmsprop(self._value_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     self._rho,
                                                     self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.momentum(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adam(self._value_grad,
                                                  self._params,
                                                  self._critic_learning_rate,
                                                  beta1=0.9,
                                                  beta2=0.9,
                                                  epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adagrad(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA),
        #                                                                    theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here?

        ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s)
        # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._NotFallen)) - self._q_func

        self._Advantage = self._advantage  #  * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards
        # self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length)
        # self._log_prob_target = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length)
        ### Only change the std
        self._prob = likelihood(self._model.getActionSymbolicVariable(),
                                self._q_valsActTarget, self._q_valsActASTD,
                                self._action_length)
        self._prob_target = likelihood(self._model.getActionSymbolicVariable(),
                                       self._q_valsActTarget,
                                       self._q_valsActTargetSTD,
                                       self._action_length)
        ## This does the sum already
        self._r = (self._prob / self._prob_target)
        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (self._r), self._Advantage)
        ppo_epsilon = self.getSettings()['kl_divergence_threshold']
        self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (theano.tensor.clip(self._r, 1.0 - ppo_epsilon,
                                1 + ppo_epsilon), self._Advantage))
        self._actLoss_ = theano.tensor.minimum((self._actLoss_),
                                               (self._actLoss_2))

        self._actLoss = (-1.0 *
                         (T.mean(self._actLoss_) +
                          (self.getSettings()['std_entropy_weight'] *
                           self._actor_entropy))) + self._actor_regularization

        self._policy_grad = T.grad(self._actLoss, self._actionParams)
        self._policy_grad = lasagne.updates.total_norm_constraint(
            self._policy_grad, 5)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(self._policy_grad,
                                                       self._actionParams,
                                                       self._learning_rate,
                                                       beta1=0.9,
                                                       beta2=0.999,
                                                       epsilon=1e-08)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        if (('train_state_encoding' in self.getSettings())
                and (self.getSettings()['train_state_encoding'])):
            self._encoded_state = lasagne.layers.get_output(
                self._model.getEncodeNet(),
                self._model.getStateSymbolicVariable(),
                deterministic=True)
            self._encoding_loss = T.mean(
                T.pow(self._encoded_state - self._model.getStates(), 2))
            self._full_loss = (
                self._loss + self._critic_regularization +
                (-1.0 * self.getSettings()['policy_loss_weight'] *
                 (T.mean(self._actLoss_) +
                  (self.getSettings()['std_entropy_weight'] *
                   self._actor_entropy))) +
                (self._actor_regularization + self._encoding_loss))
        else:
            self._full_loss = (
                self._loss + self._critic_regularization +
                (-1.0 * self.getSettings()['policy_loss_weight'] *
                 (T.mean(self._actLoss_) +
                  (self.getSettings()['std_entropy_weight'] *
                   self._actor_entropy))) + self._actor_regularization)

        if (('train_state_encoding' in self.getSettings())
                and (self.getSettings()['train_state_encoding'])):
            self._encodeParams = lasagne.layers.helper.get_all_params(
                self._model.getEncodeNet())
            self._all_Params = self._params + self._actionParams + self._encodeParams
        else:
            # self._all_Params = self._params + self._actionParams[-3:]
            self._all_Params = self._params + self._actionParams
        print("Num params: ", len(self._all_Params), " params: ",
              len(self._params), " act params: ", len(self._actionParams))
        self._both_grad = T.grad(self._full_loss, self._all_Params)
        self._both_grad = lasagne.updates.total_norm_constraint(
            self._both_grad, 5)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._collectiveUpdates = lasagne.updates.rmsprop(
                self._both_grad, self._all_Params, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._collectiveUpdates = lasagne.updates.momentum(
                self._both_grad,
                self._all_Params,
                self._learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._collectiveUpdates = lasagne.updates.adam(self._both_grad,
                                                           self._all_Params,
                                                           self._learning_rate,
                                                           beta1=0.9,
                                                           beta2=0.999,
                                                           epsilon=1e-08)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) +
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams,
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }

        ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable()
        self._diff_dyna = self._dyna_target - self._q_valsNextState
        # loss = 0.5 * self._diff ** 2
        loss = 0.5 * T.pow(self._diff_dyna, 2)
        self._loss_dyna = T.mean(loss)

        self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization,
                                 self._params)

        self._givens_dyna = {
            # self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
            self._dyna_target:
            self._dyna_target_shared
        }
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._DYNAUpdates = lasagne.updates.rmsprop(
                self._dyna_grad, self._params, self._learning_rate, self._rho,
                self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad,
                                                         self._params,
                                                         self._learning_rate,
                                                         momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     beta1=0.9,
                                                     beta2=0.999,
                                                     epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._DYNAUpdates = lasagne.updates.adagrad(
                self._dyna_grad,
                self._params,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        ## Some cool stuff to backprop action gradients

        self._action_grad = T.matrix("Action_Grad")
        self._action_grad.tag.test_value = np.zeros(
            (self._batch_size, self._action_length),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._action_grad_shared = theano.shared(
            np.zeros((self._batch_size, self._action_length),
                     dtype=self.getSettings()['float_type']))

        self._action_mean_grads = T.grad(
            cost=None,
            wrt=self._actionParams,
            known_grads={self._q_valsActA: self._action_grad_shared}),
        # print ("Action grads: ", self._action_mean_grads[0])
        ## When passing in gradients it needs to be a proper list of gradient expressions
        self._action_mean_grads = list(self._action_mean_grads[0])
        # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list))
        # print ("Action grads: ", self._action_mean_grads)
        self._actionGRADUpdates = lasagne.updates.adagrad(
            self._action_mean_grads,
            self._actionParams,
            self._learning_rate,
            epsilon=self._rms_epsilon)

        self._actGradGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared,
            # self._advantage: self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }
        """
        self._get_grad = theano.function([], outputs=T.grad(cost=None, wrt=[self._model._actionInputVar] + self._params,
                                                            known_grads={self._forward: self._fd_grad_target_shared}), 
                                         allow_input_downcast=True, 
                                         givens= {
            self._model.getStateSymbolicVariable() : self._model.getStates(),
            # self._model.getResultStateSymbolicVariable() : self._model.getResultStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
            # self._fd_grad_target : self._fd_grad_target_shared
        })
        """
        MBPG.compile(self)
Ejemplo n.º 2
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(A_CACLA, self).__init__(model, n_in, n_out, state_bounds,
                                      action_bounds, reward_bound, settings_)

        # create a small convolutional neural network

        self._actor_buffer_states = []
        self._actor_buffer_result_states = []
        self._actor_buffer_actions = []
        self._actor_buffer_rewards = []
        self._actor_buffer_falls = []
        self._actor_buffer_diff = []

        self._NotFallen = T.bcol("Not_Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1),
                                                  dtype=np.dtype('int8'))

        self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                        dtype='int8'),
                                               broadcastable=(False, True))

        self._tmp_diff = T.col("Tmp_Diff")
        self._tmp_diff.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._tmp_diff_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                              broadcastable=(False, True))

        self._dyna_target = T.col("DYNA_Target")
        self._dyna_target.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._dyna_target_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                                 broadcastable=(False, True))

        self._KL_Weight = T.scalar("KL_Weight")
        self._KL_Weight.tag.test_value = np.zeros(
            (1), dtype=np.dtype(self.getSettings()['float_type']))[0]

        self._kl_weight_shared = theano.shared(
            np.ones((1), dtype=self.getSettings()['float_type'])[0])
        self._kl_weight_shared.set_value(1.0)
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        ## Target network
        self._modelTarget = copy.deepcopy(model)

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._q_valsNextState = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen
        # self._target = self._model.getRewardSymbolicVariable() + ((self._discount_factor * self._q_valsTargetNextState ) * self._NotFallen) + (self._NotFallen - 1)
        self._target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._q_valsTargetNextState)
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared
            self._tmp_diff:
            self._tmp_diff_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (
            (self._regularization_weight *
             lasagne.regularization.regularize_network_params(
                 self._model.getActorNetwork(), lasagne.regularization.l2)))
        if (self.getSettings()['use_previous_value_regularization']):
            self._actor_regularization = self._actor_regularization + (
                (self.getSettings()['previous_value_regularization_weight']) *
                change_penalty(self._model.getActorNetwork(),
                               self._modelTarget.getActorNetwork()))
        elif ('regularization_type' in self.getSettings() and
              (self.getSettings()['regularization_type'] == 'KL_Divergence')):
            self._kl_firstfixed = T.mean(
                kl(
                    self._q_valsActTarget,
                    T.ones_like(self._q_valsActTarget) *
                    self.getSettings()['exploration_rate'], self._q_valsActA,
                    T.ones_like(self._q_valsActA) *
                    self.getSettings()['exploration_rate'],
                    self._action_length))
            #self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])*
            #                                                                         T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold']))
            self._actor_regularization = (self._kl_firstfixed) * (
                self.getSettings()['kl_divergence_threshold'])

            print("Using regularization type : ",
                  self.getSettings()['regularization_type'])
        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss, self._params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        ## Clipping the max gradient
        """
        for x in range(len(self._value_grad)): 
            self._value_grad[x] = T.clip(self._value_grad[x] ,  -0.1, 0.1)
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.rmsprop(self._value_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     self._rho,
                                                     self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.momentum(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adam(self._value_grad,
                                                  self._params,
                                                  self._critic_learning_rate,
                                                  beta1=0.9,
                                                  beta2=0.9,
                                                  epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adagrad(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## TD update
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho)
        elif ( self.getSettings()['optimizer'] == 'adam'):
            self._updates_ = lasagne.updates.adam(T.mean(self._q_func), self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08)
        else:
            print ("Unknown optimization method: ", self.getSettings()['optimizer'])
            sys.exit(-1)
        """
        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), theano.tensor.tile((self._diff * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here?
        self._actDiff = (self._model.getActionSymbolicVariable() -
                         self._q_valsActA_drop)
        # self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA)) # Target network does not work well here?
        # self._actDiff_drop = ((self._model.getActionSymbolicVariable() - self._q_valsActA_drop)) # Target network does not work well here?
        ## This should be a single column vector
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.transpose(T.sum(T.pow(self._actDiff, 2),axis=1) )), (self._diff * (1.0/(1.0-self._discount_factor))))
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.reshape(T.sum(T.pow(self._actDiff, 2),axis=1), (self._batch_size, 1) )),
        #                                                                        (self._tmp_diff * (1.0/(1.0-self._discount_factor)))
        # self._actLoss_ = (T.mean(T.pow(self._actDiff, 2),axis=1))

        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (T.mean(T.pow(self._actDiff, 2), axis=1)),
            (self._tmp_diff * (1.0 / (1.0 - self._discount_factor))))
        # self._actLoss = T.sum(self._actLoss)/float(self._batch_size)
        self._actLoss = T.mean(self._actLoss_)
        # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink
        # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2))
        self._policy_grad = T.grad(self._actLoss + self._actor_regularization,
                                   self._actionParams)
        ## Clipping the max gradient
        """
        for x in range(len(self._policy_grad)): 
            self._policy_grad[x] = T.clip(self._policy_grad[x] ,  -0.5, 0.5)
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                beta1=0.9,
                beta2=0.999,
                epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._actionUpdates = lasagne.updates.adagrad(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) +
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams,
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._model.getActions(),
        }

        ### Noisey state updates
        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen
        # self._target_dyna = theano.gradient.disconnected_grad(self._q_func)

        ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable()
        self._diff_dyna = self._dyna_target - self._q_valsNextState
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff_dyna, 2)
        self._loss_dyna = T.mean(loss)

        self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization,
                                 self._params)

        self._givens_dyna = {
            # self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
            self._dyna_target:
            self._dyna_target_shared
        }
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._DYNAUpdates = lasagne.updates.rmsprop(
                self._dyna_grad, self._params, self._learning_rate, self._rho,
                self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad,
                                                         self._params,
                                                         self._learning_rate,
                                                         momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     beta1=0.9,
                                                     beta2=0.999,
                                                     epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._DYNAUpdates = lasagne.updates.adagrad(
                self._dyna_grad,
                self._params,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        # self._target = self._model.getRewardSymbolicVariable() +  (self._discount_factor * self._q_valsTargetNextState )
        ### Give v(s') the next state and v(s) (target) the current state
        self._diff_adv = (self._discount_factor *
                          self._q_func) - (self._q_valsTargetNextState)
        self._diff_adv_givens = {
            self._model.getStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getStates(),
        }

        A_CACLA.compile(self)
Ejemplo n.º 3
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(PPOCritic2,
              self).__init__(model, n_in, n_out, state_bounds, action_bounds,
                             reward_bound, settings_)

        # create a small convolutional neural network

        self._Fallen = T.bcol("Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._Fallen.tag.test_value = np.zeros((self._batch_size, 1),
                                               dtype=np.dtype('int8'))

        self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                     dtype='int8'),
                                            broadcastable=(False, True))

        self._advantage = T.col("Advantage")
        self._advantage.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._advantage_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                               broadcastable=(False, True))

        self._KL_Weight = T.scalar("KL_Weight")
        self._KL_Weight.tag.test_value = np.zeros(
            (1), dtype=np.dtype(self.getSettings()['float_type']))[0]

        self._kl_weight_shared = theano.shared(
            np.ones((1), dtype=self.getSettings()['float_type'])[0])
        self._kl_weight_shared.set_value(
            self.getSettings()['previous_value_regularization_weight'])
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        # primary network
        self._model = model
        # Target network
        self._modelTarget = copy.deepcopy(model)

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)[:, :self._action_length]
        self._q_valsActASTD = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)[:, self._action_length:]

        ## prevent value from being 0
        self._q_valsActASTD = (self._q_valsActASTD *
                               self.getSettings()['exploration_rate']) + 5e-2
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable())[:, :self._action_length]
        self._q_valsActTargetSTD = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable())[:, self._action_length:]
        self._q_valsActTargetSTD = (
            self._q_valsActTargetSTD *
            self.getSettings()['exploration_rate']) + 5e-2
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._Fallen
        self._target = T.mul(
            T.add(self._model.getRewardSymbolicVariable(),
                  T.mul(self._discount_factor, self._q_valsTargetNextState)),
            self._Fallen)
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            self._Fallen:
            self._fallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            self._Fallen:
            self._fallen_shared,
            # self._advantage: self._advantage_shared,
            self._KL_Weight:
            self._kl_weight_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        # self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #         self._model.getActorNetwork(), lasagne.regularization.l2)) )
        self._kl_firstfixed = T.mean(
            kl(self._q_valsActTarget, self._q_valsActTargetSTD,
               self._q_valsActA, self._q_valsActASTD, self._action_length))
        # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed )
        self._actor_regularization = (
            (self._KL_Weight) * self._kl_firstfixed) + (
                10 * (self._kl_firstfixed >
                      self.getSettings()['kl_divergence_threshold']) *
                T.square(self._kl_firstfixed -
                         self.getSettings()['kl_divergence_threshold']))

        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss, self._params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._updates_ = lasagne.updates.rmsprop(
                self._loss  # + self._critic_regularization
                ,
                self._params,
                self._learning_rate,
                self._rho,
                self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._updates_ = lasagne.updates.momentum(
                self._loss  # + self._critic_regularization
                ,
                self._params,
                self._critic_learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._updates_ = lasagne.updates.adam(
                self._loss  # + self._critic_regularization 
                ,
                self._params,
                self._critic_learning_rate,
                beta1=0.9,
                beta2=0.999,
                epsilon=1e-08)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## TD update
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho)
        elif ( self.getSettings()['optimizer'] == 'adam'):
            self._updates_ = lasagne.updates.adam(T.mean(self._q_func), self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08)
        else:
            print ("Unknown optimization method: ", self.getSettings()['optimizer'])
            sys.exit(-1)
        """
        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA),
        #                                                                    theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here?

        ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s)
        # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._Fallen)) - self._q_func

        self._Advantage = self._diff * (1.0 / (1.0 - self._discount_factor)
                                        )  ## scale back to same as rewards
        self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(),
                                       self._q_valsActA, self._q_valsActASTD,
                                       self._action_length)
        self._log_prob_target = loglikelihood(
            self._model.getActionSymbolicVariable(), self._q_valsActTarget,
            self._q_valsActTargetSTD, self._action_length)
        # self._prob = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length)
        # self._prob_target = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length)
        # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target).dot(self._Advantage)) )
        # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target) * (self._Advantage)) )
        # self._actLoss_ = ( ((self._log_prob) * self._Advantage) )
        # self._actLoss_ = ( ((self._log_prob)) )
        ## This does the sum already
        # self._actLoss_ =  ( (self._log_prob).dot( self._Advantage) )
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._prob / self._prob_target), self._Advantage)
        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            T.exp(self._log_prob - self._log_prob_target), self._Advantage)

        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._log_prob), self._Advantage)
        # self._actLoss_ = T.mean(self._log_prob)
        # self._policy_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 )
        ## - because update computes gradient DESCENT updates
        # self._actLoss = -1.0 * ((T.mean(self._actLoss_)) + (self._actor_regularization ))
        # self._entropy = -1. * T.sum(T.log(self._q_valsActA + 1e-8) * self._q_valsActA, axis=1, keepdims=True)
        ## - because update computes gradient DESCENT updates
        self._actLoss = (-1.0 * T.mean(self._actLoss_)) + (
            1.0 * self._actor_regularization) + (-1e-3 *
                                                 entropy(self._q_valsActASTD))
        # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink
        # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2))
        self._policy_grad = T.grad(self._actLoss, self._actionParams)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(self._policy_grad,
                                                       self._actionParams,
                                                       self._learning_rate,
                                                       beta1=0.9,
                                                       beta2=0.999,
                                                       epsilon=1e-08)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) +
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams,
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        PPOCritic2.compile(self)
Ejemplo n.º 4
0
Archivo: TRPO.py Proyecto: skylbc/SMBAE
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(TRPO, self).__init__(model, n_in, n_out, state_bounds,
                                   action_bounds, reward_bound, settings_)

        # create a small convolutional neural network

        # self._Fallen = T.bcol("Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        # self._Fallen.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype('int8'))

        # self._fallen_shared = theano.shared(
        #     np.zeros((self._batch_size, 1), dtype='int8'),
        #     broadcastable=(False, True))

        self._advantage = T.col("Advantage")
        self._advantage.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._advantage_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                               broadcastable=(False, True))

        self._KL_Weight = T.scalar("KL_Weight")
        self._KL_Weight.tag.test_value = np.zeros(
            (1), dtype=np.dtype(self.getSettings()['float_type']))[0]

        self._kl_weight_shared = theano.shared(
            np.ones((1), dtype=self.getSettings()['float_type'])[0])
        self._kl_weight_shared.set_value(
            self.getSettings()['previous_value_regularization_weight'])
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        # primary network
        self._model = model
        # Target network
        self._modelTarget = copy.deepcopy(model)

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._q_valsNextState = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)[:, :self._action_length]
        # self._q_valsActA = scale_action(self._q_valsActA, self._action_bounds)
        self._q_valsActASTD = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)[:, self._action_length:]

        ## prevent value from being 0
        if ('use_fixed_std' in self.getSettings()
                and (self.getSettings()['use_fixed_std'])):
            self._q_valsActASTD = (T.ones_like(
                self._q_valsActA)) * self.getSettings()['exploration_rate']
            # self._q_valsActASTD = ( T.ones_like(self._q_valsActA)) * self.getSettings()['exploration_rate']
        else:
            self._q_valsActASTD = (
                (self._q_valsActASTD) *
                self.getSettings()['exploration_rate']) + 2e-2
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable())[:, :self._action_length]
        # self._q_valsActTarget = scale_action(self._q_valsActTarget, self._action_bounds)
        self._q_valsActTargetSTD = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable())[:, self._action_length:]
        if ('use_fixed_std' in self.getSettings()
                and (self.getSettings()['use_fixed_std'])):
            self._q_valsActTargetSTD = (T.ones_like(
                self._q_valsActTarget)) * self.getSettings(
                )['exploration_rate']
            # self._q_valsActTargetSTD = (self._action_std_scaling * T.ones_like(self._q_valsActTarget)) * self.getSettings()['exploration_rate']
        else:
            self._q_valsActTargetSTD = (
                (self._q_valsActTargetSTD) *
                self.getSettings()['exploration_rate']) + 2e-2
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen
        # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState )), self._NotFallen) + (self._NotFallen - 1)
        self._target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._q_valsTargetNextState)
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        #if ( 'use_fixed_std' in self.getSettings() and ( self.getSettings()['use_fixed_std'])):
        #    self._actionParams = lasagne.layers.helper.get_all_params(self._model.getActorNetwork())
        #else:
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._Fallen: self._fallen_shared,
            self._advantage:
            self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        # self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #         self._model.getActorNetwork(), lasagne.regularization.l2)) )
        self._kl_firstfixed = kl(self._q_valsActTarget,
                                 self._q_valsActTargetSTD, self._q_valsActA,
                                 self._q_valsActASTD,
                                 self._action_length).mean()
        # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed )
        self._actor_regularization = (
            (self._KL_Weight) * self._kl_firstfixed) + (
                (self._kl_firstfixed >
                 self.getSettings()['kl_divergence_threshold']) *
                T.square(self._kl_firstfixed -
                         self.getSettings()['kl_divergence_threshold']))

        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params(
        # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        # TD update
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._updates_ = lasagne.updates.rmsprop(
                T.mean(self._q_func) + self._critic_regularization,
                self._params, self._critic_learning_rate * -T.mean(self._diff),
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._updates_ = lasagne.updates.momentum(
                T.mean(self._q_func) + self._critic_regularization,
                self._params,
                self._critic_learning_rate * -T.mean(self._diff),
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._updates_ = lasagne.updates.adam(
                T.mean(self._q_func) + self._critic_regularization,
                self._params,
                self._critic_learning_rate * -T.mean(self._diff),
                beta1=0.9,
                beta2=0.999,
                epsilon=1e-08)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA),
        #                                                                    theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here?

        ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s)
        # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._Fallen)) - self._q_func
        self._Advantage = self._advantage  # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards
        # self._Advantage = self._diff # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards
        self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(),
                                       self._q_valsActA, self._q_valsActASTD,
                                       self._action_length)
        self._log_prob_target = loglikelihood(
            self._model.getActionSymbolicVariable(), self._q_valsActTarget,
            self._q_valsActTargetSTD, self._action_length)
        # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target).dot(self._Advantage)) )
        # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target) * (self._Advantage)) )
        # self._actLoss_ = ( ((self._log_prob) * self._Advantage) )
        # self._actLoss_ = ( ((self._log_prob)) )
        ## This does the sum already
        # self._actLoss_ =  ( (self._log_prob).dot( self._Advantage) )
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(T.exp(self._log_prob - self._log_prob_target), self._Advantage)
        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            T.exp(self._log_prob - self._log_prob_target), self._Advantage)

        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._log_prob), self._Advantage)
        # self._actLoss_ = T.mean(self._log_prob)
        # self._policy_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 )
        ## - because update computes gradient DESCENT updates
        # self._actLoss = -1.0 * ((T.mean(self._actLoss_)) + (self._actor_regularization ))
        # self._entropy = -1. * T.sum(T.log(self._q_valsActA + 1e-8) * self._q_valsActA, axis=1, keepdims=True)
        ## - because update computes gradient DESCENT updates
        self._actLoss = (-1.0 * T.mean(self._actLoss_))
        # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink
        # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2))
        self._policy_grad = T.grad(self._actLoss, self._actionParams)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(self._policy_grad,
                                                       self._actionParams,
                                                       self._learning_rate,
                                                       beta1=0.9,
                                                       beta2=0.999,
                                                       epsilon=1e-08)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
        N = self._model.getStateSymbolicVariable().shape[0]
        params = self._actionParams
        surr = self._actLoss * (1.0 / N)
        self.pg = flatgrad(surr, params)

        prob_mean_fixed = theano.gradient.disconnected_grad(self._q_valsActA)
        prob_std_fixed = theano.gradient.disconnected_grad(self._q_valsActASTD)
        kl_firstfixed = kl(prob_mean_fixed, prob_std_fixed, self._q_valsActA,
                           self._q_valsActASTD, self._action_length).sum() / N
        grads = T.grad(kl_firstfixed, params)
        self.flat_tangent = T.vector(name="flat_tan")
        shapes = [var.get_value(borrow=True).shape for var in params]
        start = 0
        tangents = []
        for shape in shapes:
            size = np.prod(shape)
            tangents.append(
                T.reshape(self.flat_tangent[start:start + size], shape))
            start += size
        self.gvp = T.add(
            *[T.sum(g * tangent) for (g, tangent) in zipsame(grads, tangents)])  #pylint: disable=E1111
        # Fisher-vector product
        self.fvp = flatgrad(self.gvp, params)

        self.ent = entropy(self._q_valsActASTD).mean()
        self.kl = kl(self._q_valsActTarget, self._q_valsActTargetSTD,
                     self._q_valsActA, self._q_valsActASTD,
                     self._action_length).mean()

        self.losses = [surr, self.kl, self.ent]
        self.loss_names = ["surr", "kl", "ent"]

        self.args = [
            self._model.getStateSymbolicVariable(),
            self._model.getActionSymbolicVariable(), self._advantage
            # self._q_valsActTarget_
        ]

        self.args_fvp = [
            self._model.getStateSymbolicVariable(),
            # self._model.getActionSymbolicVariable()
            # self._advantage,
            # self._q_valsActTarget_
        ]

        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) +
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams,
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        TRPO.compile(self)
Ejemplo n.º 5
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(Distillation,
              self).__init__(model, n_in, n_out, state_bounds, action_bounds,
                             reward_bound, settings_)

        # create a small convolutional neural network

        ### Load expert policy files
        self._expert_policies = []
        file_name_ = ""
        for i in range(len(self.getSettings()['expert_policy_files'])):
            file_name = self.getSettings(
            )['expert_policy_files'][i] + '/' + self.getSettings(
            )['model_type'] + '/' + getAgentName() + '.pkl'
            if (file_name_ == file_name):
                ## To help save memory when experts are the same
                self._expert_policies.append(model_)
            else:
                print("Loading pre compiled network: ", file_name)
                f = open(file_name, 'rb')
                model_ = dill.load(f)
                f.close()
                self._expert_policies.append(
                    model_)  # expert model, load the 2 expert models
            file_name_ = file_name

        self._actor_buffer_states = []
        self._actor_buffer_result_states = []
        self._actor_buffer_actions = []
        self._actor_buffer_rewards = []
        self._actor_buffer_falls = []
        self._actor_buffer_diff = []

        self._NotFallen = T.bcol("Not_Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1),
                                                  dtype=np.dtype('int8'))

        self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                        dtype='int8'),
                                               broadcastable=(False, True))

        self._tmp_diff = T.col("Tmp_Diff")
        self._tmp_diff.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._tmp_diff_shared = theano.shared(
            np.zeros((self._batch_size, 1),
                     dtype=self.getSettings()['float_type']),
            broadcastable=(False, True))  #定义一个共享变量,初始值为为0

        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        ## Target network
        self._modelTarget = copy.deepcopy(model)  # target model 是要更新的模型

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)  #确定性原始模型的state值输出
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)  #非确定的state值输出
        self._q_valsNextState = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)  #下一步的state值
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)  #目标模型的下一步的state值
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)  #目标模型的state值
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)  #目标模型的state

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)  #remove the random
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)  #actor 值

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        self._target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._q_valsTargetNextState)
        # self._model.getRewardSymbolicVariable() 获取rewards的值getRewards() =self._rewards_shared 从0开始一直更新
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop  #更新的模型的reward减去原始模型的critic的输出值
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)  # 两个模型的reward的差值
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards()
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
            self._tmp_diff: self._tmp_diff_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (
            (self._regularization_weight *
             lasagne.regularization.regularize_network_params(
                 self._model.getActorNetwork(), lasagne.regularization.l2)))
        if (self.getSettings()['use_previous_value_regularization']):
            self._actor_regularization = self._actor_regularization + (
                (self.getSettings()['previous_value_regularization_weight']) *
                change_penalty(self._model.getActorNetwork(),
                               self._modelTarget.getActorNetwork()))
        elif ('regularization_type' in self.getSettings() and
              (self.getSettings()['regularization_type'] == 'KL_Divergence')):
            self._kl_firstfixed = T.mean(
                kl(
                    self._q_valsActTarget,
                    T.ones_like(self._q_valsActTarget) *
                    self.getSettings()['exploration_rate'], self._q_valsActA,
                    T.ones_like(self._q_valsActA) *
                    self.getSettings()['exploration_rate'],
                    self._action_length))
            self._actor_regularization = (self._kl_firstfixed) * (
                self.getSettings()['kl_divergence_threshold'])

            print("Using regularization type : ",
                  self.getSettings()['regularization_type'])
        # SGD update
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.rmsprop(self._value_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     self._rho,
                                                     self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.momentum(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adam(self._value_grad,
                                                  self._params,
                                                  self._critic_learning_rate,
                                                  beta1=0.9,
                                                  beta2=0.9,
                                                  epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adagrad(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## TD update

        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        self._actDiff = (self._model.getActionSymbolicVariable() -
                         self._q_valsActA_drop)  # 更新模型的actor的输出减去原始模型的actor值

        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (T.mean(T.pow(self._actDiff, 2), axis=1)), (self._tmp_diff))
        self._actLoss = T.mean(self._actLoss_)
        self._policy_grad = T.grad(self._actLoss + self._actor_regularization,
                                   self._actionParams)
        ## Clipping the max gradient
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                beta1=0.9,
                beta2=0.999,
                epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._actionUpdates = lasagne.updates.adagrad(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates()
        }

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        ### Give v(s') the next state and v(s) (target) the current state
        self._diff_adv = (self._discount_factor * self._q_func) - (
            self._q_valsTargetNextState
        )  #\gamma*critic模型的输出-critic模型在下一个状态的输出值
        self._diff_adv_givens = {
            self._model.getStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getStates(),
        }

        Distillation.compile(self)