Exemple #1
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(A_CACLA, self).__init__(model, n_in, n_out, state_bounds,
                                      action_bounds, reward_bound, settings_)

        # create a small convolutional neural network

        self._actor_buffer_states = []
        self._actor_buffer_result_states = []
        self._actor_buffer_actions = []
        self._actor_buffer_rewards = []
        self._actor_buffer_falls = []
        self._actor_buffer_diff = []

        self._NotFallen = T.bcol("Not_Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1),
                                                  dtype=np.dtype('int8'))

        self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                        dtype='int8'),
                                               broadcastable=(False, True))

        self._tmp_diff = T.col("Tmp_Diff")
        self._tmp_diff.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._tmp_diff_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                              broadcastable=(False, True))

        self._dyna_target = T.col("DYNA_Target")
        self._dyna_target.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._dyna_target_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                                 broadcastable=(False, True))

        self._KL_Weight = T.scalar("KL_Weight")
        self._KL_Weight.tag.test_value = np.zeros(
            (1), dtype=np.dtype(self.getSettings()['float_type']))[0]

        self._kl_weight_shared = theano.shared(
            np.ones((1), dtype=self.getSettings()['float_type'])[0])
        self._kl_weight_shared.set_value(1.0)
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        ## Target network
        self._modelTarget = copy.deepcopy(model)

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._q_valsNextState = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen
        # self._target = self._model.getRewardSymbolicVariable() + ((self._discount_factor * self._q_valsTargetNextState ) * self._NotFallen) + (self._NotFallen - 1)
        self._target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._q_valsTargetNextState)
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared
            self._tmp_diff:
            self._tmp_diff_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (
            (self._regularization_weight *
             lasagne.regularization.regularize_network_params(
                 self._model.getActorNetwork(), lasagne.regularization.l2)))
        if (self.getSettings()['use_previous_value_regularization']):
            self._actor_regularization = self._actor_regularization + (
                (self.getSettings()['previous_value_regularization_weight']) *
                change_penalty(self._model.getActorNetwork(),
                               self._modelTarget.getActorNetwork()))
        elif ('regularization_type' in self.getSettings() and
              (self.getSettings()['regularization_type'] == 'KL_Divergence')):
            self._kl_firstfixed = T.mean(
                kl(
                    self._q_valsActTarget,
                    T.ones_like(self._q_valsActTarget) *
                    self.getSettings()['exploration_rate'], self._q_valsActA,
                    T.ones_like(self._q_valsActA) *
                    self.getSettings()['exploration_rate'],
                    self._action_length))
            #self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])*
            #                                                                         T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold']))
            self._actor_regularization = (self._kl_firstfixed) * (
                self.getSettings()['kl_divergence_threshold'])

            print("Using regularization type : ",
                  self.getSettings()['regularization_type'])
        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss, self._params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        ## Clipping the max gradient
        """
        for x in range(len(self._value_grad)): 
            self._value_grad[x] = T.clip(self._value_grad[x] ,  -0.1, 0.1)
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.rmsprop(self._value_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     self._rho,
                                                     self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.momentum(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adam(self._value_grad,
                                                  self._params,
                                                  self._critic_learning_rate,
                                                  beta1=0.9,
                                                  beta2=0.9,
                                                  epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adagrad(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## TD update
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho)
        elif ( self.getSettings()['optimizer'] == 'adam'):
            self._updates_ = lasagne.updates.adam(T.mean(self._q_func), self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08)
        else:
            print ("Unknown optimization method: ", self.getSettings()['optimizer'])
            sys.exit(-1)
        """
        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), theano.tensor.tile((self._diff * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here?
        self._actDiff = (self._model.getActionSymbolicVariable() -
                         self._q_valsActA_drop)
        # self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA)) # Target network does not work well here?
        # self._actDiff_drop = ((self._model.getActionSymbolicVariable() - self._q_valsActA_drop)) # Target network does not work well here?
        ## This should be a single column vector
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.transpose(T.sum(T.pow(self._actDiff, 2),axis=1) )), (self._diff * (1.0/(1.0-self._discount_factor))))
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.reshape(T.sum(T.pow(self._actDiff, 2),axis=1), (self._batch_size, 1) )),
        #                                                                        (self._tmp_diff * (1.0/(1.0-self._discount_factor)))
        # self._actLoss_ = (T.mean(T.pow(self._actDiff, 2),axis=1))

        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (T.mean(T.pow(self._actDiff, 2), axis=1)),
            (self._tmp_diff * (1.0 / (1.0 - self._discount_factor))))
        # self._actLoss = T.sum(self._actLoss)/float(self._batch_size)
        self._actLoss = T.mean(self._actLoss_)
        # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink
        # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2))
        self._policy_grad = T.grad(self._actLoss + self._actor_regularization,
                                   self._actionParams)
        ## Clipping the max gradient
        """
        for x in range(len(self._policy_grad)): 
            self._policy_grad[x] = T.clip(self._policy_grad[x] ,  -0.5, 0.5)
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                beta1=0.9,
                beta2=0.999,
                epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._actionUpdates = lasagne.updates.adagrad(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) +
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams,
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._model.getActions(),
        }

        ### Noisey state updates
        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen
        # self._target_dyna = theano.gradient.disconnected_grad(self._q_func)

        ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable()
        self._diff_dyna = self._dyna_target - self._q_valsNextState
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff_dyna, 2)
        self._loss_dyna = T.mean(loss)

        self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization,
                                 self._params)

        self._givens_dyna = {
            # self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
            self._dyna_target:
            self._dyna_target_shared
        }
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._DYNAUpdates = lasagne.updates.rmsprop(
                self._dyna_grad, self._params, self._learning_rate, self._rho,
                self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad,
                                                         self._params,
                                                         self._learning_rate,
                                                         momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     beta1=0.9,
                                                     beta2=0.999,
                                                     epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._DYNAUpdates = lasagne.updates.adagrad(
                self._dyna_grad,
                self._params,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        # self._target = self._model.getRewardSymbolicVariable() +  (self._discount_factor * self._q_valsTargetNextState )
        ### Give v(s') the next state and v(s) (target) the current state
        self._diff_adv = (self._discount_factor *
                          self._q_func) - (self._q_valsTargetNextState)
        self._diff_adv_givens = {
            self._model.getStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getStates(),
        }

        A_CACLA.compile(self)
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(Distillation,
              self).__init__(model, n_in, n_out, state_bounds, action_bounds,
                             reward_bound, settings_)

        # create a small convolutional neural network

        ### Load expert policy files
        self._expert_policies = []
        file_name_ = ""
        for i in range(len(self.getSettings()['expert_policy_files'])):
            file_name = self.getSettings(
            )['expert_policy_files'][i] + '/' + self.getSettings(
            )['model_type'] + '/' + getAgentName() + '.pkl'
            if (file_name_ == file_name):
                ## To help save memory when experts are the same
                self._expert_policies.append(model_)
            else:
                print("Loading pre compiled network: ", file_name)
                f = open(file_name, 'rb')
                model_ = dill.load(f)
                f.close()
                self._expert_policies.append(
                    model_)  # expert model, load the 2 expert models
            file_name_ = file_name

        self._actor_buffer_states = []
        self._actor_buffer_result_states = []
        self._actor_buffer_actions = []
        self._actor_buffer_rewards = []
        self._actor_buffer_falls = []
        self._actor_buffer_diff = []

        self._NotFallen = T.bcol("Not_Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1),
                                                  dtype=np.dtype('int8'))

        self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                        dtype='int8'),
                                               broadcastable=(False, True))

        self._tmp_diff = T.col("Tmp_Diff")
        self._tmp_diff.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._tmp_diff_shared = theano.shared(
            np.zeros((self._batch_size, 1),
                     dtype=self.getSettings()['float_type']),
            broadcastable=(False, True))  #定义一个共享变量,初始值为为0

        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        ## Target network
        self._modelTarget = copy.deepcopy(model)  # target model 是要更新的模型

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)  #确定性原始模型的state值输出
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)  #非确定的state值输出
        self._q_valsNextState = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)  #下一步的state值
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)  #目标模型的下一步的state值
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)  #目标模型的state值
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)  #目标模型的state

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)  #remove the random
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)  #actor 值

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        self._target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._q_valsTargetNextState)
        # self._model.getRewardSymbolicVariable() 获取rewards的值getRewards() =self._rewards_shared 从0开始一直更新
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop  #更新的模型的reward减去原始模型的critic的输出值
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)  # 两个模型的reward的差值
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards()
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
            self._tmp_diff: self._tmp_diff_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (
            (self._regularization_weight *
             lasagne.regularization.regularize_network_params(
                 self._model.getActorNetwork(), lasagne.regularization.l2)))
        if (self.getSettings()['use_previous_value_regularization']):
            self._actor_regularization = self._actor_regularization + (
                (self.getSettings()['previous_value_regularization_weight']) *
                change_penalty(self._model.getActorNetwork(),
                               self._modelTarget.getActorNetwork()))
        elif ('regularization_type' in self.getSettings() and
              (self.getSettings()['regularization_type'] == 'KL_Divergence')):
            self._kl_firstfixed = T.mean(
                kl(
                    self._q_valsActTarget,
                    T.ones_like(self._q_valsActTarget) *
                    self.getSettings()['exploration_rate'], self._q_valsActA,
                    T.ones_like(self._q_valsActA) *
                    self.getSettings()['exploration_rate'],
                    self._action_length))
            self._actor_regularization = (self._kl_firstfixed) * (
                self.getSettings()['kl_divergence_threshold'])

            print("Using regularization type : ",
                  self.getSettings()['regularization_type'])
        # SGD update
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.rmsprop(self._value_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     self._rho,
                                                     self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.momentum(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adam(self._value_grad,
                                                  self._params,
                                                  self._critic_learning_rate,
                                                  beta1=0.9,
                                                  beta2=0.9,
                                                  epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adagrad(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## TD update

        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        self._actDiff = (self._model.getActionSymbolicVariable() -
                         self._q_valsActA_drop)  # 更新模型的actor的输出减去原始模型的actor值

        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (T.mean(T.pow(self._actDiff, 2), axis=1)), (self._tmp_diff))
        self._actLoss = T.mean(self._actLoss_)
        self._policy_grad = T.grad(self._actLoss + self._actor_regularization,
                                   self._actionParams)
        ## Clipping the max gradient
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                beta1=0.9,
                beta2=0.999,
                epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._actionUpdates = lasagne.updates.adagrad(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates()
        }

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        ### Give v(s') the next state and v(s) (target) the current state
        self._diff_adv = (self._discount_factor * self._q_func) - (
            self._q_valsTargetNextState
        )  #\gamma*critic模型的输出-critic模型在下一个状态的输出值
        self._diff_adv_givens = {
            self._model.getStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getStates(),
        }

        Distillation.compile(self)