def update(self, ob_no, next_ob_no, act_t_ph, re_n, terminal_n, lr):
        ob_no, next_ob_no, act_t_ph, re_n, terminal_n = \
            [x.to(self.device) for x in [ob_no, next_ob_no, act_t_ph, re_n, terminal_n]]

        # setting the learning-rate value
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

        target_q_t = self.__calc_target_vals(next_ob_no, re_n, terminal_n)

        q_t = torch.sum(self.q_t_values(ob_no) *
                        torch_one_hot(act_t_ph, self.ac_dim),
                        dim=1)

        #####################

        # TODO compute the Bellman error (i.e. TD error between q_t and target_q_t)
        # Note that this scalar-valued tensor later gets passed into the optimizer, to be minimized
        # HINT: use reduce mean of huber_loss (from infrastructure/dqn_utils.py) instead of squared error
        total_error = torch.mean(huber_loss(q_t, target_q_t))

        #####################

        # train_fn will be called in order to train the critic (by minimizing the TD error)
        self.optimizer.zero_grad()
        total_error.backward()
        nn.utils.clip_grad_norm_(self.q_t_values.parameters(),
                                 self.grad_norm_clipping)

        self.optimizer.step()
        return total_error
    def _build(self, q_func):

        #####################

        # q values, created with the placeholder that holds CURRENT obs (i.e., t)
        self.q_t_values = q_func(self.obs_t_ph,
                                 self.ac_dim,
                                 scope='q_func',
                                 reuse=False)
        self.q_t = tf.reduce_sum(self.q_t_values *
                                 tf.one_hot(self.act_t_ph, self.ac_dim),
                                 axis=1)

        #####################

        # target q values, created with the placeholder that holds NEXT obs (i.e., t+1)
        q_tp1_values = q_func(self.obs_tp1_ph,
                              self.ac_dim,
                              scope='target_q_func',
                              reuse=False)

        if self.double_q:
            # You must fill this part for Q2 of the Q-learning potion of the homework.
            # In double Q-learning, the best action is selected using the Q-network that
            # is being updated, but the Q-value for this action is obtained from the
            # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details.
            q_tp1 = tf.reduce_sum(
                q_tp1_values *
                tf.one_hot(tf.argmax(self.q_t_values, axis=1), self.ac_dim),
                axis=1)
        else:
            # q values of the next timestep
            q_tp1 = tf.reduce_max(q_tp1_values, axis=1)

        #####################

        # TODO calculate the targets for the Bellman error
        # HINT1: as you saw in lecture, this would be:
        #currentReward + self.gamma * qValuesOfNextTimestep * (1 - self.done_mask_ph)
        # HINT2: see above, where q_tp1 is defined as the q values of the next timestep
        # HINT3: see the defined placeholders and look for the one that holds current rewards
        target_q_t = self.rew_t_ph + self.gamma * q_tp1 * (1 -
                                                           self.done_mask_ph)
        target_q_t = tf.stop_gradient(target_q_t)

        #####################

        # TODO compute the Bellman error (i.e. TD error between q_t and target_q_t)
        # Note that this scalar-valued tensor later gets passed into the optimizer, to be minimized
        # HINT: use reduce mean of huber_loss (from infrastructure/dqn_utils.py) instead of squared error
        self.total_error = tf.reduce_mean(huber_loss(self.q_t - target_q_t))

        #####################

        # TODO these variables should all of the
        # variables of the Q-function network and target network, respectively
        # HINT1: see the "scope" under which the variables were constructed in the lines at the top of this function
        # HINT2: use tf.get_collection to look for all variables under a certain scope
        q_func_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope='q_func')
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_q_func')

        #####################

        # train_fn will be called in order to train the critic (by minimizing the TD error)
        self.learning_rate = tf.placeholder(tf.float32, (),
                                            name="learning_rate")
        optimizer = self.optimizer_spec.constructor(
            learning_rate=self.learning_rate, **self.optimizer_spec.kwargs)
        self.train_fn = minimize_and_clip(optimizer,
                                          self.total_error,
                                          var_list=q_func_vars,
                                          clip_val=self.grad_norm_clipping)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_fn = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_fn.append(var_target.assign(var))
        self.update_target_fn = tf.group(*update_target_fn)
    def _build(self, q_func):

        #####################

        # q values, created with the placeholder that holds CURRENT obs (i.e., t)
        # online network: Q_phi(s, a)
        self.q_t_values = q_func(
            self.obs_t_ph, self.ac_dim, scope='q_func',
            reuse=False)  # reuse = False means to be an independant model
        self.q_t = tf.reduce_sum(self.q_t_values *
                                 tf.one_hot(self.act_t_ph, self.ac_dim),
                                 axis=1)  # act like softmax

        #####################

        # target q values, created with the placeholder that holds NEXT obs (i.e., t+1)
        # vector for a': Q_phi'(s', a')
        q_tp1_values = q_func(self.obs_tp1_ph,
                              self.ac_dim,
                              scope='target_q_func',
                              reuse=False)

        if self.double_q:
            # You must fill this part for Q2 of the Q-learning potion of the homework.
            # In double Q-learning, the best action is selected using the Q-network that
            # is being updated, but the Q-value for this action is obtained from the
            # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details.

            # Q_phi'(s', argmax_a'(Q_phi(s', a')))
            q_t_values_for_tp1 = q_func(self.obs_tp1_ph,
                                        self.ac_dim,
                                        scope='q_func',
                                        reuse=True)  # reuse the training model
            num_sample = tf.shape(self.obs_tp1_ph)[0]
            index = tf.stack([
                tf.range(num_sample),
                tf.cast(tf.argmax(q_t_values_for_tp1, axis=1), tf.int32)
            ],
                             axis=1)  # build index
            q_tp1 = tf.gather_nd(q_tp1_values, index)

        else:
            # q values of the next timestep
            # Q_phi'(s', argmax_a'(Q_phi'(s', a')))
            q_tp1 = tf.reduce_max(q_tp1_values, axis=1)

        #####################

        # TODO calculate the targets for the Bellman error
        # HINT1: as you saw in lecture, this would be:
        #currentReward + self.gamma * qValuesOfNextTimestep * (1 - self.done_mask_ph)
        # HINT2: see above, where q_tp1 is defined as the q values of the next timestep
        # HINT3: see the defined placeholders and look for the one that holds current rewards
        # 这里target的定义是计算图
        # AC当中target定义是直接给numpy
        target_q_t = self.rew_t_ph + (1 -
                                      self.done_mask_ph) * self.gamma * q_tp1
        target_q_t = tf.stop_gradient(
            target_q_t
        )  # when doing (prediction - true) don't let gradient flow into true

        #####################

        # TODO compute the Bellman error (i.e. TD error between q_t and target_q_t)
        # Note that this scalar-valued tensor later gets passed into the optimizer, to be minimized
        # HINT: use reduce mean of huber_loss (from infrastructure/dqn_utils.py) instead of squared error
        # 而不是mean square
        self.total_error = tf.reduce_mean(huber_loss(self.q_t - target_q_t))

        #####################

        # TODO these variables should all of the
        # variables of the Q-function network and target network, respectively
        # HINT1: see the "scope" under which the variables were constructed in the lines at the top of this function
        # HINT2: use tf.get_collection to look for all variables under a certain scope
        q_func_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope='q_func')
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_q_func')

        #####################

        # train_fn will be called in order to train the critic (by minimizing the TD error)
        self.learning_rate = tf.placeholder(tf.float32, (),
                                            name="learning_rate")
        optimizer = self.optimizer_spec.constructor(
            learning_rate=self.learning_rate, **self.optimizer_spec.kwargs)
        self.train_fn = minimize_and_clip(optimizer,
                                          self.total_error,
                                          var_list=q_func_vars,
                                          clip_val=self.grad_norm_clipping)
        #####################

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_fn = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            if (not self.hparams['use_polyak']):
                update_target_fn.append(var_target.assign(var))
            else:
                update_target_fn.append(
                    var_target.assign(0.0001 * var + 0.9999 * var_target))

        self.update_target_fn = tf.group(
            *update_target_fn)  # 总的赋值操作,tf.group往往用来组合op