Beispiel #1
0
    def learn(self, obs, action, reward, next_obs, terminal, sample_weight):
        # print("obs:",obs)
        # raise NotImplementedError
        # obs = layers.squeeze(input=obs,axes=[-1])
        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        pred_action_value = layers.reduce_sum(action_onehot * pred_value,
                                              dim=1)

        # calculate the target q value
        next_action_value = self.model.value(next_obs)
        greedy_action = layers.argmax(next_action_value, axis=-1)
        greedy_action = layers.unsqueeze(greedy_action, axes=[1])
        greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim)
        next_pred_value = self.target_model.value(next_obs)
        max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value,
                                  dim=1)
        max_v.stop_gradient = True

        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v
        delta = layers.abs(target - pred_action_value)
        cost = sample_weight * layers.square_error_cost(
            pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, delta
Beispiel #2
0
    def predict(self, obs):
        """
        Args:
            obs: An float32 tensor of shape ([B] + observation_space).
                 E.g. [B, C, H, W] in atari.
        """
        logits = self.model.policy(obs)
        probs = layers.softmax(logits)

        predict_actions = layers.argmax(probs, 1)

        return predict_actions
Beispiel #3
0
    def learn(self,
              obs,
              action,
              reward,
              next_obs,
              terminal,
              learning_rate=None):
        """ update value model self.model with DQN algorithm
        """
        # Support the modification of learning_rate
        if learning_rate is None:
            assert isinstance(
                self.lr,
                float), "Please set the learning rate of DQN in initializaion."
            learning_rate = self.lr

        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, pred_value),
                                              dim=1)

        # calculate the target q value
        next_action_value = self.model.value(next_obs)
        greedy_action = layers.argmax(next_action_value, axis=-1)
        greedy_action = layers.unsqueeze(greedy_action, axes=[1])
        greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim)
        next_pred_value = self.target_model.value(next_obs)
        max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value,
                                  dim=1)
        max_v.stop_gradient = True

        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                         epsilon=1e-3)
        optimizer.minimize(cost)
        return cost