def learn(self, obs, action, reward, next_obs, terminal, sample_weight): # print("obs:",obs) # raise NotImplementedError # obs = layers.squeeze(input=obs,axes=[-1]) pred_value = self.model.value(obs) action_onehot = layers.one_hot(action, self.act_dim) pred_action_value = layers.reduce_sum(action_onehot * pred_value, dim=1) # calculate the target q value next_action_value = self.model.value(next_obs) greedy_action = layers.argmax(next_action_value, axis=-1) greedy_action = layers.unsqueeze(greedy_action, axes=[1]) greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim) next_pred_value = self.target_model.value(next_obs) max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value, dim=1) max_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v delta = layers.abs(target - pred_action_value) cost = sample_weight * layers.square_error_cost( pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3) optimizer.minimize(cost) return cost, delta
def predict(self, obs): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. """ logits = self.model.policy(obs) probs = layers.softmax(logits) predict_actions = layers.argmax(probs, 1) return predict_actions
def learn(self, obs, action, reward, next_obs, terminal, learning_rate=None): """ update value model self.model with DQN algorithm """ # Support the modification of learning_rate if learning_rate is None: assert isinstance( self.lr, float), "Please set the learning rate of DQN in initializaion." learning_rate = self.lr pred_value = self.model.value(obs) action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum(layers.elementwise_mul( action_onehot, pred_value), dim=1) # calculate the target q value next_action_value = self.model.value(next_obs) greedy_action = layers.argmax(next_action_value, axis=-1) greedy_action = layers.unsqueeze(greedy_action, axes=[1]) greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim) next_pred_value = self.target_model.value(next_obs) max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value, dim=1) max_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, epsilon=1e-3) optimizer.minimize(cost) return cost