def build_q_models(policy: Policy, obs_space: gym.Space, action_space: gym.Space, config: TrainerConfigDict) -> ModelV2: if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_SCOPE) policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_TARGET_SCOPE) policy.q_func_vars = policy.q_model.variables() policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model
def build_q_losses( policy: Policy, model: ModelV2, dist_class: Type[TFActionDistribution], train_batch: SampleBatch, ) -> TensorType: """Constructs the loss for SimpleQTFPolicy. Args: policy (Policy): The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[ActionDistribution]): The action distribution class. train_batch (SampleBatch): The training data. Returns: TensorType: A single loss tensor. """ # q network evaluation q_t = compute_q_values(policy, policy.model, train_batch[SampleBatch.CUR_OBS], explore=False) # target q network evalution q_tp1 = compute_q_values(policy, policy.target_model, train_batch[SampleBatch.NEXT_OBS], explore=False) if not hasattr(policy, "q_func_vars"): policy.q_func_vars = model.variables() policy.target_q_func_vars = policy.target_model.variables() # q scores for actions which we know were selected in the given state. one_hot_selection = tf.one_hot( tf.cast(train_batch[SampleBatch.ACTIONS], tf.int32), policy.action_space.n) q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1) # compute estimate of best possible value starting from state at t + 1 dones = tf.cast(train_batch[SampleBatch.DONES], tf.float32) q_tp1_best_one_hot_selection = tf.one_hot(tf.argmax(q_tp1, 1), policy.action_space.n) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_tp1_best_masked = (1.0 - dones) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = (train_batch[SampleBatch.REWARDS] + policy.config["gamma"] * q_tp1_best_masked) # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) loss = tf.reduce_mean(huber_loss(td_error)) # save TD error as an attribute for outside access policy.td_error = td_error return loss
def _build_q_models(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> ModelV2: """Build q_model and target_q_model for Simple Q learning Note that this function works for both Tensorflow and PyTorch. Args: policy (Policy): The Policy, which will use the model for optimization. obs_space (gym.spaces.Space): The policy's observation space. action_space (gym.spaces.Space): The policy's action space. config (TrainerConfigDict): Returns: ModelV2: The Model for the Policy to use. Note: The target q model will not be returned, just assigned to `policy.target_q_model`. """ if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_SCOPE) if torch.cuda.is_available(): policy.q_model = policy.q_model.to("cuda") policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_TARGET_SCOPE) if torch.cuda.is_available(): policy.target_q_model = policy.target_q_model.to("cuda") policy.q_func_vars = policy.q_model.variables() policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model