def _setup_policy(self, time_step_spec, action_spec, boltzmann_temperature, emit_log_probability): policy = categorical_q_policy.CategoricalQPolicy( time_step_spec, action_spec, self._q_network, self._min_q_value, self._max_q_value, observation_and_action_constraint_splitter=( self._observation_and_action_constraint_splitter)) if boltzmann_temperature is not None: collect_policy = boltzmann_policy.BoltzmannPolicy( policy, temperature=boltzmann_temperature) else: collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( policy, epsilon=self._epsilon_greedy) policy = greedy_policy.GreedyPolicy(policy) target_policy = categorical_q_policy.CategoricalQPolicy( time_step_spec, action_spec, self._target_q_network, self._min_q_value, self._max_q_value, observation_and_action_constraint_splitter=( self._observation_and_action_constraint_splitter)) self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy) return policy, collect_policy
def testMasking(self): batch_size = 1000 num_state_dims = 5 num_actions = 8 observations = tf.random.uniform([batch_size, num_state_dims]) time_step = ts.restart(observations, batch_size=batch_size) input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32) action_spec = tensor_spec.BoundedTensorSpec( [1], tf.int32, 0, num_actions - 1) # We create a fixed mask here for testing purposes. Normally the mask would # be part of the observation. mask = [0, 1, 0, 1, 0, 0, 1, 0] np_mask = np.array(mask) tf_mask = tf.constant([mask for _ in range(batch_size)]) q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=input_tensor_spec, action_spec=action_spec, num_atoms=3, fc_layer_params=[4]) policy = categorical_q_policy.CategoricalQPolicy( self._time_step_spec, action_spec, q_network, self._min_q_value, self._max_q_value, observation_and_action_constraint_splitter=( lambda observation: (observation, tf_mask))) self.evaluate(tf.compat.v1.global_variables_initializer()) # Sample from the policy 1000 times, and ensure that actions considered # invalid according to the mask are never chosen. action_step = policy.action(time_step) action = self.evaluate(action_step.action) self.assertEqual(action.shape, (batch_size,)) self.assertAllEqual(np_mask[action], np.ones([batch_size]))
def testMasking(self): batch_size = 1000 num_state_dims = 5 num_actions = 8 observations = tf.random.uniform([batch_size, num_state_dims]) time_step = ts.restart(observations, batch_size=batch_size) input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32) action_spec = tensor_spec.BoundedTensorSpec( [1], tf.int32, 0, num_actions - 1) mask = [0, 1, 0, 1, 0, 0, 1, 0] np_mask = np.array(mask) tf_mask = tf.constant([mask for _ in range(batch_size)]) q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=input_tensor_spec, action_spec=action_spec, num_atoms=3, mask_split_fn=lambda observation: (observation, tf_mask), fc_layer_params=[4]) policy = categorical_q_policy.CategoricalQPolicy(self._min_q_value, self._max_q_value, q_network, action_spec) # Force creation of variables before global_variables_initializer. policy.variables() self.evaluate(tf.compat.v1.global_variables_initializer()) # Sample from the policy 1000 times and ensure that invalid actions are # never chosen. action_step = policy.action(time_step) action = self.evaluate(action_step.action) self.assertEqual(action.shape, (batch_size,)) self.assertAllEqual(np_mask[action], np.ones([batch_size]))
def testUpdate(self): policy = categorical_q_policy.CategoricalQPolicy(self._min_q_value, self._max_q_value, self._q_network, self._action_spec) new_policy = categorical_q_policy.CategoricalQPolicy(self._min_q_value, self._max_q_value, self._q_network, self._action_spec) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations) # There should be two variables in our networks for the fc_layer we # specified (one kernel and one bias). self.assertLen(policy.variables(), 2) self.assertLen(new_policy.variables(), 2) actions, _, _ = policy.action(time_step) new_actions, _, _ = new_policy.action(time_step) self.assertEqual(actions.shape, new_actions.shape) self.assertEqual(actions.dtype, new_actions.dtype) self.evaluate(tf.compat.v1.global_variables_initializer()) actions = self.evaluate(actions) # actions should be a list of two elements; e.g., [0, 1] self.assertLen(actions, 2) for action in actions: self.assertGreaterEqual(action, self._action_spec.minimum) self.assertLessEqual(action, self._action_spec.maximum) self.assertEqual(self.evaluate(new_policy.update(policy)), None) new_actions = self.evaluate(new_actions) # new_actions should also be a list of two elements; e.g., [0, 1] self.assertLen(new_actions, 2) for action in new_actions: self.assertGreaterEqual(action, self._action_spec.minimum) self.assertLessEqual(action, self._action_spec.maximum)
def testBuild(self): policy = categorical_q_policy.CategoricalQPolicy( self._time_step_spec, self._action_spec, self._q_network, self._min_q_value, self._max_q_value) self.assertEqual(policy.time_step_spec, self._time_step_spec) self.assertEqual(policy.action_spec, self._action_spec) # There should be two variables in our network for the fc_layer we specified # (one kernel and one bias). self.assertLen(policy.variables(), 2)
def testSaver(self): policy = categorical_q_policy.CategoricalQPolicy( self._time_step_spec, self._action_spec, self._q_network, self._min_q_value, self._max_q_value) saver = policy_saver.PolicySaver(policy) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(tf.compat.v1.local_variables_initializer()) save_path = os.path.join(flags.FLAGS.test_tmpdir, 'saved_categorical_q_policy') saver.save(save_path)
def testMultipleActionsRaiseError(self): with self.assertRaisesRegexp( TypeError, '.*action_spec must be a BoundedTensorSpec.*'): # Replace the action_spec for this test. action_spec = [tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)] * 2 q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=self._obs_spec, action_spec=action_spec, num_atoms=3, fc_layer_params=[4]) categorical_q_policy.CategoricalQPolicy( self._time_step_spec, action_spec, q_network, self._min_q_value, self._max_q_value)
def testSaver(self): policy = categorical_q_policy.CategoricalQPolicy( self._time_step_spec, self._action_spec, self._q_network, self._min_q_value, self._max_q_value) train_step = tf.compat.v1.train.get_or_create_global_step() saver = policy_saver.PolicySaver(policy, train_step=train_step) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(tf.compat.v1.local_variables_initializer()) save_path = os.path.join(flags.FLAGS.test_tmpdir, 'saved_categorical_q_policy') # For TF1 Compatibility we set the cached session as default. This is a # no-op in TF2. with self.cached_session(): saver.save(save_path)
def testSample(self): policy = categorical_q_policy.CategoricalQPolicy( self._time_step_spec, self._action_spec, self._q_network, self._min_q_value, self._max_q_value) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations) actions = policy.action(time_step).action self.assertEqual(actions.shape.as_list(), [2]) self.assertEqual(actions.dtype, tf.int32) self.evaluate(tf.compat.v1.global_variables_initializer()) actions = self.evaluate(actions) # actions should be a list of two elements; e.g., [0, 1] self.assertLen(actions, 2) for action in actions: self.assertGreaterEqual(action, self._action_spec.minimum) self.assertLessEqual(action, self._action_spec.maximum)
def testMultiSample(self): policy = categorical_q_policy.CategoricalQPolicy( self._min_q_value, self._max_q_value, self._q_network, self._action_spec) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations) actions, _ = policy.step(time_step, num_samples=2) self.assertEqual(actions.shape.as_list(), [2, 2]) self.assertEqual(actions.dtype, tf.int32) self.evaluate(tf.compat.v1.global_variables_initializer()) actions = self.evaluate(actions) # actions should be a nested list of the form [[0, 1], [1, 0]] self.assertLen(actions, 2) for inner_list in actions: self.assertLen(inner_list, 2) for action in inner_list: self.assertGreaterEqual(action, self._action_spec.minimum) self.assertLessEqual(action, self._action_spec.maximum)
def __init__( self, time_step_spec, action_spec, categorical_q_network, optimizer, min_q_value=-10.0, max_q_value=10.0, epsilon_greedy=0.1, n_step_update=1, boltzmann_temperature=None, # Params for target network updates target_update_tau=1.0, target_update_period=1, # Params for training. td_errors_loss_fn=None, gamma=1.0, reward_scale_factor=1.0, gradient_clipping=None, # Params for debugging debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a Categorical DQN Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A `BoundedTensorSpec` representing the actions. categorical_q_network: A categorical_q_network.CategoricalQNetwork that returns the q_distribution for each action. optimizer: The optimizer to use for training. min_q_value: A float specifying the minimum Q-value, used for setting up the support. max_q_value: A float specifying the maximum Q-value, used for setting up the support. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if a wrapper is not provided to the collect_policy method). n_step_update: The number of steps to consider when computing TD error and TD loss. Defaults to single-step updates. Note that this requires the user to call train on Trajectory objects with a time dimension of `n_step_update + 1`. However, note that we do not yet support `n_step_update > 1` in the case of RNNs (i.e., non-empty `q_network.state_spec`). boltzmann_temperature: Temperature value to use for Boltzmann sampling of the actions during data collection. The closer to 0.0, the higher the probability of choosing the best action. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of element_wise_huber_loss is used. This function takes as input the target and the estimated Q values and returns the loss for each element of the batch. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: TypeError: If the action spec contains more than one action. """ num_atoms = getattr(categorical_q_network, 'num_atoms', None) if num_atoms is None: raise TypeError( 'Expected categorical_q_network to have property ' '`num_atoms`, but it doesn\'t (note: you likely want to ' 'use a CategoricalQNetwork). Network is: %s' % (categorical_q_network, )) self._num_atoms = num_atoms self._min_q_value = min_q_value self._max_q_value = max_q_value self._support = tf.linspace(min_q_value, max_q_value, num_atoms) super(CategoricalDqnAgent, self).__init__(time_step_spec, action_spec, categorical_q_network, optimizer, epsilon_greedy=epsilon_greedy, n_step_update=n_step_update, boltzmann_temperature=boltzmann_temperature, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=td_errors_loss_fn, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, name=name) policy = categorical_q_policy.CategoricalQPolicy( min_q_value, max_q_value, self._q_network, self._action_spec) if boltzmann_temperature is not None: self._collect_policy = boltzmann_policy.BoltzmannPolicy( policy, temperature=self._boltzmann_temperature) else: self._collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( policy, epsilon=self._epsilon_greedy) self._policy = greedy_policy.GreedyPolicy(policy)
def __init__( self, time_step_spec, action_spec, categorical_q_network, optimizer, min_q_value=-10.0, max_q_value=10.0, epsilon_greedy=0.1, n_step_update=1, boltzmann_temperature=None, # Params for target network updates target_categorical_q_network=None, target_update_tau=1.0, target_update_period=1, # Params for training. td_errors_loss_fn=None, gamma=1.0, reward_scale_factor=1.0, gradient_clipping=None, # Params for debugging debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a Categorical DQN Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A `BoundedTensorSpec` representing the actions. categorical_q_network: A categorical_q_network.CategoricalQNetwork that returns the q_distribution for each action. optimizer: The optimizer to use for training. min_q_value: A float specifying the minimum Q-value, used for setting up the support. max_q_value: A float specifying the maximum Q-value, used for setting up the support. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if a wrapper is not provided to the collect_policy method). n_step_update: The number of steps to consider when computing TD error and TD loss. Defaults to single-step updates. Note that this requires the user to call train on Trajectory objects with a time dimension of `n_step_update + 1`. However, note that we do not yet support `n_step_update > 1` in the case of RNNs (i.e., non-empty `q_network.state_spec`). boltzmann_temperature: Temperature value to use for Boltzmann sampling of the actions during data collection. The closer to 0.0, the higher the probability of choosing the best action. target_categorical_q_network: (Optional.) A `tf_agents.network.Network` to be used as the target network during Q learning. Every `target_update_period` train steps, the weights from `categorical_q_network` are copied (possibly with smoothing via `target_update_tau`) to `target_categorical_q_network`. If `target_categorical_q_network` is not provided, it is created by making a copy of `categorical_q_network`, which initializes a new network with the same structure and its own layers and weights. Network copying is performed via the `Network.copy` superclass method, and may inadvertently lead to the resulting network to share weights with the original. This can happen if, for example, the original network accepted a pre-built Keras layer in its `__init__`, or accepted a Keras layer that wasn't built, but neglected to create a new copy. In these cases, it is up to you to provide a target Network having weights that are not shared with the original `categorical_q_network`. If you provide a `target_categorical_q_network` that shares any weights with `categorical_q_network`, a warning will be logged but no exception is thrown. Note; shallow copies of Keras layers may be built via the code: ```python new_layer = type(layer).from_config(layer.get_config()) ``` target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of huber_loss is used. This function takes as input the target and the estimated Q values and returns the loss for each element of the batch. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: TypeError: If the action spec contains more than one action. """ super(CategoricalDqnAgent, self).__init__(time_step_spec, action_spec, categorical_q_network, optimizer, epsilon_greedy=epsilon_greedy, n_step_update=n_step_update, boltzmann_temperature=boltzmann_temperature, target_q_network=target_categorical_q_network, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=td_errors_loss_fn, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, name=name) def check_atoms(net, label): num_atoms = getattr(net, 'num_atoms', None) if num_atoms is None: raise TypeError( 'Expected {} to have property `num_atoms`, but it ' 'doesn\'t (note: you likely want to use a ' 'CategoricalQNetwork). Network is: {}'.format(label, net)) return num_atoms num_atoms = check_atoms(self._q_network, 'categorical_q_network') target_num_atoms = check_atoms(self._target_q_network, 'target_categorical_q_network') if num_atoms != target_num_atoms: raise ValueError( 'categorical_q_network and target_categorical_q_network have ' 'different numbers of atoms: {} vs. {}'.format( num_atoms, target_num_atoms)) self._num_atoms = num_atoms self._min_q_value = min_q_value self._max_q_value = max_q_value self._support = tf.linspace(min_q_value, max_q_value, num_atoms) policy = categorical_q_policy.CategoricalQPolicy( min_q_value, max_q_value, self._q_network, self._action_spec) if boltzmann_temperature is not None: self._collect_policy = boltzmann_policy.BoltzmannPolicy( policy, temperature=self._boltzmann_temperature) else: self._collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( policy, epsilon=self._epsilon_greedy) self._policy = greedy_policy.GreedyPolicy(policy) target_policy = categorical_q_policy.CategoricalQPolicy( min_q_value, max_q_value, self._target_q_network, self._action_spec) self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy)