def setUp(self): super(PengsQLambdaTest, self).setUp() # Tensor dimensions below: TxBxA (time, batch id, action). self.q_tm1 = tf.constant([[[1.1, 2.1], [2.1, 3.1]], [[-1.1, 1.1], [-1.1, 0.1]], [[3.1, -3.1], [-2.1, -1.1]]], dtype=tf.float32) self.q_t = tf.constant([[[1.2, 2.2], [4.2, 2.2]], [[-1.2, 0.2], [1.2, 1.2]], [[2.2, -1.2], [-1.2, -2.2]]], dtype=tf.float32) # Tensor dimensions below: TxB (time, batch id). self.a_tm1 = tf.constant([[0, 1], [1, 0], [0, 0]], dtype=tf.int32) self.pcont_t = tf.constant([[0.00, 0.88], [0.89, 1.00], [0.85, 0.83]], dtype=tf.float32) self.r_t = tf.constant([[-1.3, 1.3], [-1.3, 5.3], [2.3, -3.3]], dtype=tf.float32) self.lambda_scalar = 0.5 self.lambda_ = tf.constant([[self.lambda_scalar, self.lambda_scalar], [self.lambda_scalar, self.lambda_scalar], [self.lambda_scalar, self.lambda_scalar]], dtype=tf.float32) # Evaluate trusted values by defining lambda_ as a tensor. self.qlearning_reference = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t, self.pcont_t, self.q_t, self.lambda_) # Evaluate values by defining lambda_ as a python number. self.qlearning = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t, self.pcont_t, self.q_t, self.lambda_scalar)
def setUp(self): super(PengsQLambdaTest, self).setUp() # Tensor dimensions below: TxBxA (time, batch id, action). self.q_tm1 = tf.constant( [[[1.1, 2.1], [2.1, 3.1]], [[-1.1, 1.1], [-1.1, 0.1]], [[3.1, -3.1], [-2.1, -1.1]]], dtype=tf.float32) self.q_t = tf.constant( [[[1.2, 2.2], [4.2, 2.2]], [[-1.2, 0.2], [1.2, 1.2]], [[2.2, -1.2], [-1.2, -2.2]]], dtype=tf.float32) # Tensor dimensions below: TxB (time, batch id). self.a_tm1 = tf.constant([[0, 1], [1, 0], [0, 0]], dtype=tf.int32) self.pcont_t = tf.constant([[0.00, 0.88], [0.89, 1.00], [0.85, 0.83]], dtype=tf.float32) self.r_t = tf.constant([[-1.3, 1.3], [-1.3, 5.3], [2.3, -3.3]], dtype=tf.float32) self.lambda_scalar = 0.5 self.lambda_ = tf.constant([[self.lambda_scalar, self.lambda_scalar], [self.lambda_scalar, self.lambda_scalar], [self.lambda_scalar, self.lambda_scalar]], dtype=tf.float32) # Evaluate trusted values by defining lambda_ as a tensor. self.qlearning_reference = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t, self.pcont_t, self.q_t, self.lambda_) # Evaluate values by defining lambda_ as a python number. self.qlearning = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t, self.pcont_t, self.q_t, self.lambda_scalar)
def testCompatibilityCheck(self): r_t = tf.placeholder(tf.float32, [4, 2]) with self.assertRaisesRegexp( ValueError, "QLambda: Error in rank and/or compatibility check"): self.qlearning = rl.qlambda(self.q_tm1, self.a_tm1, r_t, self.pcont_t, self.q_t, self.lambda_)
def setUp(self): super(QLambdaTest, self).setUp() # Tensor dimensions below: TxBxA (time, batch id, action). self.q_tm1 = tf.constant( [[[1.1, 2.1], [2.1, 3.1]], [[-1.1, 1.1], [-1.1, 0.1]], [[3.1, -3.1], [-2.1, -1.1]]], dtype=tf.float32) self.q_t = tf.constant( [[[1.2, 2.2], [4.2, 2.2]], [[-1.2, 0.2], [1.2, 1.2]], [[2.2, -1.2], [-1.2, -2.2]]], dtype=tf.float32) # Tensor dimensions below: TxB (time, batch id). self.a_tm1 = tf.constant([[0, 1], [1, 0], [0, 0]], dtype=tf.int32) self.pcont_t = tf.constant([[0.00, 0.88], [0.89, 1.00], [0.85, 0.83]], dtype=tf.float32) self.r_t = tf.constant([[-1.3, 1.3], [-1.3, 5.3], [2.3, -3.3]], dtype=tf.float32) self.lambda_ = tf.constant([[0.67, 0.68], [0.65, 0.69], [0.66, 0.64]], dtype=tf.float32) self.qlearning = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t, self.pcont_t, self.q_t, self.lambda_) # Evaluate target Q-values used for testing. # t20 is Target for timestep 2, batch 0 self.t20 = 2.2 * 0.85 + 2.3 self.t10 = (self.t20 * 0.65 + 0.2 * (1 - 0.65)) * 0.89 - 1.3 self.t00 = (self.t10 * 0.67 + 2.2 * (1 - 0.67)) * 0.00 - 1.3 self.t21 = -1.2 * 0.83 - 3.3 self.t11 = (self.t21 * 0.69 + 1.2 * (1 - 0.69)) * 1.00 + 5.3 self.t01 = (self.t11 * 0.68 + 4.2 * (1 - 0.68)) * 0.88 + 1.3
def setUp(self): super(QLambdaTest, self).setUp() # Tensor dimensions below: TxBxA (time, batch id, action). self.q_tm1 = tf.constant([[[1.1, 2.1], [2.1, 3.1]], [[-1.1, 1.1], [-1.1, 0.1]], [[3.1, -3.1], [-2.1, -1.1]]], dtype=tf.float32) self.q_t = tf.constant([[[1.2, 2.2], [4.2, 2.2]], [[-1.2, 0.2], [1.2, 1.2]], [[2.2, -1.2], [-1.2, -2.2]]], dtype=tf.float32) # Tensor dimensions below: TxB (time, batch id). self.a_tm1 = tf.constant([[0, 1], [1, 0], [0, 0]], dtype=tf.int32) self.pcont_t = tf.constant([[0.00, 0.88], [0.89, 1.00], [0.85, 0.83]], dtype=tf.float32) self.r_t = tf.constant([[-1.3, 1.3], [-1.3, 5.3], [2.3, -3.3]], dtype=tf.float32) self.lambda_ = tf.constant([[0.67, 0.68], [0.65, 0.69], [0.66, 0.64]], dtype=tf.float32) self.qlearning = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t, self.pcont_t, self.q_t, self.lambda_) # Evaluate target Q-values used for testing. # t20 is Target for timestep 2, batch 0 self.t20 = 2.2 * 0.85 + 2.3 self.t10 = (self.t20 * 0.65 + 0.2 * (1 - 0.65)) * 0.89 - 1.3 self.t00 = (self.t10 * 0.67 + 2.2 * (1 - 0.67)) * 0.00 - 1.3 self.t21 = -1.2 * 0.83 - 3.3 self.t11 = (self.t21 * 0.69 + 1.2 * (1 - 0.69)) * 1.00 + 5.3 self.t01 = (self.t11 * 0.68 + 4.2 * (1 - 0.68)) * 0.88 + 1.3
def testRankCheck(self): q_tm1 = tf.placeholder(tf.float32, [None, 3]) with self.assertRaisesRegexp( ValueError, "QLambda: Error in rank and/or compatibility check"): self.qlearning = rl.qlambda(q_tm1, self.a_tm1, self.r_t, self.pcont_t, self.q_t, self.lambda_scalar)
def pixel_control_loss( observations, actions, action_values, cell_size, discount_factor, scale, crop_height_dim=(None, None), crop_width_dim=(None, None)): """Calculate n-step Q-learning loss for pixel control auxiliary task. For each pixel-based pseudo reward signal, the corresponding action-value function is trained off-policy, using Q(lambda). A discount of 0.9 is commonly used for learning the value functions. Note that, since pseudo rewards have a spatial structure, with neighbouring cells exhibiting strong correlations, it is convenient to predict the action values for all the cells through a deconvolutional head. See "Reinforcement Learning with Unsupervised Auxiliary Tasks" by Jaderberg, Mnih, Czarnecki et al. (https://arxiv.org/abs/1611.05397). Args: observations: A tensor of shape `[T+1,B, ...]`; `...` is the observation shape, `T` the sequence length, and `B` the batch size. `T` and `B` can be statically unknown for `observations`, `actions` and `action_values`. actions: A tensor, shape `[T,B]`, of the actions across each sequence. action_values: A tensor, shape `[T+1,B,H,W,N]` of pixel control action values, where `H`, `W` are the number of pixel control cells/tasks, and `N` is the number of actions. cell_size: size of the cells used to derive the pixel based pseudo-rewards. discount_factor: discount used for learning the value function associated to the pseudo rewards; must be a scalar or a Tensor of shape [T,B]. scale: scale factor for pixels in `observations`. crop_height_dim: tuple (min_height, max_height) specifying how to crop the input observations before computing the pseudo-rewards. crop_width_dim: tuple (min_width, max_width) specifying how to crop the input observations before computing the pseudo-rewards. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape [B]. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B]. * `td_error`: batch of temporal difference errors, shape [B]. Raises: ValueError: if the shape of `action_values` is not compatible with that of the pseudo-rewards derived from the observations. """ # Useful shapes. sequence_length, batch_size = base_ops.best_effort_shape(actions) num_actions = action_values.get_shape().as_list()[-1] height_width_q = action_values.get_shape().as_list()[2:-1] # Calculate rewards using the observations. Crop observations if appropriate. if crop_height_dim[0] is not None: h_low, h_high = crop_height_dim observations = observations[:, :, h_low:h_high, :] if crop_width_dim[0] is not None: w_low, w_high = crop_width_dim observations = observations[:, :, :, w_low:w_high] # Rescale observations by a constant factor. observations *= tf.constant(scale) # Compute pseudo-rewards and get their shape. pseudo_rewards = pixel_control_rewards(observations, cell_size) height_width = pseudo_rewards.get_shape().as_list()[2:] # Check that pseudo-rewards and Q-values are compatible in shape. if height_width != height_width_q: raise ValueError( "Pixel Control values are not compatible with the shape of the" "pseudo-rewards derived from the observation. Pseudo-rewards have shape" "{}, while Pixel Control values have shape {}".format( height_width, height_width_q)) # We now have Q(s,a) and rewards, so can calculate the n-step loss. The # QLambda loss op expects inputs of shape [T,B,N] and [T,B], but our tensors # are in a variety of incompatible shapes. The state-action values have # shape [T,B,H,W,N] and rewards have shape [T,B,H,W]. We can think of the # [H,W] dimensions as extra batch dimensions for the purposes of the loss # calculation, so we first collapse [B,H,W] into a single dimension. q_tm1 = tf.reshape( action_values[:-1], # [T,B,H,W,N]. [sequence_length, -1, num_actions], name="q_tm1") # [T,BHW,N]. r_t = tf.reshape( pseudo_rewards, # [T,B,H,W]. [sequence_length, -1], name="r_t") # [T,BHW]. q_t = tf.reshape( action_values[1:], # [T,B,H,W,N]. [sequence_length, -1, num_actions], name="q_t") # [T,BHW,N]. # The actions tensor is of shape [T,B], and is the same for each H and W. # We thus expand it to be same shape as the reward tensor, [T,BHW]. expanded_actions = tf.expand_dims(tf.expand_dims(actions, -1), -1) a_tm1 = tf.tile( expanded_actions, multiples=[1, 1] + height_width) # [T,B,H,W]. a_tm1 = tf.reshape(a_tm1, [sequence_length, -1]) # [T,BHW]. # We similarly expand-and-tile the discount to [T,BHW]. discount_factor = tf.convert_to_tensor(discount_factor) if discount_factor.shape.ndims == 0: pcont_t = tf.reshape(discount_factor, [1, 1]) # [1,1]. pcont_t = tf.tile(pcont_t, tf.shape(a_tm1)) # [T,BHW]. elif discount_factor.shape.ndims == 2: tiled_pcont = tf.tile( tf.expand_dims(tf.expand_dims(discount_factor, -1), -1), [1, 1] + height_width) pcont_t = tf.reshape(tiled_pcont, [sequence_length, -1]) else: raise ValueError( "The discount_factor must be a scalar or a tensor of rank 2." "instead is a tensor of shape {}".format( discount_factor.shape.as_list())) # Compute a QLambda loss of shape [T,BHW] loss, _ = action_value_ops.qlambda(q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_=1) # Take sum over sequence, sum over cells. expanded_shape = [sequence_length, batch_size] + height_width spatial_loss = tf.reshape(loss, expanded_shape) # [T,B,H,W]. # Return. extra = PixelControlExtra( spatial_loss=spatial_loss, pseudo_rewards=pseudo_rewards) return base_ops.LossOutput( tf.reduce_sum(spatial_loss, axis=[0, 2, 3]), extra) # [B]
def testCompatibilityCheck(self): a_tm1 = tf.placeholder(tf.int32, [5, 2]) with self.assertRaisesRegexp( ValueError, "QLambda: Error in rank and/or compatibility check"): self.qlearning = rl.qlambda(self.q_tm1, a_tm1, self.r_t, self.pcont_t, self.q_t, self.lambda_scalar)