Example #1
0
    def testStackPad(self):
        # 1D.
        tensors = [[1, 2, 3], [4, 5, 6, 7, 8], [9]]
        result = utils.stack_pad(tensors, pad_axes=0, pad_to_lengths=6)
        self.assertTrue(
            np.array_equal(
                result,
                np.asarray([[1, 2, 3, 0, 0, 0], [4, 5, 6, 7, 8, 0],
                            [9, 0, 0, 0, 0, 0]],
                           dtype=np.float32)))

        # 3D.
        tensors = [[[[1, 2, 3], [4, 5, 6]]],
                   [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
                   [[[0, 1, 2]], [[3, 4, 5]]]]
        result = utils.stack_pad(tensors,
                                 pad_axes=[0, 1],
                                 pad_to_lengths=[2, 2])
        self.assertTrue(
            np.array_equal(
                result,
                np.asarray([[[[1, 2, 3], [4, 5, 6]], [[0, 0, 0], [0, 0, 0]]],
                            [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
                            [[[0, 1, 2], [0, 0, 0]], [[3, 4, 5], [0, 0, 0]]]],
                           dtype=np.float32)))
Example #2
0
    def testStackPadValueError(self):
        # 3D.
        tensors = [[[[1, 2, 3], [4, 5, 6]]],
                   [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
                   [[[0, 1, 2]], [[3, 4, 5]]], [[[1, 2, 3, 4]]]]

        # Not all tensors have the same shape along axis 2.
        with self.assertRaises(ValueError):
            utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[2, 2])
Example #3
0
  def testStackPadValueError(self):
    # 3D.
    tensors = [[[[1, 2, 3], [4, 5, 6]]],
               [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
               [[[0, 1, 2]], [[3, 4, 5]]],
               [[[1, 2, 3, 4]]]]

    # Not all tensors have the same shape along axis 2.
    with self.assertRaises(ValueError):
      utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[2, 2])
Example #4
0
def process_rollouts(rollouts, gamma, lambda_=1.0):
    """Convert a batch of rollouts into tensors ready to be fed into a model.

  Lists from each episode are stacked into 2D tensors and padded with 0s up to
  the maximum timestep in the batch.

  Args:
    rollouts: A list of Rollout instances.
    gamma: The discount factor. A number between 0 and 1 (inclusive). See gamma
        argument in discounted_advantage_and_rewards.
    lambda_: See lambda_ argument in discounted_advantage_and_rewards.

  Returns:
    Batch instance. states, actions, discounted_adv, and discounted_r are
    numpy arrays with shape (batch_size, max_episode_length). episode_lengths
    is a list of ints. total_rewards is a list of floats (total reward in each
    episode). batch_size and max_time are ints.

  Raises:
    ValueError: If any of the rollouts are not terminal.
  """
    for ro in rollouts:
        if not ro.terminated:
            raise ValueError('Can only process terminal rollouts.')

    episode_lengths = [len(ro.states) for ro in rollouts]
    batch_size = len(rollouts)
    max_time = max(episode_lengths)

    states = utils.stack_pad([ro.states for ro in rollouts], 0, max_time)
    actions = utils.stack_pad([ro.actions for ro in rollouts], 0, max_time)

    discounted_rewards = [None] * batch_size
    discounted_adv = [None] * batch_size
    for i, ro in enumerate(rollouts):
        disc_r, disc_adv = discounted_advantage_and_rewards(
            ro.rewards, ro.values, gamma, lambda_)
        discounted_rewards[i] = disc_r
        discounted_adv[i] = disc_adv
    discounted_rewards = utils.stack_pad(discounted_rewards, 0, max_time)
    discounted_adv = utils.stack_pad(discounted_adv, 0, max_time)

    total_rewards = [sum(ro.rewards) for ro in rollouts]

    return Batch(states=states,
                 actions=actions,
                 discounted_adv=discounted_adv,
                 discounted_r=discounted_rewards,
                 total_rewards=total_rewards,
                 episode_lengths=episode_lengths,
                 batch_size=batch_size,
                 max_time=max_time)
Example #5
0
def process_rollouts(rollouts, gamma, lambda_=1.0):
  """Convert a batch of rollouts into tensors ready to be fed into a model.

  Lists from each episode are stacked into 2D tensors and padded with 0s up to
  the maximum timestep in the batch.

  Args:
    rollouts: A list of Rollout instances.
    gamma: The discount factor. A number between 0 and 1 (inclusive). See gamma
        argument in discounted_advantage_and_rewards.
    lambda_: See lambda_ argument in discounted_advantage_and_rewards.

  Returns:
    Batch instance. states, actions, discounted_adv, and discounted_r are
    numpy arrays with shape (batch_size, max_episode_length). episode_lengths
    is a list of ints. total_rewards is a list of floats (total reward in each
    episode). batch_size and max_time are ints.

  Raises:
    ValueError: If any of the rollouts are not terminal.
  """
  for ro in rollouts:
    if not ro.terminated:
      raise ValueError('Can only process terminal rollouts.')

  episode_lengths = [len(ro.states) for ro in rollouts]
  batch_size = len(rollouts)
  max_time = max(episode_lengths)

  states = utils.stack_pad([ro.states for ro in rollouts], 0, max_time)
  actions = utils.stack_pad([ro.actions for ro in rollouts], 0, max_time)

  discounted_rewards = [None] * batch_size
  discounted_adv = [None] * batch_size
  for i, ro in enumerate(rollouts):
    disc_r, disc_adv = discounted_advantage_and_rewards(
        ro.rewards, ro.values, gamma, lambda_)
    discounted_rewards[i] = disc_r
    discounted_adv[i] = disc_adv
  discounted_rewards = utils.stack_pad(discounted_rewards, 0, max_time)
  discounted_adv = utils.stack_pad(discounted_adv, 0, max_time)

  total_rewards = [sum(ro.rewards) for ro in rollouts]

  return Batch(states=states,
               actions=actions,
               discounted_adv=discounted_adv,
               discounted_r=discounted_rewards,
               total_rewards=total_rewards,
               episode_lengths=episode_lengths,
               batch_size=batch_size,
               max_time=max_time)
Example #6
0
 def testStackPadNoAxes(self):
   # 2D.
   tensors = [[[1, 2, 3], [4, 5, 6]],
              [[7, 8, 9], [1, 2, 3]],
              [[4, 5, 6], [7, 8, 9]]]
   result = utils.stack_pad(tensors)
   self.assertTrue(np.array_equal(
       result,
       np.asarray(tensors)))
Example #7
0
 def testStackPadNoAxes(self):
   # 2D.
   tensors = [[[1, 2, 3], [4, 5, 6]],
              [[7, 8, 9], [1, 2, 3]],
              [[4, 5, 6], [7, 8, 9]]]
   result = utils.stack_pad(tensors)
   self.assertTrue(np.array_equal(
       result,
       np.asarray(tensors)))
Example #8
0
  def testStackPad(self):
    # 1D.
    tensors = [[1, 2, 3], [4, 5, 6, 7, 8], [9]]
    result = utils.stack_pad(tensors, pad_axes=0, pad_to_lengths=6)
    self.assertTrue(np.array_equal(
        result,
        np.asarray([[1, 2, 3, 0, 0, 0],
                    [4, 5, 6, 7, 8, 0],
                    [9, 0, 0, 0, 0, 0]], dtype=np.float32)))

    # 3D.
    tensors = [[[[1, 2, 3], [4, 5, 6]]],
               [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
               [[[0, 1, 2]], [[3, 4, 5]]]]
    result = utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[2, 2])
    self.assertTrue(np.array_equal(
        result,
        np.asarray([[[[1, 2, 3], [4, 5, 6]],
                     [[0, 0, 0], [0, 0, 0]]],
                    [[[7, 8, 9], [0, 1, 2]],
                     [[3, 4, 5], [6, 7, 8]]],
                    [[[0, 1, 2], [0, 0, 0]],
                     [[3, 4, 5], [0, 0, 0]]]], dtype=np.float32)))
Example #9
0
  def testNumericalGradChecking(self):
    # Similar to
    # http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization.
    epsilon = 1e-4
    eos = misc.BF_EOS_INT
    self.assertEqual(0, eos)
    config = defaults.default_config_with_updates(
        'env=c(task="print"),'
        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
        'entropy_beta=0.0,topk_loss_hparam=0.0,policy_lstm_sizes=[10],'
        'eos_token=True),'
        'batch_size=64')
    dtype = tf.float64
    tf.reset_default_graph()
    tf.set_random_seed(12345678987654321)
    np.random.seed(1294024302)
    trainer = pg_train.AsyncTrainer(
        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
    model = trainer.model
    actions_ph = model.actions
    lengths_ph = model.adjusted_lengths
    multipliers_ph = model.policy_multipliers
    loss = model.pi_loss
    global_init_op = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))

    assign_add_placeholders = [None] * len(model.trainable_variables)
    assign_add_ops = [None] * len(model.trainable_variables)
    param_shapes = [None] * len(model.trainable_variables)
    for i, param in enumerate(model.trainable_variables):
      param_shapes[i] = param.get_shape().as_list()
      assign_add_placeholders[i] = tf.placeholder(dtype,
                                                  np.prod(param_shapes[i]))
      assign_add_ops[i] = param.assign_add(
          tf.reshape(assign_add_placeholders[i], param_shapes[i]))

    with tf.Session() as sess:
      sess.run(global_init_op)  # Initialize global copy.
      trainer.initialize(sess)

      actions_raw = [random_sequence(10, 9) for _ in xrange(16)]
      actions_batch = utils.stack_pad(actions_raw, 0)
      lengths_batch = [len(l) for l in actions_raw]
      feed = {actions_ph: actions_batch,
              multipliers_ph: np.ones_like(actions_batch),
              lengths_ph: lengths_batch}

      estimated_grads = [None] * len(model.trainable_variables)
      for i, param in enumerate(model.trainable_variables):
        param_size = np.prod(param_shapes[i])
        estimated_grads[i] = np.zeros(param_size, dtype=np.float64)
        for index in xrange(param_size):
          e = onehot(index, param_size) * epsilon
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: e})
          j_plus = sess.run(loss, feed)
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: -2 * e})
          j_minus = sess.run(loss, feed)
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: e})
          estimated_grads[i][index] = (j_plus - j_minus) / (2 * epsilon)
        estimated_grads[i] = estimated_grads[i].reshape(param_shapes[i])

      analytic_grads = sess.run(model.dense_unclipped_grads, feed)

      for g1, g2 in zip(estimated_grads[1:], analytic_grads[1:]):
        logging.info('norm (g1-g2): %s', np.abs(g1 - g2).mean())
        self.assertTrue(np.allclose(g1, g2))
Example #10
0
  def testMonteCarloGradients(self):
    """Test Monte Carlo estimate of REINFORCE gradient.

    Test that the Monte Carlo estimate of the REINFORCE gradient is
    approximately equal to the true gradient. We compute the true gradient for a
    toy environment with a very small action space.

    Similar to section 5 of https://arxiv.org/pdf/1505.00521.pdf.
    """
    # Test may have different outcome on different machines due to different
    # rounding behavior of float arithmetic.
    tf.reset_default_graph()
    tf.set_random_seed(12345678987654321)
    np.random.seed(1294024302)
    max_length = 2
    num_tokens = misc.bf_num_tokens()
    eos = misc.BF_EOS_INT
    assert eos == 0
    def sequence_iterator(max_length):
      """Iterates through all sequences up to the given length."""
      yield [eos]
      for a in xrange(1, num_tokens):
        if max_length > 1:
          for sub_seq in sequence_iterator(max_length - 1):
            yield [a] + sub_seq
        else:
          yield [a]
    actions = list(sequence_iterator(max_length))

    # This batch contains all possible episodes up to max_length.
    actions_batch = utils.stack_pad(actions, 0)
    lengths_batch = [len(s) for s in actions]

    reward_map = {tuple(a): np.random.randint(-1, 7) for a in actions_batch}
    # reward_map = {tuple(a): np.random.normal(3, 1)
    #               for a in actions_batch}  # normal distribution
    # reward_map = {tuple(a): 1.0
    #               for a in actions_batch}  # expected reward is 1

    n = 100000  # MC sample size.
    config = defaults.default_config_with_updates(
        'env=c(task="print"),'
        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
        'entropy_beta=0.0,topk_loss_hparam=0.0,regularizer=0.0,'
        'policy_lstm_sizes=[10],eos_token=True),'
        'batch_size='+str(n)+',timestep_limit='+str(max_length))

    dtype = tf.float64
    trainer = pg_train.AsyncTrainer(
        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
    model = trainer.model
    actions_ph = model.actions
    lengths_ph = model.adjusted_lengths
    multipliers_ph = model.policy_multipliers

    global_init_op = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
    with tf.Session() as sess, sess.graph.as_default():
      sess.run(global_init_op)  # Initialize global copy.
      trainer.initialize(sess)

      # Compute exact gradients.
      # exact_grads = sum(P(a) * grad(log P(a)) * R(a) for a in actions_batch)
      true_loss_unnormalized = 0.0
      exact_grads = [np.zeros(v.shape) for v in model.trainable_variables]
      episode_probs_map = {}
      grads_map = {}
      for a_idx in xrange(len(actions_batch)):
        a = actions_batch[a_idx]
        grads_result, probs_result, loss = sess.run(
            [model.dense_unclipped_grads, model.chosen_probs, model.loss],
            {actions_ph: [a],
             lengths_ph: [lengths_batch[a_idx]],
             multipliers_ph: [
                 repeat_and_pad(reward_map[tuple(a)],
                                lengths_batch[a_idx],
                                max_length)]})
        # Take product over time axis.
        episode_probs_result = np.prod(probs_result[0, :lengths_batch[a_idx]])
        for i in range(0, len(exact_grads)):
          exact_grads[i] += grads_result[i] * episode_probs_result
        episode_probs_map[tuple(a)] = episode_probs_result
        reward_map[tuple(a)] = reward_map[tuple(a)]
        grads_map[tuple(a)] = grads_result
        true_loss_unnormalized += loss
      # Normalize loss. Since each episode is feed into the model one at a time,
      # normalization needs to be done manually.
      true_loss = true_loss_unnormalized / float(len(actions_batch))

      # Compute Monte Carlo gradients.
      # E_a~P[grad(log P(a)) R(a)] is aprox. eq. to
      # sum(grad(log P(a)) R(a) for a in actions_sampled_from_P) / n
      # where len(actions_sampled_from_P) == n.
      #
      # In other words, sample from the policy and compute the gradients of the
      # log probs weighted by the returns. This will excersize the code in
      # agent.py
      sampled_actions, sampled_lengths = sess.run(
          [model.sampled_tokens, model.episode_lengths])
      pi_multipliers = [
          repeat_and_pad(reward_map[tuple(a)], l, max_length)
          for a, l in zip(sampled_actions, sampled_lengths)]
      mc_grads_unnormalized, sampled_probs, mc_loss_unnormalized = sess.run(
          [model.dense_unclipped_grads, model.chosen_probs, model.loss],
          {actions_ph: sampled_actions,
           multipliers_ph: pi_multipliers,
           lengths_ph: sampled_lengths})
      # Loss is already normalized across the minibatch, so no normalization
      # is needed.
      mc_grads = mc_grads_unnormalized
      mc_loss = mc_loss_unnormalized

    # Make sure true loss and MC loss are similar.
    loss_error = smape(true_loss, mc_loss)
    self.assertTrue(loss_error < 0.15, msg='actual: %s' % loss_error)

    # Check that probs computed for episodes sampled from the model are the same
    # as the recorded true probs.
    for i in range(100):
      acs = tuple(sampled_actions[i].tolist())
      sampled_prob = np.prod(sampled_probs[i, :sampled_lengths[i]])
      self.assertTrue(np.isclose(episode_probs_map[acs], sampled_prob))

    # Make sure MC estimates of true probs are close.
    counter = Counter(tuple(e) for e in sampled_actions)
    for acs, count in counter.iteritems():
      mc_prob = count / float(len(sampled_actions))
      true_prob = episode_probs_map[acs]
      error = smape(mc_prob, true_prob)
      self.assertTrue(
          error < 0.15,
          msg='actual: %s; count: %s; mc_prob: %s; true_prob: %s'
          % (error, count, mc_prob, true_prob))

    # Manually recompute MC gradients and make sure they match MC gradients
    # computed in TF.
    mc_grads_recompute = [np.zeros(v.shape) for v in model.trainable_variables]
    for i in range(n):
      acs = tuple(sampled_actions[i].tolist())
      for i in range(0, len(mc_grads_recompute)):
        mc_grads_recompute[i] += grads_map[acs][i]
    for i in range(0, len(mc_grads_recompute)):
      self.assertTrue(np.allclose(mc_grads[i], mc_grads_recompute[i] / n))

    # Check angle between gradients as fraction of pi.
    for index in range(len(mc_grads)):
      v1 = mc_grads[index].reshape(-1)
      v2 = exact_grads[index].reshape(-1)
      # angle = arccos(v1 . v2 / (|v1|*|v2|))
      angle_rad = np.arccos(
          np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
      logging.info('angle / pi: %s', angle_rad / np.pi)
      angle_frac = angle_rad / np.pi
      self.assertTrue(angle_frac < 0.02, msg='actual: %s' % angle_frac)
    # Check norms.
    for index in range(len(mc_grads)):
      v1_norm = np.linalg.norm(mc_grads[index].reshape(-1))
      v2_norm = np.linalg.norm(exact_grads[index].reshape(-1))
      error = smape(v1_norm, v2_norm)
      self.assertTrue(error < 0.02, msg='actual: %s' % error)

    # Check expected rewards.
    # E_a~P[R(a)] approx eq sum(P(a) * R(a) for a in actions)
    mc_expected_reward = np.mean(
        [reward_map[tuple(a)] for a in sampled_actions])
    exact_expected_reward = np.sum(
        [episode_probs_map[k] * reward_map[k] for k in reward_map])
    error = smape(mc_expected_reward, exact_expected_reward)
    self.assertTrue(error < 0.005, msg='actual: %s' % angle_frac)
Example #11
0
  def testNumericalGradChecking(self):
    # Similar to
    # http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization.
    epsilon = 1e-4
    eos = misc.BF_EOS_INT
    self.assertEqual(0, eos)
    config = defaults.default_config_with_updates(
        'env=c(task="print"),'
        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
        'entropy_beta=0.0,topk_loss_hparam=0.0,policy_lstm_sizes=[10],'
        'eos_token=True),'
        'batch_size=64')
    dtype = tf.float64
    tf.reset_default_graph()
    tf.set_random_seed(12345678987654321)
    np.random.seed(1294024302)
    trainer = pg_train.AsyncTrainer(
        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
    model = trainer.model
    actions_ph = model.actions
    lengths_ph = model.adjusted_lengths
    multipliers_ph = model.policy_multipliers
    loss = model.pi_loss
    global_init_op = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))

    assign_add_placeholders = [None] * len(model.trainable_variables)
    assign_add_ops = [None] * len(model.trainable_variables)
    param_shapes = [None] * len(model.trainable_variables)
    for i, param in enumerate(model.trainable_variables):
      param_shapes[i] = param.get_shape().as_list()
      assign_add_placeholders[i] = tf.placeholder(dtype,
                                                  np.prod(param_shapes[i]))
      assign_add_ops[i] = param.assign_add(
          tf.reshape(assign_add_placeholders[i], param_shapes[i]))

    with tf.Session() as sess:
      sess.run(global_init_op)  # Initialize global copy.
      trainer.initialize(sess)

      actions_raw = [random_sequence(10, 9) for _ in xrange(16)]
      actions_batch = utils.stack_pad(actions_raw, 0)
      lengths_batch = [len(l) for l in actions_raw]
      feed = {actions_ph: actions_batch,
              multipliers_ph: np.ones_like(actions_batch),
              lengths_ph: lengths_batch}

      estimated_grads = [None] * len(model.trainable_variables)
      for i, param in enumerate(model.trainable_variables):
        param_size = np.prod(param_shapes[i])
        estimated_grads[i] = np.zeros(param_size, dtype=np.float64)
        for index in xrange(param_size):
          e = onehot(index, param_size) * epsilon
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: e})
          j_plus = sess.run(loss, feed)
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: -2 * e})
          j_minus = sess.run(loss, feed)
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: e})
          estimated_grads[i][index] = (j_plus - j_minus) / (2 * epsilon)
        estimated_grads[i] = estimated_grads[i].reshape(param_shapes[i])

      analytic_grads = sess.run(model.dense_unclipped_grads, feed)

      for g1, g2 in zip(estimated_grads[1:], analytic_grads[1:]):
        logging.info('norm (g1-g2): %s', np.abs(g1 - g2).mean())
        self.assertTrue(np.allclose(g1, g2))
Example #12
0
  def testMonteCarloGradients(self):
    """Test Monte Carlo estimate of REINFORCE gradient.

    Test that the Monte Carlo estimate of the REINFORCE gradient is
    approximately equal to the true gradient. We compute the true gradient for a
    toy environment with a very small action space.

    Similar to section 5 of https://arxiv.org/pdf/1505.00521.pdf.
    """
    # Test may have different outcome on different machines due to different
    # rounding behavior of float arithmetic.
    tf.reset_default_graph()
    tf.set_random_seed(12345678987654321)
    np.random.seed(1294024302)
    max_length = 2
    num_tokens = misc.bf_num_tokens()
    eos = misc.BF_EOS_INT
    assert eos == 0
    def sequence_iterator(max_length):
      """Iterates through all sequences up to the given length."""
      yield [eos]
      for a in xrange(1, num_tokens):
        if max_length > 1:
          for sub_seq in sequence_iterator(max_length - 1):
            yield [a] + sub_seq
        else:
          yield [a]
    actions = list(sequence_iterator(max_length))

    # This batch contains all possible episodes up to max_length.
    actions_batch = utils.stack_pad(actions, 0)
    lengths_batch = [len(s) for s in actions]

    reward_map = {tuple(a): np.random.randint(-1, 7) for a in actions_batch}
    # reward_map = {tuple(a): np.random.normal(3, 1)
    #               for a in actions_batch}  # normal distribution
    # reward_map = {tuple(a): 1.0
    #               for a in actions_batch}  # expected reward is 1

    n = 100000  # MC sample size.
    config = defaults.default_config_with_updates(
        'env=c(task="print"),'
        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
        'entropy_beta=0.0,topk_loss_hparam=0.0,regularizer=0.0,'
        'policy_lstm_sizes=[10],eos_token=True),'
        'batch_size='+str(n)+',timestep_limit='+str(max_length))

    dtype = tf.float64
    trainer = pg_train.AsyncTrainer(
        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
    model = trainer.model
    actions_ph = model.actions
    lengths_ph = model.adjusted_lengths
    multipliers_ph = model.policy_multipliers

    global_init_op = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
    with tf.Session() as sess, sess.graph.as_default():
      sess.run(global_init_op)  # Initialize global copy.
      trainer.initialize(sess)

      # Compute exact gradients.
      # exact_grads = sum(P(a) * grad(log P(a)) * R(a) for a in actions_batch)
      true_loss_unnormalized = 0.0
      exact_grads = [np.zeros(v.shape) for v in model.trainable_variables]
      episode_probs_map = {}
      grads_map = {}
      for a_idx in xrange(len(actions_batch)):
        a = actions_batch[a_idx]
        grads_result, probs_result, loss = sess.run(
            [model.dense_unclipped_grads, model.chosen_probs, model.loss],
            {actions_ph: [a],
             lengths_ph: [lengths_batch[a_idx]],
             multipliers_ph: [
                 repeat_and_pad(reward_map[tuple(a)],
                                lengths_batch[a_idx],
                                max_length)]})
        # Take product over time axis.
        episode_probs_result = np.prod(probs_result[0, :lengths_batch[a_idx]])
        for i in range(0, len(exact_grads)):
          exact_grads[i] += grads_result[i] * episode_probs_result
        episode_probs_map[tuple(a)] = episode_probs_result
        reward_map[tuple(a)] = reward_map[tuple(a)]
        grads_map[tuple(a)] = grads_result
        true_loss_unnormalized += loss
      # Normalize loss. Since each episode is feed into the model one at a time,
      # normalization needs to be done manually.
      true_loss = true_loss_unnormalized / float(len(actions_batch))

      # Compute Monte Carlo gradients.
      # E_a~P[grad(log P(a)) R(a)] is aprox. eq. to
      # sum(grad(log P(a)) R(a) for a in actions_sampled_from_P) / n
      # where len(actions_sampled_from_P) == n.
      #
      # In other words, sample from the policy and compute the gradients of the
      # log probs weighted by the returns. This will excersize the code in
      # agent.py
      sampled_actions, sampled_lengths = sess.run(
          [model.sampled_tokens, model.episode_lengths])
      pi_multipliers = [
          repeat_and_pad(reward_map[tuple(a)], l, max_length)
          for a, l in zip(sampled_actions, sampled_lengths)]
      mc_grads_unnormalized, sampled_probs, mc_loss_unnormalized = sess.run(
          [model.dense_unclipped_grads, model.chosen_probs, model.loss],
          {actions_ph: sampled_actions,
           multipliers_ph: pi_multipliers,
           lengths_ph: sampled_lengths})
      # Loss is already normalized across the minibatch, so no normalization
      # is needed.
      mc_grads = mc_grads_unnormalized
      mc_loss = mc_loss_unnormalized

    # Make sure true loss and MC loss are similar.
    loss_error = smape(true_loss, mc_loss)
    self.assertTrue(loss_error < 0.15, msg='actual: %s' % loss_error)

    # Check that probs computed for episodes sampled from the model are the same
    # as the recorded true probs.
    for i in range(100):
      acs = tuple(sampled_actions[i].tolist())
      sampled_prob = np.prod(sampled_probs[i, :sampled_lengths[i]])
      self.assertTrue(np.isclose(episode_probs_map[acs], sampled_prob))

    # Make sure MC estimates of true probs are close.
    counter = Counter(tuple(e) for e in sampled_actions)
    for acs, count in counter.iteritems():
      mc_prob = count / float(len(sampled_actions))
      true_prob = episode_probs_map[acs]
      error = smape(mc_prob, true_prob)
      self.assertTrue(
          error < 0.15,
          msg='actual: %s; count: %s; mc_prob: %s; true_prob: %s'
          % (error, count, mc_prob, true_prob))

    # Manually recompute MC gradients and make sure they match MC gradients
    # computed in TF.
    mc_grads_recompute = [np.zeros(v.shape) for v in model.trainable_variables]
    for i in range(n):
      acs = tuple(sampled_actions[i].tolist())
      for i in range(0, len(mc_grads_recompute)):
        mc_grads_recompute[i] += grads_map[acs][i]
    for i in range(0, len(mc_grads_recompute)):
      self.assertTrue(np.allclose(mc_grads[i], mc_grads_recompute[i] / n))

    # Check angle between gradients as fraction of pi.
    for index in range(len(mc_grads)):
      v1 = mc_grads[index].reshape(-1)
      v2 = exact_grads[index].reshape(-1)
      # angle = arccos(v1 . v2 / (|v1|*|v2|))
      angle_rad = np.arccos(
          np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
      logging.info('angle / pi: %s', angle_rad / np.pi)
      angle_frac = angle_rad / np.pi
      self.assertTrue(angle_frac < 0.02, msg='actual: %s' % angle_frac)
    # Check norms.
    for index in range(len(mc_grads)):
      v1_norm = np.linalg.norm(mc_grads[index].reshape(-1))
      v2_norm = np.linalg.norm(exact_grads[index].reshape(-1))
      error = smape(v1_norm, v2_norm)
      self.assertTrue(error < 0.02, msg='actual: %s' % error)

    # Check expected rewards.
    # E_a~P[R(a)] approx eq sum(P(a) * R(a) for a in actions)
    mc_expected_reward = np.mean(
        [reward_map[tuple(a)] for a in sampled_actions])
    exact_expected_reward = np.sum(
        [episode_probs_map[k] * reward_map[k] for k in reward_map])
    error = smape(mc_expected_reward, exact_expected_reward)
    self.assertTrue(error < 0.005, msg='actual: %s' % angle_frac)