Beispiel #1
0
def get_namespace(config_string):
  """Get namespace for the selected algorithm.

  Users who want to add additional algorithm types should modify this function.
  The algorithm's namespace should contain the following functions:
    run_training: Run the main training loop.
    define_tuner_hparam_space: Return the hparam tuning space for the algo.
    write_hparams_to_config: Helper for tuning. Write hparams chosen for tuning
        to the Config object.
  Look at pg_train.py and ga_train.py for function signatures and
  implementations.

  Args:
    config_string: String representation of a Config object. This will get
        parsed into a Config in order to determine what algorithm to use.

  Returns:
    algorithm_namespace: The module corresponding to the algorithm given in the
        config.
    config: The Config object resulting from parsing `config_string`.

  Raises:
    ValueError: If config.agent.algorithm is not one of the registered
        algorithms.
  """
  config = defaults.default_config_with_updates(config_string)
  if config.agent.algorithm not in ALGORITHM_REGISTRATION:
    raise ValueError('Unknown algorithm type "%s"' % (config.agent.algorithm,))
  else:
    return ALGORITHM_REGISTRATION[config.agent.algorithm], config
Beispiel #2
0
def get_namespace(config_string):
  """Get namespace for the selected algorithm.

  Users who want to add additional algorithm types should modify this function.
  The algorithm's namespace should contain the following functions:
    run_training: Run the main training loop.
    define_tuner_hparam_space: Return the hparam tuning space for the algo.
    write_hparams_to_config: Helper for tuning. Write hparams chosen for tuning
        to the Config object.
  Look at pg_train.py and ga_train.py for function signatures and
  implementations.

  Args:
    config_string: String representation of a Config object. This will get
        parsed into a Config in order to determine what algorithm to use.

  Returns:
    algorithm_namespace: The module corresponding to the algorithm given in the
        config.
    config: The Config object resulting from parsing `config_string`.

  Raises:
    ValueError: If config.agent.algorithm is not one of the registered
        algorithms.
  """
  config = defaults.default_config_with_updates(config_string)
  if config.agent.algorithm not in ALGORITHM_REGISTRATION:
    raise ValueError('Unknown algorithm type "%s"' % (config.agent.algorithm,))
  else:
    return ALGORITHM_REGISTRATION[config.agent.algorithm], config
Beispiel #3
0
def run_tuner_loop(ns):
    """Run tuning loop for this worker."""
    is_chief = FLAGS.task_id == 0
    tuning_space = ns.define_tuner_hparam_space(
        hparam_space_type=FLAGS.hparam_space)
    fixed_hparams = parse_hparams_string(FLAGS.fixed_hparams)
    for name, value in fixed_hparams.iteritems():
        tuning_space[name] = [value]
    tuning_space_size = np.prod(
        [len(values) for values in tuning_space.values()])

    num_local_trials, remainder = divmod(tuning_space_size, FLAGS.num_tuners)
    if FLAGS.tuner_id < remainder:
        num_local_trials += 1
    starting_trial_id = (num_local_trials * FLAGS.tuner_id +
                         min(remainder, FLAGS.tuner_id))

    tf.logging.info('tuning_space_size: %d', tuning_space_size)
    tf.logging.info('num_local_trials: %d', num_local_trials)
    tf.logging.info('starting_trial_id: %d', starting_trial_id)

    for local_trial_index in xrange(num_local_trials):
        trial_config = defaults.default_config_with_updates(FLAGS.config)
        global_trial_index = local_trial_index + starting_trial_id
        trial_name = 'trial_' + str(global_trial_index)
        trial_dir = os.path.join(FLAGS.logdir, trial_name)
        hparams = hparams_for_index(global_trial_index, tuning_space)
        ns.write_hparams_to_config(trial_config,
                                   hparams,
                                   hparam_space_type=FLAGS.hparam_space)

        results_list = ns.run_training(config=trial_config,
                                       tuner=None,
                                       logdir=trial_dir,
                                       is_chief=is_chief,
                                       trial_name=trial_name)

        if not is_chief:
            # Only chief worker needs to write tuning results to disk.
            continue

        objective, metrics = compute_tuning_objective(
            results_list, hparams, trial_name, num_trials=tuning_space_size)
        tf.logging.info('metrics:\n%s', metrics)
        tf.logging.info('objective: %s', objective)
        tf.logging.info('programs_seen_fraction: %s',
                        metrics['programs_seen_fraction'])
        tf.logging.info('success_rate: %s', metrics['success_rate'])
        tf.logging.info('success_rate_objective_weight: %s',
                        FLAGS.success_rate_objective_weight)

        tuning_results_file = os.path.join(trial_dir, 'tuning_results.txt')
        with tf.gfile.FastGFile(tuning_results_file, 'a') as writer:
            writer.write(str(metrics) + '\n')

        tf.logging.info('Trial %s complete.', trial_name)
Beispiel #4
0
def run_tuner_loop(ns):
  """Run tuning loop for this worker."""
  is_chief = FLAGS.task_id == 0
  tuning_space = ns.define_tuner_hparam_space(
      hparam_space_type=FLAGS.hparam_space)
  fixed_hparams = parse_hparams_string(FLAGS.fixed_hparams)
  for name, value in fixed_hparams.iteritems():
    tuning_space[name] = [value]
  tuning_space_size = np.prod([len(values) for values in tuning_space.values()])

  num_local_trials, remainder = divmod(tuning_space_size, FLAGS.num_tuners)
  if FLAGS.tuner_id < remainder:
    num_local_trials += 1
  starting_trial_id = (
      num_local_trials * FLAGS.tuner_id + min(remainder, FLAGS.tuner_id))

  logging.info('tuning_space_size: %d', tuning_space_size)
  logging.info('num_local_trials: %d', num_local_trials)
  logging.info('starting_trial_id: %d', starting_trial_id)

  for local_trial_index in xrange(num_local_trials):
    trial_config = defaults.default_config_with_updates(FLAGS.config)
    global_trial_index = local_trial_index + starting_trial_id
    trial_name = 'trial_' + str(global_trial_index)
    trial_dir = os.path.join(FLAGS.logdir, trial_name)
    hparams = hparams_for_index(global_trial_index, tuning_space)
    ns.write_hparams_to_config(
        trial_config, hparams, hparam_space_type=FLAGS.hparam_space)

    results_list = ns.run_training(
        config=trial_config, tuner=None, logdir=trial_dir, is_chief=is_chief,
        trial_name=trial_name)

    if not is_chief:
      # Only chief worker needs to write tuning results to disk.
      continue

    objective, metrics = compute_tuning_objective(
        results_list, hparams, trial_name, num_trials=tuning_space_size)
    logging.info('metrics:\n%s', metrics)
    logging.info('objective: %s', objective)
    logging.info('programs_seen_fraction: %s',
                 metrics['programs_seen_fraction'])
    logging.info('success_rate: %s', metrics['success_rate'])
    logging.info('success_rate_objective_weight: %s',
                 FLAGS.success_rate_objective_weight)

    tuning_results_file = os.path.join(trial_dir, 'tuning_results.txt')
    with tf.gfile.FastGFile(tuning_results_file, 'a') as writer:
      writer.write(str(metrics) + '\n')

    logging.info('Trial %s complete.', trial_name)
Beispiel #5
0
 def testMakeTask(self):
     maxlen = 100
     padchr = '['
     config = defaults.default_config_with_updates(
         'env=c(config_for_iclr=False,fixed_string=[8,5,12,12,15])')
     task = code_tasks.make_task(config.env, 'print', timestep_limit=maxlen)
     reward_fns = task.rl_batch(1)
     r = reward_fns[0]
     self.assertClose(
         r('++++++++.---.+++++++...').episode_rewards[-1], 0.2444)
     self.assertClose(
         r('++++++++.---.+++++++..+++.').episode_rewards[-1], 0.935)
     self.assertClose(
         r(pad('++++++++.---.+++++++..+++.', maxlen,
               padchr)).episode_rewards[-1], 0.75)
Beispiel #6
0
  def RunTrainingSteps(self, config_string, num_steps=10):
    """Run a few training steps with the given config.

    Just check that nothing crashes.

    Args:
      config_string: Config encoded in a string. See
          $REPO_PATH/common/config_lib.py
      num_steps: Number of training steps to run. Defaults to 10.
    """
    config = defaults.default_config_with_updates(config_string)
    FLAGS.max_npe = num_steps * config.batch_size
    FLAGS.logdir = tf.test.get_temp_dir()
    FLAGS.config = config_string
    run.main(None)
Beispiel #7
0
    def RunTrainingSteps(self, config_string, num_steps=10):
        """Run a few training steps with the given config.

    Just check that nothing crashes.

    Args:
      config_string: Config encoded in a string. See
          $REPO_PATH/common/config_lib.py
      num_steps: Number of training steps to run. Defaults to 10.
    """
        config = defaults.default_config_with_updates(config_string)
        FLAGS.max_npe = num_steps * config.batch_size
        FLAGS.logdir = tf.test.get_temp_dir()
        FLAGS.config = config_string
        run.main(None)
Beispiel #8
0
 def testMakeTask(self):
   maxlen = 100
   padchr = '['
   config = defaults.default_config_with_updates(
       'env=c(config_for_iclr=False,fixed_string=[8,5,12,12,15])')
   task = code_tasks.make_task(config.env, 'print', timestep_limit=maxlen)
   reward_fns = task.rl_batch(1)
   r = reward_fns[0]
   self.assertClose(
       r('++++++++.---.+++++++...').episode_rewards[-1],
       0.2444)
   self.assertClose(
       r('++++++++.---.+++++++..+++.').episode_rewards[-1],
       0.935)
   self.assertClose(
       r(pad('++++++++.---.+++++++..+++.',
             maxlen, padchr)).episode_rewards[-1],
       0.75)
Beispiel #9
0
  def testMonteCarloGradients(self):
    """Test Monte Carlo estimate of REINFORCE gradient.

    Test that the Monte Carlo estimate of the REINFORCE gradient is
    approximately equal to the true gradient. We compute the true gradient for a
    toy environment with a very small action space.

    Similar to section 5 of https://arxiv.org/pdf/1505.00521.pdf.
    """
    # Test may have different outcome on different machines due to different
    # rounding behavior of float arithmetic.
    tf.reset_default_graph()
    tf.set_random_seed(12345678987654321)
    np.random.seed(1294024302)
    max_length = 2
    num_tokens = misc.bf_num_tokens()
    eos = misc.BF_EOS_INT
    assert eos == 0
    def sequence_iterator(max_length):
      """Iterates through all sequences up to the given length."""
      yield [eos]
      for a in xrange(1, num_tokens):
        if max_length > 1:
          for sub_seq in sequence_iterator(max_length - 1):
            yield [a] + sub_seq
        else:
          yield [a]
    actions = list(sequence_iterator(max_length))

    # This batch contains all possible episodes up to max_length.
    actions_batch = utils.stack_pad(actions, 0)
    lengths_batch = [len(s) for s in actions]

    reward_map = {tuple(a): np.random.randint(-1, 7) for a in actions_batch}
    # reward_map = {tuple(a): np.random.normal(3, 1)
    #               for a in actions_batch}  # normal distribution
    # reward_map = {tuple(a): 1.0
    #               for a in actions_batch}  # expected reward is 1

    n = 100000  # MC sample size.
    config = defaults.default_config_with_updates(
        'env=c(task="print"),'
        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
        'entropy_beta=0.0,topk_loss_hparam=0.0,regularizer=0.0,'
        'policy_lstm_sizes=[10],eos_token=True),'
        'batch_size='+str(n)+',timestep_limit='+str(max_length))

    dtype = tf.float64
    trainer = pg_train.AsyncTrainer(
        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
    model = trainer.model
    actions_ph = model.actions
    lengths_ph = model.adjusted_lengths
    multipliers_ph = model.policy_multipliers

    global_init_op = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
    with tf.Session() as sess, sess.graph.as_default():
      sess.run(global_init_op)  # Initialize global copy.
      trainer.initialize(sess)

      # Compute exact gradients.
      # exact_grads = sum(P(a) * grad(log P(a)) * R(a) for a in actions_batch)
      true_loss_unnormalized = 0.0
      exact_grads = [np.zeros(v.shape) for v in model.trainable_variables]
      episode_probs_map = {}
      grads_map = {}
      for a_idx in xrange(len(actions_batch)):
        a = actions_batch[a_idx]
        grads_result, probs_result, loss = sess.run(
            [model.dense_unclipped_grads, model.chosen_probs, model.loss],
            {actions_ph: [a],
             lengths_ph: [lengths_batch[a_idx]],
             multipliers_ph: [
                 repeat_and_pad(reward_map[tuple(a)],
                                lengths_batch[a_idx],
                                max_length)]})
        # Take product over time axis.
        episode_probs_result = np.prod(probs_result[0, :lengths_batch[a_idx]])
        for i in range(0, len(exact_grads)):
          exact_grads[i] += grads_result[i] * episode_probs_result
        episode_probs_map[tuple(a)] = episode_probs_result
        reward_map[tuple(a)] = reward_map[tuple(a)]
        grads_map[tuple(a)] = grads_result
        true_loss_unnormalized += loss
      # Normalize loss. Since each episode is feed into the model one at a time,
      # normalization needs to be done manually.
      true_loss = true_loss_unnormalized / float(len(actions_batch))

      # Compute Monte Carlo gradients.
      # E_a~P[grad(log P(a)) R(a)] is aprox. eq. to
      # sum(grad(log P(a)) R(a) for a in actions_sampled_from_P) / n
      # where len(actions_sampled_from_P) == n.
      #
      # In other words, sample from the policy and compute the gradients of the
      # log probs weighted by the returns. This will excersize the code in
      # agent.py
      sampled_actions, sampled_lengths = sess.run(
          [model.sampled_tokens, model.episode_lengths])
      pi_multipliers = [
          repeat_and_pad(reward_map[tuple(a)], l, max_length)
          for a, l in zip(sampled_actions, sampled_lengths)]
      mc_grads_unnormalized, sampled_probs, mc_loss_unnormalized = sess.run(
          [model.dense_unclipped_grads, model.chosen_probs, model.loss],
          {actions_ph: sampled_actions,
           multipliers_ph: pi_multipliers,
           lengths_ph: sampled_lengths})
      # Loss is already normalized across the minibatch, so no normalization
      # is needed.
      mc_grads = mc_grads_unnormalized
      mc_loss = mc_loss_unnormalized

    # Make sure true loss and MC loss are similar.
    loss_error = smape(true_loss, mc_loss)
    self.assertTrue(loss_error < 0.15, msg='actual: %s' % loss_error)

    # Check that probs computed for episodes sampled from the model are the same
    # as the recorded true probs.
    for i in range(100):
      acs = tuple(sampled_actions[i].tolist())
      sampled_prob = np.prod(sampled_probs[i, :sampled_lengths[i]])
      self.assertTrue(np.isclose(episode_probs_map[acs], sampled_prob))

    # Make sure MC estimates of true probs are close.
    counter = Counter(tuple(e) for e in sampled_actions)
    for acs, count in counter.iteritems():
      mc_prob = count / float(len(sampled_actions))
      true_prob = episode_probs_map[acs]
      error = smape(mc_prob, true_prob)
      self.assertTrue(
          error < 0.15,
          msg='actual: %s; count: %s; mc_prob: %s; true_prob: %s'
          % (error, count, mc_prob, true_prob))

    # Manually recompute MC gradients and make sure they match MC gradients
    # computed in TF.
    mc_grads_recompute = [np.zeros(v.shape) for v in model.trainable_variables]
    for i in range(n):
      acs = tuple(sampled_actions[i].tolist())
      for i in range(0, len(mc_grads_recompute)):
        mc_grads_recompute[i] += grads_map[acs][i]
    for i in range(0, len(mc_grads_recompute)):
      self.assertTrue(np.allclose(mc_grads[i], mc_grads_recompute[i] / n))

    # Check angle between gradients as fraction of pi.
    for index in range(len(mc_grads)):
      v1 = mc_grads[index].reshape(-1)
      v2 = exact_grads[index].reshape(-1)
      # angle = arccos(v1 . v2 / (|v1|*|v2|))
      angle_rad = np.arccos(
          np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
      logging.info('angle / pi: %s', angle_rad / np.pi)
      angle_frac = angle_rad / np.pi
      self.assertTrue(angle_frac < 0.02, msg='actual: %s' % angle_frac)
    # Check norms.
    for index in range(len(mc_grads)):
      v1_norm = np.linalg.norm(mc_grads[index].reshape(-1))
      v2_norm = np.linalg.norm(exact_grads[index].reshape(-1))
      error = smape(v1_norm, v2_norm)
      self.assertTrue(error < 0.02, msg='actual: %s' % error)

    # Check expected rewards.
    # E_a~P[R(a)] approx eq sum(P(a) * R(a) for a in actions)
    mc_expected_reward = np.mean(
        [reward_map[tuple(a)] for a in sampled_actions])
    exact_expected_reward = np.sum(
        [episode_probs_map[k] * reward_map[k] for k in reward_map])
    error = smape(mc_expected_reward, exact_expected_reward)
    self.assertTrue(error < 0.005, msg='actual: %s' % angle_frac)
Beispiel #10
0
  def testVarUpdates(self):
    """Tests that variables get updated as expected.

    For the RL update, check that gradients are non-zero and that the global
    model gets updated.
    """
    config = defaults.default_config_with_updates(
        'env=c(task="reverse"),'
        'agent=c(algorithm="pg",eos_token=True,optimizer="sgd",lr=1.0)')
    lr = config.agent.lr

    tf.reset_default_graph()
    trainer = pg_train.AsyncTrainer(
        config, task_id=0, ps_tasks=0, num_workers=1)
    global_init_op = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
    with tf.Session() as sess:
      sess.run(global_init_op)  # Initialize global copy.
      trainer.initialize(sess)
      model = trainer.model
      global_vars = sess.run(trainer.global_model.trainable_variables)
      local_vars = sess.run(model.trainable_variables)

      # Make sure names match.
      g_prefix = 'global/'
      l_prefix = 'local/'
      for g, l in zip(trainer.global_model.trainable_variables,
                      model.trainable_variables):
        self.assertEqual(g.name[len(g_prefix):], l.name[len(l_prefix):])

      # Assert that shapes and values are the same between global and local
      # models.
      for g, l in zip(global_vars, local_vars):
        self.assertEqual(g.shape, l.shape)
        self.assertTrue(np.array_equal(g, l))

      # Make all gradients dense tensors.
      for param, grad in model.gradients_dict.items():
        if isinstance(grad, tf.IndexedSlices):
          # Converts to dense tensor.
          model.gradients_dict[param] = tf.multiply(grad, 1.0)

      # Perform update.
      results = model.update_step(
          sess, trainer.data_manager.sample_rl_batch(), trainer.train_op,
          trainer.global_step, return_gradients=True)
      grads_dict = results.gradients_dict
      for grad in grads_dict.values():
        self.assertIsNotNone(grad)
        self.assertTrue(np.count_nonzero(grad) > 0)
      global_update = sess.run(trainer.global_model.trainable_variables)
      for tf_var, var_before, var_after in zip(
          model.trainable_variables, local_vars, global_update):
        # Check that the params were updated.
        self.assertTrue(np.allclose(
            var_after,
            var_before - grads_dict[tf_var] * lr))

      # Test that global to local sync works.
      sess.run(trainer.sync_op)
      global_vars = sess.run(trainer.global_model.trainable_variables)
      local_vars = sess.run(model.trainable_variables)
      for l, g in zip(local_vars, global_vars):
        self.assertTrue(np.allclose(l, g))
Beispiel #11
0
  def testNumericalGradChecking(self):
    # Similar to
    # http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization.
    epsilon = 1e-4
    eos = misc.BF_EOS_INT
    self.assertEqual(0, eos)
    config = defaults.default_config_with_updates(
        'env=c(task="print"),'
        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
        'entropy_beta=0.0,topk_loss_hparam=0.0,policy_lstm_sizes=[10],'
        'eos_token=True),'
        'batch_size=64')
    dtype = tf.float64
    tf.reset_default_graph()
    tf.set_random_seed(12345678987654321)
    np.random.seed(1294024302)
    trainer = pg_train.AsyncTrainer(
        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
    model = trainer.model
    actions_ph = model.actions
    lengths_ph = model.adjusted_lengths
    multipliers_ph = model.policy_multipliers
    loss = model.pi_loss
    global_init_op = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))

    assign_add_placeholders = [None] * len(model.trainable_variables)
    assign_add_ops = [None] * len(model.trainable_variables)
    param_shapes = [None] * len(model.trainable_variables)
    for i, param in enumerate(model.trainable_variables):
      param_shapes[i] = param.get_shape().as_list()
      assign_add_placeholders[i] = tf.placeholder(dtype,
                                                  np.prod(param_shapes[i]))
      assign_add_ops[i] = param.assign_add(
          tf.reshape(assign_add_placeholders[i], param_shapes[i]))

    with tf.Session() as sess:
      sess.run(global_init_op)  # Initialize global copy.
      trainer.initialize(sess)

      actions_raw = [random_sequence(10, 9) for _ in xrange(16)]
      actions_batch = utils.stack_pad(actions_raw, 0)
      lengths_batch = [len(l) for l in actions_raw]
      feed = {actions_ph: actions_batch,
              multipliers_ph: np.ones_like(actions_batch),
              lengths_ph: lengths_batch}

      estimated_grads = [None] * len(model.trainable_variables)
      for i, param in enumerate(model.trainable_variables):
        param_size = np.prod(param_shapes[i])
        estimated_grads[i] = np.zeros(param_size, dtype=np.float64)
        for index in xrange(param_size):
          e = onehot(index, param_size) * epsilon
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: e})
          j_plus = sess.run(loss, feed)
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: -2 * e})
          j_minus = sess.run(loss, feed)
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: e})
          estimated_grads[i][index] = (j_plus - j_minus) / (2 * epsilon)
        estimated_grads[i] = estimated_grads[i].reshape(param_shapes[i])

      analytic_grads = sess.run(model.dense_unclipped_grads, feed)

      for g1, g2 in zip(estimated_grads[1:], analytic_grads[1:]):
        logging.info('norm (g1-g2): %s', np.abs(g1 - g2).mean())
        self.assertTrue(np.allclose(g1, g2))
Beispiel #12
0
  def testMonteCarloGradients(self):
    """Test Monte Carlo estimate of REINFORCE gradient.

    Test that the Monte Carlo estimate of the REINFORCE gradient is
    approximately equal to the true gradient. We compute the true gradient for a
    toy environment with a very small action space.

    Similar to section 5 of https://arxiv.org/pdf/1505.00521.pdf.
    """
    # Test may have different outcome on different machines due to different
    # rounding behavior of float arithmetic.
    tf.reset_default_graph()
    tf.set_random_seed(12345678987654321)
    np.random.seed(1294024302)
    max_length = 2
    num_tokens = misc.bf_num_tokens()
    eos = misc.BF_EOS_INT
    assert eos == 0
    def sequence_iterator(max_length):
      """Iterates through all sequences up to the given length."""
      yield [eos]
      for a in xrange(1, num_tokens):
        if max_length > 1:
          for sub_seq in sequence_iterator(max_length - 1):
            yield [a] + sub_seq
        else:
          yield [a]
    actions = list(sequence_iterator(max_length))

    # This batch contains all possible episodes up to max_length.
    actions_batch = utils.stack_pad(actions, 0)
    lengths_batch = [len(s) for s in actions]

    reward_map = {tuple(a): np.random.randint(-1, 7) for a in actions_batch}
    # reward_map = {tuple(a): np.random.normal(3, 1)
    #               for a in actions_batch}  # normal distribution
    # reward_map = {tuple(a): 1.0
    #               for a in actions_batch}  # expected reward is 1

    n = 100000  # MC sample size.
    config = defaults.default_config_with_updates(
        'env=c(task="print"),'
        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
        'entropy_beta=0.0,topk_loss_hparam=0.0,regularizer=0.0,'
        'policy_lstm_sizes=[10],eos_token=True),'
        'batch_size='+str(n)+',timestep_limit='+str(max_length))

    dtype = tf.float64
    trainer = pg_train.AsyncTrainer(
        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
    model = trainer.model
    actions_ph = model.actions
    lengths_ph = model.adjusted_lengths
    multipliers_ph = model.policy_multipliers

    global_init_op = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
    with tf.Session() as sess, sess.graph.as_default():
      sess.run(global_init_op)  # Initialize global copy.
      trainer.initialize(sess)

      # Compute exact gradients.
      # exact_grads = sum(P(a) * grad(log P(a)) * R(a) for a in actions_batch)
      true_loss_unnormalized = 0.0
      exact_grads = [np.zeros(v.shape) for v in model.trainable_variables]
      episode_probs_map = {}
      grads_map = {}
      for a_idx in xrange(len(actions_batch)):
        a = actions_batch[a_idx]
        grads_result, probs_result, loss = sess.run(
            [model.dense_unclipped_grads, model.chosen_probs, model.loss],
            {actions_ph: [a],
             lengths_ph: [lengths_batch[a_idx]],
             multipliers_ph: [
                 repeat_and_pad(reward_map[tuple(a)],
                                lengths_batch[a_idx],
                                max_length)]})
        # Take product over time axis.
        episode_probs_result = np.prod(probs_result[0, :lengths_batch[a_idx]])
        for i in range(0, len(exact_grads)):
          exact_grads[i] += grads_result[i] * episode_probs_result
        episode_probs_map[tuple(a)] = episode_probs_result
        reward_map[tuple(a)] = reward_map[tuple(a)]
        grads_map[tuple(a)] = grads_result
        true_loss_unnormalized += loss
      # Normalize loss. Since each episode is feed into the model one at a time,
      # normalization needs to be done manually.
      true_loss = true_loss_unnormalized / float(len(actions_batch))

      # Compute Monte Carlo gradients.
      # E_a~P[grad(log P(a)) R(a)] is aprox. eq. to
      # sum(grad(log P(a)) R(a) for a in actions_sampled_from_P) / n
      # where len(actions_sampled_from_P) == n.
      #
      # In other words, sample from the policy and compute the gradients of the
      # log probs weighted by the returns. This will excersize the code in
      # agent.py
      sampled_actions, sampled_lengths = sess.run(
          [model.sampled_tokens, model.episode_lengths])
      pi_multipliers = [
          repeat_and_pad(reward_map[tuple(a)], l, max_length)
          for a, l in zip(sampled_actions, sampled_lengths)]
      mc_grads_unnormalized, sampled_probs, mc_loss_unnormalized = sess.run(
          [model.dense_unclipped_grads, model.chosen_probs, model.loss],
          {actions_ph: sampled_actions,
           multipliers_ph: pi_multipliers,
           lengths_ph: sampled_lengths})
      # Loss is already normalized across the minibatch, so no normalization
      # is needed.
      mc_grads = mc_grads_unnormalized
      mc_loss = mc_loss_unnormalized

    # Make sure true loss and MC loss are similar.
    loss_error = smape(true_loss, mc_loss)
    self.assertTrue(loss_error < 0.15, msg='actual: %s' % loss_error)

    # Check that probs computed for episodes sampled from the model are the same
    # as the recorded true probs.
    for i in range(100):
      acs = tuple(sampled_actions[i].tolist())
      sampled_prob = np.prod(sampled_probs[i, :sampled_lengths[i]])
      self.assertTrue(np.isclose(episode_probs_map[acs], sampled_prob))

    # Make sure MC estimates of true probs are close.
    counter = Counter(tuple(e) for e in sampled_actions)
    for acs, count in counter.iteritems():
      mc_prob = count / float(len(sampled_actions))
      true_prob = episode_probs_map[acs]
      error = smape(mc_prob, true_prob)
      self.assertTrue(
          error < 0.15,
          msg='actual: %s; count: %s; mc_prob: %s; true_prob: %s'
          % (error, count, mc_prob, true_prob))

    # Manually recompute MC gradients and make sure they match MC gradients
    # computed in TF.
    mc_grads_recompute = [np.zeros(v.shape) for v in model.trainable_variables]
    for i in range(n):
      acs = tuple(sampled_actions[i].tolist())
      for i in range(0, len(mc_grads_recompute)):
        mc_grads_recompute[i] += grads_map[acs][i]
    for i in range(0, len(mc_grads_recompute)):
      self.assertTrue(np.allclose(mc_grads[i], mc_grads_recompute[i] / n))

    # Check angle between gradients as fraction of pi.
    for index in range(len(mc_grads)):
      v1 = mc_grads[index].reshape(-1)
      v2 = exact_grads[index].reshape(-1)
      # angle = arccos(v1 . v2 / (|v1|*|v2|))
      angle_rad = np.arccos(
          np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
      logging.info('angle / pi: %s', angle_rad / np.pi)
      angle_frac = angle_rad / np.pi
      self.assertTrue(angle_frac < 0.02, msg='actual: %s' % angle_frac)
    # Check norms.
    for index in range(len(mc_grads)):
      v1_norm = np.linalg.norm(mc_grads[index].reshape(-1))
      v2_norm = np.linalg.norm(exact_grads[index].reshape(-1))
      error = smape(v1_norm, v2_norm)
      self.assertTrue(error < 0.02, msg='actual: %s' % error)

    # Check expected rewards.
    # E_a~P[R(a)] approx eq sum(P(a) * R(a) for a in actions)
    mc_expected_reward = np.mean(
        [reward_map[tuple(a)] for a in sampled_actions])
    exact_expected_reward = np.sum(
        [episode_probs_map[k] * reward_map[k] for k in reward_map])
    error = smape(mc_expected_reward, exact_expected_reward)
    self.assertTrue(error < 0.005, msg='actual: %s' % angle_frac)
def run_training(
        config=None,
        tuner=None,
        logdir=None,
        trial_name=None,  # pylint: disable=unused-argument
        is_chief=True):
    """Do all training runs.

  This is the top level training function for policy gradient based models.
  Run this from the main function.

  Args:
    config: config_lib.Config instance containing global config (agent and
        environment hparams). If None, config will be parsed from FLAGS.config.
    tuner: (unused) A tuner instance. Leave as None if not tuning.
    logdir: Parent directory where all data from all runs will be written. If
        None, FLAGS.logdir will be used.
    trial_name: (unused) If tuning, set this to a unique string that identifies
        this trial. If `tuner` is not None, this also must be set.
    is_chief: True if this worker is the chief.

  Returns:
    List of results dicts which were written to disk. Each training run gets a
    results dict. Results dict contains metrics, i.e. (name, value) pairs which
    give information about the training run.

  Raises:
    ValueError: If FLAGS.num_workers does not divide FLAGS.num_repetitions.
    ValueError: If results dicts read from disk contain invalid data.
  """
    if not config:
        # If custom config is not given, get it from flags.
        config = defaults.default_config_with_updates(FLAGS.config)
    if not logdir:
        logdir = FLAGS.logdir

    if FLAGS.num_repetitions % FLAGS.num_workers != 0:
        raise ValueError('Number of workers must divide number of repetitions')
    num_local_reps = FLAGS.num_repetitions // FLAGS.num_workers
    logging.info('Running %d reps globally.', FLAGS.num_repetitions)
    logging.info('This worker will run %d local reps.', num_local_reps)
    if FLAGS.max_npe:
        max_generations = FLAGS.max_npe // config.batch_size
        logging.info('Max samples per rep: %d', FLAGS.max_npe)
        logging.info('Max generations per rep: %d', max_generations)
    else:
        max_generations = sys.maxint
        logging.info('Running unlimited generations.')

    assert FLAGS.num_workers > 0
    logging.info('Starting experiment. Directory: "%s"', logdir)
    results = results_lib.Results(logdir, FLAGS.task_id)
    local_results_list = results.read_this_shard()
    if local_results_list:
        if local_results_list[0]['max_npe'] != FLAGS.max_npe:
            raise ValueError(
                'Cannot resume training. Max-NPE changed. Was %s, now %s',
                local_results_list[0]['max_npe'], FLAGS.max_npe)
        if local_results_list[0][
                'max_global_repetitions'] != FLAGS.num_repetitions:
            raise ValueError(
                'Cannot resume training. Number of repetitions changed. Was %s, '
                'now %s', local_results_list[0]['max_global_repetitions'],
                FLAGS.num_repetitions)
    start_rep = len(local_results_list)

    for rep in xrange(start_rep, num_local_reps):
        global_rep = num_local_reps * FLAGS.task_id + rep
        logging.info('Starting repetition: Rep = %d. (global rep = %d)', rep,
                     global_rep)

        # Save data for each rep, like checkpoints, goes into separate folders.
        run_dir = os.path.join(logdir, 'run_%d' % global_rep)

        if not tf.gfile.IsDirectory(run_dir):
            tf.gfile.MakeDirs(run_dir)
        checkpoint_writer = CheckpointWriter(run_dir,
                                             population_size=config.batch_size)

        data_manager = data.DataManager(config, run_number=global_rep)
        task_eval_fn = ga_lib.make_task_eval_fn(data_manager.rl_task)

        if config.agent.algorithm == 'rand':
            logging.info('Running random search.')
            assert FLAGS.max_npe
            result = run_random_search(FLAGS.max_npe, run_dir, task_eval_fn,
                                       config.timestep_limit)
        else:
            assert config.agent.algorithm == 'ga'
            logging.info('Running genetic algorithm.')
            pop = ga_lib.make_population(ga_lib.random_individual(
                config.timestep_limit),
                                         n=config.batch_size)
            hof = utils.MaxUniquePriorityQueue(2)  # Hall of fame.
            result = ga_lib.ga_loop(pop,
                                    cxpb=config.agent.crossover_rate,
                                    mutpb=config.agent.mutation_rate,
                                    task_eval_fn=task_eval_fn,
                                    ngen=max_generations,
                                    halloffame=hof,
                                    checkpoint_writer=checkpoint_writer)

        logging.info('Finished rep. Num gens: %d', result.generations)

        results_dict = {
            'max_npe': FLAGS.max_npe,
            'batch_size': config.batch_size,
            'max_batches': FLAGS.max_npe // config.batch_size,
            'npe': result.num_programs,
            'max_global_repetitions': FLAGS.num_repetitions,
            'max_local_repetitions': num_local_reps,
            'code_solution': result.best_code if result.solution_found else '',
            'best_reward': result.reward,
            'num_batches': result.generations,
            'found_solution': result.solution_found,
            'task': data_manager.task_name,
            'global_rep': global_rep
        }
        logging.info('results_dict: %s', results_dict)
        results.append(results_dict)

    if is_chief:
        logging.info(
            'Worker is chief. Waiting for all workers to finish so that results '
            'can be reported to the tuner.')

        global_results_list, shard_stats = results.read_all(
            num_shards=FLAGS.num_workers)
        while not all(s.finished for s in shard_stats):
            logging.info(
                'Still waiting on these workers: %s', ', '.join([
                    '%d (%d reps left)' %
                    (i, s.max_local_reps - s.num_local_reps_completed)
                    for i, s in enumerate(shard_stats) if not s.finished
                ]))
            sleep(60)
            global_results_list, shard_stats = results.read_all(
                num_shards=FLAGS.num_workers)

        logging.info(
            '%d results obtained. Chief worker is exiting the experiment.',
            len(global_results_list))

        return global_results_list
Beispiel #14
0
  def testVarUpdates(self):
    """Tests that variables get updated as expected.

    For the RL update, check that gradients are non-zero and that the global
    model gets updated.
    """
    config = defaults.default_config_with_updates(
        'env=c(task="reverse"),'
        'agent=c(algorithm="pg",eos_token=True,optimizer="sgd",lr=1.0)')
    lr = config.agent.lr

    tf.reset_default_graph()
    trainer = pg_train.AsyncTrainer(
        config, task_id=0, ps_tasks=0, num_workers=1)
    global_init_op = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
    with tf.Session() as sess:
      sess.run(global_init_op)  # Initialize global copy.
      trainer.initialize(sess)
      model = trainer.model
      global_vars = sess.run(trainer.global_model.trainable_variables)
      local_vars = sess.run(model.trainable_variables)

      # Make sure names match.
      g_prefix = 'global/'
      l_prefix = 'local/'
      for g, l in zip(trainer.global_model.trainable_variables,
                      model.trainable_variables):
        self.assertEqual(g.name[len(g_prefix):], l.name[len(l_prefix):])

      # Assert that shapes and values are the same between global and local
      # models.
      for g, l in zip(global_vars, local_vars):
        self.assertEqual(g.shape, l.shape)
        self.assertTrue(np.array_equal(g, l))

      # Make all gradients dense tensors.
      for param, grad in model.gradients_dict.items():
        if isinstance(grad, tf.IndexedSlices):
          # Converts to dense tensor.
          model.gradients_dict[param] = tf.multiply(grad, 1.0)

      # Perform update.
      results = model.update_step(
          sess, trainer.data_manager.sample_rl_batch(), trainer.train_op,
          trainer.global_step, return_gradients=True)
      grads_dict = results.gradients_dict
      for grad in grads_dict.values():
        self.assertIsNotNone(grad)
        self.assertTrue(np.count_nonzero(grad) > 0)
      global_update = sess.run(trainer.global_model.trainable_variables)
      for tf_var, var_before, var_after in zip(
          model.trainable_variables, local_vars, global_update):
        # Check that the params were updated.
        self.assertTrue(np.allclose(
            var_after,
            var_before - grads_dict[tf_var] * lr))

      # Test that global to local sync works.
      sess.run(trainer.sync_op)
      global_vars = sess.run(trainer.global_model.trainable_variables)
      local_vars = sess.run(model.trainable_variables)
      for l, g in zip(local_vars, global_vars):
        self.assertTrue(np.allclose(l, g))
Beispiel #15
0
  def testNumericalGradChecking(self):
    # Similar to
    # http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization.
    epsilon = 1e-4
    eos = misc.BF_EOS_INT
    self.assertEqual(0, eos)
    config = defaults.default_config_with_updates(
        'env=c(task="print"),'
        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
        'entropy_beta=0.0,topk_loss_hparam=0.0,policy_lstm_sizes=[10],'
        'eos_token=True),'
        'batch_size=64')
    dtype = tf.float64
    tf.reset_default_graph()
    tf.set_random_seed(12345678987654321)
    np.random.seed(1294024302)
    trainer = pg_train.AsyncTrainer(
        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
    model = trainer.model
    actions_ph = model.actions
    lengths_ph = model.adjusted_lengths
    multipliers_ph = model.policy_multipliers
    loss = model.pi_loss
    global_init_op = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))

    assign_add_placeholders = [None] * len(model.trainable_variables)
    assign_add_ops = [None] * len(model.trainable_variables)
    param_shapes = [None] * len(model.trainable_variables)
    for i, param in enumerate(model.trainable_variables):
      param_shapes[i] = param.get_shape().as_list()
      assign_add_placeholders[i] = tf.placeholder(dtype,
                                                  np.prod(param_shapes[i]))
      assign_add_ops[i] = param.assign_add(
          tf.reshape(assign_add_placeholders[i], param_shapes[i]))

    with tf.Session() as sess:
      sess.run(global_init_op)  # Initialize global copy.
      trainer.initialize(sess)

      actions_raw = [random_sequence(10, 9) for _ in xrange(16)]
      actions_batch = utils.stack_pad(actions_raw, 0)
      lengths_batch = [len(l) for l in actions_raw]
      feed = {actions_ph: actions_batch,
              multipliers_ph: np.ones_like(actions_batch),
              lengths_ph: lengths_batch}

      estimated_grads = [None] * len(model.trainable_variables)
      for i, param in enumerate(model.trainable_variables):
        param_size = np.prod(param_shapes[i])
        estimated_grads[i] = np.zeros(param_size, dtype=np.float64)
        for index in xrange(param_size):
          e = onehot(index, param_size) * epsilon
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: e})
          j_plus = sess.run(loss, feed)
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: -2 * e})
          j_minus = sess.run(loss, feed)
          sess.run(assign_add_ops[i],
                   {assign_add_placeholders[i]: e})
          estimated_grads[i][index] = (j_plus - j_minus) / (2 * epsilon)
        estimated_grads[i] = estimated_grads[i].reshape(param_shapes[i])

      analytic_grads = sess.run(model.dense_unclipped_grads, feed)

      for g1, g2 in zip(estimated_grads[1:], analytic_grads[1:]):
        logging.info('norm (g1-g2): %s', np.abs(g1 - g2).mean())
        self.assertTrue(np.allclose(g1, g2))
Beispiel #16
0
def run_training(config=None,
                 tuner=None,
                 logdir=None,
                 trial_name=None,
                 is_chief=True):
    """Do all training runs.

  This is the top level training function for policy gradient based models.
  Run this from the main function.

  Args:
    config: config_lib.Config instance containing global config (agent and
        environment hparams). If None, config will be parsed from FLAGS.config.
    tuner: A tuner instance. Leave as None if not tuning.
    logdir: Parent directory where all data from all runs will be written. If
        None, FLAGS.logdir will be used.
    trial_name: If tuning, set this to a unique string that identifies this
        trial. If `tuner` is not None, this also must be set.
    is_chief: True if this worker is the chief.

  Returns:
    List of results dicts which were written to disk. Each training run gets a
    results dict. Results dict contains metrics, i.e. (name, value) pairs which
    give information about the training run.

  Raises:
    ValueError: If results dicts read from disk contain invalid data.
  """
    if not config:
        # If custom config is not given, get it from flags.
        config = defaults.default_config_with_updates(FLAGS.config)
    if not logdir:
        logdir = FLAGS.logdir
    if not tf.gfile.Exists(logdir):
        tf.gfile.MakeDirs(logdir)
    assert FLAGS.num_repetitions > 0
    results = results_lib.Results(logdir)
    results_list, _ = results.read_all()

    tf.logging.info('Starting experiment. Directory: "%s"', logdir)

    if results_list:
        if results_list[0]['max_npe'] != FLAGS.max_npe:
            raise ValueError(
                'Cannot resume training. Max-NPE changed. Was %s, now %s',
                results_list[0]['max_npe'], FLAGS.max_npe)
        if results_list[0]['max_global_repetitions'] != FLAGS.num_repetitions:
            raise ValueError(
                'Cannot resume training. Number of repetitions changed. Was %s, '
                'now %s', results_list[0]['max_global_repetitions'],
                FLAGS.num_repetitions)

    while len(results_list) < FLAGS.num_repetitions:
        run_number = len(results_list)
        rep_container_name = trial_name if trial_name else 'container'
        if FLAGS.num_repetitions > 1:
            rep_dir = os.path.join(logdir, 'run_%d' % run_number)
            rep_container_name = rep_container_name + '_run_' + str(run_number)
        else:
            rep_dir = logdir

        tf.logging.info('Starting repetition %d (%d out of %d)', run_number,
                        run_number + 1, FLAGS.num_repetitions)

        # Train will write result to disk.
        with tf.container(rep_container_name):
            trainer = train(config, is_chief, tuner, rep_dir, run_number,
                            results)
        tf.logging.info('Done training.')

        if is_chief:
            # Destroy current container immediately (clears current graph).
            tf.logging.info('Clearing shared variables.')
            tf.Session.reset(FLAGS.master, containers=[rep_container_name])
            tf.logging.info('Shared variables cleared.')

            # Delete replay buffer on disk.
            assert trainer
            trainer.delete_replay_buffer()
        else:
            # Give chief worker time to clean up.
            sleep_sec = 30.0
            tf.logging.info('Sleeping for %s sec.', sleep_sec)
            time.sleep(sleep_sec)
        tf.reset_default_graph()
        tf.logging.info('Default graph reset.')

        # Expecting that train wrote new result to disk before returning.
        results_list, _ = results.read_all()
    return results_list
Beispiel #17
0
def run_training(config=None, tuner=None, logdir=None, trial_name=None,
                 is_chief=True):
  """Do all training runs.

  This is the top level training function for policy gradient based models.
  Run this from the main function.

  Args:
    config: config_lib.Config instance containing global config (agent and
        environment hparams). If None, config will be parsed from FLAGS.config.
    tuner: A tuner instance. Leave as None if not tuning.
    logdir: Parent directory where all data from all runs will be written. If
        None, FLAGS.logdir will be used.
    trial_name: If tuning, set this to a unique string that identifies this
        trial. If `tuner` is not None, this also must be set.
    is_chief: True if this worker is the chief.

  Returns:
    List of results dicts which were written to disk. Each training run gets a
    results dict. Results dict contains metrics, i.e. (name, value) pairs which
    give information about the training run.

  Raises:
    ValueError: If results dicts read from disk contain invalid data.
  """
  if not config:
    # If custom config is not given, get it from flags.
    config = defaults.default_config_with_updates(FLAGS.config)
  if not logdir:
    logdir = FLAGS.logdir
  if not tf.gfile.Exists(logdir):
    tf.gfile.MakeDirs(logdir)
  assert FLAGS.num_repetitions > 0
  results = results_lib.Results(logdir)
  results_list, _ = results.read_all()

  logging.info('Starting experiment. Directory: "%s"', logdir)

  if results_list:
    if results_list[0]['max_npe'] != FLAGS.max_npe:
      raise ValueError(
          'Cannot resume training. Max-NPE changed. Was %s, now %s',
          results_list[0]['max_npe'], FLAGS.max_npe)
    if results_list[0]['max_global_repetitions'] != FLAGS.num_repetitions:
      raise ValueError(
          'Cannot resume training. Number of repetitions changed. Was %s, '
          'now %s',
          results_list[0]['max_global_repetitions'],
          FLAGS.num_repetitions)

  while len(results_list) < FLAGS.num_repetitions:
    run_number = len(results_list)
    rep_container_name = trial_name if trial_name else 'container'
    if FLAGS.num_repetitions > 1:
      rep_dir = os.path.join(logdir, 'run_%d' % run_number)
      rep_container_name = rep_container_name + '_run_' + str(run_number)
    else:
      rep_dir = logdir

    logging.info(
        'Starting repetition %d (%d out of %d)', run_number, run_number + 1,
        FLAGS.num_repetitions)

    # Train will write result to disk.
    with tf.container(rep_container_name):
      trainer = train(config, is_chief, tuner, rep_dir, run_number, results)
    logging.info('Done training.')

    if is_chief:
      # Destroy current container immediately (clears current graph).
      logging.info('Clearing shared variables.')
      tf.Session.reset(FLAGS.master, containers=[rep_container_name])
      logging.info('Shared variables cleared.')

      # Delete replay buffer on disk.
      assert trainer
      trainer.delete_replay_buffer()
    else:
      # Give chief worker time to clean up.
      sleep_sec = 30.0
      logging.info('Sleeping for %s sec.', sleep_sec)
      time.sleep(sleep_sec)
    tf.reset_default_graph()
    logging.info('Default graph reset.')

    # Expecting that train wrote new result to disk before returning.
    results_list, _ = results.read_all()
  return results_list
Beispiel #18
0
def run_training(config=None, tuner=None, logdir=None, trial_name=None,  # pylint: disable=unused-argument
                 is_chief=True):
  """Do all training runs.

  This is the top level training function for policy gradient based models.
  Run this from the main function.

  Args:
    config: config_lib.Config instance containing global config (agent and
        environment hparams). If None, config will be parsed from FLAGS.config.
    tuner: (unused) A tuner instance. Leave as None if not tuning.
    logdir: Parent directory where all data from all runs will be written. If
        None, FLAGS.logdir will be used.
    trial_name: (unused) If tuning, set this to a unique string that identifies
        this trial. If `tuner` is not None, this also must be set.
    is_chief: True if this worker is the chief.

  Returns:
    List of results dicts which were written to disk. Each training run gets a
    results dict. Results dict contains metrics, i.e. (name, value) pairs which
    give information about the training run.

  Raises:
    ValueError: If FLAGS.num_workers does not divide FLAGS.num_repetitions.
    ValueError: If results dicts read from disk contain invalid data.
  """
  if not config:
    # If custom config is not given, get it from flags.
    config = defaults.default_config_with_updates(FLAGS.config)
  if not logdir:
    logdir = FLAGS.logdir

  if FLAGS.num_repetitions % FLAGS.num_workers != 0:
    raise ValueError('Number of workers must divide number of repetitions')
  num_local_reps = FLAGS.num_repetitions // FLAGS.num_workers
  logging.info('Running %d reps globally.', FLAGS.num_repetitions)
  logging.info('This worker will run %d local reps.', num_local_reps)
  if FLAGS.max_npe:
    max_generations = FLAGS.max_npe // config.batch_size
    logging.info('Max samples per rep: %d', FLAGS.max_npe)
    logging.info('Max generations per rep: %d', max_generations)
  else:
    max_generations = sys.maxint
    logging.info('Running unlimited generations.')

  assert FLAGS.num_workers > 0
  logging.info('Starting experiment. Directory: "%s"', logdir)
  results = results_lib.Results(logdir, FLAGS.task_id)
  local_results_list = results.read_this_shard()
  if local_results_list:
    if local_results_list[0]['max_npe'] != FLAGS.max_npe:
      raise ValueError(
          'Cannot resume training. Max-NPE changed. Was %s, now %s',
          local_results_list[0]['max_npe'], FLAGS.max_npe)
    if local_results_list[0]['max_global_repetitions'] != FLAGS.num_repetitions:
      raise ValueError(
          'Cannot resume training. Number of repetitions changed. Was %s, '
          'now %s',
          local_results_list[0]['max_global_repetitions'],
          FLAGS.num_repetitions)
  start_rep = len(local_results_list)

  for rep in xrange(start_rep, num_local_reps):
    global_rep = num_local_reps * FLAGS.task_id + rep
    logging.info(
        'Starting repetition: Rep = %d. (global rep = %d)',
        rep, global_rep)

    # Save data for each rep, like checkpoints, goes into separate folders.
    run_dir = os.path.join(logdir, 'run_%d' % global_rep)

    if not tf.gfile.IsDirectory(run_dir):
      tf.gfile.MakeDirs(run_dir)
    checkpoint_writer = CheckpointWriter(run_dir,
                                         population_size=config.batch_size)

    data_manager = data.DataManager(config, run_number=global_rep)
    task_eval_fn = ga_lib.make_task_eval_fn(data_manager.rl_task)

    if config.agent.algorithm == 'rand':
      logging.info('Running random search.')
      assert FLAGS.max_npe
      result = run_random_search(
          FLAGS.max_npe, run_dir, task_eval_fn, config.timestep_limit)
    else:
      assert config.agent.algorithm == 'ga'
      logging.info('Running genetic algorithm.')
      pop = ga_lib.make_population(
          ga_lib.random_individual(config.timestep_limit),
          n=config.batch_size)
      hof = utils.MaxUniquePriorityQueue(2)  # Hall of fame.
      result = ga_lib.ga_loop(
          pop,
          cxpb=config.agent.crossover_rate, mutpb=config.agent.mutation_rate,
          task_eval_fn=task_eval_fn,
          ngen=max_generations, halloffame=hof,
          checkpoint_writer=checkpoint_writer)

    logging.info('Finished rep. Num gens: %d', result.generations)

    results_dict = {
        'max_npe': FLAGS.max_npe,
        'batch_size': config.batch_size,
        'max_batches': FLAGS.max_npe // config.batch_size,
        'npe': result.num_programs,
        'max_global_repetitions': FLAGS.num_repetitions,
        'max_local_repetitions': num_local_reps,
        'code_solution': result.best_code if result.solution_found else '',
        'best_reward': result.reward,
        'num_batches': result.generations,
        'found_solution': result.solution_found,
        'task': data_manager.task_name,
        'global_rep': global_rep}
    logging.info('results_dict: %s', results_dict)
    results.append(results_dict)

  if is_chief:
    logging.info(
        'Worker is chief. Waiting for all workers to finish so that results '
        'can be reported to the tuner.')

    global_results_list, shard_stats = results.read_all(
        num_shards=FLAGS.num_workers)
    while not all(s.finished for s in shard_stats):
      logging.info(
          'Still waiting on these workers: %s',
          ', '.join(
              ['%d (%d reps left)'
               % (i, s.max_local_reps - s.num_local_reps_completed)
               for i, s in enumerate(shard_stats)
               if not s.finished]))
      sleep(60)
      global_results_list, shard_stats = results.read_all(
          num_shards=FLAGS.num_workers)

    logging.info(
        '%d results obtained. Chief worker is exiting the experiment.',
        len(global_results_list))

    return global_results_list