def test_experiences_to_batches(target_computer_class_mock):
  compute = target_computer_class_mock.return_value.compute
  compute.return_value = np.array([42, 43])

  state1 = np.arange(16).reshape((4, 4)) + 1
  state2 = np.arange(16).reshape((4, 4)) + 2
  state3 = np.arange(16).reshape((4, 4)) + 3
  experiences = [Experience(state1, 1, 2, state2, False, False, [3]),
                 Experience(state2, 3, 4, state3, True, False, [])]

  run_inference = Mock(side_effect=[np.array([[0, 0, 0, -0.5],
                                              [0, 0, 0, 0]])])

  batcher = ExperienceBatcher(None, run_inference, None, 1.0 / 15.0)

  state_batch, targets, actions = batcher.experiences_to_batches(experiences)

  reward_batch = np.array([2, 4])
  bad_action_batch = np.array([False, True])
  next_state_batch = np.array([state2.flatten(), state3.flatten()]) / 15.0
  available_actions_batch = np.array([[False, False, False, True],
                                      [False, False, False, False]])

  assert (compute.call_args_list[0][0][0] == reward_batch).all()
  assert (compute.call_args_list[0][0][1] == bad_action_batch).all()
  assert (compute.call_args_list[0][0][2] == next_state_batch).all()
  assert (compute.call_args_list[0][0][3] == available_actions_batch).all()

  expected_state_batch = np.array([state1.flatten(), state2.flatten()]) / 15.0

  assert (state_batch == expected_state_batch).all()
  assert (targets == np.array([42, 43])).all()
  assert (actions == np.array([1, 3])).all()
Exemple #2
0
def run_training(train_dir):
    """Run training"""

    resume = os.path.exists(train_dir)

    with tf.Graph().as_default():
        model = FeedModel()
        saver = tf.train.Saver()
        session = tf.Session()
        summary_writer = tf.summary.FileWriter(train_dir,
                                               graph_def=session.graph_def,
                                               flush_secs=10)

        if resume:
            print("Resuming: ", train_dir)
            saver.restore(session, tf.train.latest_checkpoint(train_dir))
        else:
            print("Starting new training: ", train_dir)
            session.run(model.init)

        run_inference = make_run_inference(session, model)
        get_q_values = make_get_q_values(session, model)

        experience_collector = ExperienceCollector()
        batcher = ExperienceBatcher(experience_collector, run_inference,
                                    get_q_values, STATE_NORMALIZE_FACTOR)

        test_experiences = experience_collector.collect(
            play.random_strategy, 100)

        for state_batch, targets, actions in batcher.get_batches_stepwise():

            global_step, _ = session.run(
                [model.global_step, model.train_op],
                feed_dict={
                    model.state_batch_placeholder: state_batch,
                    model.targets_placeholder: targets,
                    model.actions_placeholder: actions,
                })

            if global_step % 1e3 == 0 and global_step != 0:
                saver.save(session,
                           train_dir + "/checkpoint",
                           global_step=global_step)
                loss = write_summaries(session, batcher, model,
                                       test_experiences, summary_writer)
                print("Step:", global_step, "Loss:", loss)
Exemple #3
0
def run_training(train_dir):
  """Run training"""

  resume = os.path.exists(train_dir)

  with tf.Graph().as_default():
    model = FeedModel()
    saver = tf.train.Saver()
    session = tf.Session()
    summary_writer = tf.train.SummaryWriter(train_dir,
                                            graph_def=session.graph_def,
                                            flush_secs=10)

    if resume:
      print("Resuming: ", train_dir)
      saver.restore(session, tf.train.latest_checkpoint(train_dir))
    else:
      print("Starting new training: ", train_dir)
      session.run(model.init)

    run_inference = make_run_inference(session, model)
    get_q_values = make_get_q_values(session, model)

    experience_collector = ExperienceCollector()
    batcher = ExperienceBatcher(experience_collector, run_inference,
                                get_q_values, STATE_NORMALIZE_FACTOR)

    test_experiences = experience_collector.collect(play.random_strategy, 100)

    for state_batch, targets, actions in batcher.get_batches_stepwise():

      global_step, _ = session.run([model.global_step, model.train_op],
          feed_dict={
              model.state_batch_placeholder: state_batch,
              model.targets_placeholder: targets,
              model.actions_placeholder: actions,})

      if global_step % 10000 == 0 and global_step != 0:
        saver.save(session, train_dir + "/checkpoint", global_step=global_step)
        loss = write_summaries(session, batcher, model, test_experiences,
                               summary_writer)
        print("Step:", global_step, "Loss:", loss)
Exemple #4
0
def test_experiences_to_batches(target_computer_class_mock):
    compute = target_computer_class_mock.return_value.compute
    compute.return_value = np.array([42, 43])

    state1 = np.arange(16).reshape((4, 4)) + 1
    state2 = np.arange(16).reshape((4, 4)) + 2
    state3 = np.arange(16).reshape((4, 4)) + 3
    experiences = [
        Experience(state1, 1, 2, state2, False, False, [3]),
        Experience(state2, 3, 4, state3, True, False, [])
    ]

    run_inference = Mock(
        side_effect=[np.array([[0, 0, 0, -0.5], [0, 0, 0, 0]])])

    batcher = ExperienceBatcher(None, run_inference, None, 1.0 / 15.0)

    state_batch, targets, actions = batcher.experiences_to_batches(experiences)

    reward_batch = np.array([2, 4])
    bad_action_batch = np.array([False, True])
    next_state_batch = np.array([state2.flatten(), state3.flatten()]) / 15.0
    available_actions_batch = np.array([[False, False, False, True],
                                        [False, False, False, False]])

    assert (compute.call_args_list[0][0][0] == reward_batch).all()
    assert (compute.call_args_list[0][0][1] == bad_action_batch).all()
    assert (compute.call_args_list[0][0][2] == next_state_batch).all()
    assert (compute.call_args_list[0][0][3] == available_actions_batch).all()

    expected_state_batch = np.array([state1.flatten(),
                                     state2.flatten()]) / 15.0

    assert (state_batch == expected_state_batch).all()
    assert (targets == np.array([42, 43])).all()
    assert (actions == np.array([1, 3])).all()