Ejemplo n.º 1
0
    def __init__(self, policy, batch_size=None, seed=None):
        """Initializes a new `PyTFPolicy`.

    Args:
      policy: A TF Policy implementing `tf_policy.Base`.
      batch_size: (deprecated)
      seed: Seed to use if policy performs random actions (optional).
    """
        if not isinstance(policy, tf_policy.Base):
            logging.warning('Policy should implement tf_policy.Base')

        if batch_size is not None:
            logging.warning(
                'In PyTFPolicy constructor, `batch_size` is deprecated, '
                'this parameter has no effect. This argument will be '
                'removed on 2019-05-01')

        time_step_spec = tensor_spec.to_nest_array_spec(policy.time_step_spec)
        action_spec = tensor_spec.to_nest_array_spec(policy.action_spec)
        super(PyTFPolicy, self).__init__(time_step_spec,
                                         action_spec,
                                         policy_state_spec=(),
                                         info_spec=())

        self._tf_policy = policy
        self.session = None

        self._policy_state_spec = tensor_spec.to_nest_array_spec(
            self._tf_policy.policy_state_spec)

        self._batch_size = None
        self._batched = None
        self._seed = seed
        self._built = False
Ejemplo n.º 2
0
 def testBuild(self):
   policy = py_tf_policy.PyTFPolicy(self._tf_policy)
   expected_time_step_spec = ts.time_step_spec(
       tensor_spec.to_nest_array_spec(self._obs_spec))
   expected_action_spec = tensor_spec.to_nest_array_spec(self._action_spec)
   self.assertEqual(expected_time_step_spec, policy.time_step_spec())
   self.assertEqual(expected_action_spec, policy.action_spec())
Ejemplo n.º 3
0
  def __init__(self, policy, batch_size=None, seed=None):
    """Initializes a new `PyTFPolicy`.

    Args:
      policy: A TF Policy implementing `tf_policy.Base`.
      batch_size: (deprecated)
      seed: Seed to use if policy performs random actions (optional).
    """
    if not isinstance(policy, tf_policy.Base):
      logging.warning('Policy should implement tf_policy.Base')

    time_step_spec = tensor_spec.to_nest_array_spec(policy.time_step_spec())
    action_spec = tensor_spec.to_nest_array_spec(policy.action_spec())
    super(PyTFPolicy, self).__init__(
        time_step_spec, action_spec, policy_state_spec=(), info_spec=())

    self._tf_policy = policy
    self.session = None

    self._policy_state_spec = tensor_spec.to_nest_array_spec(
        self._tf_policy.policy_state_spec())

    self._batch_size = None
    self._batched = None
    self._seed = seed
    self._built = False
Ejemplo n.º 4
0
  def __init__(self, policy, batch_size=None, seed=None):
    """Initializes a new `PyTFPolicy`.

    Args:
      policy: A TF Policy implementing `tf_policy.Base`.
      batch_size: The batch size of time_steps and actions.
      seed: Seed to use if policy performs random actions (optional).
    """
    if not isinstance(policy, tf_policy.Base):
      tf.logging.warning('Policy should implement tf_policy.Base')

    self._tf_policy = policy
    self.session = None

    self._time_step_spec = tensor_spec.to_nest_array_spec(
        self._tf_policy.time_step_spec())
    self._action_spec = tensor_spec.to_nest_array_spec(
        self._tf_policy.action_spec())
    self._policy_state_spec = tensor_spec.to_nest_array_spec(
        self._tf_policy.policy_state_spec())

    self._batch_size = batch_size
    self._seed = seed
    self._batched = batch_size is not None
    self._set_up_feeds_and_fetches()
Ejemplo n.º 5
0
 def __init__(self, policy, use_tf_function=False):
   time_step_spec = tensor_spec.to_nest_array_spec(policy.time_step_spec)
   action_spec = tensor_spec.to_nest_array_spec(policy.action_spec)
   policy_state_spec = tensor_spec.to_nest_array_spec(policy.policy_state_spec)
   info_spec = tensor_spec.to_nest_array_spec(policy.info_spec)
   super(PyTFEagerPolicy,
         self).__init__(policy, time_step_spec, action_spec, policy_state_spec,
                        info_spec, use_tf_function)
Ejemplo n.º 6
0
 def __init__(self,
              policy: tf_policy.TFPolicy,
              use_tf_function: bool = False,
              batch_time_steps=True):
   time_step_spec = tensor_spec.to_nest_array_spec(policy.time_step_spec)
   action_spec = tensor_spec.to_nest_array_spec(policy.action_spec)
   policy_state_spec = tensor_spec.to_nest_array_spec(policy.policy_state_spec)
   info_spec = tensor_spec.to_nest_array_spec(policy.info_spec)
   super(PyTFEagerPolicy,
         self).__init__(policy, time_step_spec, action_spec, policy_state_spec,
                        info_spec, use_tf_function, batch_time_steps)
    def __init__(self, output_path, tensor_data_spec, py_mode=False):
        """Creates observer object.

    Args:
      output_path: The path to the TFRecords file.
      tensor_data_spec: Nested list/tuple or dict of TensorSpecs, describing the
        shape of the non-batched Tensors.
      py_mode: Whether the observer is being used in a py_driver.

    Raises:
      ValueError: if the tensors and specs have incompatible dimensions or
      shapes.
    """
        self._py_mode = py_mode
        self._array_data_spec = tensor_spec.to_nest_array_spec(
            tensor_data_spec)
        self._encoder = example_encoding.get_example_serializer(
            self._array_data_spec)
        # Two output files: a tfrecord file and a file with the serialized spec
        self.output_path = output_path
        tf.io.gfile.makedirs(os.path.dirname(self.output_path))
        self._writer = tf.io.TFRecordWriter(self.output_path)
        logging.info('Writing dataset to TFRecord at %s', self.output_path)
        # Save the tensor spec used to write the dataset to file
        spec_output_path = self.output_path + _SPEC_FILE_EXTENSION
        encode_spec_to_file(spec_output_path, tensor_data_spec)
Ejemplo n.º 8
0
  def setUp(self):
    super(ReverbReplayBufferTest, self).setUp()

    # Prepare the environment (and the corresponding specs).
    self._env = test_envs.EpisodeCountingEnv(steps_per_episode=3)
    tensor_time_step_spec = tf.nest.map_structure(tensor_spec.from_spec,
                                                  self._env.time_step_spec())
    tensor_action_spec = tensor_spec.from_spec(self._env.action_spec())
    self._data_spec = trajectory.Trajectory(
        step_type=tensor_time_step_spec.step_type,
        observation=tensor_time_step_spec.observation,
        action=tensor_action_spec,
        policy_info=(),
        next_step_type=tensor_time_step_spec.step_type,
        reward=tensor_time_step_spec.reward,
        discount=tensor_time_step_spec.discount,
    )
    table_spec = tf.nest.map_structure(
        lambda s: tf.TensorSpec(dtype=s.dtype, shape=(None,) + s.shape),
        self._data_spec)
    self._array_data_spec = tensor_spec.to_nest_array_spec(self._data_spec)

    # Initialize and start a Reverb server (and set up a client to it).
    self._table_name = 'test_table'
    uniform_table = reverb.Table(
        self._table_name,
        max_size=100,
        sampler=reverb.selectors.Uniform(),
        remover=reverb.selectors.Fifo(),
        rate_limiter=reverb.rate_limiters.MinSize(1),
        signature=table_spec,
    )
    self._server = reverb.Server([uniform_table])
    self._py_client = reverb.Client('localhost:{}'.format(self._server.port))
Ejemplo n.º 9
0
    def __init__(self, playerIndex, debug=False, create_model=True):
        """ Initialize an agent. """

        super().__init__(playerIndex, debug=debug)
        self.trainable = True

        # Whether to use small numbers for debugging reasons
        self.use_small_numbers = use_small_nums

        # Hyperparameters
        self.alpha = 0.01  # learning rate
        self.gamma = 0.95  # favour future rewards
        self.exploration_decay_rate = 1 / 2000
        self.reward_win_round = 0.005
        self.reward_per_card_played = 0.001
        self.rewards = {
            0: 1.0,  # No other agent finished before
            1: 0.05,  # One other agent finished before
            2: 0.04,  # Two other agents finished before
            3: -1.0,  # Three other agents finished before
        }

        # Training/Batch parameters
        self.sample_batch = 64 if self.use_small_numbers else 512
        self.replay_capacity = 128 if self.use_small_numbers else 1024
        self.train_each_n_steps = 5 if self.use_small_numbers else 50
        self.step_iteration = 0
        self.model_data_spec = (  # TODO adjust to new model
            tf.TensorSpec([4 * 13], tf.int8, "board_state"),
            tf.TensorSpec([1], tf.float32, "q_value"),
        )
        self.replay_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
            capacity=self.replay_capacity,
            data_spec=tensor_spec.to_nest_array_spec(self.model_data_spec)
        )

        # Validation parameters
        self.val_replay_capacity = 20 if self.use_small_numbers else 200
        self.validation_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
            capacity=self.val_replay_capacity,
            data_spec=tensor_spec.to_nest_array_spec(self.model_data_spec)
        )

        # Initialize model
        if create_model:
            self._create_model()
Ejemplo n.º 10
0
def get_episode_spec(traj_spec, max_episode_len):
    traj_arr_spec = tensor_spec.to_nest_array_spec(traj_spec)
    traj_batch_arr_spec = get_batched_spec(traj_arr_spec, max_episode_len)
    observation_spec = traj_batch_arr_spec.observation['pixels']
    metric_spec = array_spec.BoundedArraySpec(shape=(max_episode_len,
                                                     max_episode_len),
                                              dtype=np.float32,
                                              minimum=0.)
    return tensor_spec.from_spec(
        (observation_spec, observation_spec, metric_spec))
Ejemplo n.º 11
0
 def _specs_from_collect_data_spec(self, policy_specs):
     policy_specs = tensor_spec.to_nest_array_spec(policy_specs)
     collect_data_spec = policy_specs['collect_data_spec']
     policy_state_spec = policy_specs['policy_state_spec']
     time_step_spec = ts.TimeStep(step_type=collect_data_spec.step_type,
                                  reward=collect_data_spec.reward,
                                  discount=collect_data_spec.discount,
                                  observation=collect_data_spec.observation)
     action_spec = collect_data_spec.action
     info_spec = collect_data_spec.policy_info
     return time_step_spec, action_spec, policy_state_spec, info_spec
Ejemplo n.º 12
0
def specs_from_collect_data_spec(
    loaded_policy_specs: types.NestedTensorSpec
) -> Dict[types.NestedSpec, types.NestedSpec]:
  """Creates policy specs from specs loaded from disk.

  The PolicySaver saves policy specs next to the saved model as
  a `struct.StructuredValue` proto. This recreates the
  original specs from the proto.

  Pass the proto loaded from the file with `tensor_spec.from_pbtxt_file()`
  to this function.

  Args:
     loaded_policy_specs: `struct.StructuredValue` proto that had been
       previously created by PolicySaver as a pbtxt.

  Returns:
    A dict with specs extracted from the proto. The dict contains the following
    keys and values. Except `time_step_spec` all the specs are nests of
    `ArraySpecs`.
       * `collect_data_spec`: Collect data spec for the policy.
       * `time_step_spec`: `TimeStepSpec` for the policy.
       * `action_spec`:  Action spec for the policy
       * `policy_state_spec`: State spec for the policy.
       * `info_spec`: Info spec for the policy.
  """
  policy_specs = tensor_spec.to_nest_array_spec(loaded_policy_specs)
  collect_data_spec = policy_specs['collect_data_spec']
  policy_state_spec = policy_specs['policy_state_spec']
  time_step_spec = ts.TimeStep(
      step_type=collect_data_spec.step_type,
      reward=collect_data_spec.reward,
      discount=collect_data_spec.discount,
      observation=collect_data_spec.observation)
  action_spec = collect_data_spec.action
  info_spec = collect_data_spec.policy_info
  return dict(
      collect_data_spec=collect_data_spec,
      time_step_spec=time_step_spec,
      action_spec=action_spec,
      policy_state_spec=policy_state_spec,
      info_spec=info_spec)
Ejemplo n.º 13
0
    def __init__(self,
                 output_path,
                 tensor_data_spec,
                 py_mode=False,
                 compress_image=False,
                 image_quality=95):
        """Creates observer object.

    Args:
      output_path: The path to the TFRecords file.
      tensor_data_spec: Nested list/tuple or dict of TensorSpecs, describing the
        shape of the non-batched Tensors.
      py_mode: Whether the observer is being used in a py_driver.
      compress_image: Whether to compress image. It is assumed that any uint8
        tensor of rank 3 with shape (w,h,c) is an image.
      image_quality: An optional int. Defaults to 95. Quality of the compression
        from 0 to 100 (higher is better and slower).

    Raises:
      ValueError: if the tensors and specs have incompatible dimensions or
      shapes.
    """
        self._py_mode = py_mode
        self._array_data_spec = tensor_spec.to_nest_array_spec(
            tensor_data_spec)
        self._encoder = example_encoding.get_example_serializer(
            self._array_data_spec,
            compress_image=compress_image,
            image_quality=image_quality)
        # Two output files: a tfrecord file and a file with the serialized spec
        self.output_path = output_path
        tf.io.gfile.makedirs(os.path.dirname(self.output_path))
        self._writer = tf.io.TFRecordWriter(self.output_path)
        logging.info('Writing dataset to TFRecord at %s', self.output_path)
        # Save the tensor spec used to write the dataset to file
        spec_output_path = self.output_path + _SPEC_FILE_EXTENSION
        encode_spec_to_file(spec_output_path, tensor_data_spec)
Ejemplo n.º 14
0
def train_eval(
        root_dir,
        env_name='CartPole-v0',
        num_iterations=100000,
        fc_layer_params=(100, ),
        # Params for collect
        initial_collect_steps=1000,
        collect_steps_per_iteration=1,
        epsilon_greedy=0.1,
        replay_buffer_capacity=100000,
        # Params for target update
        target_update_tau=0.05,
        target_update_period=5,
        # Params for train
        train_steps_per_iteration=1,
        batch_size=64,
        learning_rate=1e-3,
        n_step_update=1,
        gamma=0.99,
        reward_scale_factor=1.0,
        gradient_clipping=None,
        # Params for eval
        num_eval_episodes=10,
        eval_interval=1000,
        # Params for checkpoints, summaries and logging
        train_checkpoint_interval=10000,
        policy_checkpoint_interval=5000,
        log_interval=1000,
        summaries_flush_secs=10,
        debug_summaries=False,
        summarize_grads_and_vars=False,
        eval_metrics_callback=None):
    """A simple train and eval for DQN."""
    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)
    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)
    eval_metrics = [
        py_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        py_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes),
    ]

    # Note this is a python environment.
    env = batched_py_environment.BatchedPyEnvironment(
        [suite_gym.load(env_name)])
    eval_py_env = suite_gym.load(env_name)

    # Convert specs to BoundedTensorSpec.
    action_spec = tensor_spec.from_spec(env.action_spec())
    observation_spec = tensor_spec.from_spec(env.observation_spec())
    time_step_spec = ts.time_step_spec(observation_spec)

    q_net = q_network.QNetwork(tensor_spec.from_spec(env.observation_spec()),
                               tensor_spec.from_spec(env.action_spec()),
                               fc_layer_params=fc_layer_params)

    # The agent must be in graph.
    global_step = tf.compat.v1.train.get_or_create_global_step()
    agent = dqn_agent.DqnAgent(
        time_step_spec,
        action_spec,
        q_network=q_net,
        epsilon_greedy=epsilon_greedy,
        n_step_update=n_step_update,
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate),
        td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        gradient_clipping=gradient_clipping,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=global_step)

    tf_collect_policy = agent.collect_policy
    collect_policy = py_tf_policy.PyTFPolicy(tf_collect_policy)
    greedy_policy = py_tf_policy.PyTFPolicy(agent.policy)
    random_policy = random_py_policy.RandomPyPolicy(env.time_step_spec(),
                                                    env.action_spec())

    # Python replay buffer.
    replay_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
        capacity=replay_buffer_capacity,
        data_spec=tensor_spec.to_nest_array_spec(agent.collect_data_spec))

    time_step = env.reset()

    # Initialize the replay buffer with some transitions. We use the random
    # policy to initialize the replay buffer to make sure we get a good
    # distribution of actions.
    for _ in range(initial_collect_steps):
        time_step = collect_step(env, time_step, random_policy, replay_buffer)

    # TODO(b/112041045) Use global_step as counter.
    train_checkpointer = common.Checkpointer(ckpt_dir=train_dir,
                                             agent=agent,
                                             global_step=global_step)

    policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
        train_dir, 'policy'),
                                              policy=agent.policy,
                                              global_step=global_step)

    ds = replay_buffer.as_dataset(sample_batch_size=batch_size,
                                  num_steps=n_step_update + 1)
    ds = ds.prefetch(4)
    itr = tf.compat.v1.data.make_initializable_iterator(ds)

    experience = itr.get_next()

    train_op = common.function(agent.train)(experience)

    with eval_summary_writer.as_default(), \
         tf.compat.v2.summary.record_if(True):
        for eval_metric in eval_metrics:
            eval_metric.tf_summaries(train_step=global_step)

    with tf.compat.v1.Session() as session:
        train_checkpointer.initialize_or_restore(session)
        common.initialize_uninitialized_variables(session)
        session.run(itr.initializer)
        # Copy critic network values to the target critic network.
        session.run(agent.initialize())
        train = session.make_callable(train_op)
        global_step_call = session.make_callable(global_step)
        session.run(train_summary_writer.init())
        session.run(eval_summary_writer.init())

        # Compute initial evaluation metrics.
        global_step_val = global_step_call()
        metric_utils.compute_summaries(
            eval_metrics,
            eval_py_env,
            greedy_policy,
            num_episodes=num_eval_episodes,
            global_step=global_step_val,
            log=True,
            callback=eval_metrics_callback,
        )

        timed_at_step = global_step_val
        collect_time = 0
        train_time = 0
        steps_per_second_ph = tf.compat.v1.placeholder(tf.float32,
                                                       shape=(),
                                                       name='steps_per_sec_ph')
        steps_per_second_summary = tf.compat.v2.summary.scalar(
            name='global_steps_per_sec',
            data=steps_per_second_ph,
            step=global_step)

        for _ in range(num_iterations):
            start_time = time.time()
            for _ in range(collect_steps_per_iteration):
                time_step = collect_step(env, time_step, collect_policy,
                                         replay_buffer)
            collect_time += time.time() - start_time
            start_time = time.time()
            for _ in range(train_steps_per_iteration):
                loss = train()
            train_time += time.time() - start_time
            global_step_val = global_step_call()
            if global_step_val % log_interval == 0:
                logging.info('step = %d, loss = %f', global_step_val,
                             loss.loss)
                steps_per_sec = ((global_step_val - timed_at_step) /
                                 (collect_time + train_time))
                session.run(steps_per_second_summary,
                            feed_dict={steps_per_second_ph: steps_per_sec})
                logging.info('%.3f steps/sec', steps_per_sec)
                logging.info(
                    '%s', 'collect_time = {}, train_time = {}'.format(
                        collect_time, train_time))
                timed_at_step = global_step_val
                collect_time = 0
                train_time = 0

            if global_step_val % train_checkpoint_interval == 0:
                train_checkpointer.save(global_step=global_step_val)

            if global_step_val % policy_checkpoint_interval == 0:
                policy_checkpointer.save(global_step=global_step_val)

            if global_step_val % eval_interval == 0:
                metric_utils.compute_summaries(
                    eval_metrics,
                    eval_py_env,
                    greedy_policy,
                    num_episodes=num_eval_episodes,
                    global_step=global_step_val,
                    log=True,
                    callback=eval_metrics_callback,
                )
                # Reset timing to avoid counting eval time.
                timed_at_step = global_step_val
                start_time = time.time()