def __init__(self, policy, batch_size=None, seed=None): """Initializes a new `PyTFPolicy`. Args: policy: A TF Policy implementing `tf_policy.Base`. batch_size: (deprecated) seed: Seed to use if policy performs random actions (optional). """ if not isinstance(policy, tf_policy.Base): logging.warning('Policy should implement tf_policy.Base') if batch_size is not None: logging.warning( 'In PyTFPolicy constructor, `batch_size` is deprecated, ' 'this parameter has no effect. This argument will be ' 'removed on 2019-05-01') time_step_spec = tensor_spec.to_nest_array_spec(policy.time_step_spec) action_spec = tensor_spec.to_nest_array_spec(policy.action_spec) super(PyTFPolicy, self).__init__(time_step_spec, action_spec, policy_state_spec=(), info_spec=()) self._tf_policy = policy self.session = None self._policy_state_spec = tensor_spec.to_nest_array_spec( self._tf_policy.policy_state_spec) self._batch_size = None self._batched = None self._seed = seed self._built = False
def testBuild(self): policy = py_tf_policy.PyTFPolicy(self._tf_policy) expected_time_step_spec = ts.time_step_spec( tensor_spec.to_nest_array_spec(self._obs_spec)) expected_action_spec = tensor_spec.to_nest_array_spec(self._action_spec) self.assertEqual(expected_time_step_spec, policy.time_step_spec()) self.assertEqual(expected_action_spec, policy.action_spec())
def __init__(self, policy, batch_size=None, seed=None): """Initializes a new `PyTFPolicy`. Args: policy: A TF Policy implementing `tf_policy.Base`. batch_size: (deprecated) seed: Seed to use if policy performs random actions (optional). """ if not isinstance(policy, tf_policy.Base): logging.warning('Policy should implement tf_policy.Base') time_step_spec = tensor_spec.to_nest_array_spec(policy.time_step_spec()) action_spec = tensor_spec.to_nest_array_spec(policy.action_spec()) super(PyTFPolicy, self).__init__( time_step_spec, action_spec, policy_state_spec=(), info_spec=()) self._tf_policy = policy self.session = None self._policy_state_spec = tensor_spec.to_nest_array_spec( self._tf_policy.policy_state_spec()) self._batch_size = None self._batched = None self._seed = seed self._built = False
def __init__(self, policy, batch_size=None, seed=None): """Initializes a new `PyTFPolicy`. Args: policy: A TF Policy implementing `tf_policy.Base`. batch_size: The batch size of time_steps and actions. seed: Seed to use if policy performs random actions (optional). """ if not isinstance(policy, tf_policy.Base): tf.logging.warning('Policy should implement tf_policy.Base') self._tf_policy = policy self.session = None self._time_step_spec = tensor_spec.to_nest_array_spec( self._tf_policy.time_step_spec()) self._action_spec = tensor_spec.to_nest_array_spec( self._tf_policy.action_spec()) self._policy_state_spec = tensor_spec.to_nest_array_spec( self._tf_policy.policy_state_spec()) self._batch_size = batch_size self._seed = seed self._batched = batch_size is not None self._set_up_feeds_and_fetches()
def __init__(self, policy, use_tf_function=False): time_step_spec = tensor_spec.to_nest_array_spec(policy.time_step_spec) action_spec = tensor_spec.to_nest_array_spec(policy.action_spec) policy_state_spec = tensor_spec.to_nest_array_spec(policy.policy_state_spec) info_spec = tensor_spec.to_nest_array_spec(policy.info_spec) super(PyTFEagerPolicy, self).__init__(policy, time_step_spec, action_spec, policy_state_spec, info_spec, use_tf_function)
def __init__(self, policy: tf_policy.TFPolicy, use_tf_function: bool = False, batch_time_steps=True): time_step_spec = tensor_spec.to_nest_array_spec(policy.time_step_spec) action_spec = tensor_spec.to_nest_array_spec(policy.action_spec) policy_state_spec = tensor_spec.to_nest_array_spec(policy.policy_state_spec) info_spec = tensor_spec.to_nest_array_spec(policy.info_spec) super(PyTFEagerPolicy, self).__init__(policy, time_step_spec, action_spec, policy_state_spec, info_spec, use_tf_function, batch_time_steps)
def __init__(self, output_path, tensor_data_spec, py_mode=False): """Creates observer object. Args: output_path: The path to the TFRecords file. tensor_data_spec: Nested list/tuple or dict of TensorSpecs, describing the shape of the non-batched Tensors. py_mode: Whether the observer is being used in a py_driver. Raises: ValueError: if the tensors and specs have incompatible dimensions or shapes. """ self._py_mode = py_mode self._array_data_spec = tensor_spec.to_nest_array_spec( tensor_data_spec) self._encoder = example_encoding.get_example_serializer( self._array_data_spec) # Two output files: a tfrecord file and a file with the serialized spec self.output_path = output_path tf.io.gfile.makedirs(os.path.dirname(self.output_path)) self._writer = tf.io.TFRecordWriter(self.output_path) logging.info('Writing dataset to TFRecord at %s', self.output_path) # Save the tensor spec used to write the dataset to file spec_output_path = self.output_path + _SPEC_FILE_EXTENSION encode_spec_to_file(spec_output_path, tensor_data_spec)
def setUp(self): super(ReverbReplayBufferTest, self).setUp() # Prepare the environment (and the corresponding specs). self._env = test_envs.EpisodeCountingEnv(steps_per_episode=3) tensor_time_step_spec = tf.nest.map_structure(tensor_spec.from_spec, self._env.time_step_spec()) tensor_action_spec = tensor_spec.from_spec(self._env.action_spec()) self._data_spec = trajectory.Trajectory( step_type=tensor_time_step_spec.step_type, observation=tensor_time_step_spec.observation, action=tensor_action_spec, policy_info=(), next_step_type=tensor_time_step_spec.step_type, reward=tensor_time_step_spec.reward, discount=tensor_time_step_spec.discount, ) table_spec = tf.nest.map_structure( lambda s: tf.TensorSpec(dtype=s.dtype, shape=(None,) + s.shape), self._data_spec) self._array_data_spec = tensor_spec.to_nest_array_spec(self._data_spec) # Initialize and start a Reverb server (and set up a client to it). self._table_name = 'test_table' uniform_table = reverb.Table( self._table_name, max_size=100, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), signature=table_spec, ) self._server = reverb.Server([uniform_table]) self._py_client = reverb.Client('localhost:{}'.format(self._server.port))
def __init__(self, playerIndex, debug=False, create_model=True): """ Initialize an agent. """ super().__init__(playerIndex, debug=debug) self.trainable = True # Whether to use small numbers for debugging reasons self.use_small_numbers = use_small_nums # Hyperparameters self.alpha = 0.01 # learning rate self.gamma = 0.95 # favour future rewards self.exploration_decay_rate = 1 / 2000 self.reward_win_round = 0.005 self.reward_per_card_played = 0.001 self.rewards = { 0: 1.0, # No other agent finished before 1: 0.05, # One other agent finished before 2: 0.04, # Two other agents finished before 3: -1.0, # Three other agents finished before } # Training/Batch parameters self.sample_batch = 64 if self.use_small_numbers else 512 self.replay_capacity = 128 if self.use_small_numbers else 1024 self.train_each_n_steps = 5 if self.use_small_numbers else 50 self.step_iteration = 0 self.model_data_spec = ( # TODO adjust to new model tf.TensorSpec([4 * 13], tf.int8, "board_state"), tf.TensorSpec([1], tf.float32, "q_value"), ) self.replay_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer( capacity=self.replay_capacity, data_spec=tensor_spec.to_nest_array_spec(self.model_data_spec) ) # Validation parameters self.val_replay_capacity = 20 if self.use_small_numbers else 200 self.validation_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer( capacity=self.val_replay_capacity, data_spec=tensor_spec.to_nest_array_spec(self.model_data_spec) ) # Initialize model if create_model: self._create_model()
def get_episode_spec(traj_spec, max_episode_len): traj_arr_spec = tensor_spec.to_nest_array_spec(traj_spec) traj_batch_arr_spec = get_batched_spec(traj_arr_spec, max_episode_len) observation_spec = traj_batch_arr_spec.observation['pixels'] metric_spec = array_spec.BoundedArraySpec(shape=(max_episode_len, max_episode_len), dtype=np.float32, minimum=0.) return tensor_spec.from_spec( (observation_spec, observation_spec, metric_spec))
def _specs_from_collect_data_spec(self, policy_specs): policy_specs = tensor_spec.to_nest_array_spec(policy_specs) collect_data_spec = policy_specs['collect_data_spec'] policy_state_spec = policy_specs['policy_state_spec'] time_step_spec = ts.TimeStep(step_type=collect_data_spec.step_type, reward=collect_data_spec.reward, discount=collect_data_spec.discount, observation=collect_data_spec.observation) action_spec = collect_data_spec.action info_spec = collect_data_spec.policy_info return time_step_spec, action_spec, policy_state_spec, info_spec
def specs_from_collect_data_spec( loaded_policy_specs: types.NestedTensorSpec ) -> Dict[types.NestedSpec, types.NestedSpec]: """Creates policy specs from specs loaded from disk. The PolicySaver saves policy specs next to the saved model as a `struct.StructuredValue` proto. This recreates the original specs from the proto. Pass the proto loaded from the file with `tensor_spec.from_pbtxt_file()` to this function. Args: loaded_policy_specs: `struct.StructuredValue` proto that had been previously created by PolicySaver as a pbtxt. Returns: A dict with specs extracted from the proto. The dict contains the following keys and values. Except `time_step_spec` all the specs are nests of `ArraySpecs`. * `collect_data_spec`: Collect data spec for the policy. * `time_step_spec`: `TimeStepSpec` for the policy. * `action_spec`: Action spec for the policy * `policy_state_spec`: State spec for the policy. * `info_spec`: Info spec for the policy. """ policy_specs = tensor_spec.to_nest_array_spec(loaded_policy_specs) collect_data_spec = policy_specs['collect_data_spec'] policy_state_spec = policy_specs['policy_state_spec'] time_step_spec = ts.TimeStep( step_type=collect_data_spec.step_type, reward=collect_data_spec.reward, discount=collect_data_spec.discount, observation=collect_data_spec.observation) action_spec = collect_data_spec.action info_spec = collect_data_spec.policy_info return dict( collect_data_spec=collect_data_spec, time_step_spec=time_step_spec, action_spec=action_spec, policy_state_spec=policy_state_spec, info_spec=info_spec)
def __init__(self, output_path, tensor_data_spec, py_mode=False, compress_image=False, image_quality=95): """Creates observer object. Args: output_path: The path to the TFRecords file. tensor_data_spec: Nested list/tuple or dict of TensorSpecs, describing the shape of the non-batched Tensors. py_mode: Whether the observer is being used in a py_driver. compress_image: Whether to compress image. It is assumed that any uint8 tensor of rank 3 with shape (w,h,c) is an image. image_quality: An optional int. Defaults to 95. Quality of the compression from 0 to 100 (higher is better and slower). Raises: ValueError: if the tensors and specs have incompatible dimensions or shapes. """ self._py_mode = py_mode self._array_data_spec = tensor_spec.to_nest_array_spec( tensor_data_spec) self._encoder = example_encoding.get_example_serializer( self._array_data_spec, compress_image=compress_image, image_quality=image_quality) # Two output files: a tfrecord file and a file with the serialized spec self.output_path = output_path tf.io.gfile.makedirs(os.path.dirname(self.output_path)) self._writer = tf.io.TFRecordWriter(self.output_path) logging.info('Writing dataset to TFRecord at %s', self.output_path) # Save the tensor spec used to write the dataset to file spec_output_path = self.output_path + _SPEC_FILE_EXTENSION encode_spec_to_file(spec_output_path, tensor_data_spec)
def train_eval( root_dir, env_name='CartPole-v0', num_iterations=100000, fc_layer_params=(100, ), # Params for collect initial_collect_steps=1000, collect_steps_per_iteration=1, epsilon_greedy=0.1, replay_buffer_capacity=100000, # Params for target update target_update_tau=0.05, target_update_period=5, # Params for train train_steps_per_iteration=1, batch_size=64, learning_rate=1e-3, n_step_update=1, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, # Params for eval num_eval_episodes=10, eval_interval=1000, # Params for checkpoints, summaries and logging train_checkpoint_interval=10000, policy_checkpoint_interval=5000, log_interval=1000, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for DQN.""" root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ py_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), py_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes), ] # Note this is a python environment. env = batched_py_environment.BatchedPyEnvironment( [suite_gym.load(env_name)]) eval_py_env = suite_gym.load(env_name) # Convert specs to BoundedTensorSpec. action_spec = tensor_spec.from_spec(env.action_spec()) observation_spec = tensor_spec.from_spec(env.observation_spec()) time_step_spec = ts.time_step_spec(observation_spec) q_net = q_network.QNetwork(tensor_spec.from_spec(env.observation_spec()), tensor_spec.from_spec(env.action_spec()), fc_layer_params=fc_layer_params) # The agent must be in graph. global_step = tf.compat.v1.train.get_or_create_global_step() agent = dqn_agent.DqnAgent( time_step_spec, action_spec, q_network=q_net, epsilon_greedy=epsilon_greedy, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate), td_errors_loss_fn=dqn_agent.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) tf_collect_policy = agent.collect_policy collect_policy = py_tf_policy.PyTFPolicy(tf_collect_policy) greedy_policy = py_tf_policy.PyTFPolicy(agent.policy) random_policy = random_py_policy.RandomPyPolicy(env.time_step_spec(), env.action_spec()) # Python replay buffer. replay_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer( capacity=replay_buffer_capacity, data_spec=tensor_spec.to_nest_array_spec(agent.collect_data_spec)) time_step = env.reset() # Initialize the replay buffer with some transitions. We use the random # policy to initialize the replay buffer to make sure we get a good # distribution of actions. for _ in range(initial_collect_steps): time_step = collect_step(env, time_step, random_policy, replay_buffer) # TODO(b/112041045) Use global_step as counter. train_checkpointer = common.Checkpointer(ckpt_dir=train_dir, agent=agent, global_step=global_step) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=agent.policy, global_step=global_step) ds = replay_buffer.as_dataset(sample_batch_size=batch_size, num_steps=n_step_update + 1) ds = ds.prefetch(4) itr = tf.compat.v1.data.make_initializable_iterator(ds) experience = itr.get_next() train_op = common.function(agent.train)(experience) with eval_summary_writer.as_default(), \ tf.compat.v2.summary.record_if(True): for eval_metric in eval_metrics: eval_metric.tf_summaries(train_step=global_step) with tf.compat.v1.Session() as session: train_checkpointer.initialize_or_restore(session) common.initialize_uninitialized_variables(session) session.run(itr.initializer) # Copy critic network values to the target critic network. session.run(agent.initialize()) train = session.make_callable(train_op) global_step_call = session.make_callable(global_step) session.run(train_summary_writer.init()) session.run(eval_summary_writer.init()) # Compute initial evaluation metrics. global_step_val = global_step_call() metric_utils.compute_summaries( eval_metrics, eval_py_env, greedy_policy, num_episodes=num_eval_episodes, global_step=global_step_val, log=True, callback=eval_metrics_callback, ) timed_at_step = global_step_val collect_time = 0 train_time = 0 steps_per_second_ph = tf.compat.v1.placeholder(tf.float32, shape=(), name='steps_per_sec_ph') steps_per_second_summary = tf.compat.v2.summary.scalar( name='global_steps_per_sec', data=steps_per_second_ph, step=global_step) for _ in range(num_iterations): start_time = time.time() for _ in range(collect_steps_per_iteration): time_step = collect_step(env, time_step, collect_policy, replay_buffer) collect_time += time.time() - start_time start_time = time.time() for _ in range(train_steps_per_iteration): loss = train() train_time += time.time() - start_time global_step_val = global_step_call() if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, loss.loss) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) session.run(steps_per_second_summary, feed_dict={steps_per_second_ph: steps_per_sec}) logging.info('%.3f steps/sec', steps_per_sec) logging.info( '%s', 'collect_time = {}, train_time = {}'.format( collect_time, train_time)) timed_at_step = global_step_val collect_time = 0 train_time = 0 if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) if global_step_val % eval_interval == 0: metric_utils.compute_summaries( eval_metrics, eval_py_env, greedy_policy, num_episodes=num_eval_episodes, global_step=global_step_val, log=True, callback=eval_metrics_callback, ) # Reset timing to avoid counting eval time. timed_at_step = global_step_val start_time = time.time()