def __init__(self, name, coord, env, tf_queues, unroll_length, id, actor_id): """ Args: name (str): name of the thread coord (tf.train.Coordinator): coordinate among threads env (TFEnvironment): A TFEnvironment tf_queues (TFQueues): an object for storing all the tf.FIFOQueues for communicating between threads unroll_length (int): Each env unrolls for so many steps before sending the steps to the learning queue. If the env is batched, then the total number would be `unroll_length` * `batch_size`. id (int): an integer identifies the env thread actor_id (int): indicates which actor thread the env thread should send time steps to. """ super().__init__(name=name, target=self._run, args=(coord, unroll_length)) self._env = env self._tfq = tf_queues self._id = id self._actor_q = self._tfq.actor_queues[actor_id] self._action_return_q = self._tfq.action_return_queues[id] self._unroll_queue = self._tfq.env_unroll_queues[id] self._initial_policy_state = common.get_initial_policy_state( self._env.batch_size, tf.nest.map_structure( lambda t: tf.TensorSpec(t.shape[1:], t.dtype), self._tfq._policy_step_spec.state))
def __init__(self, name, coord, env, tf_queues, unroll_length, id, actor_id, first_env_id=None): """ Args: name (str): name of the thread coord (tf.train.Coordinator): coordinate among threads env (TFEnvironment): A TFEnvironment tf_queues (TFQueues): an object for storing all the tf.FIFOQueues for communicating between threads unroll_length (int): Each env unrolls for so many steps before sending the steps to the learning queue. If the env is batched, then the total number would be `unroll_length` * `batch_size`. id (int): an integer identifies the env thread first_env_id (int): the id for the first environment of the batched environment `env`. If there are multiple `EnvThread`s. The `first_env_id` should be set in such way that the IDs for individual environments are different. If None, it's assumed that all the `env` have same batch_size and first_env_id will be set as `id * env.batch_size` actor_id (int): indicates which actor thread the env thread should send time steps to. """ super().__init__(name=name, target=self._run, args=(coord, unroll_length)) self._env = env self._tfq = tf_queues self._id = id if first_env_id is None: first_env_id = self._id * env.batch_size self._first_env_id = first_env_id self._actor_q = self._tfq.actor_queues[actor_id] self._action_return_q = self._tfq.action_return_queues[id] self._unroll_queue = self._tfq.env_unroll_queues[id] self._initial_policy_state = common.get_initial_policy_state( self._env.batch_size, tf.nest.map_structure( lambda t: tf.TensorSpec(t.shape[1:], t.dtype), self._tfq._batch_state_spec))
def _update(self, experience, weight): batch_size = tf.shape(experience.step_type)[1] counter = tf.zeros((), tf.int32) initial_train_state = common.get_initial_policy_state( batch_size, self.train_state_spec) if self._use_rollout_state: first_train_state = tf.nest.map_structure( lambda state: state[0, ...], experience.state) else: first_train_state = initial_train_state num_steps = tf.shape(experience.step_type)[0] def create_ta(s): # TensorArray cannot use Tensor (batch_size) as element_shape ta_batch_size = experience.step_type.shape[1] return tf.TensorArray(dtype=s.dtype, size=num_steps, element_shape=tf.TensorShape( [ta_batch_size]).concatenate(s.shape)) experience_ta = tf.nest.map_structure( create_ta, nest_utils.to_distribution_param_spec( self.processed_experience_spec)) experience_ta = tf.nest.map_structure( lambda elem, ta: ta.unstack(elem), experience, experience_ta) info_ta = tf.nest.map_structure( create_ta, nest_utils.to_distribution_param_spec(self.train_step_info_spec)) scope = get_current_scope() def _train_loop_body(counter, policy_state, info_ta): exp = tf.nest.map_structure(lambda ta: ta.read(counter), experience_ta) exp = nest_utils.params_to_distributions( exp, self.processed_experience_spec) policy_state = common.reset_state_if_necessary( policy_state, initial_train_state, tf.equal(exp.step_type, StepType.FIRST)) with tf.name_scope(scope): policy_step = self.train_step(exp, policy_state) info_ta = tf.nest.map_structure( lambda ta, x: ta.write(counter, x), info_ta, nest_utils.distributions_to_params(policy_step.info)) counter += 1 return [counter, policy_step.state, info_ta] with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape: tape.watch(self.trainable_variables) [_, _, info_ta] = tf.while_loop( cond=lambda counter, *_: tf.less(counter, num_steps), body=_train_loop_body, loop_vars=[counter, first_train_state, info_ta], back_prop=True, name="train_loop") info = tf.nest.map_structure(lambda ta: ta.stack(), info_ta) info = nest_utils.params_to_distributions( info, self.train_step_info_spec) experience = nest_utils.params_to_distributions( experience, self.processed_experience_spec) training_info = TrainingInfo(action=experience.action, reward=experience.reward, discount=experience.discount, step_type=experience.step_type, rollout_info=experience.rollout_info, info=info, env_id=experience.env_id) loss_info, grads_and_vars = self.train_complete( tape=tape, training_info=training_info, weight=weight) del tape return training_info, loss_info, grads_and_vars
def _update(self, experience, weight): batch_size = experience.step_type.shape[1] counter = tf.zeros((), tf.int32) initial_train_state = common.get_initial_policy_state( batch_size, self._algorithm.train_state_spec) num_steps = experience.step_type.shape[0] def create_ta(s): return tf.TensorArray(dtype=s.dtype, size=num_steps, element_shape=tf.TensorShape( [batch_size]).concatenate(s.shape)) experience_ta = tf.nest.map_structure(create_ta, self._processed_experience_spec) experience_ta = tf.nest.map_structure( lambda elem, ta: ta.unstack(elem), experience, experience_ta) training_info_ta = tf.nest.map_structure(create_ta, self._training_info_spec) def _train_loop_body(counter, policy_state, training_info_ta): exp = tf.nest.map_structure(lambda ta: ta.read(counter), experience_ta) collect_action_distribution_param = exp.action_distribution collect_action_distribution = nested_distributions_from_specs( self._action_distribution_spec, collect_action_distribution_param) exp = exp._replace(action_distribution=collect_action_distribution) policy_state = common.reset_state_if_necessary( policy_state, initial_train_state, tf.equal(exp.step_type, StepType.FIRST)) policy_step = common.algorithm_step(self._algorithm, self._observation_transformer, exp, policy_state, training=True) action_dist_param = common.get_distribution_params( policy_step.action) training_info = make_training_info( action=exp.action, action_distribution=action_dist_param, reward=exp.reward, discount=exp.discount, step_type=exp.step_type, info=policy_step.info, collect_info=exp.info, collect_action_distribution=collect_action_distribution_param) training_info_ta = tf.nest.map_structure( lambda ta, x: ta.write(counter, x), training_info_ta, training_info) counter += 1 return [counter, policy_step.state, training_info_ta] with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape: tape.watch(self._trainable_variables) [_, _, training_info_ta] = tf.while_loop( cond=lambda counter, *_: tf.less(counter, num_steps), body=_train_loop_body, loop_vars=[counter, initial_train_state, training_info_ta], back_prop=True, name="train_loop") training_info = tf.nest.map_structure(lambda ta: ta.stack(), training_info_ta) action_distribution = nested_distributions_from_specs( self._action_distribution_spec, training_info.action_distribution) collect_action_distribution = nested_distributions_from_specs( self._action_distribution_spec, training_info.collect_action_distribution) training_info = training_info._replace( action_distribution=action_distribution, collect_action_distribution=collect_action_distribution) loss_info, grads_and_vars = self._algorithm.train_complete( tape=tape, training_info=training_info, weight=weight) del tape return training_info, loss_info, grads_and_vars
def _prepare_specs(self, algorithm): """Prepare various tensor specs.""" def extract_spec(nest): return tf.nest.map_structure( lambda t: tf.TensorSpec(t.shape[1:], t.dtype), nest) time_step = self.get_initial_time_step() self._time_step_spec = extract_spec(time_step) self._action_spec = self._env.action_spec() policy_step = algorithm.predict(time_step, self._initial_state) info_spec = extract_spec(policy_step.info) self._pred_policy_step_spec = PolicyStep( action=self._action_spec, state=algorithm.predict_state_spec, info=info_spec) def _to_distribution_spec(spec): if isinstance(spec, tf.TensorSpec): return DistributionSpec(tfp.distributions.Deterministic, input_params_spec={"loc": spec}, sample_spec=spec) return spec self._action_distribution_spec = tf.nest.map_structure( _to_distribution_spec, algorithm.action_distribution_spec) self._action_dist_param_spec = tf.nest.map_structure( lambda spec: spec.input_params_spec, self._action_distribution_spec) self._experience_spec = Experience( step_type=self._time_step_spec.step_type, reward=self._time_step_spec.reward, discount=self._time_step_spec.discount, observation=self._time_step_spec.observation, prev_action=self._action_spec, action=self._action_spec, info=info_spec, action_distribution=self._action_dist_param_spec) action_dist_params = common.zero_tensor_from_nested_spec( self._experience_spec.action_distribution, self._env.batch_size) action_dist = nested_distributions_from_specs( self._action_distribution_spec, action_dist_params) exp = Experience(step_type=time_step.step_type, reward=time_step.reward, discount=time_step.discount, observation=time_step.observation, prev_action=time_step.prev_action, action=time_step.prev_action, info=policy_step.info, action_distribution=action_dist) processed_exp = algorithm.preprocess_experience(exp) self._processed_experience_spec = self._experience_spec._replace( info=extract_spec(processed_exp.info)) policy_step = common.algorithm_step( algorithm, ob_transformer=self._observation_transformer, time_step=exp, state=common.get_initial_policy_state(self._env.batch_size, algorithm.train_state_spec), training=True) info_spec = extract_spec(policy_step.info) self._training_info_spec = make_training_info( action=self._action_spec, action_distribution=self._action_dist_param_spec, step_type=self._time_step_spec.step_type, reward=self._time_step_spec.reward, discount=self._time_step_spec.discount, info=info_spec, collect_info=self._processed_experience_spec.info, collect_action_distribution=self._action_dist_param_spec)
def get_initial_policy_state(self): """ Return can be the train or prediction state spec, depending on self._training """ return common.get_initial_policy_state(self._env.batch_size, self._policy_state_spec)
def prepare_off_policy_specs(self, time_step: ActionTimeStep): """Prepare various tensor specs for off_policy training. prepare_off_policy_specs is called by OffPolicyDriver._prepare_spec(). """ self._env_batch_size = time_step.step_type.shape[0] self._time_step_spec = common.extract_spec(time_step) initial_state = common.get_initial_policy_state( self._env_batch_size, self.train_state_spec) transformed_timestep = self.transform_timestep(time_step) policy_step = self.rollout(transformed_timestep, initial_state) info_spec = common.extract_spec(policy_step.info) self._action_distribution_spec = tf.nest.map_structure( common.to_distribution_spec, self.action_distribution_spec) self._action_dist_param_spec = tf.nest.map_structure( lambda spec: spec.input_params_spec, self._action_distribution_spec) self._experience_spec = Experience( step_type=self._time_step_spec.step_type, reward=self._time_step_spec.reward, discount=self._time_step_spec.discount, observation=self._time_step_spec.observation, prev_action=self._action_spec, action=self._action_spec, info=info_spec, action_distribution=self._action_dist_param_spec, state=self.train_state_spec if self._use_rollout_state else ()) action_dist_params = common.zero_tensor_from_nested_spec( self._experience_spec.action_distribution, self._env_batch_size) action_dist = nested_distributions_from_specs( self._action_distribution_spec, action_dist_params) exp = Experience(step_type=time_step.step_type, reward=time_step.reward, discount=time_step.discount, observation=time_step.observation, prev_action=time_step.prev_action, action=time_step.prev_action, info=policy_step.info, action_distribution=action_dist, state=initial_state if self._use_rollout_state else ()) transformed_exp = self.transform_timestep(exp) processed_exp = self.preprocess_experience(transformed_exp) self._processed_experience_spec = self._experience_spec._replace( observation=common.extract_spec(processed_exp.observation), info=common.extract_spec(processed_exp.info)) policy_step = common.algorithm_step( algorithm_step_func=self.train_step, time_step=processed_exp, state=initial_state) info_spec = common.extract_spec(policy_step.info) self._training_info_spec = TrainingInfo( action_distribution=self._action_dist_param_spec, info=info_spec)