def test_step(self): tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load('CartPole-v0')) indexed_tf_env = IndexedTFEnv(tf_env, 5) # take first action a1 = PolicyStep(action=tf.convert_to_tensor([1]), state=(), info=()) time_step_0 = indexed_tf_env.step(a1) self.assertEqual(time_step_0["env_id"], 5) self.assertEqual(time_step_0["ts_id"], 0) self.assertEqual(time_step_0["reward"], 0) self.assertEqual(time_step_0["step_type"], 0) self.assertEqual(time_step_0["discount"], 1.0) self.assertTrue("ob_0" in time_step_0) self.assertTrue("ob_1" in time_step_0) self.assertTrue("ob_2" in time_step_0) self.assertTrue("ob_3" in time_step_0) # take second action a2 = PolicyStep(action=tf.convert_to_tensor([0]), state=(), info=()) time_step_1 = indexed_tf_env.step(a2) self.assertEqual(time_step_1["env_id"], 5) self.assertEqual(time_step_1["ts_id"], 1) self.assertEqual(time_step_1["reward"], 1) self.assertEqual(time_step_1["step_type"], 1) self.assertEqual(time_step_1["discount"], 1.0) self.assertTrue("ob_0" in time_step_1) self.assertTrue("ob_1" in time_step_1) self.assertTrue("ob_2" in time_step_1) self.assertTrue("ob_3" in time_step_1)
def _actor_train_step(self, exp: Experience, state: DdpgActorState): action, actor_state = self._actor_network(exp.observation, exp.step_type, network_state=state.actor) with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(action) q_value, critic_state = self._critic_network( (exp.observation, action), network_state=state.critic) dqda = tape.gradient(q_value, action) def actor_loss_fn(dqda, action): if self._dqda_clipping: dqda = tf.clip_by_value(dqda, -self._dqda_clipping, self._dqda_clipping) loss = 0.5 * losses.element_wise_squared_loss( tf.stop_gradient(dqda + action), action) loss = tf.reduce_sum(loss, axis=list(range(1, len(loss.shape)))) return loss actor_loss = tf.nest.map_structure(actor_loss_fn, dqda, action) state = DdpgActorState(actor=actor_state, critic=critic_state) info = LossInfo(loss=tf.add_n(tf.nest.flatten(actor_loss)), extra=actor_loss) return PolicyStep(action=action, state=state, info=info)
def action(self, time_step): observation = time_step.observation batch_size = observation.shape[0] action = tf.constant(self._action, dtype=tf.float32, shape=[batch_size, 1]) return PolicyStep(action=action)
def rollout(self, time_step: ActionTimeStep, state: AgentState): """Rollout for one step.""" new_state = AgentState() info = AgentInfo() observation = self._encode(time_step) if self._icm is not None: icm_step = self._icm.train_step( (observation, time_step.prev_action), state=state.icm) info = info._replace(icm=icm_step.info) new_state = new_state._replace(icm=icm_step.state) rl_step = self._rl_algorithm.rollout( time_step._replace(observation=observation), state.rl) new_state = new_state._replace(rl=rl_step.state) info = info._replace(rl=rl_step.info) # TODO # avoid computing this when rollout (off policy train) if self._entropy_target_algorithm: et_step = self._entropy_target_algorithm.train_step( rl_step.action, step_type=time_step.step_type) info = info._replace(entropy_target=et_step.info) return PolicyStep(action=rl_step.action, state=new_state, info=info)
def predict(self, time_step: ActionTimeStep, state=None): observation = self._encode(time_step) action_distribution, actor_state = self._actor_network( observation, step_type=time_step.step_type, network_state=state) return PolicyStep(action=action_distribution, state=actor_state, info=())
def rollout(self, time_step: ActionTimeStep, state, mode, epsilon_greedy=1.0): """Train one step. Args: time_step: time_step.observation should be the latent vector state: state of the model """ latent_vector = time_step.observation rnn_output, rnn_state = self._rnn(latent_vector, state) mem_readout = self._memory.genkey_and_read(self._key_net, rnn_output) policy_input = tf.concat( [tf.stop_gradient(latent_vector), rnn_output, mem_readout], axis=-1) action_distribution, _ = self._actor_net(policy_input, step_type=time_step.step_type, network_state=None) value, _ = self._value_net(latent_vector, step_type=time_step.step_type, network_state=None) info = ActorCriticInfo(action_distribution=action_distribution, value=value) action = common.epsilon_greedy_sample(action_distribution, epsilon_greedy) return PolicyStep(action=action, state=rnn_state, info=info)
def _rollout_partial_state(self, time_step: ActionTimeStep, state=None): action, state = self._actor_network(time_step.observation, step_type=time_step.step_type, network_state=state.share.actor) empty_state = tf.nest.map_structure(lambda x: (), self.train_state_spec) state = empty_state._replace(share=SacShareState(actor=state)) return PolicyStep(action=action, state=state, info=())
def predict(self, time_step: ActionTimeStep, state: ActorCriticState): """Predict for one step.""" action_distribution, actor_state = self._actor_network( time_step.observation, step_type=time_step.step_type, network_state=state.actor) return PolicyStep(action=action_distribution, state=ActorCriticState(actor=actor_state), info=())
def greedy_predict(self, time_step: ActionTimeStep, state=None, eps=0.1): observation = self._encode(time_step) new_state = AgentState() rl_step = self._rl_algorithm.greedy_predict( time_step._replace(observation=observation), state.rl) new_state = new_state._replace(rl=rl_step.state) return PolicyStep(action=rl_step.action, state=new_state, info=())
def greedy_predict(self, time_step: ActionTimeStep, state=None): action, state = self._actor_network( time_step.observation, step_type=time_step.step_type, network_state=state.actor.actor) empty_state = tf.nest.map_structure(lambda x: (), self.train_state_spec) state = empty_state._replace( actor=DdpgActorState(actor=state, critic=())) return PolicyStep(action=action, state=state, info=())
def predict(self, time_step: ActionTimeStep, state: AgentState): """Predict for one step.""" observation = self._encode(time_step) new_state = AgentState() rl_step = self._rl_algorithm.predict( time_step._replace(observation=observation), state.rl) new_state = new_state._replace(rl=rl_step.state) return PolicyStep(action=rl_step.action, state=new_state, info=())
def rollout(self, time_step: ActionTimeStep, state): """Train one step.""" mbp_step = self._mbp.train_step(inputs=(time_step.observation, time_step.prev_action), state=state.mbp_state) mba_step = self._mba.rollout( time_step=time_step._replace(observation=mbp_step.outputs), state=state.mba_state) return PolicyStep(action=mba_step.action, state=MerlinState(mbp_state=mbp_step.state, mba_state=mba_step.state), info=MerlinInfo(mbp_info=mbp_step.info, mba_info=mba_step.info))
def rollout(self, time_step: ActionTimeStep, state: ActorCriticState): """Rollout for one step.""" value, value_state = self._value_network(time_step.observation, step_type=time_step.step_type, network_state=state.value) action_distribution, actor_state = self._actor_network( time_step.observation, step_type=time_step.step_type, network_state=state.actor) return PolicyStep(action=action_distribution, state=ActorCriticState(actor=actor_state, value=value_state), info=ActorCriticInfo(value=value))
def test_eval_logger(): """ Tests the per step logging mediated through a custom TensorFlow metric. Note that due to the fact that TensorFlow places logging in a graph built through C++ which is only triggered when tensors are evaluated it is very difficult to capture the logging message even through using mocked output streams. Therefore, the test checks the attributes that can be tested and logs expected logging values for by-eye comparison. This is a fairly simple case since the logging code is simple but the test is in this sense in complete. """ # Set up the logger using default parameters. logger = EvalPerStepLogger() # Test that the time step counter is initialised to zero. assert logger._t == 0 # Build one time step's worth of data to be logged. observation = tf.convert_to_tensor(np.random.randint(10, size=(1, 1)), dtype=tf.float32) action = tf.convert_to_tensor(np.eye(2)[np.random.randint(2)]) reward = -1 * observation discount = tf.convert_to_tensor(np.array([0.99])) # The logger takes in a tuple of (TimeStep, PolicyStep, TimeStep) the second time step # represents the next period and is not used so we simply pass a copy of the original time step. time_step = ts.TimeStep(ts.StepType(1), reward, discount, observation) policy_step = PolicyStep(action, state=(), info=()) next_time_step = copy.deepcopy(time_step) # Collect the data in a tuple as required by the logger. time_step_data = (time_step, policy_step, next_time_step) # Print the expected logging term for comparison by eye. tf.print("\nExpected Values\nStep: ", 0, "\t", "State: ", observation, "\t", "Action: ", action, end="\n", output_stream=sys.stdout) # Run the logging for a single time step. logger(time_step_data) # Check that the time step counter has incremented. assert logger._t == 1
def predict(self, time_step: ActionTimeStep, state: AgentState, epsilon_greedy): """Predict for one step.""" observation = self._encode(time_step) new_state = AgentState() if self._goal_generator is not None: goal_step = self._goal_generator.predict( time_step._replace(observation=observation), state.goal_generator, epsilon_greedy) new_state = new_state._replace(goal_generator=goal_step.state) observation = [observation, goal_step.action] rl_step = self._rl_algorithm.predict( time_step._replace(observation=observation), state.rl, epsilon_greedy) new_state = new_state._replace(rl=rl_step.state) return PolicyStep(action=rl_step.action, state=new_state, info=())
def train_step(self, exp: Experience, state: SacState): action_distribution, share_actor_state = self._actor_network( exp.observation, step_type=exp.step_type, network_state=state.share.actor) action = tf.nest.map_structure(lambda d: d.sample(), action_distribution) log_pi = tfa_common.log_probability(action_distribution, action, self._action_spec) actor_state, actor_info = self._actor_train_step( exp, state.actor, action_distribution, action, log_pi) critic_state, critic_info = self._critic_train_step( exp, state.critic, action, log_pi) alpha_info = self._alpha_train_step(log_pi) state = SacState(share=SacShareState(actor=share_actor_state), actor=actor_state, critic=critic_state) info = SacInfo(actor=actor_info, critic=critic_info, alpha=alpha_info) return PolicyStep(action_distribution, state, info)
def predict(self, time_step: ActionTimeStep, state, epsilon_greedy): action, state = self._actor_network(time_step.observation, step_type=time_step.step_type, network_state=state.actor.actor) empty_state = tf.nest.map_structure(lambda x: (), self.train_state_spec) def _sample(a, ou): return tf.cond( tf.less(tf.random.uniform((), 0, 1), epsilon_greedy), lambda: a + ou(), lambda: a) noisy_action = tf.nest.map_structure(_sample, action, self._ou_process) noisy_action = tf.nest.map_structure(tfa_common.clip_to_spec, noisy_action, self._action_spec) state = empty_state._replace( actor=DdpgActorState(actor=state, critic=())) return PolicyStep(action=noisy_action, state=state, info=DdpgInfo(action_distribution=action))
def rollout(self, time_step: ActionTimeStep, state=None): observation = self._encode(time_step) value, value_state = self._value_network( observation, step_type=time_step.step_type, network_state=state.value_state) # ValueRnnNetwork will add a time dim to value # See value_rnn_network.py L153 if isinstance(self._value_network, ValueRnnNetwork): value = tf.squeeze(value, axis=1) action_distribution, actor_state = self._actor_network( observation, step_type=time_step.step_type, network_state=state.actor_state) info = ActorCriticInfo(value=value, icm_reward=(), icm_info=(), entropy_target_info=()) if self._icm is not None: icm_step = self._icm.train_step( (observation, time_step.prev_action), state=state.icm_state) info = info._replace(icm_reward=icm_step.outputs, icm_info=icm_step.info) icm_state = icm_step.state else: icm_state = () if self._entropy_target_algorithm: et_step = self._entropy_target_algorithm.train_step( action_distribution) info = info._replace(entropy_target_info=et_step.info) state = ActorCriticState(actor_state=actor_state, value_state=value_state, icm_state=icm_state) return PolicyStep(action=action_distribution, state=state, info=info)
def rollout(self, time_step: ActionTimeStep, state: AgentState, mode): """Rollout for one step.""" new_state = AgentState() info = AgentInfo() observation = self._encode(time_step) if self._goal_generator is not None: goal_step = self._goal_generator.rollout( time_step._replace(observation=time_step.observation), state.goal_generator, mode) new_state = new_state._replace(goal_generator=goal_step.state) info = info._replace(goal_generator=goal_step.info) observation = [observation, goal_step.action] if self._icm is not None: icm_step = self._icm.train_step( time_step._replace(observation=observation), state=state.icm) info = info._replace(icm=icm_step.info) new_state = new_state._replace(icm=icm_step.state) rl_step = self._rl_algorithm.rollout( time_step._replace(observation=observation), state.rl, mode) new_state = new_state._replace(rl=rl_step.state) info = info._replace(rl=rl_step.info) if self._entropy_target_algorithm: # TODO: For off-policy training, skip entropy_target_algorithm # during rollout() assert 'action_distribution' in rl_step.info._fields, ( "PolicyStep from rl_algorithm.rollout() does not contain " "`action_distribution`, which is required by " "`enforce_entropy_target`") et_step = self._entropy_target_algorithm.train_step( rl_step.info.action_distribution, step_type=time_step.step_type) info = info._replace(entropy_target=et_step.info) return PolicyStep(action=rl_step.action, state=new_state, info=info)
def _prepare_specs(self, algorithm): """Prepare various tensor specs.""" time_step = self.get_initial_time_step() self._time_step_spec = common.extract_spec(time_step) self._action_spec = self._env.action_spec() policy_step = algorithm.rollout( algorithm.transform_timestep(time_step), self._initial_state) info_spec = common.extract_spec(policy_step.info) self._policy_step_spec = PolicyStep( action=self._action_spec, state=algorithm.train_state_spec, info=info_spec) self._action_distribution_spec = tf.nest.map_structure( common.to_distribution_spec, algorithm.action_distribution_spec) self._action_dist_param_spec = tf.nest.map_structure( lambda spec: spec.input_params_spec, self._action_distribution_spec) algorithm.prepare_off_policy_specs(time_step)
def train_step(self, time_step: ActionTimeStep, state): """Train one step. Args: time_step: time_step.observation should be the latent vector state: state of the model """ latent_vector = time_step.observation rnn_output, rnn_state = self._rnn(latent_vector, state) mem_readout = self._memory.genkey_and_read(self._key_net, rnn_output) policy_input = tf.concat( [tf.stop_gradient(latent_vector), rnn_output, mem_readout], axis=-1) action_distribution, _ = self._actor_net( policy_input, step_type=time_step.step_type, network_state=None) value, _ = self._value_net( latent_vector, step_type=time_step.step_type, network_state=None) info = ActorCriticInfo( value=value, icm_reward=(), icm_info=(), entropy_target_info=()) return PolicyStep( action=action_distribution, state=rnn_state, info=info)
def train_step(self, exp: Experience, state): new_state = AgentState() info = AgentInfo() observation = self._encode(exp) if self._goal_generator is not None: goal_step = self._goal_generator.train_step( exp._replace(observation=observation), state.goal_generator) info = info._replace(goal_generator=goal_step.info) new_state = new_state._replace(goal_generator=goal_step.state) observation = [observation, goal_step.action] if self._icm is not None: icm_step = self._icm.train_step( exp._replace(observation=observation), state=state.icm, calc_intrinsic_reward=False) info = info._replace(icm=icm_step.info) new_state = new_state._replace(icm=icm_step.state) rl_step = self._rl_algorithm.train_step( exp._replace(observation=observation, rollout_info=exp.rollout_info.rl), state.rl) new_state = new_state._replace(rl=rl_step.state) info = info._replace(rl=rl_step.info) if self._entropy_target_algorithm: assert 'action_distribution' in rl_step.info._fields, ( "PolicyStep from rl_algorithm.train_step() does not contain " "`action_distribution`, which is required by " "`enforce_entropy_target`") et_step = self._entropy_target_algorithm.train_step( rl_step.info.action_distribution, step_type=exp.step_type) info = info._replace(entropy_target=et_step.info) return PolicyStep(action=rl_step.action, state=new_state, info=info)
def train_step(self, exp: Experience, state): new_state = AgentState() info = AgentInfo() observation = self._encode(exp) if self._icm is not None: icm_step = self._icm.train_step((observation, exp.prev_action), state=state.icm, calc_intrinsic_reward=False) info = info._replace(icm=icm_step.info) new_state = new_state._replace(icm=icm_step.state) rl_step = self._rl_algorithm.train_step( exp._replace(observation=observation), state.rl) new_state = new_state._replace(rl=rl_step.state) info = info._replace(rl=rl_step.info) if self._entropy_target_algorithm: et_step = self._entropy_target_algorithm.train_step( rl_step.action, step_type=exp.step_type) info = info._replace(entropy_target=et_step.info) return PolicyStep(action=rl_step.action, state=new_state, info=info)
def greedy_predict(self, time_step: ActionTimeStep, state=None): action, state = self._actor_network(time_step.observation, step_type=time_step.step_type, network_state=state) return PolicyStep(action=action, state=state, info=())
def distribution(self, time_step, policy_state=()): del policy_state action = self.action(time_step).action return PolicyStep(action=_MockDistribution(action))
def _prepare_specs(self, algorithm): """Prepare various tensor specs.""" def extract_spec(nest): return tf.nest.map_structure( lambda t: tf.TensorSpec(t.shape[1:], t.dtype), nest) time_step = self.get_initial_time_step() self._time_step_spec = extract_spec(time_step) self._action_spec = self._env.action_spec() policy_step = algorithm.predict(time_step, self._initial_state) info_spec = extract_spec(policy_step.info) self._pred_policy_step_spec = PolicyStep( action=self._action_spec, state=algorithm.predict_state_spec, info=info_spec) def _to_distribution_spec(spec): if isinstance(spec, tf.TensorSpec): return DistributionSpec(tfp.distributions.Deterministic, input_params_spec={"loc": spec}, sample_spec=spec) return spec self._action_distribution_spec = tf.nest.map_structure( _to_distribution_spec, algorithm.action_distribution_spec) self._action_dist_param_spec = tf.nest.map_structure( lambda spec: spec.input_params_spec, self._action_distribution_spec) self._experience_spec = Experience( step_type=self._time_step_spec.step_type, reward=self._time_step_spec.reward, discount=self._time_step_spec.discount, observation=self._time_step_spec.observation, prev_action=self._action_spec, action=self._action_spec, info=info_spec, action_distribution=self._action_dist_param_spec) action_dist_params = common.zero_tensor_from_nested_spec( self._experience_spec.action_distribution, self._env.batch_size) action_dist = nested_distributions_from_specs( self._action_distribution_spec, action_dist_params) exp = Experience(step_type=time_step.step_type, reward=time_step.reward, discount=time_step.discount, observation=time_step.observation, prev_action=time_step.prev_action, action=time_step.prev_action, info=policy_step.info, action_distribution=action_dist) processed_exp = algorithm.preprocess_experience(exp) self._processed_experience_spec = self._experience_spec._replace( info=extract_spec(processed_exp.info)) policy_step = common.algorithm_step( algorithm, ob_transformer=self._observation_transformer, time_step=exp, state=common.get_initial_policy_state(self._env.batch_size, algorithm.train_state_spec), training=True) info_spec = extract_spec(policy_step.info) self._training_info_spec = make_training_info( action=self._action_spec, action_distribution=self._action_dist_param_spec, step_type=self._time_step_spec.step_type, reward=self._time_step_spec.reward, discount=self._time_step_spec.discount, info=info_spec, collect_info=self._processed_experience_spec.info, collect_action_distribution=self._action_dist_param_spec)
def action(self, time_step): del time_step action = tf.constant(self._action, dtype=tf.float32, shape=[1]) return PolicyStep(action=action)