def _generate_new_skills(time_step, state): rl_prev_action = state.skill # avoid dividing by 0 #steps = torch.max( # state.steps.to(torch.float32), torch.as_tensor(1.0)) rl_time_step = time_step._replace(reward=state.rl_reward, discount=state.rl_discount, prev_action=rl_prev_action) rl_step = self._rl.rollout_step(rl_time_step, state.rl) # store to replay buffer self._rl.observe_for_replay( make_experience( # ``rl_time_step.observation`` has been transformed!!! rl_time_step._replace( observation=rl_time_step.untransformed.observation), rl_step, state.rl)) return SkillGeneratorState( skill=rl_step.output, steps=torch.zeros_like(state.steps), discriminator=state.discriminator, rl=rl_step.state, rl_reward=torch.zeros_like(state.rl_reward), rl_discount=torch.ones_like(state.rl_discount))
def _step(self, time_step, policy_state): policy_state = common.reset_state_if_necessary(policy_state, self._initial_state, time_step.is_first()) if self._mode == self.PREDICT: step_func = functools.partial(self._algorithm.predict, epsilon_greedy=self._epsilon_greedy) elif self._mode == self.ON_POLICY_TRAINING: step_func = functools.partial(self._algorithm.rollout, mode=RLAlgorithm.ON_POLICY_TRAINING) elif self._mode == self.OFF_POLICY_TRAINING: step_func = functools.partial(self._algorithm.rollout, mode=RLAlgorithm.ROLLOUT) else: raise ValueError() transformed_time_step = self._algorithm.transform_timestep(time_step) policy_step = step_func(transformed_time_step, policy_state) next_time_step = self._env_step(policy_step.action) if self._observers: traj = from_transition(time_step, policy_step._replace(info=()), next_time_step) for observer in self._observers: observer(traj) if self._algorithm.exp_observers and self._training: policy_step = nest_utils.distributions_to_params(policy_step) exp = make_experience(time_step, policy_step, policy_state) self._algorithm.observe(exp) return next_time_step, policy_step, transformed_time_step
def _generate_new_action(time_step, state): rl_time_step = time_step._replace( reward=state.rl_reward, discount=state.rl_discount) observation, repr_state = rl_time_step.observation, () if self._repr_learner is not None: repr_step = self._repr_learner.rollout_step( time_step, state.repr) observation = repr_step.output repr_state = repr_step.state rl_step = self._rl.rollout_step( rl_time_step._replace(observation=observation), state.rl) # Store to replay buffer. super(DynamicActionRepeatAgent, self).observe_for_replay( make_experience( rl_time_step._replace( # Store the untransformed observation so that later it will # be transformed again during training observation=rl_time_step.untransformed.observation), rl_step, state)) steps, action = rl_step.output return ActionRepeatState( action=action, steps=steps + 1, # [0, K-1] -> [1, K] repr=repr_state, rl=rl_step.state, rl_reward=torch.zeros_like(state.rl_reward), rl_discount=torch.ones_like(state.rl_discount))
def experience_spec(self): """Spec for experience.""" policy_step_spec = PolicyStep(action=self.action_spec, state=self.train_state_spec, info=self.rollout_info_spec) exp_spec = make_experience(self.time_step_spec, policy_step_spec, policy_step_spec.state) if not self._use_rollout_state: exp_spec = exp_spec._replace(state=()) return exp_spec
def get_training_exps(self): """ Get training experiences from the learning queue Returns: exp (Experience): shapes are [Q, T, B, ...], where Q is learn_queue_cap env_id (tf.tensor): if not None, has the shape of (`num_envs`). Each element of `env_ids` indicates which batched env the data come from. steps (int): how many environment steps this batch of exps contain """ batch = self._tfq.learn_queue.dequeue_all() # convert the batch to the experience format exp = make_experience(batch.time_step, batch.policy_step, batch.state) # make the exp batch major for each environment num_envs, unroll_length, env_batch_size \ = batch.time_step.reward.shape[:3] steps = num_envs * unroll_length * env_batch_size return exp, steps
def test_agent_steps(self): batch_size = 1 observation_spec = TensorSpec((10, )) action_spec = BoundedTensorSpec((), dtype='int64') time_step = TimeStep( observation=observation_spec.zeros(outer_dims=(batch_size, )), prev_action=action_spec.zeros(outer_dims=(batch_size, ))) actor_net = functools.partial(ActorDistributionNetwork, fc_layer_params=(100, )) value_net = functools.partial(ValueNetwork, fc_layer_params=(100, )) # TODO: add a goal generator and an entropy target algorithm once they # are implemented. agent = Agent(observation_spec=observation_spec, action_spec=action_spec, rl_algorithm_cls=functools.partial( ActorCriticAlgorithm, actor_network_ctor=actor_net, value_network_ctor=value_net), intrinsic_reward_module=ICMAlgorithm( action_spec=action_spec, observation_spec=observation_spec)) predict_state = agent.get_initial_predict_state(batch_size) rollout_state = agent.get_initial_rollout_state(batch_size) train_state = agent.get_initial_train_state(batch_size) pred_step = agent.predict_step(time_step, predict_state, epsilon_greedy=0.1) self.assertEqual(pred_step.state.irm, ()) rollout_step = agent.rollout_step(time_step, rollout_state) self.assertNotEqual(rollout_step.state.irm, ()) exp = make_experience(time_step, rollout_step, rollout_state) train_step = agent.train_step(exp, train_state) self.assertNotEqual(train_step.state.irm, ()) self.assertTensorEqual(rollout_step.state.irm, train_step.state.irm)
def _generate_new_action(time_step, state): rl_time_step = time_step._replace( reward=state.rl_reward, # To keep consistent with other algorithms, we choose to multiply # discount with gamma once more in td_loss.py discount=state.rl_discount / self._gamma) observation, repr_state = rl_time_step.observation, () if self._repr_learner is not None: repr_step = self._repr_learner.rollout_step( time_step, state.repr) observation = repr_step.output repr_state = repr_step.state rl_step = self._rl.rollout_step( rl_time_step._replace(observation=observation), state.rl) rl_step = rl_step._replace( info=(rl_step.info, state.k, state.sample_rewards)) # Store to replay buffer. super(DynamicActionRepeatAgent, self).observe_for_replay( make_experience( rl_time_step._replace( # Store the untransformed observation so that later it will # be transformed again during training observation=rl_time_step.untransformed.observation), rl_step, state)) steps, action = rl_step.output return ActionRepeatState( action=action, steps=steps + 1, # [0, K-1] -> [1, K] k=torch.zeros_like(state.k), repr=repr_state, rl=rl_step.state, rl_reward=torch.zeros_like(state.rl_reward), sample_rewards=torch.zeros_like(state.sample_rewards), rl_discount=torch.ones_like(state.rl_discount))
def _test_preprocess_experience(self, train_reward_function, td_steps, reanalyze_ratio, expected): """ The following summarizes how the data is generated: .. code-block:: python # position: 01234567890123 step_type0 = 'FMMMLFMMLFMMMM' step_type1 = 'FMMMMMLFMMMMLF' scale = 1. for current model 2. for target model observation = [position] * 3 reward = position if train_reward_function and td_steps!=-1 else position * (step_type == LAST) value = 0.5 * position * scale action_probs = scale * [position, position+1, position] for env 0 scale * [position+1, position, position] for env 1 action = 1 for env 0 0 for env 1 """ reanalyze_td_steps = 2 num_unroll_steps = 4 batch_size = 2 obs_dim = 3 observation_spec = alf.TensorSpec([obs_dim]) action_spec = alf.BoundedTensorSpec((), minimum=0, maximum=1, dtype=torch.int32) reward_spec = alf.TensorSpec(()) time_step_spec = ds.time_step_spec(observation_spec, action_spec, reward_spec) global _mcts_model_id _mcts_model_id = 0 muzero = MuzeroAlgorithm(observation_spec, action_spec, model_ctor=_create_mcts_model, mcts_algorithm_ctor=MockMCTSAlgorithm, num_unroll_steps=num_unroll_steps, td_steps=td_steps, train_game_over_function=True, train_reward_function=train_reward_function, reanalyze_ratio=reanalyze_ratio, reanalyze_td_steps=reanalyze_td_steps, data_transformer_ctor=partial(FrameStacker, stack_size=2)) data_transformer = FrameStacker(observation_spec, stack_size=2) time_step = common.zero_tensor_from_nested_spec( time_step_spec, batch_size) dt_state = common.zero_tensor_from_nested_spec( data_transformer.state_spec, batch_size) state = muzero.get_initial_predict_state(batch_size) transformed_time_step, dt_state = data_transformer.transform_timestep( time_step, dt_state) alg_step = muzero.rollout_step(transformed_time_step, state) alg_step_spec = dist_utils.extract_spec(alg_step) experience = ds.make_experience(time_step, alg_step, state) experience_spec = ds.make_experience(time_step_spec, alg_step_spec, muzero.train_state_spec) replay_buffer = ReplayBuffer(data_spec=experience_spec, num_environments=batch_size, max_length=16, keep_episodic_info=True) # 01234567890123 step_type0 = 'FMMMLFMMLFMMMM' step_type1 = 'FMMMMMLFMMMMLF' dt_state = common.zero_tensor_from_nested_spec( data_transformer.state_spec, batch_size) for i in range(len(step_type0)): step_type = [step_type0[i], step_type1[i]] step_type = [ ds.StepType.MID if c == 'M' else (ds.StepType.FIRST if c == 'F' else ds.StepType.LAST) for c in step_type ] step_type = torch.tensor(step_type, dtype=torch.int32) reward = reward = torch.full([batch_size], float(i)) if not train_reward_function or td_steps == -1: reward = reward * (step_type == ds.StepType.LAST).to( torch.float32) time_step = time_step._replace( discount=(step_type != ds.StepType.LAST).to(torch.float32), step_type=step_type, observation=torch.tensor([[i, i + 1, i], [i + 1, i, i]], dtype=torch.float32), reward=reward, env_id=torch.arange(batch_size, dtype=torch.int32)) transformed_time_step, dt_state = data_transformer.transform_timestep( time_step, dt_state) alg_step = muzero.rollout_step(transformed_time_step, state) experience = ds.make_experience(time_step, alg_step, state) replay_buffer.add_batch(experience) state = alg_step.state env_ids = torch.tensor([0] * 14 + [1] * 14, dtype=torch.int64) positions = torch.tensor(list(range(14)) + list(range(14)), dtype=torch.int64) experience = replay_buffer.get_field(None, env_ids.unsqueeze(-1).cpu(), positions.unsqueeze(-1).cpu()) experience = experience._replace(replay_buffer=replay_buffer, batch_info=BatchInfo( env_ids=env_ids, positions=positions), rollout_info_field='rollout_info') processed_experience = muzero.preprocess_experience(experience) import pprint pprint.pprint(processed_experience.rollout_info) alf.nest.map_structure(lambda x, y: self.assertEqual(x, y), processed_experience.rollout_info, expected)
def unroll(self, unroll_length): r"""Unroll ``unroll_length`` steps using the current policy. Because the ``self._env`` is a batched environment. The total number of environment steps is ``self._env.batch_size * unroll_length``. Args: unroll_length (int): number of steps to unroll. Returns: Experience: The stacked experience with shape :math:`[T, B, \ldots]` for each of its members. """ if self._current_time_step is None: self._current_time_step = common.get_initial_time_step(self._env) if self._current_policy_state is None: self._current_policy_state = self.get_initial_rollout_state( self._env.batch_size) if self._current_transform_state is None: self._current_transform_state = self.get_initial_transform_state( self._env.batch_size) time_step = self._current_time_step policy_state = self._current_policy_state trans_state = self._current_transform_state experience_list = [] initial_state = self.get_initial_rollout_state(self._env.batch_size) env_step_time = 0. store_exp_time = 0. for _ in range(unroll_length): policy_state = common.reset_state_if_necessary( policy_state, initial_state, time_step.is_first()) transformed_time_step, trans_state = self.transform_timestep( time_step, trans_state) # save the untransformed time step in case that sub-algorithms need # to store it in replay buffers transformed_time_step = transformed_time_step._replace( untransformed=time_step) policy_step = self.rollout_step(transformed_time_step, policy_state) # release the reference to ``time_step`` transformed_time_step = transformed_time_step._replace( untransformed=()) action = common.detach(policy_step.output) t0 = time.time() next_time_step = self._env.step(action) env_step_time += time.time() - t0 self.observe_for_metrics(time_step.cpu()) if self._exp_replayer_type == "one_time": exp = make_experience(transformed_time_step, policy_step, policy_state) else: exp = make_experience(time_step.cpu(), policy_step, policy_state) t0 = time.time() self.observe_for_replay(exp) store_exp_time += time.time() - t0 exp_for_training = Experience( action=action, reward=transformed_time_step.reward, discount=transformed_time_step.discount, step_type=transformed_time_step.step_type, state=policy_state, prev_action=transformed_time_step.prev_action, observation=transformed_time_step.observation, rollout_info=dist_utils.distributions_to_params( policy_step.info), env_id=transformed_time_step.env_id) experience_list.append(exp_for_training) time_step = next_time_step policy_state = policy_step.state alf.summary.scalar("time/unroll_env_step", env_step_time) alf.summary.scalar("time/unroll_store_exp", store_exp_time) experience = alf.nest.utils.stack_nests(experience_list) experience = experience._replace( rollout_info=dist_utils.params_to_distributions( experience.rollout_info, self._rollout_info_spec)) self._current_time_step = time_step # Need to detach so that the graph from this unroll is disconnected from # the next unroll. Otherwise backward() will report error for on-policy # training after the next unroll. self._current_policy_state = common.detach(policy_state) self._current_transform_state = common.detach(trans_state) return experience
def test_preprocess_experience(self): """ The following summarizes how the data is generated: .. code-block:: python # position: 01234567890123 step_type0 = 'FMMMLFMMLFMMMM' step_type1 = 'FMMMMMLFMMMMLF' reward = position if train_reward_function and td_steps!=-1 else position * (step_type == LAST) action = t + 1 for env 0 t for env 1 """ num_unroll_steps = 4 batch_size = 2 obs_dim = 3 observation_spec = alf.TensorSpec([obs_dim]) action_spec = alf.BoundedTensorSpec((1, ), minimum=0, maximum=1, dtype=torch.float32) reward_spec = alf.TensorSpec(()) time_step_spec = ds.time_step_spec(observation_spec, action_spec, reward_spec) repr_learner = PredictiveRepresentationLearner( observation_spec, action_spec, num_unroll_steps=num_unroll_steps, decoder_ctor=partial(SimpleDecoder, target_field='reward', decoder_net_ctor=partial( EncodingNetwork, fc_layer_params=(4, ))), encoding_net_ctor=LSTMEncodingNetwork, dynamics_net_ctor=LSTMEncodingNetwork) time_step = common.zero_tensor_from_nested_spec( time_step_spec, batch_size) state = repr_learner.get_initial_predict_state(batch_size) alg_step = repr_learner.rollout_step(time_step, state) alg_step = alg_step._replace(output=torch.tensor([[1.], [0.]])) alg_step_spec = dist_utils.extract_spec(alg_step) experience = ds.make_experience(time_step, alg_step, state) experience_spec = ds.make_experience(time_step_spec, alg_step_spec, repr_learner.train_state_spec) replay_buffer = ReplayBuffer(data_spec=experience_spec, num_environments=batch_size, max_length=16, keep_episodic_info=True) # 01234567890123 step_type0 = 'FMMMLFMMLFMMMM' step_type1 = 'FMMMMMLFMMMMLF' for i in range(len(step_type0)): step_type = [step_type0[i], step_type1[i]] step_type = [ ds.StepType.MID if c == 'M' else (ds.StepType.FIRST if c == 'F' else ds.StepType.LAST) for c in step_type ] step_type = torch.tensor(step_type, dtype=torch.int32) reward = reward = torch.full([batch_size], float(i)) time_step = time_step._replace( discount=(step_type != ds.StepType.LAST).to(torch.float32), step_type=step_type, observation=torch.tensor([[i, i + 1, i], [i + 1, i, i]], dtype=torch.float32), reward=reward, env_id=torch.arange(batch_size, dtype=torch.int32)) alg_step = repr_learner.rollout_step(time_step, state) alg_step = alg_step._replace(output=i + torch.tensor([[1.], [0.]])) experience = ds.make_experience(time_step, alg_step, state) replay_buffer.add_batch(experience) state = alg_step.state env_ids = torch.tensor([0] * 14 + [1] * 14, dtype=torch.int64) positions = torch.tensor(list(range(14)) + list(range(14)), dtype=torch.int64) experience = replay_buffer.get_field(None, env_ids.unsqueeze(-1).cpu(), positions.unsqueeze(-1).cpu()) experience = experience._replace(replay_buffer=replay_buffer, batch_info=BatchInfo( env_ids=env_ids, positions=positions), rollout_info_field='rollout_info') processed_experience = repr_learner.preprocess_experience(experience) pprint.pprint(processed_experience.rollout_info) # yapf: disable expected = PredictiveRepresentationLearnerInfo( action=torch.tensor( [[[ 1., 2., 3., 4., 5.]], [[ 2., 3., 4., 5., 5.]], [[ 3., 4., 5., 5., 5.]], [[ 4., 5., 5., 5., 5.]], [[ 5., 5., 5., 5., 5.]], [[ 6., 7., 8., 9., 9.]], [[ 7., 8., 9., 9., 9.]], [[ 8., 9., 9., 9., 9.]], [[ 9., 9., 9., 9., 9.]], [[10., 11., 12., 13., 14.]], [[11., 12., 13., 14., 14.]], [[12., 13., 14., 14., 14.]], [[13., 14., 14., 14., 14.]], [[14., 14., 14., 14., 14.]], [[ 0., 1., 2., 3., 4.]], [[ 1., 2., 3., 4., 5.]], [[ 2., 3., 4., 5., 6.]], [[ 3., 4., 5., 6., 6.]], [[ 4., 5., 6., 6., 6.]], [[ 5., 6., 6., 6., 6.]], [[ 6., 6., 6., 6., 6.]], [[ 7., 8., 9., 10., 11.]], [[ 8., 9., 10., 11., 12.]], [[ 9., 10., 11., 12., 12.]], [[10., 11., 12., 12., 12.]], [[11., 12., 12., 12., 12.]], [[12., 12., 12., 12., 12.]], [[13., 13., 13., 13., 13.]]]).unsqueeze(-1), mask=torch.tensor( [[[ True, True, True, True, True]], [[ True, True, True, True, False]], [[ True, True, True, False, False]], [[ True, True, False, False, False]], [[ True, False, False, False, False]], [[ True, True, True, True, False]], [[ True, True, True, False, False]], [[ True, True, False, False, False]], [[ True, False, False, False, False]], [[ True, True, True, True, True]], [[ True, True, True, True, False]], [[ True, True, True, False, False]], [[ True, True, False, False, False]], [[ True, False, False, False, False]], [[ True, True, True, True, True]], [[ True, True, True, True, True]], [[ True, True, True, True, True]], [[ True, True, True, True, False]], [[ True, True, True, False, False]], [[ True, True, False, False, False]], [[ True, False, False, False, False]], [[ True, True, True, True, True]], [[ True, True, True, True, True]], [[ True, True, True, True, False]], [[ True, True, True, False, False]], [[ True, True, False, False, False]], [[ True, False, False, False, False]], [[ True, False, False, False, False]]]), target=torch.tensor( [[[ 0., 1., 2., 3., 4.]], [[ 1., 2., 3., 4., 4.]], [[ 2., 3., 4., 4., 4.]], [[ 3., 4., 4., 4., 4.]], [[ 4., 4., 4., 4., 4.]], [[ 5., 6., 7., 8., 8.]], [[ 6., 7., 8., 8., 8.]], [[ 7., 8., 8., 8., 8.]], [[ 8., 8., 8., 8., 8.]], [[ 9., 10., 11., 12., 13.]], [[10., 11., 12., 13., 13.]], [[11., 12., 13., 13., 13.]], [[12., 13., 13., 13., 13.]], [[13., 13., 13., 13., 13.]], [[ 0., 1., 2., 3., 4.]], [[ 1., 2., 3., 4., 5.]], [[ 2., 3., 4., 5., 6.]], [[ 3., 4., 5., 6., 6.]], [[ 4., 5., 6., 6., 6.]], [[ 5., 6., 6., 6., 6.]], [[ 6., 6., 6., 6., 6.]], [[ 7., 8., 9., 10., 11.]], [[ 8., 9., 10., 11., 12.]], [[ 9., 10., 11., 12., 12.]], [[10., 11., 12., 12., 12.]], [[11., 12., 12., 12., 12.]], [[12., 12., 12., 12., 12.]], [[13., 13., 13., 13., 13.]]])) # yapf: enable alf.nest.map_structure(lambda x, y: self.assertEqual(x, y), processed_experience.rollout_info, expected)