def predict_step(self, time_step: TimeStep, state: AgentState, epsilon_greedy): """Predict for one step.""" new_state = AgentState() observation = time_step.observation info = AgentInfo() if self._representation_learner is not None: repr_step = self._representation_learner.predict_step( time_step, state.repr) new_state = new_state._replace(repr=repr_step.state) info = info._replace(repr=repr_step.info) observation = repr_step.output if self._goal_generator is not None: goal_step = self._goal_generator.predict_step( time_step._replace(observation=observation), state.goal_generator, epsilon_greedy) new_state = new_state._replace(goal_generator=goal_step.state) info = info._replace(goal_generator=goal_step.info) observation = [observation, goal_step.output] rl_step = self._rl_algorithm.predict_step( time_step._replace(observation=observation), state.rl, epsilon_greedy) new_state = new_state._replace(rl=rl_step.state) info = info._replace(rl=rl_step.info) return AlgStep(output=rl_step.output, state=new_state, info=info)
class ICMAlgorithmTest(alf.test.TestCase): def setUp(self): self._input_tensor_spec = TensorSpec((10, )) self._time_step = TimeStep( step_type=StepType.MID, reward=0, discount=1, observation=self._input_tensor_spec.zeros(outer_dims=(1, )), prev_action=None, env_id=None) self._hidden_size = 100 def test_discrete_action(self): action_spec = BoundedTensorSpec((), dtype=torch.int64, minimum=0, maximum=3) alg = ICMAlgorithm(action_spec=action_spec, observation_spec=self._input_tensor_spec, hidden_size=self._hidden_size) state = self._input_tensor_spec.zeros(outer_dims=(1, )) alg_step = alg.train_step( self._time_step._replace(prev_action=action_spec.zeros( outer_dims=(1, ))), state) # the inverse net should predict a uniform distribution self.assertTensorClose( torch.sum(alg_step.info.loss.extra['inverse_loss']), torch.as_tensor( math.log(action_spec.maximum - action_spec.minimum + 1)), epsilon=1e-4) def test_continuous_action(self): action_spec = TensorSpec((4, )) alg = ICMAlgorithm(action_spec=action_spec, observation_spec=self._input_tensor_spec, hidden_size=self._hidden_size) state = self._input_tensor_spec.zeros(outer_dims=(1, )) alg_step = alg.train_step( self._time_step._replace(prev_action=action_spec.zeros( outer_dims=(1, ))), state) # the inverse net should predict a zero action vector self.assertTensorClose( torch.sum(alg_step.info.loss.extra['inverse_loss']), torch.as_tensor(0))
class DIAYNAlgorithmTest(alf.test.TestCase): def setUp(self): input_tensor_spec = TensorSpec((10, )) self._time_step = TimeStep( step_type=torch.tensor(StepType.MID, dtype=torch.int32), reward=0, discount=1, observation=input_tensor_spec.zeros(outer_dims=(1, )), prev_action=None, env_id=None) self._encoding_net = EncodingNetwork( input_tensor_spec=input_tensor_spec) def test_discrete_skill_loss(self): skill_spec = BoundedTensorSpec((), dtype=torch.int64, minimum=0, maximum=3) alg = DIAYNAlgorithm(skill_spec=skill_spec, encoding_net=self._encoding_net) skill = state = torch.nn.functional.one_hot( skill_spec.zeros(outer_dims=(1, )), int(skill_spec.maximum - skill_spec.minimum + 1)).to(torch.float32) alg_step = alg.train_step( self._time_step._replace( observation=[self._time_step.observation, skill]), state) # the discriminator should predict a uniform distribution self.assertTensorClose(torch.sum(alg_step.info.loss), torch.as_tensor( math.log(skill_spec.maximum - skill_spec.minimum + 1)), epsilon=1e-4) def test_continuous_skill_loss(self): skill_spec = TensorSpec((4, )) alg = DIAYNAlgorithm(skill_spec=skill_spec, encoding_net=self._encoding_net) skill = state = skill_spec.zeros(outer_dims=(1, )) alg_step = alg.train_step( self._time_step._replace( observation=[self._time_step.observation, skill]), state) # the discriminator should predict a zero skill vector self.assertTensorClose(torch.sum(alg_step.info.loss), torch.as_tensor(0))
def rollout_step(self, time_step: TimeStep, state: AgentState): """Rollout for one step.""" new_state = AgentState() info = AgentInfo() time_step = transform_nest(time_step, "observation", self._observation_transformer) subtrajectory = self._skill_generator.update_disc_subtrajectory( time_step, state.skill_generator) skill_step = self._skill_generator.rollout_step( time_step, state.skill_generator) new_state = new_state._replace(skill_generator=skill_step.state) info = info._replace(skill_generator=skill_step.info) observation = self._make_low_level_observation( subtrajectory, skill_step.output, skill_step.info.switch_skill, skill_step.state.steps, skill_step.state.discriminator.first_observation) rl_step = self._rl_algorithm.rollout_step( time_step._replace(observation=observation), state.rl) new_state = new_state._replace(rl=rl_step.state) info = info._replace(rl=rl_step.info) skill_discount = (( (skill_step.state.steps == 1) & (time_step.step_type != StepType.LAST)).to(torch.float32) * (1 - self._skill_boundary_discount)) info = info._replace(skill_discount=1 - skill_discount) return AlgStep(output=rl_step.output, state=new_state, info=info)
def predict_step(self, time_step: TimeStep, state: AgentState, epsilon_greedy): """Predict for one step.""" new_state = AgentState() time_step = transform_nest(time_step, "observation", self._observation_transformer) subtrajectory = self._skill_generator.update_disc_subtrajectory( time_step, state.skill_generator) skill_step = self._skill_generator.predict_step( time_step, state.skill_generator, epsilon_greedy) new_state = new_state._replace(skill_generator=skill_step.state) observation = self._make_low_level_observation( subtrajectory, skill_step.output, skill_step.info.switch_skill, skill_step.state.steps, skill_step.state.discriminator.first_observation) rl_step = self._rl_algorithm.predict_step( time_step._replace(observation=observation), state.rl, epsilon_greedy) new_state = new_state._replace(rl=rl_step.state) return AlgStep(output=rl_step.output, state=new_state)
def rollout_step(self, time_step: TimeStep, state): if self._reward_normalizer is not None: self._reward_normalizer.update(time_step.reward) time_step = time_step._replace( reward=self._reward_normalizer.normalize( time_step.reward, self._reward_clip_value)) return self._mcts.predict_step(time_step, state)
def rollout_step(self, time_step: TimeStep, state: AgentState): """Rollout for one step.""" new_state = AgentState() info = AgentInfo() observation = time_step.observation if self._representation_learner is not None: repr_step = self._representation_learner.rollout_step( time_step, state.repr) new_state = new_state._replace(repr=repr_step.state) info = info._replace(repr=repr_step.info) observation = repr_step.output if self._goal_generator is not None: goal_step = self._goal_generator.rollout_step( time_step._replace(observation=observation), state.goal_generator) new_state = new_state._replace(goal_generator=goal_step.state) info = info._replace(goal_generator=goal_step.info) observation = [observation, goal_step.output] rl_step = self._rl_algorithm.rollout_step( time_step._replace(observation=observation), state.rl) new_state = new_state._replace(rl=rl_step.state) info = info._replace(rl=rl_step.info) if self._irm is not None: irm_step = self._irm.rollout_step( time_step._replace(observation=observation), state=state.irm) info = info._replace(irm=irm_step.info) new_state = new_state._replace(irm=irm_step.state) if self._entropy_target_algorithm: assert 'action_distribution' in rl_step.info._fields, ( "AlgStep from rl_algorithm.rollout() does not contain " "`action_distribution`, which is required by " "`enforce_entropy_target`") et_step = self._entropy_target_algorithm.rollout_step( rl_step.info.action_distribution, step_type=time_step.step_type, on_policy_training=self.is_on_policy()) info = info._replace(entropy_target=et_step.info) return AlgStep(output=rl_step.output, state=new_state, info=info)
def predict_step(self, time_step: TimeStep, state, epsilon_greedy): mbp_step = self._mbp.predict_step(inputs=(time_step.observation, time_step.prev_action), state=state.mbp_state) mba_step = self._mba.predict_step( time_step=time_step._replace(observation=mbp_step.output), state=state.mba_state, epsilon_greedy=epsilon_greedy) return AlgStep(output=mba_step.output, state=MerlinState(mbp_state=mbp_step.state, mba_state=mba_step.state), info=())
def rollout_step(self, time_step: TimeStep, state): """Train one step.""" mbp_step = self._mbp.train_step( inputs=(time_step.observation, time_step.prev_action), state=state.mbp_state) mba_step = self._mba.rollout_step( time_step=time_step._replace(observation=mbp_step.output), state=state.mba_state) return AlgStep( output=mba_step.output, state=MerlinState( mbp_state=mbp_step.state, mba_state=mba_step.state), info=MerlinInfo(mbp_info=mbp_step.info, mba_info=mba_step.info))
def _predict_multi_step_cost(self, observation, actions): """Compute the total cost by unrolling multiple steps according to the given initial observation and multi-step actions. Args: observation: the current observation for predicting quantities of future time steps actions (Tensor): a set of action sequences to shape [B, population, unroll_steps, action_dim] Returns: cost (Tensor): negation of accumulated predicted reward, with the shape of [B, population] """ batch_size, population_size, num_unroll_steps = actions.shape[0:3] state = self.get_initial_predict_state(batch_size) time_step = TimeStep() dyn_state = state.dynamics._replace(feature=observation) dyn_state = nest.map_structure( partial(self._expand_to_population, population_size=population_size), dyn_state) # expand to particles dyn_state = nest.map_structure(self._expand_to_particles, dyn_state) reward_state = state.reward reward = 0 for i in range(num_unroll_steps): action = actions[:, :, i, ...].view(-1, actions.shape[3]) action = self._expand_to_particles(action) time_step = time_step._replace(prev_action=action) time_step, dyn_state = self._predict_next_step( time_step, dyn_state) next_obs = time_step.observation # Note: currently using (next_obs, action), might need to # consider (obs, action) in order to be more compatible # with the conventional definition of the reward function reward_step, reward_state = self._calc_step_reward( next_obs, action, reward_state) reward = reward + reward_step cost = -reward # reshape cost # [B*par, n] -> [B, par*n] cost = cost.reshape( -1, self._particles_per_replica * self._num_dynamics_replicas) cost = cost.mean(-1) # reshape cost back to [batch size, population_size] cost = torch.reshape(cost, [batch_size, -1]) return cost
def _calc_cost_for_action_sequence(self, time_step: TimeStep, state, ac_seqs): """ Args: time_step (TimeStep): input data for next step prediction state (MbrlState): input state for next step prediction ac_seqs: action_sequence (Tensor) of shape [batch_size, population_size, solution_dim]), where solution_dim = planning_horizon * num_actions Returns: cost (Tensor) with shape [batch_size, population_size] """ obs = time_step.observation batch_size = obs.shape[0] ac_seqs = torch.reshape( ac_seqs, [batch_size, self._population_size, self._planning_horizon, -1]) ac_seqs = ac_seqs.permute(2, 0, 1, 3) ac_seqs = torch.reshape( ac_seqs, (self._planning_horizon, -1, self._num_actions)) state = state._replace(dynamics=state.dynamics._replace(feature=obs)) init_obs = self._expand_to_population(obs) state = nest.map_structure(self._expand_to_population, state) obs = init_obs cost = 0 for i in range(ac_seqs.shape[0]): action = ac_seqs[i] time_step = time_step._replace(prev_action=action) time_step, state = self._dynamics_func(time_step, state) next_obs = time_step.observation # Note: currently using (next_obs, action), might need to # consider (obs, action) in order to be more compatible # with the conventional definition of the reward function reward_step, state = self._reward_func(next_obs, action, state) cost = cost - reward_step obs = next_obs # reshape cost back to [batch size, population_size] cost = torch.reshape(cost, [batch_size, -1]) return cost
def test_mcts_algorithm(self): observation_spec = alf.TensorSpec((3, 3)) action_spec = alf.BoundedTensorSpec((), dtype=torch.int64, minimum=0, maximum=8) model = TicTacToeModel() time_step = TimeStep(step_type=torch.tensor([StepType.MID])) # board situations and expected actions # yapf: disable cases = [ ([[1, -1, 1], [1, -1, -1], [0, 0, 1]], 6), ([[0, 0, 0], [0, -1, -1], [0, 1, 0]], 3), ([[ 1, -1, -1], [-1, -1, 0], [ 0, 1, 1]], 6), ([[-1, 0, 1], [ 0, -1, -1], [ 0, 0, 1]], 3), ([[0, 0, 0], [0, 0, 0], [0, 0, -1]], 4), ([[0, 0, 0], [0, -1, 0], [0, 0, 0]], (0, 2, 6, 8)), ([[0, 0, 0], [0, 1, -1], [1, -1, -1]], 2), ] # yapf: enable def _create_mcts(observation_spec, action_spec, num_simulations): return MCTSAlgorithm( observation_spec, action_spec, discount=1.0, root_dirichlet_alpha=100., root_exploration_fraction=0.25, num_simulations=num_simulations, pb_c_init=1.25, pb_c_base=19652, visit_softmax_temperature_fn=VisitSoftmaxTemperatureByMoves( [(0, 1.0), (10, 0.0001)]), known_value_bounds=(-1, 1), is_two_player_game=True) # test case serially for observation, action in cases: observation = torch.tensor([observation], dtype=torch.float32) state = MCTSState(steps=(observation != 0).sum(dim=(1, 2))) # We use varing num_simulations instead of a fixed large number such # as 2000 to make the test faster. num_simulations = int((observation == 0).sum().cpu()) * 200 mcts = _create_mcts( observation_spec, action_spec, num_simulations=num_simulations) mcts.set_model(model) alg_step = mcts.predict_step( time_step._replace(observation=observation), state) print(observation, alg_step.output, alg_step.info) if type(action) == tuple: self.assertTrue(alg_step.output[0] in action) else: self.assertEqual(alg_step.output[0], action) # test batch predict observation = torch.tensor([case[0] for case in cases], dtype=torch.float32) state = MCTSState(steps=(observation != 0).sum(dim=(1, 2))) mcts = _create_mcts( observation_spec, action_spec, num_simulations=2000) mcts.set_model(model) alg_step = mcts.predict_step( time_step._replace( step_type=torch.tensor([StepType.MID] * len(cases)), observation=observation), state) for i, (observation, action) in enumerate(cases): if type(action) == tuple: self.assertTrue(alg_step.output[i] in action) else: self.assertEqual(alg_step.output[i], action)
def predict_step(self, time_step: TimeStep, state, epsilon_greedy): if self._reward_normalizer is not None: time_step = time_step._replace( reward=self._reward_normalizer.normalize( time_step.reward, self._reward_clip_value)) return self._mcts.predict_step(time_step, state)