예제 #1
0
    def predict_step(self, time_step: TimeStep, state: AgentState,
                     epsilon_greedy):
        """Predict for one step."""
        new_state = AgentState()
        observation = time_step.observation
        info = AgentInfo()

        if self._representation_learner is not None:
            repr_step = self._representation_learner.predict_step(
                time_step, state.repr)
            new_state = new_state._replace(repr=repr_step.state)
            info = info._replace(repr=repr_step.info)
            observation = repr_step.output

        if self._goal_generator is not None:
            goal_step = self._goal_generator.predict_step(
                time_step._replace(observation=observation),
                state.goal_generator, epsilon_greedy)
            new_state = new_state._replace(goal_generator=goal_step.state)
            info = info._replace(goal_generator=goal_step.info)
            observation = [observation, goal_step.output]

        rl_step = self._rl_algorithm.predict_step(
            time_step._replace(observation=observation), state.rl,
            epsilon_greedy)
        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        return AlgStep(output=rl_step.output, state=new_state, info=info)
예제 #2
0
class ICMAlgorithmTest(alf.test.TestCase):
    def setUp(self):
        self._input_tensor_spec = TensorSpec((10, ))
        self._time_step = TimeStep(
            step_type=StepType.MID,
            reward=0,
            discount=1,
            observation=self._input_tensor_spec.zeros(outer_dims=(1, )),
            prev_action=None,
            env_id=None)
        self._hidden_size = 100

    def test_discrete_action(self):
        action_spec = BoundedTensorSpec((),
                                        dtype=torch.int64,
                                        minimum=0,
                                        maximum=3)
        alg = ICMAlgorithm(action_spec=action_spec,
                           observation_spec=self._input_tensor_spec,
                           hidden_size=self._hidden_size)
        state = self._input_tensor_spec.zeros(outer_dims=(1, ))

        alg_step = alg.train_step(
            self._time_step._replace(prev_action=action_spec.zeros(
                outer_dims=(1, ))), state)

        # the inverse net should predict a uniform distribution
        self.assertTensorClose(
            torch.sum(alg_step.info.loss.extra['inverse_loss']),
            torch.as_tensor(
                math.log(action_spec.maximum - action_spec.minimum + 1)),
            epsilon=1e-4)

    def test_continuous_action(self):
        action_spec = TensorSpec((4, ))
        alg = ICMAlgorithm(action_spec=action_spec,
                           observation_spec=self._input_tensor_spec,
                           hidden_size=self._hidden_size)
        state = self._input_tensor_spec.zeros(outer_dims=(1, ))

        alg_step = alg.train_step(
            self._time_step._replace(prev_action=action_spec.zeros(
                outer_dims=(1, ))), state)

        # the inverse net should predict a zero action vector
        self.assertTensorClose(
            torch.sum(alg_step.info.loss.extra['inverse_loss']),
            torch.as_tensor(0))
예제 #3
0
class DIAYNAlgorithmTest(alf.test.TestCase):
    def setUp(self):
        input_tensor_spec = TensorSpec((10, ))
        self._time_step = TimeStep(
            step_type=torch.tensor(StepType.MID, dtype=torch.int32),
            reward=0,
            discount=1,
            observation=input_tensor_spec.zeros(outer_dims=(1, )),
            prev_action=None,
            env_id=None)
        self._encoding_net = EncodingNetwork(
            input_tensor_spec=input_tensor_spec)

    def test_discrete_skill_loss(self):
        skill_spec = BoundedTensorSpec((),
                                       dtype=torch.int64,
                                       minimum=0,
                                       maximum=3)
        alg = DIAYNAlgorithm(skill_spec=skill_spec,
                             encoding_net=self._encoding_net)
        skill = state = torch.nn.functional.one_hot(
            skill_spec.zeros(outer_dims=(1, )),
            int(skill_spec.maximum - skill_spec.minimum + 1)).to(torch.float32)

        alg_step = alg.train_step(
            self._time_step._replace(
                observation=[self._time_step.observation, skill]), state)

        # the discriminator should predict a uniform distribution
        self.assertTensorClose(torch.sum(alg_step.info.loss),
                               torch.as_tensor(
                                   math.log(skill_spec.maximum -
                                            skill_spec.minimum + 1)),
                               epsilon=1e-4)

    def test_continuous_skill_loss(self):
        skill_spec = TensorSpec((4, ))
        alg = DIAYNAlgorithm(skill_spec=skill_spec,
                             encoding_net=self._encoding_net)
        skill = state = skill_spec.zeros(outer_dims=(1, ))

        alg_step = alg.train_step(
            self._time_step._replace(
                observation=[self._time_step.observation, skill]), state)

        # the discriminator should predict a zero skill vector
        self.assertTensorClose(torch.sum(alg_step.info.loss),
                               torch.as_tensor(0))
예제 #4
0
    def rollout_step(self, time_step: TimeStep, state: AgentState):
        """Rollout for one step."""
        new_state = AgentState()
        info = AgentInfo()

        time_step = transform_nest(time_step, "observation",
                                   self._observation_transformer)

        subtrajectory = self._skill_generator.update_disc_subtrajectory(
            time_step, state.skill_generator)

        skill_step = self._skill_generator.rollout_step(
            time_step, state.skill_generator)
        new_state = new_state._replace(skill_generator=skill_step.state)
        info = info._replace(skill_generator=skill_step.info)

        observation = self._make_low_level_observation(
            subtrajectory, skill_step.output, skill_step.info.switch_skill,
            skill_step.state.steps,
            skill_step.state.discriminator.first_observation)

        rl_step = self._rl_algorithm.rollout_step(
            time_step._replace(observation=observation), state.rl)
        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        skill_discount = ((
            (skill_step.state.steps == 1)
            & (time_step.step_type != StepType.LAST)).to(torch.float32) *
                          (1 - self._skill_boundary_discount))
        info = info._replace(skill_discount=1 - skill_discount)

        return AlgStep(output=rl_step.output, state=new_state, info=info)
예제 #5
0
    def predict_step(self, time_step: TimeStep, state: AgentState,
                     epsilon_greedy):
        """Predict for one step."""
        new_state = AgentState()

        time_step = transform_nest(time_step, "observation",
                                   self._observation_transformer)

        subtrajectory = self._skill_generator.update_disc_subtrajectory(
            time_step, state.skill_generator)

        skill_step = self._skill_generator.predict_step(
            time_step, state.skill_generator, epsilon_greedy)
        new_state = new_state._replace(skill_generator=skill_step.state)

        observation = self._make_low_level_observation(
            subtrajectory, skill_step.output, skill_step.info.switch_skill,
            skill_step.state.steps,
            skill_step.state.discriminator.first_observation)

        rl_step = self._rl_algorithm.predict_step(
            time_step._replace(observation=observation), state.rl,
            epsilon_greedy)
        new_state = new_state._replace(rl=rl_step.state)

        return AlgStep(output=rl_step.output, state=new_state)
예제 #6
0
 def rollout_step(self, time_step: TimeStep, state):
     if self._reward_normalizer is not None:
         self._reward_normalizer.update(time_step.reward)
         time_step = time_step._replace(
             reward=self._reward_normalizer.normalize(
                 time_step.reward, self._reward_clip_value))
     return self._mcts.predict_step(time_step, state)
예제 #7
0
    def rollout_step(self, time_step: TimeStep, state: AgentState):
        """Rollout for one step."""
        new_state = AgentState()
        info = AgentInfo()
        observation = time_step.observation

        if self._representation_learner is not None:
            repr_step = self._representation_learner.rollout_step(
                time_step, state.repr)
            new_state = new_state._replace(repr=repr_step.state)
            info = info._replace(repr=repr_step.info)
            observation = repr_step.output

        if self._goal_generator is not None:
            goal_step = self._goal_generator.rollout_step(
                time_step._replace(observation=observation),
                state.goal_generator)
            new_state = new_state._replace(goal_generator=goal_step.state)
            info = info._replace(goal_generator=goal_step.info)
            observation = [observation, goal_step.output]

        rl_step = self._rl_algorithm.rollout_step(
            time_step._replace(observation=observation), state.rl)
        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        if self._irm is not None:
            irm_step = self._irm.rollout_step(
                time_step._replace(observation=observation), state=state.irm)
            info = info._replace(irm=irm_step.info)
            new_state = new_state._replace(irm=irm_step.state)

        if self._entropy_target_algorithm:
            assert 'action_distribution' in rl_step.info._fields, (
                "AlgStep from rl_algorithm.rollout() does not contain "
                "`action_distribution`, which is required by "
                "`enforce_entropy_target`")
            et_step = self._entropy_target_algorithm.rollout_step(
                rl_step.info.action_distribution,
                step_type=time_step.step_type,
                on_policy_training=self.is_on_policy())
            info = info._replace(entropy_target=et_step.info)

        return AlgStep(output=rl_step.output, state=new_state, info=info)
예제 #8
0
 def predict_step(self, time_step: TimeStep, state, epsilon_greedy):
     mbp_step = self._mbp.predict_step(inputs=(time_step.observation,
                                               time_step.prev_action),
                                       state=state.mbp_state)
     mba_step = self._mba.predict_step(
         time_step=time_step._replace(observation=mbp_step.output),
         state=state.mba_state,
         epsilon_greedy=epsilon_greedy)
     return AlgStep(output=mba_step.output,
                    state=MerlinState(mbp_state=mbp_step.state,
                                      mba_state=mba_step.state),
                    info=())
예제 #9
0
    def rollout_step(self, time_step: TimeStep, state):
        """Train one step."""
        mbp_step = self._mbp.train_step(
            inputs=(time_step.observation, time_step.prev_action),
            state=state.mbp_state)
        mba_step = self._mba.rollout_step(
            time_step=time_step._replace(observation=mbp_step.output),
            state=state.mba_state)

        return AlgStep(
            output=mba_step.output,
            state=MerlinState(
                mbp_state=mbp_step.state, mba_state=mba_step.state),
            info=MerlinInfo(mbp_info=mbp_step.info, mba_info=mba_step.info))
예제 #10
0
    def _predict_multi_step_cost(self, observation, actions):
        """Compute the total cost by unrolling multiple steps according to
            the given initial observation and multi-step actions.
        Args:
            observation: the current observation for predicting quantities of
                future time steps
            actions (Tensor): a set of action sequences to
                shape [B, population, unroll_steps, action_dim]
        Returns:
            cost (Tensor): negation of accumulated predicted reward, with
                the shape of [B, population]
        """
        batch_size, population_size, num_unroll_steps = actions.shape[0:3]

        state = self.get_initial_predict_state(batch_size)
        time_step = TimeStep()
        dyn_state = state.dynamics._replace(feature=observation)
        dyn_state = nest.map_structure(
            partial(self._expand_to_population,
                    population_size=population_size), dyn_state)

        # expand to particles
        dyn_state = nest.map_structure(self._expand_to_particles, dyn_state)
        reward_state = state.reward
        reward = 0
        for i in range(num_unroll_steps):
            action = actions[:, :, i, ...].view(-1, actions.shape[3])
            action = self._expand_to_particles(action)
            time_step = time_step._replace(prev_action=action)
            time_step, dyn_state = self._predict_next_step(
                time_step, dyn_state)
            next_obs = time_step.observation
            # Note: currently using (next_obs, action), might need to
            # consider (obs, action) in order to be more compatible
            # with the conventional definition of the reward function
            reward_step, reward_state = self._calc_step_reward(
                next_obs, action, reward_state)
            reward = reward + reward_step
        cost = -reward
        # reshape cost
        # [B*par, n] -> [B, par*n]
        cost = cost.reshape(
            -1, self._particles_per_replica * self._num_dynamics_replicas)
        cost = cost.mean(-1)

        # reshape cost back to [batch size, population_size]
        cost = torch.reshape(cost, [batch_size, -1])

        return cost
예제 #11
0
    def _calc_cost_for_action_sequence(self, time_step: TimeStep, state,
                                       ac_seqs):
        """
        Args:
            time_step (TimeStep): input data for next step prediction
            state (MbrlState): input state for next step prediction
            ac_seqs: action_sequence (Tensor) of shape [batch_size,
                    population_size, solution_dim]), where
                    solution_dim = planning_horizon * num_actions
        Returns:
            cost (Tensor) with shape [batch_size, population_size]
        """
        obs = time_step.observation
        batch_size = obs.shape[0]

        ac_seqs = torch.reshape(
            ac_seqs,
            [batch_size, self._population_size, self._planning_horizon, -1])

        ac_seqs = ac_seqs.permute(2, 0, 1, 3)
        ac_seqs = torch.reshape(
            ac_seqs, (self._planning_horizon, -1, self._num_actions))

        state = state._replace(dynamics=state.dynamics._replace(feature=obs))
        init_obs = self._expand_to_population(obs)
        state = nest.map_structure(self._expand_to_population, state)

        obs = init_obs
        cost = 0
        for i in range(ac_seqs.shape[0]):
            action = ac_seqs[i]
            time_step = time_step._replace(prev_action=action)
            time_step, state = self._dynamics_func(time_step, state)
            next_obs = time_step.observation
            # Note: currently using (next_obs, action), might need to
            # consider (obs, action) in order to be more compatible
            # with the conventional definition of the reward function
            reward_step, state = self._reward_func(next_obs, action, state)
            cost = cost - reward_step
            obs = next_obs

        # reshape cost back to [batch size, population_size]
        cost = torch.reshape(cost, [batch_size, -1])
        return cost
예제 #12
0
    def test_mcts_algorithm(self):
        observation_spec = alf.TensorSpec((3, 3))
        action_spec = alf.BoundedTensorSpec((),
                                            dtype=torch.int64,
                                            minimum=0,
                                            maximum=8)
        model = TicTacToeModel()
        time_step = TimeStep(step_type=torch.tensor([StepType.MID]))

        # board situations and expected actions
        # yapf: disable
        cases = [
            ([[1, -1,  1],
              [1, -1, -1],
              [0,  0,  1]], 6),
            ([[0,  0,  0],
              [0, -1, -1],
              [0,  1,  0]], 3),
            ([[ 1, -1, -1],
              [-1, -1,  0],
              [ 0,  1,  1]], 6),
            ([[-1,  0,  1],
              [ 0, -1, -1],
              [ 0,  0,  1]], 3),
            ([[0, 0,  0],
              [0, 0,  0],
              [0, 0, -1]], 4),
            ([[0,  0, 0],
              [0, -1, 0],
              [0,  0, 0]], (0, 2, 6, 8)),
            ([[0,  0,  0],
              [0,  1, -1],
              [1, -1, -1]], 2),
        ]
        # yapf: enable

        def _create_mcts(observation_spec, action_spec, num_simulations):
            return MCTSAlgorithm(
                observation_spec,
                action_spec,
                discount=1.0,
                root_dirichlet_alpha=100.,
                root_exploration_fraction=0.25,
                num_simulations=num_simulations,
                pb_c_init=1.25,
                pb_c_base=19652,
                visit_softmax_temperature_fn=VisitSoftmaxTemperatureByMoves(
                    [(0, 1.0), (10, 0.0001)]),
                known_value_bounds=(-1, 1),
                is_two_player_game=True)

        # test case serially
        for observation, action in cases:
            observation = torch.tensor([observation], dtype=torch.float32)
            state = MCTSState(steps=(observation != 0).sum(dim=(1, 2)))
            # We use varing num_simulations instead of a fixed large number such
            # as 2000 to make the test faster.
            num_simulations = int((observation == 0).sum().cpu()) * 200
            mcts = _create_mcts(
                observation_spec, action_spec, num_simulations=num_simulations)
            mcts.set_model(model)
            alg_step = mcts.predict_step(
                time_step._replace(observation=observation), state)
            print(observation, alg_step.output, alg_step.info)
            if type(action) == tuple:
                self.assertTrue(alg_step.output[0] in action)
            else:
                self.assertEqual(alg_step.output[0], action)

        # test batch predict
        observation = torch.tensor([case[0] for case in cases],
                                   dtype=torch.float32)
        state = MCTSState(steps=(observation != 0).sum(dim=(1, 2)))
        mcts = _create_mcts(
            observation_spec, action_spec, num_simulations=2000)
        mcts.set_model(model)
        alg_step = mcts.predict_step(
            time_step._replace(
                step_type=torch.tensor([StepType.MID] * len(cases)),
                observation=observation), state)
        for i, (observation, action) in enumerate(cases):
            if type(action) == tuple:
                self.assertTrue(alg_step.output[i] in action)
            else:
                self.assertEqual(alg_step.output[i], action)
예제 #13
0
 def predict_step(self, time_step: TimeStep, state, epsilon_greedy):
     if self._reward_normalizer is not None:
         time_step = time_step._replace(
             reward=self._reward_normalizer.normalize(
                 time_step.reward, self._reward_clip_value))
     return self._mcts.predict_step(time_step, state)