コード例 #1
0
    def test_step(self):
        tf_env = tf_py_environment.TFPyEnvironment(
            suite_gym.load('CartPole-v0'))
        indexed_tf_env = IndexedTFEnv(tf_env, 5)
        # take first action
        a1 = PolicyStep(action=tf.convert_to_tensor([1]), state=(), info=())
        time_step_0 = indexed_tf_env.step(a1)
        self.assertEqual(time_step_0["env_id"], 5)
        self.assertEqual(time_step_0["ts_id"], 0)
        self.assertEqual(time_step_0["reward"], 0)
        self.assertEqual(time_step_0["step_type"], 0)
        self.assertEqual(time_step_0["discount"], 1.0)
        self.assertTrue("ob_0" in time_step_0)
        self.assertTrue("ob_1" in time_step_0)
        self.assertTrue("ob_2" in time_step_0)
        self.assertTrue("ob_3" in time_step_0)

        # take second action
        a2 = PolicyStep(action=tf.convert_to_tensor([0]), state=(), info=())
        time_step_1 = indexed_tf_env.step(a2)
        self.assertEqual(time_step_1["env_id"], 5)
        self.assertEqual(time_step_1["ts_id"], 1)
        self.assertEqual(time_step_1["reward"], 1)
        self.assertEqual(time_step_1["step_type"], 1)
        self.assertEqual(time_step_1["discount"], 1.0)
        self.assertTrue("ob_0" in time_step_1)
        self.assertTrue("ob_1" in time_step_1)
        self.assertTrue("ob_2" in time_step_1)
        self.assertTrue("ob_3" in time_step_1)
コード例 #2
0
    def _actor_train_step(self, exp: Experience, state: DdpgActorState):
        action, actor_state = self._actor_network(exp.observation,
                                                  exp.step_type,
                                                  network_state=state.actor)

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(action)
            q_value, critic_state = self._critic_network(
                (exp.observation, action), network_state=state.critic)

        dqda = tape.gradient(q_value, action)

        def actor_loss_fn(dqda, action):
            if self._dqda_clipping:
                dqda = tf.clip_by_value(dqda, -self._dqda_clipping,
                                        self._dqda_clipping)
            loss = 0.5 * losses.element_wise_squared_loss(
                tf.stop_gradient(dqda + action), action)
            loss = tf.reduce_sum(loss, axis=list(range(1, len(loss.shape))))
            return loss

        actor_loss = tf.nest.map_structure(actor_loss_fn, dqda, action)
        state = DdpgActorState(actor=actor_state, critic=critic_state)
        info = LossInfo(loss=tf.add_n(tf.nest.flatten(actor_loss)),
                        extra=actor_loss)
        return PolicyStep(action=action, state=state, info=info)
コード例 #3
0
ファイル: sac_agent_test.py プロジェクト: zircote/agents
 def action(self, time_step):
     observation = time_step.observation
     batch_size = observation.shape[0]
     action = tf.constant(self._action,
                          dtype=tf.float32,
                          shape=[batch_size, 1])
     return PolicyStep(action=action)
コード例 #4
0
    def rollout(self, time_step: ActionTimeStep, state: AgentState):
        """Rollout for one step."""
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(time_step)
        if self._icm is not None:
            icm_step = self._icm.train_step(
                (observation, time_step.prev_action), state=state.icm)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.rollout(
            time_step._replace(observation=observation), state.rl)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        # TODO
        # avoid computing this when rollout (off policy train)
        if self._entropy_target_algorithm:
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.action, step_type=time_step.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)
コード例 #5
0
 def predict(self, time_step: ActionTimeStep, state=None):
     observation = self._encode(time_step)
     action_distribution, actor_state = self._actor_network(
         observation, step_type=time_step.step_type, network_state=state)
     return PolicyStep(action=action_distribution,
                       state=actor_state,
                       info=())
コード例 #6
0
ファイル: merlin_algorithm.py プロジェクト: runjerry/alf
    def rollout(self,
                time_step: ActionTimeStep,
                state,
                mode,
                epsilon_greedy=1.0):
        """Train one step.

        Args:
            time_step: time_step.observation should be the latent vector
            state: state of the model
        """
        latent_vector = time_step.observation
        rnn_output, rnn_state = self._rnn(latent_vector, state)
        mem_readout = self._memory.genkey_and_read(self._key_net, rnn_output)
        policy_input = tf.concat(
            [tf.stop_gradient(latent_vector), rnn_output, mem_readout],
            axis=-1)
        action_distribution, _ = self._actor_net(policy_input,
                                                 step_type=time_step.step_type,
                                                 network_state=None)

        value, _ = self._value_net(latent_vector,
                                   step_type=time_step.step_type,
                                   network_state=None)

        info = ActorCriticInfo(action_distribution=action_distribution,
                               value=value)
        action = common.epsilon_greedy_sample(action_distribution,
                                              epsilon_greedy)
        return PolicyStep(action=action, state=rnn_state, info=info)
コード例 #7
0
ファイル: sac_algorithm.py プロジェクト: zhaoyinfu123/alf
 def _rollout_partial_state(self, time_step: ActionTimeStep, state=None):
     action, state = self._actor_network(time_step.observation,
                                         step_type=time_step.step_type,
                                         network_state=state.share.actor)
     empty_state = tf.nest.map_structure(lambda x: (),
                                         self.train_state_spec)
     state = empty_state._replace(share=SacShareState(actor=state))
     return PolicyStep(action=action, state=state, info=())
コード例 #8
0
    def predict(self, time_step: ActionTimeStep, state: ActorCriticState):
        """Predict for one step."""
        action_distribution, actor_state = self._actor_network(
            time_step.observation,
            step_type=time_step.step_type,
            network_state=state.actor)

        return PolicyStep(action=action_distribution,
                          state=ActorCriticState(actor=actor_state),
                          info=())
コード例 #9
0
    def greedy_predict(self, time_step: ActionTimeStep, state=None, eps=0.1):
        observation = self._encode(time_step)

        new_state = AgentState()

        rl_step = self._rl_algorithm.greedy_predict(
            time_step._replace(observation=observation), state.rl)
        new_state = new_state._replace(rl=rl_step.state)

        return PolicyStep(action=rl_step.action, state=new_state, info=())
コード例 #10
0
ファイル: ddpg_algorithm.py プロジェクト: ruizhaogit/alf
 def greedy_predict(self, time_step: ActionTimeStep, state=None):
     action, state = self._actor_network(
         time_step.observation,
         step_type=time_step.step_type,
         network_state=state.actor.actor)
     empty_state = tf.nest.map_structure(lambda x: (),
                                         self.train_state_spec)
     state = empty_state._replace(
         actor=DdpgActorState(actor=state, critic=()))
     return PolicyStep(action=action, state=state, info=())
コード例 #11
0
    def predict(self, time_step: ActionTimeStep, state: AgentState):
        """Predict for one step."""
        observation = self._encode(time_step)

        new_state = AgentState()

        rl_step = self._rl_algorithm.predict(
            time_step._replace(observation=observation), state.rl)
        new_state = new_state._replace(rl=rl_step.state)

        return PolicyStep(action=rl_step.action, state=new_state, info=())
コード例 #12
0
    def rollout(self, time_step: ActionTimeStep, state):
        """Train one step."""
        mbp_step = self._mbp.train_step(inputs=(time_step.observation,
                                                time_step.prev_action),
                                        state=state.mbp_state)
        mba_step = self._mba.rollout(
            time_step=time_step._replace(observation=mbp_step.outputs),
            state=state.mba_state)

        return PolicyStep(action=mba_step.action,
                          state=MerlinState(mbp_state=mbp_step.state,
                                            mba_state=mba_step.state),
                          info=MerlinInfo(mbp_info=mbp_step.info,
                                          mba_info=mba_step.info))
コード例 #13
0
    def rollout(self, time_step: ActionTimeStep, state: ActorCriticState):
        """Rollout for one step."""
        value, value_state = self._value_network(time_step.observation,
                                                 step_type=time_step.step_type,
                                                 network_state=state.value)

        action_distribution, actor_state = self._actor_network(
            time_step.observation,
            step_type=time_step.step_type,
            network_state=state.actor)

        return PolicyStep(action=action_distribution,
                          state=ActorCriticState(actor=actor_state,
                                                 value=value_state),
                          info=ActorCriticInfo(value=value))
コード例 #14
0
def test_eval_logger():
    """
    Tests the per step logging mediated through a custom TensorFlow metric.

    Note that due to the fact that TensorFlow places logging in a graph built through C++ which is
    only triggered when tensors are evaluated it is very difficult to capture the logging message
    even through using mocked output streams. Therefore, the test checks the attributes that can be
    tested and logs expected logging values for by-eye comparison. This is a fairly simple case
    since the logging code is simple but the test is in this sense in complete.
    """
    # Set up the logger using default parameters.
    logger = EvalPerStepLogger()
    # Test that the time step counter is initialised to zero.
    assert logger._t == 0

    # Build one time step's worth of data to be logged.
    observation = tf.convert_to_tensor(np.random.randint(10, size=(1, 1)),
                                       dtype=tf.float32)
    action = tf.convert_to_tensor(np.eye(2)[np.random.randint(2)])
    reward = -1 * observation
    discount = tf.convert_to_tensor(np.array([0.99]))

    # The logger takes in a tuple of (TimeStep, PolicyStep, TimeStep) the second time step
    # represents the next period and is not used so we simply pass a copy of the original time step.
    time_step = ts.TimeStep(ts.StepType(1), reward, discount, observation)
    policy_step = PolicyStep(action, state=(), info=())
    next_time_step = copy.deepcopy(time_step)
    # Collect the data in a tuple as required by the logger.
    time_step_data = (time_step, policy_step, next_time_step)

    # Print the expected logging term for comparison by eye.
    tf.print("\nExpected Values\nStep: ",
             0,
             "\t",
             "State: ",
             observation,
             "\t",
             "Action: ",
             action,
             end="\n",
             output_stream=sys.stdout)
    # Run the logging for a single time step.
    logger(time_step_data)
    # Check that the time step counter has incremented.
    assert logger._t == 1
コード例 #15
0
ファイル: agent.py プロジェクト: runjerry/alf
    def predict(self, time_step: ActionTimeStep, state: AgentState,
                epsilon_greedy):
        """Predict for one step."""
        observation = self._encode(time_step)

        new_state = AgentState()
        if self._goal_generator is not None:
            goal_step = self._goal_generator.predict(
                time_step._replace(observation=observation),
                state.goal_generator, epsilon_greedy)
            new_state = new_state._replace(goal_generator=goal_step.state)
            observation = [observation, goal_step.action]

        rl_step = self._rl_algorithm.predict(
            time_step._replace(observation=observation), state.rl,
            epsilon_greedy)
        new_state = new_state._replace(rl=rl_step.state)

        return PolicyStep(action=rl_step.action, state=new_state, info=())
コード例 #16
0
ファイル: sac_algorithm.py プロジェクト: mathkobe/alf
    def train_step(self, exp: Experience, state: SacState):
        action_distribution, share_actor_state = self._actor_network(
            exp.observation,
            step_type=exp.step_type,
            network_state=state.share.actor)
        action = tf.nest.map_structure(lambda d: d.sample(),
                                       action_distribution)
        log_pi = tfa_common.log_probability(action_distribution, action,
                                            self._action_spec)

        actor_state, actor_info = self._actor_train_step(
            exp, state.actor, action_distribution, action, log_pi)
        critic_state, critic_info = self._critic_train_step(
            exp, state.critic, action, log_pi)
        alpha_info = self._alpha_train_step(log_pi)
        state = SacState(share=SacShareState(actor=share_actor_state),
                         actor=actor_state,
                         critic=critic_state)
        info = SacInfo(actor=actor_info, critic=critic_info, alpha=alpha_info)
        return PolicyStep(action_distribution, state, info)
コード例 #17
0
    def predict(self, time_step: ActionTimeStep, state, epsilon_greedy):
        action, state = self._actor_network(time_step.observation,
                                            step_type=time_step.step_type,
                                            network_state=state.actor.actor)
        empty_state = tf.nest.map_structure(lambda x: (),
                                            self.train_state_spec)

        def _sample(a, ou):
            return tf.cond(
                tf.less(tf.random.uniform((), 0, 1), epsilon_greedy),
                lambda: a + ou(), lambda: a)

        noisy_action = tf.nest.map_structure(_sample, action, self._ou_process)
        noisy_action = tf.nest.map_structure(tfa_common.clip_to_spec,
                                             noisy_action, self._action_spec)
        state = empty_state._replace(
            actor=DdpgActorState(actor=state, critic=()))
        return PolicyStep(action=noisy_action,
                          state=state,
                          info=DdpgInfo(action_distribution=action))
コード例 #18
0
    def rollout(self, time_step: ActionTimeStep, state=None):
        observation = self._encode(time_step)

        value, value_state = self._value_network(
            observation,
            step_type=time_step.step_type,
            network_state=state.value_state)
        # ValueRnnNetwork will add a time dim to value
        # See value_rnn_network.py L153
        if isinstance(self._value_network, ValueRnnNetwork):
            value = tf.squeeze(value, axis=1)

        action_distribution, actor_state = self._actor_network(
            observation,
            step_type=time_step.step_type,
            network_state=state.actor_state)

        info = ActorCriticInfo(value=value,
                               icm_reward=(),
                               icm_info=(),
                               entropy_target_info=())
        if self._icm is not None:
            icm_step = self._icm.train_step(
                (observation, time_step.prev_action), state=state.icm_state)
            info = info._replace(icm_reward=icm_step.outputs,
                                 icm_info=icm_step.info)
            icm_state = icm_step.state
        else:
            icm_state = ()

        if self._entropy_target_algorithm:
            et_step = self._entropy_target_algorithm.train_step(
                action_distribution)
            info = info._replace(entropy_target_info=et_step.info)

        state = ActorCriticState(actor_state=actor_state,
                                 value_state=value_state,
                                 icm_state=icm_state)

        return PolicyStep(action=action_distribution, state=state, info=info)
コード例 #19
0
ファイル: agent.py プロジェクト: runjerry/alf
    def rollout(self, time_step: ActionTimeStep, state: AgentState, mode):
        """Rollout for one step."""
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(time_step)

        if self._goal_generator is not None:
            goal_step = self._goal_generator.rollout(
                time_step._replace(observation=time_step.observation),
                state.goal_generator, mode)
            new_state = new_state._replace(goal_generator=goal_step.state)
            info = info._replace(goal_generator=goal_step.info)
            observation = [observation, goal_step.action]

        if self._icm is not None:
            icm_step = self._icm.train_step(
                time_step._replace(observation=observation), state=state.icm)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.rollout(
            time_step._replace(observation=observation), state.rl, mode)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        if self._entropy_target_algorithm:
            # TODO: For off-policy training, skip entropy_target_algorithm
            # during rollout()
            assert 'action_distribution' in rl_step.info._fields, (
                "PolicyStep from rl_algorithm.rollout() does not contain "
                "`action_distribution`, which is required by "
                "`enforce_entropy_target`")
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.info.action_distribution,
                step_type=time_step.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)
コード例 #20
0
ファイル: off_policy_driver.py プロジェクト: ruizhaogit/alf
    def _prepare_specs(self, algorithm):
        """Prepare various tensor specs."""

        time_step = self.get_initial_time_step()
        self._time_step_spec = common.extract_spec(time_step)
        self._action_spec = self._env.action_spec()

        policy_step = algorithm.rollout(
            algorithm.transform_timestep(time_step), self._initial_state)
        info_spec = common.extract_spec(policy_step.info)
        self._policy_step_spec = PolicyStep(
            action=self._action_spec,
            state=algorithm.train_state_spec,
            info=info_spec)

        self._action_distribution_spec = tf.nest.map_structure(
            common.to_distribution_spec, algorithm.action_distribution_spec)
        self._action_dist_param_spec = tf.nest.map_structure(
            lambda spec: spec.input_params_spec,
            self._action_distribution_spec)

        algorithm.prepare_off_policy_specs(time_step)
コード例 #21
0
    def train_step(self, time_step: ActionTimeStep, state):
        """Train one step.

        Args:
            time_step: time_step.observation should be the latent vector
            state: state of the model
        """
        latent_vector = time_step.observation
        rnn_output, rnn_state = self._rnn(latent_vector, state)
        mem_readout = self._memory.genkey_and_read(self._key_net, rnn_output)
        policy_input = tf.concat(
            [tf.stop_gradient(latent_vector), rnn_output, mem_readout],
            axis=-1)
        action_distribution, _ = self._actor_net(
            policy_input, step_type=time_step.step_type, network_state=None)

        value, _ = self._value_net(
            latent_vector, step_type=time_step.step_type, network_state=None)

        info = ActorCriticInfo(
            value=value, icm_reward=(), icm_info=(), entropy_target_info=())
        return PolicyStep(
            action=action_distribution, state=rnn_state, info=info)
コード例 #22
0
ファイル: agent.py プロジェクト: runjerry/alf
    def train_step(self, exp: Experience, state):
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(exp)

        if self._goal_generator is not None:
            goal_step = self._goal_generator.train_step(
                exp._replace(observation=observation), state.goal_generator)
            info = info._replace(goal_generator=goal_step.info)
            new_state = new_state._replace(goal_generator=goal_step.state)
            observation = [observation, goal_step.action]

        if self._icm is not None:
            icm_step = self._icm.train_step(
                exp._replace(observation=observation),
                state=state.icm,
                calc_intrinsic_reward=False)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.train_step(
            exp._replace(observation=observation,
                         rollout_info=exp.rollout_info.rl), state.rl)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        if self._entropy_target_algorithm:
            assert 'action_distribution' in rl_step.info._fields, (
                "PolicyStep from rl_algorithm.train_step() does not contain "
                "`action_distribution`, which is required by "
                "`enforce_entropy_target`")
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.info.action_distribution, step_type=exp.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)
コード例 #23
0
    def train_step(self, exp: Experience, state):
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(exp)

        if self._icm is not None:
            icm_step = self._icm.train_step((observation, exp.prev_action),
                                            state=state.icm,
                                            calc_intrinsic_reward=False)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.train_step(
            exp._replace(observation=observation), state.rl)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        if self._entropy_target_algorithm:
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.action, step_type=exp.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)
コード例 #24
0
 def greedy_predict(self, time_step: ActionTimeStep, state=None):
     action, state = self._actor_network(time_step.observation,
                                         step_type=time_step.step_type,
                                         network_state=state)
     return PolicyStep(action=action, state=state, info=())
コード例 #25
0
 def distribution(self, time_step, policy_state=()):
   del policy_state
   action = self.action(time_step).action
   return PolicyStep(action=_MockDistribution(action))
コード例 #26
0
    def _prepare_specs(self, algorithm):
        """Prepare various tensor specs."""
        def extract_spec(nest):
            return tf.nest.map_structure(
                lambda t: tf.TensorSpec(t.shape[1:], t.dtype), nest)

        time_step = self.get_initial_time_step()
        self._time_step_spec = extract_spec(time_step)
        self._action_spec = self._env.action_spec()

        policy_step = algorithm.predict(time_step, self._initial_state)
        info_spec = extract_spec(policy_step.info)
        self._pred_policy_step_spec = PolicyStep(
            action=self._action_spec,
            state=algorithm.predict_state_spec,
            info=info_spec)

        def _to_distribution_spec(spec):
            if isinstance(spec, tf.TensorSpec):
                return DistributionSpec(tfp.distributions.Deterministic,
                                        input_params_spec={"loc": spec},
                                        sample_spec=spec)
            return spec

        self._action_distribution_spec = tf.nest.map_structure(
            _to_distribution_spec, algorithm.action_distribution_spec)
        self._action_dist_param_spec = tf.nest.map_structure(
            lambda spec: spec.input_params_spec,
            self._action_distribution_spec)

        self._experience_spec = Experience(
            step_type=self._time_step_spec.step_type,
            reward=self._time_step_spec.reward,
            discount=self._time_step_spec.discount,
            observation=self._time_step_spec.observation,
            prev_action=self._action_spec,
            action=self._action_spec,
            info=info_spec,
            action_distribution=self._action_dist_param_spec)

        action_dist_params = common.zero_tensor_from_nested_spec(
            self._experience_spec.action_distribution, self._env.batch_size)
        action_dist = nested_distributions_from_specs(
            self._action_distribution_spec, action_dist_params)
        exp = Experience(step_type=time_step.step_type,
                         reward=time_step.reward,
                         discount=time_step.discount,
                         observation=time_step.observation,
                         prev_action=time_step.prev_action,
                         action=time_step.prev_action,
                         info=policy_step.info,
                         action_distribution=action_dist)

        processed_exp = algorithm.preprocess_experience(exp)
        self._processed_experience_spec = self._experience_spec._replace(
            info=extract_spec(processed_exp.info))

        policy_step = common.algorithm_step(
            algorithm,
            ob_transformer=self._observation_transformer,
            time_step=exp,
            state=common.get_initial_policy_state(self._env.batch_size,
                                                  algorithm.train_state_spec),
            training=True)
        info_spec = extract_spec(policy_step.info)
        self._training_info_spec = make_training_info(
            action=self._action_spec,
            action_distribution=self._action_dist_param_spec,
            step_type=self._time_step_spec.step_type,
            reward=self._time_step_spec.reward,
            discount=self._time_step_spec.discount,
            info=info_spec,
            collect_info=self._processed_experience_spec.info,
            collect_action_distribution=self._action_dist_param_spec)
コード例 #27
0
 def action(self, time_step):
     del time_step
     action = tf.constant(self._action, dtype=tf.float32, shape=[1])
     return PolicyStep(action=action)