コード例 #1
0
ファイル: learning.py プロジェクト: zerocurve/acme
    def _step(self):
        # Update target network.
        online_variables = (
            *self._observation_network.variables,
            *self._critic_network.variables,
            *self._policy_network.variables,
        )
        target_variables = (
            *self._target_observation_network.variables,
            *self._target_critic_network.variables,
            *self._target_policy_network.variables,
        )

        # Make online -> target network update ops.
        if tf.math.mod(self._num_steps, self._target_update_period) == 0:
            for src, dest in zip(online_variables, target_variables):
                dest.assign(src)
        self._num_steps.assign_add(1)

        # Get data from replay (dropping extras if any). Note there is no
        # extra data here because we do not insert any into Reverb.
        inputs = next(self._iterator)
        transitions: types.Transition = inputs.data

        # Cast the additional discount to match the environment discount dtype.
        discount = tf.cast(self._discount, dtype=transitions.discount.dtype)

        with tf.GradientTape(persistent=True) as tape:
            # Maybe transform the observation before feeding into policy and critic.
            # Transforming the observations this way at the start of the learning
            # step effectively means that the policy and critic share observation
            # network weights.
            o_tm1 = self._observation_network(transitions.observation)
            o_t = self._target_observation_network(
                transitions.next_observation)
            # This stop_gradient prevents gradients to propagate into the target
            # observation network. In addition, since the online policy network is
            # evaluated at o_t, this also means the policy loss does not influence
            # the observation network training.
            o_t = tree.map_structure(tf.stop_gradient, o_t)

            # Critic learning.
            q_tm1 = self._critic_network(o_tm1, transitions.action)
            q_t = self._target_critic_network(o_t,
                                              self._target_policy_network(o_t))

            # Squeeze into the shape expected by the td_learning implementation.
            q_tm1 = tf.squeeze(q_tm1, axis=-1)  # [B]
            q_t = tf.squeeze(q_t, axis=-1)  # [B]

            # Critic loss.
            critic_loss = trfl.td_learning(q_tm1, transitions.reward,
                                           discount * transitions.discount,
                                           q_t).loss
            critic_loss = tf.reduce_mean(critic_loss, axis=0)

            # Actor learning.
            dpg_a_t = self._policy_network(o_t)
            dpg_q_t = self._critic_network(o_t, dpg_a_t)

            # Actor loss. If clipping is true use dqda clipping and clip the norm.
            dqda_clipping = 1.0 if self._clipping else None
            policy_loss = losses.dpg(dpg_q_t,
                                     dpg_a_t,
                                     tape=tape,
                                     dqda_clipping=dqda_clipping,
                                     clip_norm=self._clipping)
            policy_loss = tf.reduce_mean(policy_loss, axis=0)

        # Get trainable variables.
        policy_variables = self._policy_network.trainable_variables
        critic_variables = (
            # In this agent, the critic loss trains the observation network.
            self._observation_network.trainable_variables +
            self._critic_network.trainable_variables)

        # Compute gradients.
        policy_gradients = tape.gradient(policy_loss, policy_variables)
        critic_gradients = tape.gradient(critic_loss, critic_variables)

        # Delete the tape manually because of the persistent=True flag.
        del tape

        # Maybe clip gradients.
        if self._clipping:
            policy_gradients = tf.clip_by_global_norm(policy_gradients, 40.)[0]
            critic_gradients = tf.clip_by_global_norm(critic_gradients, 40.)[0]

        # Apply gradients.
        self._policy_optimizer.apply(policy_gradients, policy_variables)
        self._critic_optimizer.apply(critic_gradients, critic_variables)

        # Losses to track.
        return {
            'critic_loss': critic_loss,
            'policy_loss': policy_loss,
        }
コード例 #2
0
ファイル: training.py プロジェクト: NetColby/DNRL
    def _forward(self, inputs: Any) -> None:
        """Trainer forward pass

        Args:
            inputs (Any): input data from the data table (transitions)
        """

        # TODO: Update this forward function to work like MAD4PG
        data = inputs.data

        # Note (dries): The unused variable is start_of_episodes.
        observations, actions, rewards, discounts, _, extras = (
            data.observations,
            data.actions,
            data.rewards,
            data.discounts,
            data.start_of_episode,
            data.extras,
        )

        # Get initial state for the LSTM from replay and
        # extract the first state in the sequence..
        core_state = tree.map_structure(lambda s: s[:, 0, :],
                                        extras["core_states"])
        target_core_state = tree.map_structure(tf.identity, core_state)

        # TODO (dries): Take out all the data_points that does not need
        #  to be processed here at the start. Therefore it does not have
        #  to be done later on and saves processing time.

        self.policy_losses: Dict[str, tf.Tensor] = {}
        self.critic_losses: Dict[str, tf.Tensor] = {}

        # Do forward passes through the networks and calculate the losses
        with tf.GradientTape(persistent=True) as tape:
            # Note (dries): We are assuming that only the policy network
            # is recurrent and not the observation network.
            obs_trans, target_obs_trans = self._transform_observations(
                observations)

            target_actions = self._target_policy_actions(
                target_obs_trans, target_core_state)

            for agent in self._agents:
                agent_key = self.agent_net_keys[agent]

                # Get critic feed
                (
                    obs_trans_feed,
                    target_obs_trans_feed,
                    action_feed,
                    target_actions_feed,
                ) = self._get_critic_feed(
                    obs_trans=obs_trans,
                    target_obs_trans=target_obs_trans,
                    actions=actions,
                    target_actions=target_actions,
                    extras=extras,
                    agent=agent,
                )

                # Critic learning.
                # Remove the last sequence step for the normal network
                obs_comb, dims = train_utils.combine_dim(obs_trans_feed)
                act_comb, _ = train_utils.combine_dim(action_feed)
                q_values = self._critic_networks[agent_key](obs_comb, act_comb)
                q_values.set_dimensions(dims)

                # Remove first sequence step for the target
                obs_comb, _ = train_utils.combine_dim(target_obs_trans_feed)
                act_comb, _ = train_utils.combine_dim(target_actions_feed)
                target_q_values = self._target_critic_networks[agent_key](
                    obs_comb, act_comb)
                target_q_values.set_dimensions(dims)

                # Cast the additional discount to match
                # the environment discount dtype.
                agent_discount = discounts[agent]
                discount = tf.cast(self._discount, dtype=agent_discount.dtype)

                # Critic loss.
                critic_loss = recurrent_n_step_critic_loss(
                    q_values,
                    target_q_values,
                    rewards[agent],
                    discount * agent_discount,
                    bootstrap_n=self._bootstrap_n,
                    loss_fn=losses.categorical,
                )
                self.critic_losses[agent] = tf.reduce_mean(critic_loss, axis=0)

                # Actor learning.
                obs_agent_feed = target_obs_trans[agent]
                # TODO (dries): Why is there an extra tuple?
                agent_core_state = core_state[agent][0]
                transposed_obs = tf2_utils.batch_to_sequence(obs_agent_feed)
                outputs, updated_states = snt.static_unroll(
                    self._policy_networks[agent_key],
                    transposed_obs,
                    agent_core_state,
                )

                dpg_actions = tf2_utils.batch_to_sequence(outputs)

                # Note (dries): This is done to so that losses.dpg can verify
                # using gradient.tape that there is a
                # gradient relationship between dpg_q_values and dpg_actions_comb.
                dpg_actions_comb, dim = train_utils.combine_dim(dpg_actions)

                # Note (dries): This seemingly useless line is important!
                # Don't remove it. See above note.
                dpg_actions = train_utils.extract_dim(dpg_actions_comb, dim)

                # Get dpg actions
                dpg_actions_feed = self._get_dpg_feed(target_actions,
                                                      dpg_actions, agent)

                # Get dpg Q values.
                obs_comb, _ = train_utils.combine_dim(target_obs_trans_feed)
                act_comb, _ = train_utils.combine_dim(dpg_actions_feed)
                dpg_z_values = self._critic_networks[agent_key](obs_comb,
                                                                act_comb)
                dpg_q_values = dpg_z_values.mean()

                # Actor loss. If clipping is true use dqda clipping and clip the norm.
                dqda_clipping = 1.0 if self._max_gradient_norm is not None else None
                clip_norm = True if self._max_gradient_norm is not None else False

                policy_loss = losses.dpg(
                    dpg_q_values,
                    dpg_actions_comb,
                    tape=tape,
                    dqda_clipping=dqda_clipping,
                    clip_norm=clip_norm,
                )
                self.policy_losses[agent] = tf.reduce_mean(policy_loss, axis=0)
        self.tape = tape
コード例 #3
0
ファイル: learning.py プロジェクト: novatig/acme
    def _step(self):
        # Update target network.
        online_variables = (
            *self._observation_network.variables,
            *self._critic_network.variables,
            *self._policy_network.variables,
        )
        target_variables = (
            *self._target_observation_network.variables,
            *self._target_critic_network.variables,
            *self._target_policy_network.variables,
        )
        # Make online -> target network update ops.
        if self._target_update_period > 0 and \
           tf.math.mod(self._num_steps, self._target_update_period) == 0:
            for src, dest in zip(online_variables, target_variables):
                dest.assign(src)
        self._num_steps.assign_add(1)

        # Get data from replay (dropping extras if any). Note there is no
        # extra data here because we do not insert any into Reverb.
        inputs = next(self._iterator)
        o_tm1, a_tm1, r_t, d_t, o_t, extra = inputs.data
        behavior_logP_tm1 = extra['logP']
        behavior_tm1 = extra['policy']

        # Cast the additional discount to match the environment discount dtype.
        discount = tf.cast(self._discount, dtype=d_t.dtype)

        with tf.GradientTape(persistent=True) as tape:
            # Maybe transform the observation before feeding into policy and critic.
            # Transforming the observations this way at the start of the learning
            # step effectively means that the policy and critic share observation
            # network weights.
            o_tm1 = self._observation_network(o_tm1)
            o_t = self._target_observation_network(o_t)
            o_t = tree.map_structure(tf.stop_gradient, o_t)

            # Policy
            pol_tm1, v_tm1 = self._policy_network(o_tm1)
            pol_t, v_t = self._target_policy_network(o_t)
            pol_t = tree.map_structure(tf.stop_gradient, pol_t)
            v_t = tree.map_structure(tf.stop_gradient, v_t)

            # Actor loss. If clipping is true use dqda clipping and clip the norm.
            # TODO: two critic nets, e.g. q1_tm1 and q2_tm1, pick the min as target
            # DPG loss. If clipping is true use dqda clipping and clip the norm.
            dqda_clipping = 1.0 if self._clipping else None
            onpol_a_tm1, onpol_logP_tm1 = self._sampling_head(pol_tm1)
            onpol_q_tm1 = self._critic_network(o_tm1, onpol_a_tm1)
            onpol_q_tm1 = tf.squeeze(onpol_q_tm1, axis=-1)  # [B]

            logP_tm1 = self._sampling_head.log_prob(a_tm1, pol_tm1)
            ReFER_params_loss = self._ReFER.loss(behavior_logP_tm1, logP_tm1)

            dpg_loss = losses.dpg(onpol_q_tm1,
                                  onpol_a_tm1,
                                  tape=tape,
                                  dqda_clipping=dqda_clipping,
                                  clip_norm=self._clipping)
            dpg_loss = tf.reduce_mean(dpg_loss, axis=0)
            entropy_loss = self._entropy_coeff * tf.reduce_mean(onpol_logP_tm1,
                                                                axis=0)

            KL_coef = self._ReFER.DKL_coef()
            #behavior_P_tm1 = tf.math.exp(behavior_logP_tm1)
            #KL_loss = KL_coef * behavior_P_tm1 * (behavior_logP_tm1 - logP_tm1)
            KL_loss = tf.reduce_sum((behavior_tm1 - pol_tm1)**2, axis=-1)
            KL_loss = KL_coef * tf.reduce_mean(KL_loss, axis=0)

            # V(s) loss
            value_target = tf.stop_gradient(onpol_q_tm1 - self._entropy_coeff *
                                            onpol_logP_tm1)

            value_loss = losses.huber(value_target - v_tm1, 1.0)
            #value_loss = 0.5 * (value_target - v_tm1) ** 2
            value_loss = tf.reduce_mean(value_loss, axis=0)

            # Critic learning with TD loss
            q_tm1 = self._critic_network(o_tm1, a_tm1)
            q_tm1 = tf.squeeze(q_tm1, axis=-1)  # [B]

            onpol_a_t, logP_t = self._sampling_head(pol_t)
            onpol_q_t = self._target_critic_network(o_t, onpol_a_t)
            onpol_q_t = tf.squeeze(onpol_q_t, axis=-1)  # [B]
            onpol_q_t = tree.map_structure(tf.stop_gradient, onpol_q_t)

            R_t = self._observation_network.scale_rewards(r_t)
            critic_target = tf.stop_gradient(R_t +
                                             d_t * tf.minimum(v_t, onpol_q_t))
            #critic_target = tf.stop_gradient(R_t + d_t * 0.5*(v_t + onpol_q_t))

            critic_loss = losses.huber(critic_target - q_tm1, 1.0)
            #critic_loss = 0.5 * (critic_target - q_tm1) ** 2
            critic_loss = tf.reduce_mean(critic_loss, axis=0)

            encoder_loss = self._observation_network.compute_loss(o_tm1, r_t)

            policy_loss = value_loss + entropy_loss + dpg_loss + encoder_loss + KL_loss

        # Compute gradients.
        policy_gradients = tape.gradient(policy_loss, self._policy_variables)
        critic_gradients = tape.gradient(critic_loss, self._critic_variables)
        ReFER_gradient = tape.gradient(ReFER_params_loss,
                                       self._ReFER.trainable_variables)

        # Delete the tape manually because of the persistent=True flag.
        del tape

        # Maybe clip gradients.
        if self._clipping:
            policy_gradients = tf.clip_by_global_norm(policy_gradients, 40.)[0]
            critic_gradients = tf.clip_by_global_norm(critic_gradients, 40.)[0]

        # Apply gradients.
        self._policy_optimizer.apply(policy_gradients, self._policy_variables)
        self._critic_optimizer.apply(critic_gradients, self._critic_variables)
        self._ReFER_optimizer.apply(ReFER_gradient,
                                    self._ReFER.trainable_variables)

        # Losses to track.
        return {
            'critic_loss': critic_loss,
            'svalue_loss': value_loss,
            'entropy_loss': entropy_loss,
            'dpg_loss': dpg_loss,
            'avg_q': tf.reduce_mean(onpol_q_t, axis=0),
            'KL_loss': KL_loss,
            #'frac_off_pol': self._ReFER._last_frac_off_pol,
            'beta': self._ReFER._beta,
            'r_mean': self._observation_network._ret_mean,
            'r_scale': self._observation_network._ret_scale,
        }
コード例 #4
0
ファイル: training.py プロジェクト: NetColby/DNRL
    def _forward(self, inputs: Any) -> None:
        """Trainer forward pass

        Args:
            inputs (Any): input data from the data table (transitions)
        """

        # Unpack input data as follows:
        # o_tm1 = dictionary of observations one for each agent
        # a_tm1 = dictionary of actions taken from obs in o_tm1
        # e_tm1 [Optional] = extra data for timestep t-1
        # that the agents persist in replay.
        # r_t = dictionary of rewards or rewards sequences
        #   (if using N step transitions) ensuing from actions a_tm1
        # d_t = environment discount ensuing from actions a_tm1.
        #   This discount is applied to future rewards after r_t.
        # o_t = dictionary of next observations or next observation sequences
        # e_t [Optional] = extra data for timestep t that the agents persist in replay.
        o_tm1, a_tm1, e_tm1, r_t, d_t, o_t, e_t = inputs.data

        # Do forward passes through the networks and calculate the losses
        self.policy_losses = {}
        self.critic_losses = {}
        with tf.GradientTape(persistent=True) as tape:
            o_tm1_trans, o_t_trans = self._transform_observations(o_tm1, o_t)
            a_t = self._target_policy_actions(o_t_trans)

            for agent in self._agents:
                agent_key = self.agent_net_keys[agent]

                # Get critic feed
                o_tm1_feed, o_t_feed, a_tm1_feed, a_t_feed = self._get_critic_feed(
                    o_tm1_trans=o_tm1_trans,
                    o_t_trans=o_t_trans,
                    a_tm1=a_tm1,
                    a_t=a_t,
                    e_tm1=e_tm1,
                    e_t=e_t,
                    agent=agent,
                )

                # Critic learning.
                q_tm1 = self._critic_networks[agent_key](o_tm1_feed,
                                                         a_tm1_feed)
                q_t = self._target_critic_networks[agent_key](o_t_feed,
                                                              a_t_feed)

                # Cast the additional discount to match the environment discount dtype.
                discount = tf.cast(self._discount, dtype=d_t[agent].dtype)

                # Critic loss.
                critic_loss = losses.categorical(q_tm1, r_t[agent],
                                                 discount * d_t[agent], q_t)
                self.critic_losses[agent] = tf.reduce_mean(critic_loss, axis=0)
                # Actor learning.
                o_t_agent_feed = o_t_trans[agent]
                dpg_a_t = self._policy_networks[agent_key](o_t_agent_feed)

                # Get dpg actions
                dpg_a_t_feed = self._get_dpg_feed(a_t, dpg_a_t, agent)

                # Get dpg Q values.
                dpg_z_t = self._critic_networks[agent_key](o_t_feed,
                                                           dpg_a_t_feed)
                dpg_q_t = dpg_z_t.mean()

                # Actor loss. If clipping is true use dqda clipping and clip the norm.
                dqda_clipping = 1.0 if self._max_gradient_norm is not None else None
                clip_norm = True if self._max_gradient_norm is not None else False

                policy_loss = losses.dpg(
                    dpg_q_t,
                    dpg_a_t,
                    tape=tape,
                    dqda_clipping=dqda_clipping,
                    clip_norm=clip_norm,
                )
                self.policy_losses[agent] = tf.reduce_mean(policy_loss, axis=0)
        self.tape = tape
コード例 #5
0
  def _step(self, sample) -> Dict[str, tf.Tensor]:
    transitions: types.Transition = sample.data  # Assuming ReverbSample.

    # Cast the additional discount to match the environment discount dtype.
    discount = tf.cast(self._discount, dtype=transitions.discount.dtype)

    with tf.GradientTape(persistent=True) as tape:
      # Maybe transform the observation before feeding into policy and critic.
      # Transforming the observations this way at the start of the learning
      # step effectively means that the policy and critic share observation
      # network weights.
      o_tm1 = self._observation_network(transitions.observation)
      o_t = self._target_observation_network(transitions.next_observation)
      # This stop_gradient prevents gradients to propagate into the target
      # observation network. In addition, since the online policy network is
      # evaluated at o_t, this also means the policy loss does not influence
      # the observation network training.
      o_t = tree.map_structure(tf.stop_gradient, o_t)

      # Critic learning.
      q_tm1 = self._critic_network(o_tm1, transitions.action)
      q_t = self._target_critic_network(o_t, self._target_policy_network(o_t))

      # Critic loss.
      critic_loss = losses.categorical(q_tm1, transitions.reward,
                                       discount * transitions.discount, q_t)
      critic_loss = tf.reduce_mean(critic_loss, axis=[0])

      # Actor learning.
      dpg_a_t = self._policy_network(o_t)
      dpg_z_t = self._critic_network(o_t, dpg_a_t)
      dpg_q_t = dpg_z_t.mean()

      # Actor loss. If clipping is true use dqda clipping and clip the norm.
      dqda_clipping = 1.0 if self._clipping else None
      policy_loss = losses.dpg(
          dpg_q_t,
          dpg_a_t,
          tape=tape,
          dqda_clipping=dqda_clipping,
          clip_norm=self._clipping)
      policy_loss = tf.reduce_mean(policy_loss, axis=[0])

    # Get trainable variables.
    policy_variables = self._policy_network.trainable_variables
    critic_variables = (
        # In this agent, the critic loss trains the observation network.
        self._observation_network.trainable_variables +
        self._critic_network.trainable_variables)

    # Compute gradients.
    replica_context = tf.distribute.get_replica_context()
    policy_gradients = _average_gradients_across_replicas(
        replica_context,
        tape.gradient(policy_loss, policy_variables))
    critic_gradients = _average_gradients_across_replicas(
        replica_context,
        tape.gradient(critic_loss, critic_variables))

    # Delete the tape manually because of the persistent=True flag.
    del tape

    # Maybe clip gradients.
    if self._clipping:
      policy_gradients = tf.clip_by_global_norm(policy_gradients, 40.)[0]
      critic_gradients = tf.clip_by_global_norm(critic_gradients, 40.)[0]

    # Apply gradients.
    self._policy_optimizer.apply(policy_gradients, policy_variables)
    self._critic_optimizer.apply(critic_gradients, critic_variables)

    # Losses to track.
    return {
        'critic_loss': critic_loss,
        'policy_loss': policy_loss,
    }