コード例 #1
0
    def create_normalizer_update(
        vector_input: tf.Tensor,
        steps: tf.Tensor,
        running_mean: tf.Tensor,
        running_variance: tf.Tensor,
    ) -> tf.Operation:
        """
        Creates the update operation for the normalizer.
        :param vector_input: Vector observation to use for updating the running mean and variance.
        :param running_mean: Tensorflow tensor representing the current running mean.
        :param running_variance: Tensorflow tensor representing the current running variance.
        :param steps: Tensorflow tensor representing the current number of steps that have been normalized.
        :return: A TF operation that updates the normalization based on vector_input.
        """
        # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
        # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
        steps_increment = tf.shape(vector_input)[0]
        total_new_steps = tf.add(steps, steps_increment)

        # Compute the incremental update and divide by the number of new steps.
        input_to_old_mean = tf.subtract(vector_input, running_mean)
        new_mean = running_mean + tf.reduce_sum(
            input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32),
            axis=0)
        # Compute difference of input to the new mean for Welford update
        input_to_new_mean = tf.subtract(vector_input, new_mean)
        new_variance = running_variance + tf.reduce_sum(
            input_to_new_mean * input_to_old_mean, axis=0)
        update_mean = tf.assign(running_mean, new_mean)
        update_variance = tf.assign(running_variance, new_variance)
        update_norm_step = tf.assign(steps, total_new_steps)
        return tf.group([update_mean, update_variance, update_norm_step])
コード例 #2
0
 def copy_normalization(self, mean, variance, steps):
     """
     Copies the mean, variance, and steps into the normalizers of the
     input of this SACNetwork. Used to copy the normalizer from the policy network
     to the target network.
     param mean: Tensor containing the mean.
     param variance: Tensor containing the variance
     param steps: Tensor containing the number of steps.
     """
     update_mean = tf.assign(self.running_mean, mean)
     update_variance = tf.assign(self.running_variance, variance)
     update_norm_step = tf.assign(self.normalization_steps, steps)
     return tf.group([update_mean, update_variance, update_norm_step])
コード例 #3
0
 def create_normalizer_update(self, vector_input):
     mean_current_observation = tf.reduce_mean(vector_input, axis=0)
     new_mean = self.running_mean + (
         mean_current_observation - self.running_mean) / tf.cast(
             tf.add(self.normalization_steps, 1), tf.float32)
     new_variance = self.running_variance + (
         mean_current_observation - new_mean) * (mean_current_observation -
                                                 self.running_mean)
     update_mean = tf.assign(self.running_mean, new_mean)
     update_variance = tf.assign(self.running_variance, new_variance)
     update_norm_step = tf.assign(self.normalization_steps,
                                  self.normalization_steps + 1)
     return tf.group([update_mean, update_variance, update_norm_step])
コード例 #4
0
    def create_normalizer_update(self, vector_input):
        # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
        # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
        steps_increment = tf.shape(vector_input)[0]
        total_new_steps = tf.add(self.normalization_steps, steps_increment)

        # Compute the incremental update and divide by the number of new steps.
        input_to_old_mean = tf.subtract(vector_input, self.running_mean)
        new_mean = self.running_mean + tf.reduce_sum(
            input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32),
            axis=0)
        # Compute difference of input to the new mean for Welford update
        input_to_new_mean = tf.subtract(vector_input, new_mean)
        new_variance = self.running_variance + tf.reduce_sum(
            input_to_new_mean * input_to_old_mean, axis=0)
        update_mean = tf.assign(self.running_mean, new_mean)
        update_variance = tf.assign(self.running_variance, new_variance)
        update_norm_step = tf.assign(self.normalization_steps, total_new_steps)
        return tf.group([update_mean, update_variance, update_norm_step])
コード例 #5
0
    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: Brain parameters used to generate specific network graph.
        :param lr: Learning rate.
        :param lr_schedule: Learning rate decay schedule.
        :param h_size: Size of hidden layers
        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
            set higher to explore more.
        :return: a sub-class of PPOAgent tailored to the environment.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers: Number of hidden layers between encoded input and policy & value layers
        :param tau: Strength of soft-Q update.
        :param m_size: Size of brain memory.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope(""):
                super().__init__(policy, trainer_params)
                hyperparameters: SACSettings = cast(
                    SACSettings, trainer_params.hyperparameters)
                lr = hyperparameters.learning_rate
                lr_schedule = hyperparameters.learning_rate_schedule
                max_step = trainer_params.max_steps
                self.tau = hyperparameters.tau
                self.init_entcoef = hyperparameters.init_entcoef

                self.policy = policy
                self.act_size = policy.act_size
                policy_network_settings = policy.network_settings
                h_size = policy_network_settings.hidden_units
                num_layers = policy_network_settings.num_layers
                vis_encode_type = policy_network_settings.vis_encode_type

                self.tau = hyperparameters.tau
                self.burn_in_ratio = 0.0

                # Non-exposed SAC parameters
                self.discrete_target_entropy_scale = (
                    0.2  # Roughly equal to e-greedy 0.05
                )
                self.continuous_target_entropy_scale = 1.0

                stream_names = list(self.reward_signals.keys())
                # Use to reduce "survivor bonus" when using Curiosity or GAIL.
                self.gammas = [
                    _val.gamma
                    for _val in trainer_params.reward_signals.values()
                ]
                self.use_dones_in_backup = {
                    name: tf.Variable(1.0)
                    for name in stream_names
                }
                self.disable_use_dones = {
                    name: self.use_dones_in_backup[name].assign(0.0)
                    for name in stream_names
                }

                if num_layers < 1:
                    num_layers = 1

                self.target_init_op: List[tf.Tensor] = []
                self.target_update_op: List[tf.Tensor] = []
                self.update_batch_policy: Optional[tf.Operation] = None
                self.update_batch_value: Optional[tf.Operation] = None
                self.update_batch_entropy: Optional[tf.Operation] = None

                self.policy_network = SACPolicyNetwork(
                    policy=self.policy,
                    m_size=self.policy.m_size,  # 3x policy.m_size
                    h_size=h_size,
                    normalize=self.policy.normalize,
                    use_recurrent=self.policy.use_recurrent,
                    num_layers=num_layers,
                    stream_names=stream_names,
                    vis_encode_type=vis_encode_type,
                )
                self.target_network = SACTargetNetwork(
                    policy=self.policy,
                    m_size=self.policy.m_size,  # 1x policy.m_size
                    h_size=h_size,
                    normalize=self.policy.normalize,
                    use_recurrent=self.policy.use_recurrent,
                    num_layers=num_layers,
                    stream_names=stream_names,
                    vis_encode_type=vis_encode_type,
                )
                # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
                self.m_size = 3 * self.policy.m_size
                self._create_inputs_and_outputs()
                self.learning_rate = ModelUtils.create_schedule(
                    lr_schedule,
                    lr,
                    self.policy.global_step,
                    int(max_step),
                    min_value=1e-10,
                )
                self._create_losses(
                    self.policy_network.q1_heads,
                    self.policy_network.q2_heads,
                    lr,
                    int(max_step),
                    stream_names,
                    discrete=not self.policy.use_continuous_act,
                )
                self._create_sac_optimizer_ops()

                self.selected_actions = (self.policy.selected_actions
                                         )  # For GAIL and other reward signals
                if self.policy.normalize:
                    target_update_norm = self.target_network.copy_normalization(
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                    # Update the normalization of the optimizer when the policy does.
                    self.policy.update_normalization_op = tf.group([
                        self.policy.update_normalization_op, target_update_norm
                    ])

        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
            "Losses/Policy Loss": "policy_loss",
            "Losses/Q1 Loss": "q1_loss",
            "Losses/Q2 Loss": "q2_loss",
            "Policy/Entropy Coeff": "entropy_coef",
            "Policy/Learning Rate": "learning_rate",
        }

        self.update_dict = {
            "value_loss": self.total_value_loss,
            "policy_loss": self.policy_loss,
            "q1_loss": self.q1_loss,
            "q2_loss": self.q2_loss,
            "entropy_coef": self.ent_coef,
            "update_batch": self.update_batch_policy,
            "update_value": self.update_batch_value,
            "update_entropy": self.update_batch_entropy,
            "learning_rate": self.learning_rate,
        }
コード例 #6
0
    def __init__(
        self,
        brain,
        lr=1e-4,
        lr_schedule=LearningRateSchedule.CONSTANT,
        h_size=128,
        init_entcoef=0.1,
        max_step=5e6,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        m_size=None,
        seed=0,
        stream_names=None,
        tau=0.005,
        gammas=None,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: BrainInfo used to generate specific network graph.
        :param lr: Learning rate.
        :param lr_schedule: Learning rate decay schedule.
        :param h_size: Size of hidden layers
        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
            set higher to explore more.
        :return: a sub-class of PPOAgent tailored to the environment.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers: Number of hidden layers between encoded input and policy & value layers
        :param tau: Strength of soft-Q update.
        :param m_size: Size of brain memory.
        """
        self.tau = tau
        self.gammas = gammas
        self.brain = brain
        self.init_entcoef = init_entcoef
        if stream_names is None:
            stream_names = []
        # Use to reduce "survivor bonus" when using Curiosity or GAIL.
        self.use_dones_in_backup = {
            name: tf.Variable(1.0)
            for name in stream_names
        }
        self.disable_use_dones = {
            name: self.use_dones_in_backup[name].assign(0.0)
            for name in stream_names
        }
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain,
                               seed, stream_names)
        if num_layers < 1:
            num_layers = 1

        self.target_init_op: List[tf.Tensor] = []
        self.target_update_op: List[tf.Tensor] = []
        self.update_batch_policy: Optional[tf.Operation] = None
        self.update_batch_value: Optional[tf.Operation] = None
        self.update_batch_entropy: Optional[tf.Operation] = None

        self.policy_network = SACPolicyNetwork(
            brain=brain,
            m_size=m_size,
            h_size=h_size,
            normalize=normalize,
            use_recurrent=use_recurrent,
            num_layers=num_layers,
            seed=seed,
            stream_names=stream_names,
            vis_encode_type=vis_encode_type,
        )
        self.target_network = SACTargetNetwork(
            brain=brain,
            m_size=m_size // 4 if m_size else None,
            h_size=h_size,
            normalize=normalize,
            use_recurrent=use_recurrent,
            num_layers=num_layers,
            seed=seed,
            stream_names=stream_names,
            vis_encode_type=vis_encode_type,
        )
        self.create_inputs_and_outputs()
        self.learning_rate = self.create_learning_rate(lr_schedule, lr,
                                                       self.global_step,
                                                       max_step)
        self.create_losses(
            self.policy_network.q1_heads,
            self.policy_network.q2_heads,
            lr,
            max_step,
            stream_names,
            discrete=self.brain.vector_action_space_type == "discrete",
        )

        self.selected_actions = (self.policy_network.selected_actions
                                 )  # For GAIL and other reward signals
        if normalize:
            target_update_norm = self.target_network.copy_normalization(
                self.policy_network.running_mean,
                self.policy_network.running_variance,
                self.policy_network.normalization_steps,
            )
            self.update_normalization = tf.group(
                [self.policy_network.update_normalization, target_update_norm])
            self.running_mean = self.policy_network.running_mean
            self.running_variance = self.policy_network.running_variance
            self.normalization_steps = self.policy_network.normalization_steps