Beispiel #1
0
    def _setup_stochastic_policy(self, obs, action, reuse=False, scope="pi"):
        """Create the variables of a stochastic policy.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the policy
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            pi_h = obs

            # create the hidden layers
            for i, layer_size in enumerate(self.layers):
                pi_h = layer(pi_h,
                             layer_size,
                             'fc{}'.format(i),
                             act_fun=self.act_fun,
                             layer_norm=self.layer_norm)

            # create the output mean
            policy_mean = layer(
                pi_h,
                self.ac_space.shape[0],
                'mean',
                act_fun=None,
                kernel_initializer=tf.random_uniform_initializer(minval=-3e-3,
                                                                 maxval=3e-3))

            # create the output log_std
            log_std = layer(
                pi_h,
                self.ac_space.shape[0],
                'log_std',
                act_fun=None,
            )

        # OpenAI Variation to cap the standard deviation
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)

        std = tf.exp(log_std)

        # Reparameterization trick
        policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std
        logp_pi = gaussian_likelihood(policy, policy_mean, log_std)
        logp_ac = gaussian_likelihood(action, policy_mean, log_std)

        # Apply squashing and account for it in the probability
        _, _, logp_ac = apply_squashing_func(policy_mean, action, logp_ac)
        _, policy, _ = apply_squashing_func(policy_mean, policy, logp_pi)

        # Store the variables under their respective parameters.
        self.policy = policy
        self.logp_ac = logp_ac
Beispiel #2
0
    def make_actor(self, obs, reuse=False, scope="pi"):
        """Create an actor tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the actor
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            pi_h = obs

            # zero out the fingerprint observations for the worker policy
            if self.zero_fingerprint:
                pi_h = self._remove_fingerprint(pi_h, self.ob_space.shape[0],
                                                self.fingerprint_dim,
                                                self.co_space.shape[0])

            # create the hidden layers
            for i, layer_size in enumerate(self.layers):
                pi_h = layer(pi_h,
                             layer_size,
                             'fc{}'.format(i),
                             act_fun=self.act_fun,
                             layer_norm=self.layer_norm)

            # create the output layer
            policy = layer(pi_h,
                           self.ac_space.shape[0],
                           'output',
                           act_fun=tf.nn.tanh,
                           kernel_initializer=tf.random_uniform_initializer(
                               minval=-3e-3, maxval=3e-3))

            # scaling terms to the output from the policy
            ac_means = (self.ac_space.high + self.ac_space.low) / 2.
            ac_magnitudes = (self.ac_space.high - self.ac_space.low) / 2.

            policy = ac_means + ac_magnitudes * tf.to_float(policy)

        return policy
Beispiel #3
0
    def make_critic(self, obs, action, reuse=False, scope="qf"):
        """Create a critic tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the critic
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            # concatenate the observations and actions
            qf_h = tf.concat([obs, action], axis=-1)

            # zero out the fingerprint observations for the worker policy
            if self.zero_fingerprint:
                qf_h = self._remove_fingerprint(
                    qf_h, self.ob_space.shape[0], self.fingerprint_dim,
                    self.co_space.shape[0] + self.ac_space.shape[0])

            # create the hidden layers
            for i, layer_size in enumerate(self.layers):
                qf_h = layer(qf_h,
                             layer_size,
                             'fc{}'.format(i),
                             act_fun=self.act_fun,
                             layer_norm=self.layer_norm)

            # create the output layer
            qvalue_fn = layer(qf_h,
                              1,
                              'qf_output',
                              kernel_initializer=tf.random_uniform_initializer(
                                  minval=-3e-3, maxval=3e-3))

        return qvalue_fn
Beispiel #4
0
    def _setup_deterministic_policy(self, obs, reuse=False, scope="pi"):
        """Create the variables of deterministic a policy.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the policy
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            pi_h = obs

            # create the hidden layers
            for i, layer_size in enumerate(self.layers):
                pi_h = layer(pi_h,
                             layer_size,
                             'fc{}'.format(i),
                             act_fun=self.act_fun,
                             layer_norm=self.layer_norm)

            # create the output layer
            policy = layer(pi_h,
                           self.ac_space.shape[0],
                           'output',
                           act_fun=tf.nn.tanh,
                           kernel_initializer=tf.random_uniform_initializer(
                               minval=-3e-3, maxval=3e-3))

            # scaling terms to the output from the policy
            ac_means = (self.ac_space.high + self.ac_space.low) / 2.
            ac_magnitudes = (self.ac_space.high - self.ac_space.low) / 2.

            policy = ac_means + ac_magnitudes * tf.to_float(policy)

        # Store the variables under their respective parameters.
        self.policy = policy
Beispiel #5
0
    def make_critic(self, obs, action, reuse=False, scope="qf"):
        """Create a critic tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            an outer scope term

        Returns
        -------
        tf.Variable
            the output from the critic
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            # concatenate the observations and actions
            qf_h = tf.concat([obs, action], axis=-1)

            # create the hidden layers
            for i, layer_size in enumerate(self.layers):
                qf_h = layer(qf_h,
                             layer_size,
                             'fc{}'.format(i),
                             act_fun=self.act_fun,
                             layer_norm=self.layer_norm)

            # create the output layer
            qvalue_fn = layer(qf_h,
                              1,
                              'qf_output',
                              kernel_initializer=tf.random_uniform_initializer(
                                  minval=-3e-3, maxval=3e-3))

        return qvalue_fn
Beispiel #6
0
    def make_critic(self,
                    obs,
                    action=None,
                    reuse=False,
                    scope="value_fns",
                    create_qf=True,
                    create_vf=True):
        """Create the critic variables.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor
        create_qf : bool
            whether to create the Q-functions
        create_vf : bool
            whether to create the value function

        Returns
        -------
        tf.Variable
            the output from the first Q-function. Set to None if `create_qf` is
            False.
        tf.Variable
            the output from the second Q-function. Set to None if `create_qf`
            is False.
        tf.Variable
            the output from the value function. Set to None if `create_vf` is
            False.
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            # zero out the fingerprint observations for the worker policy
            if self.zero_fingerprint:
                obs = self._remove_fingerprint(
                    obs,
                    self.ob_space.shape[0],
                    self.fingerprint_dim,
                    self.co_space.shape[0]
                )

            # Value function
            if create_vf:
                with tf.compat.v1.variable_scope("vf", reuse=reuse):
                    vf_h = obs

                    # create the hidden layers
                    for i, layer_size in enumerate(self.layers):
                        vf_h = layer(
                            vf_h, layer_size, 'fc{}'.format(i),
                            act_fun=self.act_fun,
                            layer_norm=self.layer_norm
                        )

                    # create the output layer
                    value_fn = layer(
                        vf_h, 1, 'vf_output',
                        kernel_initializer=tf.random_uniform_initializer(
                            minval=-3e-3, maxval=3e-3)
                    )
            else:
                value_fn = None

            # Double Q values to reduce overestimation
            if create_qf:
                with tf.compat.v1.variable_scope('qf1', reuse=reuse):
                    # concatenate the observations and actions
                    qf1_h = tf.concat([obs, action], axis=-1)

                    # create the hidden layers
                    for i, layer_size in enumerate(self.layers):
                        qf1_h = layer(
                            qf1_h, layer_size, 'fc{}'.format(i),
                            act_fun=self.act_fun,
                            layer_norm=self.layer_norm
                        )

                    # create the output layer
                    qf1 = layer(
                        qf1_h, 1, 'qf_output',
                        kernel_initializer=tf.random_uniform_initializer(
                            minval=-3e-3, maxval=3e-3)
                    )

                with tf.compat.v1.variable_scope('qf2', reuse=reuse):
                    # concatenate the observations and actions
                    qf2_h = tf.concat([obs, action], axis=-1)

                    # create the hidden layers
                    for i, layer_size in enumerate(self.layers):
                        qf2_h = layer(
                            qf2_h, layer_size, 'fc{}'.format(i),
                            act_fun=self.act_fun,
                            layer_norm=self.layer_norm
                        )

                    # create the output layer
                    qf2 = layer(
                        qf2_h, 1, 'qf_output',
                        kernel_initializer=tf.random_uniform_initializer(
                            minval=-3e-3, maxval=3e-3)
                    )
            else:
                qf1, qf2 = None, None

        return qf1, qf2, value_fn
Beispiel #7
0
    def make_actor(self, obs, action, reuse=False, scope="pi"):
        """Create the actor variables.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the deterministic actor
        tf.Variable
            the output from the stochastic actor
        tf.Variable
            the log-probability of a given observation given the output action
            from the policy
        tf.Variable
            the log-probability of a given observation given a fixed action
        """
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            pi_h = obs

            # zero out the fingerprint observations for the worker policy
            if self.zero_fingerprint:
                pi_h = self._remove_fingerprint(
                    pi_h,
                    self.ob_space.shape[0],
                    self.fingerprint_dim,
                    self.co_space.shape[0]
                )

            # create the hidden layers
            for i, layer_size in enumerate(self.layers):
                pi_h = layer(
                    pi_h,  layer_size, 'fc{}'.format(i),
                    act_fun=self.act_fun,
                    layer_norm=self.layer_norm
                )

            # create the output mean
            policy_mean = layer(
                pi_h, self.ac_space.shape[0], 'mean',
                act_fun=None,
                kernel_initializer=tf.random_uniform_initializer(
                    minval=-3e-3, maxval=3e-3)
            )

            # create the output log_std
            log_std = layer(
                pi_h, self.ac_space.shape[0], 'log_std',
                act_fun=None,
            )

        # OpenAI Variation to cap the standard deviation
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)

        std = tf.exp(log_std)

        # Reparameterization trick
        policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std
        logp_pi = gaussian_likelihood(policy, policy_mean, log_std)
        logp_ac = gaussian_likelihood(action, policy_mean, log_std)

        # Apply squashing and account for it in the probability
        _, _, logp_ac = apply_squashing_func(
            policy_mean, action, logp_ac)
        deterministic_policy, policy, logp_pi = apply_squashing_func(
            policy_mean, policy, logp_pi)

        return deterministic_policy, policy, logp_pi, logp_ac
Beispiel #8
0
    def test_layer(self):
        """Check the functionality of the layer() method.

        This method is tested for the following features:

        1. the number of outputs from the layer equals num_outputs
        2. the name is properly used
        3. the proper activation function applied if requested
        4. layer_norm is applied if requested
        """
        # =================================================================== #
        # test case 1                                                         #
        # =================================================================== #

        # Choose a random number of outputs.
        num_outputs = random.randint(1, 10)

        # Create the layer.
        out_val = layer(
            val=tf.compat.v1.placeholder(
                tf.float32,
                shape=(None, 1),
                name='input_test1',
            ),
            num_outputs=num_outputs,
            name="test1",
        )

        # Test the number of outputs.
        self.assertEqual(out_val.shape[-1], num_outputs)

        # Clear the graph.
        tf.compat.v1.reset_default_graph()

        # =================================================================== #
        # test case 2                                                         #
        # =================================================================== #

        # Create the layer.
        out_val = layer(
            val=tf.compat.v1.placeholder(
                tf.float32,
                shape=(None, 1),
                name='input_test2',
            ),
            num_outputs=num_outputs,
            name="test2",
        )

        # Test the name matches what is expected.
        self.assertEqual(out_val.name, "test2/BiasAdd:0")

        # Clear the graph.
        tf.compat.v1.reset_default_graph()

        # =================================================================== #
        # test case 3                                                         #
        # =================================================================== #

        # Create the layer.
        out_val = layer(
            val=tf.compat.v1.placeholder(
                tf.float32,
                shape=(None, 1),
                name='input_test3',
            ),
            act_fun=tf.nn.relu,
            num_outputs=num_outputs,
            name="test3",
        )

        # Test that the name matches the activation function that was added.
        self.assertEqual(out_val.name, "Relu:0")

        # Clear the graph.
        tf.compat.v1.reset_default_graph()

        # =================================================================== #
        # test case 4                                                         #
        # =================================================================== #

        # Create the layer.
        _ = layer(
            val=tf.compat.v1.placeholder(
                tf.float32,
                shape=(None, 1),
                name='input_test4',
            ),
            layer_norm=True,
            num_outputs=num_outputs,
            name="test4",
        )

        # Test that the LayerNorm layer was added.
        self.assertListEqual(
            sorted([var.name for var in get_trainable_vars()]),
            ['LayerNorm/beta:0',
             'LayerNorm/gamma:0',
             'test4/bias:0',
             'test4/kernel:0']
        )

        # Clear the graph.
        tf.compat.v1.reset_default_graph()