Ejemplo n.º 1
0
    def build_graph(self):
        """
        Builds computational graph for policy
        """
        with tf.variable_scope(self.name):
            # build the actual policy network
            self.obs_var, self.mean_var = create_mlp(name='mean_network',
                                                     output_dim=self.action_dim,
                                                     hidden_sizes=self.hidden_sizes,
                                                     hidden_nonlinearity=self.hidden_nonlinearity,
                                                     output_nonlinearity=self.output_nonlinearity,
                                                     input_dim=(None, self.obs_dim,)
                                                     )

            with tf.variable_scope("log_std_network"):
                log_std_var = tf.get_variable(name='log_std_var',
                                              shape=(1, self.action_dim,),
                                              dtype=tf.float32,
                                              initializer=tf.constant_initializer(self.init_log_std),
                                              trainable=self.learn_std
                                              )

                self.log_std_var = tf.maximum(log_std_var, self.min_log_std, name='log_std')

            # symbolically define sampled action and distribution
            self.action_var = self.mean_var + tf.random_normal(shape=tf.shape(self.mean_var)) * tf.exp(log_std_var)
            self._dist = DiagonalGaussian(self.action_dim)

            # save the policy's trainable variables in dicts
            current_scope = tf.get_default_graph().get_name_scope()
            trainable_policy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope)
            self.policy_params = OrderedDict([(remove_scope_from_name(var.name, current_scope), var) for var in trainable_policy_vars])
Ejemplo n.º 2
0
    def build_graph(self):
        with tf.variable_scope(self.name):
            # build the actual policy network
            self.obs_var, self.prob_var = create_mlp(
                name='prob_network',
                output_dim=self.action_dim,
                hidden_sizes=self.hidden_sizes,
                hidden_nonlinearity=self.hidden_nonlinearity,
                output_nonlinearity=self.output_nonlinearity,
                input_dim=(
                    None,
                    self.obs_dim,
                ))

            # symbolically define sampled action and distribution
            self.action_var = tf.random.categorical(tf.log(self.prob_var), 1)
            self._dist = Categorical(self.action_dim)

            # save the policy's trainable variables in dicts
            current_scope = tf.get_default_graph().get_name_scope()
            trainable_policy_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope)
            self.policy_params = OrderedDict([
                (remove_scope_from_name(var.name, current_scope), var)
                for var in trainable_policy_vars
            ])
Ejemplo n.º 3
0
 def _create_placeholders_for_vars(self,
                                   scope,
                                   graph_keys=tf.GraphKeys.
                                   TRAINABLE_VARIABLES):
     var_list = tf.get_collection(graph_keys, scope=scope)
     placeholders = []
     for var in var_list:
         var_name = remove_scope_from_name(var.name, scope.split('/')[0])
         placeholders.append((var_name,
                              tf.placeholder(tf.float32,
                                             shape=var.shape,
                                             name="%s_ph" % var_name)))
     return OrderedDict(placeholders)