def make_actor(self, obs, reuse=False, scope="pi"): """Create an actor tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the actor """ # Initial image pre-processing (for convolutional policies). if self.model_params["model_type"] == "conv": pi_h = create_conv( obs=obs, image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) else: pi_h = obs # Create the output mean. policy_mean = create_fcnet( obs=pi_h, layers=self.model_params["layers"], num_output=self.ac_space.shape[0], stochastic=False, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) # Create the output log_std. log_std = tf.get_variable(name='logstd', shape=[1, self.ac_space.shape[0]], initializer=tf.zeros_initializer()) # Create a method to sample from the distribution. std = tf.exp(log_std) action = policy_mean + std * tf.random_normal( shape=tf.shape(policy_mean), dtype=tf.float32) return action, policy_mean, log_std
def make_critic(self, obs, action, reuse=False, scope="qf"): """Create a critic tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the critic """ # Concatenate the observations and actions. qf_h = tf.concat([obs, action], axis=-1) # Initial image pre-processing (for convolutional policies). if self.model_params["model_type"] == "conv": qf_h = create_conv( obs=qf_h, image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], batch_norm=self.model_params["batch_norm"], phase=self.phase_ph, dropout=self.model_params["dropout"], rate=self.rate_ph, scope=scope, reuse=reuse, ) return create_fcnet( obs=qf_h, layers=self.model_params["layers"], num_output=1, stochastic=False, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], batch_norm=self.model_params["batch_norm"], phase=self.phase_ph, dropout=self.model_params["dropout"], rate=self.rate_ph, output_pre="qf_", scope=scope, reuse=reuse, )
def make_actor(self, obs, ac_space, reuse=False, scope="pi"): """Create an actor tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder of the individual agent ac_space : gym.space.* the action space of the individual agent reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the actor """ # Initial image pre-processing (for convolutional policies). if self.model_params["model_type"] == "conv": pi_h = create_conv( obs=obs, image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) else: pi_h = obs # Create the model. policy = create_fcnet( obs=pi_h, layers=self.model_params["layers"], num_output=ac_space.shape[0], stochastic=False, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) # Scaling terms to the output from the policy. ac_means = (ac_space.high + ac_space.low) / 2. ac_magnitudes = (ac_space.high - ac_space.low) / 2. # Apply squashing and scale by action space. return ac_means + ac_magnitudes * tf.nn.tanh(policy)
def make_critic(self, obs, reuse=False, scope="qf"): """Create a critic tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the critic """ # Initial image pre-processing (for convolutional policies). if self.model_params["model_type"] == "conv": vf_h = create_conv( obs=obs, image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) else: vf_h = obs return create_fcnet( obs=vf_h, layers=self.model_params["layers"], num_output=1, stochastic=False, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, )
def make_critic(self, obs, action=None, reuse=False, scope="value_fns", create_qf=True, create_vf=True): """Create the critic variables. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor create_qf : bool whether to create the Q-functions create_vf : bool whether to create the value function Returns ------- tf.Variable the output from the first Q-function. Set to None if `create_qf` is False. tf.Variable the output from the second Q-function. Set to None if `create_qf` is False. tf.Variable the output from the value function. Set to None if `create_vf` is False. """ conv_params = dict( image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], reuse=reuse, ) fcnet_params = dict( layers=self.model_params["layers"], num_output=1, stochastic=False, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], reuse=reuse, ) with tf.compat.v1.variable_scope(scope, reuse=reuse): # Value function if create_vf: if self.model_params["model_type"] == "conv": vf_h = create_conv(obs=obs, scope="vf", **conv_params) else: vf_h = obs value_fn = create_fcnet(obs=vf_h, scope="vf", output_pre="vf_", **fcnet_params) else: value_fn = None # Double Q values to reduce overestimation if create_qf: # Concatenate the observations and actions. qf_h = tf.concat([obs, action], axis=-1) if self.model_params["model_type"] == "conv": qf1_h = create_conv(obs=qf_h, scope="qf1", **conv_params) qf2_h = create_conv(obs=qf_h, scope="qf2", **conv_params) else: qf1_h = qf_h qf2_h = qf_h qf1 = create_fcnet(obs=qf1_h, scope="qf1", output_pre="qf_", **fcnet_params) qf2 = create_fcnet(obs=qf2_h, scope="qf2", output_pre="qf_", **fcnet_params) else: qf1, qf2 = None, None return qf1, qf2, value_fn
def make_actor(self, obs, action, reuse=False, scope="pi"): """Create the actor variables. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the deterministic actor tf.Variable the output from the stochastic actor tf.Variable the log-probability of a given observation given the output action from the policy tf.Variable the log-probability of a given observation given a fixed action """ # Initial image pre-processing (for convolutional policies). if self.model_params["model_type"] == "conv": pi_h = create_conv( obs=obs, image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) else: pi_h = obs # Create the model. policy_mean, log_std = create_fcnet( obs=pi_h, layers=self.model_params["layers"], num_output=self.ac_space.shape[0], stochastic=True, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) # OpenAI Variation to cap the standard deviation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) std = tf.exp(log_std) # Reparameterization trick policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std logp_pi = gaussian_likelihood(policy, policy_mean, log_std) logp_ac = gaussian_likelihood(action, policy_mean, log_std) # Apply squashing and account for it in the probability _, _, logp_ac = apply_squashing_func(policy_mean, action, logp_ac) deterministic_policy, policy, logp_pi = apply_squashing_func( policy_mean, policy, logp_pi) return deterministic_policy, policy, logp_pi, logp_ac
def make_critic(self, obs, action=None, reuse=False, scope="value_fns", create_qf=True, create_vf=True): """Create the critic variables. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor create_qf : bool whether to create the Q-functions create_vf : bool whether to create the value function Returns ------- tf.Variable the output from the first Q-function. Set to None if `create_qf` is False. tf.Variable the output from the second Q-function. Set to None if `create_qf` is False. tf.Variable the output from the value function. Set to None if `create_vf` is False. """ conv_params = dict( image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], reuse=reuse, ) fcnet_params = dict( layers=self.model_params["layers"], num_output=1, stochastic=False, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], reuse=reuse, ) with tf.compat.v1.variable_scope(scope, reuse=reuse): # Value function if create_vf: if self.model_params["model_type"] == "conv": vf_h = create_conv(obs=obs, scope="vf", **conv_params) else: vf_h = obs # if an image is present in the observation # extra processing steps are needed if self.includes_image: batch_size = tf.shape(vf_h)[0] image_size = (self.image_height * self.image_width * self.image_channels) original_vf_h = vf_h vf_h = original_vf_h[:, image_size:] vf_h = tf.gather(vf_h, [ i for i in range(vf_h.shape[1]) if i not in self.ignore_flat_channels ], axis=1) # ignoring the image is useful for the lower level # for creating an abstraction barrier if not self.ignore_image: vf_h_image = tf.reshape( original_vf_h[:, :image_size], [ batch_size, self.image_height, self.image_width, self.image_channels ]) # create the hidden convolutional layers for i, (filters, kernel_size, strides) in enumerate( zip(self.filters, self.kernel_sizes, self.strides)): vf_h_image = self._conv_layer( vf_h_image, filters, kernel_size, strides, 'conv{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) h = vf_h_image.shape[1] w = vf_h_image.shape[2] c = vf_h_image.shape[3] vf_h = tf.concat([ tf.reshape(vf_h_image, [batch_size, h * w * c]) / tf.cast(h * w * c, tf.float32), vf_h ], 1) value_fn = create_fcnet(obs=vf_h, scope="vf", output_pre="vf_", **fcnet_params) else: value_fn = None # Double Q values to reduce overestimation if create_qf: # Concatenate the observations and actions. qf_h = tf.concat([obs, action], axis=-1) if self.model_params["model_type"] == "conv": qf1_h = create_conv(obs=qf_h, scope="qf1", **conv_params) qf2_h = create_conv(obs=qf_h, scope="qf2", **conv_params) else: qf1_h = qf_h qf2_h = qf_h qf1 = create_fcnet(obs=qf1_h, scope="qf1", output_pre="qf_", **fcnet_params) qf2 = create_fcnet(obs=qf2_h, scope="qf2", output_pre="qf_", **fcnet_params) else: qf1, qf2 = None, None return qf1, qf2, value_fn