def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, pi_init_scale=1.0, pi_init_bias=0.0, pi_init_std=1.0, vf_init_scale=1.0, vf_init_bias=0.0): mean = linear(pi_latent_vector, 'pi', self.size, init_scale=pi_init_scale, init_bias=pi_init_bias) logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.constant_initializer( np.log(pi_init_std)), trainable=False) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=vf_init_scale, init_bias=vf_init_bias) return self.proba_distribution_from_flat(pdparam), mean, q_values
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="mlp", **kwargs): super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) self._pdtype = make_proba_dist_type(ac_space) self._kwargs_check(feature_extraction, kwargs) if layers is not None: warnings.warn("Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn("The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) if net_arch is None: if layers is None: layers = [64, 64] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs) else: pi_latent, vf_latent = mlp_extractor(tf.layers.flatten(self.processed_obs), net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, pi_init_scale=1.0, pi_init_bias=0.0, pi_init_std=0.125, vf_init_scale=1.0, vf_init_bias=0.0) self._setup_init() return
def sf_cnn(scaled_images, **kwargs) -> tf.Tensor: """ CNN from Nature paper. :param scaled_images: (TensorFlow Tensor) Image input placeholder :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN :return: (TensorFlow Tensor) The CNN output layer """ activ = tf.nn.relu layer_1 = activ( conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_3 = activ( conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_3 = conv_to_fc(layer_3) return activ( linear(layer_3, 'fc1', n_hidden=FEATURE_SIZE, init_scale=np.sqrt(2)))
def __init__(self, sess: tf.Session, ob_space: ObservSpace, ac_space: ActionSpace, n_env: int, n_steps: int, n_batch: int, reuse=False, layers=None, cnn_extractor=sf_cnn, feature_extraction="cnn", add_action_ph=True, **kwargs): super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=reuse, scale=(feature_extraction == "cnn"), add_action_ph=add_action_ph) if layers is None: layers = [64, 64] with tf.variable_scope("model", reuse=reuse): value_fn: tf.Tensor = None recons_mod: tf.Tensor = None successor_feature: tf.Tensor = None extracted_features: tf.Tensor = None if feature_extraction == "cnn": extracted_features = cnn_extractor(self.processed_x, **kwargs) # TODO: L2 Normalize extracted features assert len(extracted_features.shape) == 2 extracted_features = tf.nn.l2_normalize(extracted_features, axis=1) # TODO: Add machado reconstruction module recons_mod = reconstruct(extracted_features, 'reconstruct', action_ph=self.action_ph, num_action_space=ac_space.n) # TODO: Add machado SF estimator successor_feature = sf_estimator(extracted_features) value_fn = linear(extracted_features, 'vf', 1) pi_latent = extracted_features vf_latent = extracted_features else: raise NotImplementedError( 'Not implement reconstruction module yet.') # activ = tf.tanh # processed_x = tf.layers.flatten(self.processed_x) # pi_h = processed_x # vf_h = processed_x # for i, layer_size in enumerate(layers): # pi_h = activ(linear(pi_h, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) # vf_h = activ(linear(vf_h, 'vf_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) # value_fn = linear(vf_h, 'vf', 1) # pi_latent = pi_h # vf_latent = vf_h self.proba_distribution, self.policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self.value_fn = value_fn self.recons_mod = recons_mod self.successor_feature = successor_feature self.reward_bonus = tf.math.reciprocal( tf.linalg.norm(self.successor_feature, 2, axis=1)) self._feature = extracted_features self.initial_state = None self._setup_init()