def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse=False, n_lstm=256): super(AcerLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse, n_lstm) with tf.variable_scope("model", reuse=reuse): extracted_features = nature_cnn(self.obs_ph) # lstm input_seq = batch_to_seq(extracted_features, n_env, n_steps) masks = batch_to_seq(self.masks_ph, n_env, n_steps) rnn_output, self.snew = lstm(input_seq, masks, self.states_ph, 'lstm1', n_hidden=n_lstm) rnn_output = seq_to_batch(rnn_output) pi_logits = linear(rnn_output, 'pi', self.n_act, init_scale=0.01) policy = tf.nn.softmax(pi_logits) q_value = linear(rnn_output, 'q', self.n_act) self.action = sample( pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((n_env, n_lstm * 2), dtype=np.float32) self.policy = policy # actual policy params now self.q_value = q_value
def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, layer_norm=False, **kwargs): super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse) with tf.variable_scope("model", reuse=reuse): extracted_features = nature_cnn(self.obs_ph, **kwargs) input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.masks_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) value_fn = linear(rnn_output, 'v', 1) self.proba_distribution, self.policy = self.pdtype.proba_distribution_from_latent( rnn_output) self._value = value_fn[:, 0] self.action = self.proba_distribution.sample() self.neglogp = self.proba_distribution.neglogp(self.action) self.initial_state = np.zeros((self.n_env, n_lstm * 2), dtype=np.float32) self.value_fn = value_fn
def proba_distribution_from_latent(self, latent_vector, init_scale=1.0, init_bias=0.0): """ returns the probability distribution from latent values :param latent_vector: ([float]) the latent values :param init_scale: (float) the inital scale of the distribution :param init_bias: (float) the inital bias of the distribution :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated """ pdparam = linear(latent_vector, 'pi', self.n_cat, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), pdparam
def proba_distribution_from_latent(self, latent_vector, init_scale=1.0, init_bias=0.0): """ returns the probability distribution from latent values :param latent_vector: ([float]) the latent values :param init_scale: (float) the inital scale of the distribution :param init_bias: (float) the inital bias of the distribution :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated """ mean = linear(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) return self.proba_distribution_from_flat(pdparam), mean
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse=False): super(AcerCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse) with tf.variable_scope("model", reuse=reuse): extracted_features = nature_cnn(self.obs_ph) pi_logits = linear(extracted_features, 'pi', self.n_act, init_scale=0.01) policy = tf.nn.softmax(pi_logits) q_value = linear(extracted_features, 'q', self.n_act) self.action = sample( pi_logits) # could change this to use self.pi instead self.initial_state = [] # not stateful self.policy = policy # actual policy params now self.q_value = q_value
def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, _type="cnn", **kwargs): super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse) with tf.variable_scope("model", reuse=reuse): if _type == "cnn": extracted_features = nature_cnn(self.processed_x, **kwargs) value_fn = linear(extracted_features, 'v', 1)[:, 0] else: activ = tf.tanh processed_x = tf.layers.flatten(self.processed_x) pi_h1 = activ( linear(processed_x, 'pi_fc1', n_hidden=64, init_scale=np.sqrt(2))) pi_h2 = activ( linear(pi_h1, 'pi_fc2', n_hidden=64, init_scale=np.sqrt(2))) vf_h1 = activ( linear(processed_x, 'vf_fc1', n_hidden=64, init_scale=np.sqrt(2))) vf_h2 = activ( linear(vf_h1, 'vf_fc2', n_hidden=64, init_scale=np.sqrt(2))) value_fn = linear(vf_h2, 'vf', 1)[:, 0] extracted_features = pi_h2 self.proba_distribution, self.policy = self.pdtype.proba_distribution_from_latent( extracted_features, init_scale=0.01) self.action = self.proba_distribution.sample() self.neglogp = self.proba_distribution.neglogp(self.action) self.initial_state = None self.value_fn = value_fn
def nature_cnn(unscaled_images, **kwargs): """ CNN from Nature paper. :param unscaled_images: (TensorFlow Tensor) Image input placeholder :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN :return: (TensorFlow Tensor) The CNN output layer """ scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu layer_1 = activ( conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_3 = activ( conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_3 = conv_to_fc(layer_3) return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))