def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): if cfg.is_mod(cfg.MOD_PRETRAIN_PI): # init the output layer of the policy with the weights of the pretrained policy # [w_hid1, w_hid2, w_out], [b_hid1, b_hid2, b_out] ws, bs = load_weights() w_out, b_out = ws[-1], bs[-1] # check dimensions assert w_out.shape[0] == pi_latent_vector.shape[1] assert w_out.shape[1] == self.size # construct the linear output layer for mean prediction with tf.variable_scope('pi'): mean_weight = tf.get_variable(f"w_mean", initializer=w_out) mean_bias = tf.get_variable(f"b_mean", initializer=b_out) output = tf.matmul(pi_latent_vector, mean_weight) + mean_bias mean = output else: mean = linear(pi_latent_vector, 'pi', self.size, init_scale=cfg.pi_out_init_scale, init_bias=init_bias) if cfg.is_mod(cfg.MOD_BOUND_MEAN): with tf.variable_scope('pi'): mean = tf.tanh(mean) # squashing mean only if cfg.is_mod(cfg.MOD_CONST_EXPLORE): logstd = cfg.init_logstd else: logstd_initializer = tf.constant_initializer(cfg.init_logstd) # print(f'Initializing all logstds with: {cfg.init_logstd}') logstd = tf.get_variable(name='pi/logstd', shape=(self.size,), initializer=logstd_initializer) # clipping of logstd inspired by sac logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX) # log(f'Clipping logstd in range from {LOG_STD_MIN} to {LOG_STD_MAX}') pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), mean, q_values
def cnn3D(input_space, **kwargs): """ Custom 3d CNN. :param scaled_images: (TensorFlow Tensor) Image input placeholder :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN :return: (TensorFlow Tensor) The CNN output layer """ activ = tf.nn.relu layer_1 = activ( conv3d(input_space, 'c1', n_filters=16, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv3d(layer_1, 'c2', n_filters=16, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_3 = maxpool3d(layer_2, 2, 1, 'VALID') #layer_4 = activ(conv3d(layer_3, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_4 = conv_to_fc(layer_3) layer_5 = activ(linear(layer_4, 'fc1', n_hidden=16, init_scale=np.sqrt(2))) layer_6 = activ(linear(layer_5, 'fc2', n_hidden=16, init_scale=np.sqrt(2))) return activ(linear(layer_6, 'fc3', n_hidden=32, init_scale=np.sqrt(2)))
def modified_deep_nature_cnn(scaled_images, **kwargs): """ CNN from Nature paper. :param scaled_images: (TensorFlow Tensor) Image input placeholder :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN :return: (TensorFlow Tensor) The CNN output layer """ activ = tf.nn.relu layer_1 = activ( conv(scaled_images, 'c1', n_filters=8, filter_size=6, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'c2', n_filters=16, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_2 = conv_to_fc(layer_2) layer_3 = activ(linear(layer_2, 'fc1', n_hidden=128, init_scale=np.sqrt(2))) return activ(linear(layer_3, 'fc2', n_hidden=128, init_scale=np.sqrt(2)))
def mlp_extractor(flat_observations, net_arch, act_fun): """ Constructs an MLP that receives observations as an input and outputs a latent representation for the policy and a value network. The ``net_arch`` parameter allows to specify the amount and size of the hidden layers and how many of them are shared between the policy network and the value network. It is assumed to be a list with the following structure: 1. An arbitrary length (zero allowed) number of integers each specifying the number of units in a shared layer. If the number of ints is zero, there will be no shared layers. 2. An optional dict, to specify the following non-shared layers for the value network and the policy network. It is formatted like ``dict(vf=[<value layer sizes>], pi=[<policy layer sizes>])``. If it is missing any of the keys (pi or vf), no non-shared layers (empty list) is assumed. For example to construct a network with one shared layer of size 55 followed by two non-shared layers for the value network of size 255 and a single non-shared layer of size 128 for the policy network, the following layers_spec would be used: ``[55, dict(vf=[255, 255], pi=[128])]``. A simple shared network topology with two layers of size 128 would be specified as [128, 128]. :param flat_observations: (tf.Tensor) The observations to base policy and value function on. :param net_arch: ([int or dict]) The specification of the policy and value networks. See above for details on its formatting. :param act_fun: (tf function) The activation function to use for the networks. :return: (tf.Tensor, tf.Tensor) latent_policy, latent_value of the specified network. If all layers are shared, then ``latent_policy == latent_value`` """ latent = flat_observations policy_only_layers = [] # Layer sizes of the network that only belongs to the policy network value_only_layers = [] # Layer sizes of the network that only belongs to the value network # Iterate through the shared layers and build the shared parts of the network for idx, layer in enumerate(net_arch): if isinstance(layer, int): # Check that this is a shared layer layer_size = layer latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) else: assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts" if 'pi' in layer: assert isinstance(layer['pi'], list), "Error: net_arch[-1]['pi'] must contain a list of integers." policy_only_layers = layer['pi'] if 'vf' in layer: assert isinstance(layer['vf'], list), "Error: net_arch[-1]['vf'] must contain a list of integers." value_only_layers = layer['vf'] break # From here on the network splits up in policy and value network # Build the non-shared part of the network latent_policy = latent latent_value = latent for idx, (pi_layer_size, vf_layer_size) in enumerate(zip_longest(policy_only_layers, value_only_layers)): if pi_layer_size is not None: assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers." latent_policy = act_fun(linear(latent_policy, "pi/fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) if vf_layer_size is not None: assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers." latent_value = act_fun(linear(latent_value, "values_fn/vf/fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) return latent_policy, latent_value
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="cnn", **kwargs): super(FeedForwardWithSafeValue, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) if layers is not None: warnings.warn( "Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn( "The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) if net_arch is None: if layers is None: layers = [256, 256] # [64,64] net_arch = [dict(vf=layers, pi=layers, vcf=layers)] with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": pi_latent = vf_latent = vcf_latent = cnn_extractor( self.processed_obs, **kwargs) else: pi_latent, vf_latent, vcf_latent = mlp_extractor_safe( tf.layers.flatten(self.processed_obs), net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._vcf = linear(vcf_latent, 'vcf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init() self._vcf_flat = self.vcf[:, 0]
def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): mean = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) if cfg.is_mod(cfg.MOD_BOUND_MEAN): with tf.variable_scope('pi'): mean = tf.tanh(mean) # squashing mean only logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) # inspired by sac logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), mean, q_values
def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): mean = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) if self.act_norm is None: norm_mean = mean else: # norm_mean = self.act_norm.un_normalize(mean) norm_mean = self.act_norm.clip_normalize(mean) logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) pdparam = tf.concat([norm_mean, mean * 0.0 + logstd], axis=1) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), mean, q_values
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, args, reuse=tf.compat.v1.AUTO_REUSE, **kwargs): super(EmbeddingPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=False) self.args = args self.step_counter = 0 graph_args = AttrDict({ 'BATCH_SIZE': 1, 'NODE_COUNT': args.graph_node_count, 'UPDATE_ITERATION': args.graph_update_iteration, 'EMBEDDING_SIZE': args.graph_embedding_size, 'HIDDEN_LAYERS': args.graph_hidden_layers, 'HIDDEN_ACTIVATION': tf.nn.relu, 'END_ACTIVATION': tf.nn.relu, 'RESIDUAL': True, 'ROOT_EMBEDDING': False, }) with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE): self.graph = Graph(graph_args) pi_latent, vf_latent = graph_extractor(self.processed_obs, self.graph, args) self._value_fn = linear(vf_latent, 'vf', 1) self._policy = pi_latent self._proba_distribution = self.pdtype.proba_distribution_from_flat( pi_latent) # self.q_value = vf_latent self.q_value = linear(vf_latent, 'q', self.args.n_action_slots, init_scale=0.01) # self._proba_distribution, self._policy, self.q_value = \ # self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): # alpha = tf.exp(tf.nn.softplus(linear(pi_latent_vector, 'pi/alpha', self.size, init_scale=init_scale, init_bias=init_bias))*5) # beta = tf.exp(tf.nn.softplus(linear(pi_latent_vector, 'pi/beta', self.size, init_scale=init_scale, init_bias=init_bias))*5) mu = tf.math.sigmoid(linear(pi_latent_vector, 'pi/dense', self.size, init_scale=init_scale, init_bias=0.3)) * 0.770 + 0.117 var = tf.math.sigmoid(linear(pi_latent_vector, 'pi/dense_1', self.size, init_scale=init_scale, init_bias=-0.3))/100 alpha = -mu*tf.math.divide_no_nan((var+mu**2-mu),var) beta = (mu-1)*tf.math.divide_no_nan((var+mu**2-mu),var) pdparam = tf.concat([alpha, beta, mu, var], axis=1) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), pdparam, q_values
def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=None, graph=self.graph) self.observation_ph, self.processed_obs = observation_input( self.venv.observation_space, scale=(self.network_type == "cnn")) with tf.variable_scope("target_model"): if self.network_type == 'cnn': self.target_network = small_convnet( self.processed_obs, tf.nn.leaky_relu) elif self.network_type == 'mlp': self.target_network = tf_layers.mlp( self.processed_obs, [1024, 512]) self.target_network = tf_layers.linear( self.target_network, "out", 512) else: raise ValueError("Unknown network type {}!".format( self.network_type)) with tf.variable_scope("predictor_model"): if self.network_type == 'cnn': self.predictor_network = tf.nn.relu( small_convnet(self.processed_obs, tf.nn.leaky_relu)) elif self.network_type == 'mlp': self.predictor_network = tf_layers.mlp( self.processed_obs, [1024, 512]) self.predictor_network = tf.nn.relu( tf_layers.linear(self.predictor_network, "pred_fc1", 512)) self.predictor_network = tf_layers.linear( self.predictor_network, "out", 512) with tf.name_scope("loss"): self.int_reward = tf.reduce_mean(tf.square( tf.stop_gradient(self.target_network) - self.predictor_network), axis=1) self.aux_loss = tf.reduce_mean( tf.square( tf.stop_gradient(self.target_network) - self.predictor_network)) with tf.name_scope("train"): self.optimizer = tf.train.AdamOptimizer(self.learning_rate) self.training_op = self.optimizer.minimize(self.aux_loss) self.params = tf.trainable_variables() tf.global_variables_initializer().run(session=self.sess)
def build_actor_critic_network_actionsadded(x, layers, action_indices, state_indices, reuse): activ = tf.nn.relu with tf.variable_scope("actor_critic", reuse=tf.compat.AUTO_REUSE): actions = tf.gather(x, action_indices, axis=1) actions = tf.reduce_sum(actions, axis=1, keepdims=True) state = tf.gather(x, state_indices, axis=1) vf_h = tf.layers.flatten(tf.concat([actions, state], axis=1)) for j, layer_size in enumerate(layers): vf_h = activ(linear(vf_h, 'vf_fc' + str(j), n_hidden=layer_size, init_scale=np.sqrt(2))) vf_latent = activ(linear(vf_h, 'vf_head', len(action_indices))) pi_latent = build_policy(x, layers, action_indices, state_indices, activ) return pi_latent, vf_latent
def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0, stv_from_obs=False): mean = linear(pi_latent_vector, 'pi/dense', self.size, init_scale=init_scale, init_bias=init_bias) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) if stv_from_obs: print("STD from OBSERVATION") # logstd = linear(pi_latent_vector, 'pi/dense_1', self.size, init_scale=init_scale, init_bias=init_bias) std = EPS + tf.nn.sigmoid(linear(pi_latent_vector, 'pi/dense_1', self.size, init_scale=init_scale, init_bias=init_bias-0.5)) logstd = tf.log(std) # logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX) pdparam = tf.concat([mean, logstd], axis=1) else: print("STD from FIXED VALUE") logstd = tf.get_variable(name='pi/dense_1', shape=[1, self.size], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) return self.proba_distribution_from_flat(pdparam), mean, q_values, logstd
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, act_norm_init=None, obs_norm_init=None, net_arch=None, reuse=False, act_fun=tf.tanh): super(NormalMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse) if obs_norm_init is not None: self.obs_norm = TFNormalizer(sess, 'obs_norm', ob_space.shape[0], reuse=reuse, **obs_norm_init) else: self.obs_norm = None if act_norm_init is not None: self.act_norm = TFNormalizer(sess, 'act_norm', ac_space.shape[0], reuse=reuse, **act_norm_init) else: self.act_norm = None del self._pdtype self._pdtype = ActNormGaussProbDistType(ac_space.shape[0], self.act_norm) if net_arch is None: net_arch = [dict(vf=[64, 64], pi=[64, 64])] with tf.variable_scope("model", reuse=reuse): # normalization and clipping if self.obs_norm is not None: extractor_in = self.obs_norm.clip_normalize(tf.layers.flatten(self.processed_obs)) else: extractor_in = tf.layers.flatten(self.processed_obs) pi_latent, vf_latent = mlp_extractor(extractor_in, net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): pdparam = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), pdparam, q_values
def __init__(self, tf_session, ob_space, ac_space, num_env, num_steps, num_batch, activation_func=tf.nn.tanh, reuse=False, **kwargs): super(SafePolicy, self).__init__(tf_session, ob_space, ac_space, num_env, num_steps, num_batch, reuse=reuse) layers = [256, 256, 256] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): pi_latent, vf_latent = mlp_extractor( tf.layers.flatten(self.processed_obs), net_arch, activation_func) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def Cnn1(image, **kwargs): activ = tf.nn.relu layer_1 = activ( conv(image, 'c1', n_filters=32, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'c2', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_3 = activ( conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_3 = conv_to_fc(layer_3) return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, feature_extraction="cnn", **kwargs): super(PPOPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": pi_latent, vf_latent = customizedCNN(self.processed_obs, **kwargs) else: pi_latent, vf_latent = mlp_extractor( tf.layers.flatten(self.processed_obs)) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def small_convnet(x, activ=tf.nn.relu, **kwargs): layer_1 = activ( tf_layers.conv(x, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( tf_layers.conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_3 = activ( tf_layers.conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_3 = tf_layers.conv_to_fc(layer_3) return tf_layers.linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2))
def custom_extractor(scaled_images, **kwargs): activ = tf.nn.relu layer_1 = activ( conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_3 = activ( conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_3 = conv_to_fc(layer_3) return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, args, reuse=tf.compat.v1.AUTO_REUSE, **kwargs): super(EnigmaPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=False) self.args = args self.step_counter = 0 mcts_probs = self.processed_obs[:,:args.n_action_slots,-1] mcts_logits = tf.log(mcts_probs + 1e-5) - tf.log(1-mcts_probs+1e-5) self.mcts_distribution = self.pdtype.proba_distribution_from_flat(mcts_logits) obs = self.processed_obs[:,:,:-1] action_indices = list(range(self.args.n_action_slots)) state_indices = [x+self.args.n_action_slots for x in range(self.args.state_dim)] with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE): if self.args.value_gets_actions: if self.args.actions_added: pi_latent, vf_latent = build_actor_critic_network_actionsadded(obs, args.network_layers, action_indices, state_indices, reuse) else: pi_latent, vf_latent = build_actor_critic_network_tri(obs, args.network_layers, action_indices, state_indices, reuse) else: pi_latent, vf_latent = build_actor_critic_network_tri_separate_vf(obs, args.network_layers, action_indices, state_indices, args.latent_dim) self._value_fn = linear(vf_latent, 'vf', 1) self._policy = pi_latent self._proba_distribution = self.pdtype.proba_distribution_from_flat(pi_latent) self.q_value = vf_latent # self.q_value = linear(vf_latent, 'q', self.args.n_action_slots, init_scale=0.01) self._setup_init()
def customizedCNN(scaled_images, **kwargs): """ CNN from Nature paper. :param scaled_images: (TensorFlow Tensor) Image input placeholder :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN :return: (TensorFlow Tensor) The CNN output layer """ activ = tf.nn.relu layer_1 = activ( conv(scaled_images, 'c1', n_filters=8, filter_size=6, stride=3, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'c2', n_filters=8, filter_size=3, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_3 = activ( conv(layer_2, 'c3', n_filters=8, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_4 = conv_to_fc(layer_3) layer_5 = activ(linear(layer_4, 'fc1', n_hidden=256, init_scale=np.sqrt(2))) layer_6 = activ(linear(layer_5, 'fc2', n_hidden=128, init_scale=np.sqrt(2))) active = tf.tanh pi = active(linear(layer_6, "pi_fc{}".format(1), 64, init_scale=np.sqrt(2))) # pi = active(linear(pi, "pi_fc{}".format(2), 128, init_scale=np.sqrt(2))) vf = active(linear(layer_6, "vf_fc{}".format(1), 64, init_scale=np.sqrt(2))) # vf = active(linear(vf, "vf_fc{}".format(2), 128, init_scale=np.sqrt(2))) return pi, vf
def modified_cnn(unscaled_images, **kwargs): import tensorflow as tf scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=1, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ(conv(layer_1, 'c2', n_filters=32, filter_size=2, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_2 = conv_to_fc(layer_2) return activ(linear(layer_2, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
def build_actor_critic_network_peasant_method(x, layers, action_indices, state_indices, reuse): activ = tf.nn.relu pis = [] vfs = [] #x = tf.layers.flatten(x) for i in range(len(action_indices) + len(state_indices)): with tf.variable_scope("actor_critic", reuse=tf.compat.v1.AUTO_REUSE): x_prime = x[:, i, :] pi_h = x_prime vf_h = x_prime for j, layer_size in enumerate(layers): pi_h = activ(linear(pi_h, 'pi_fc' + str(j), n_hidden=layer_size, init_scale=np.sqrt(2))) vf_h = activ(linear(vf_h, 'vf_fc' + str(j), n_hidden=layer_size, init_scale=np.sqrt(2))) pis.append(pi_h) vfs.append(vf_h) pi_h = tf.layers.flatten(tf.concat(pis, axis=1)) vf_h = tf.layers.flatten(tf.concat(vfs, axis=1)) pi_latent = linear(pi_h, 'pi_head', len(action_indices)) vf_latent = linear(vf_h, 'vf_head', len(action_indices)) return pi_latent, vf_latent
def build_actor_critic_network(x, layers, num_actions, num_state, reuse): activ = tf.nn.relu pis = [] vfs = [] x = tf.layers.flatten(x) for i in range(num_actions + num_state): with tf.variable_scope("actor_critic", reuse=tf.compat.v1.AUTO_REUSE): x_prime = x[:, i, :] pi_h = x_prime vf_h = x_prime for i, layer_size in enumerate(layers): pi_h = activ(linear(pi_h, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) vf_h = activ(linear(vf_h, 'vf_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) pis.append(pi_h) vfs.append(vf_h) pi_h = tf.layers.flatten(tf.concat(pis, axis=1)) vf_h = tf.layers.flatten(tf.concat(vfs, axis=1)) pi_latent = pi_h vf_latent = vf_h return pi_latent, vf_latent
def build_policy(x, layers, action_indices, state_indices, activ): # policy function (works on (action_i, goal, path) triples pis = [] for i in action_indices: ind = [i] ind.extend(state_indices) ind = np.array(ind, dtype=np.int32) with tf.variable_scope("actor_critic", reuse=tf.compat.v1.AUTO_REUSE): x_prime = tf.gather(x, ind, axis=1) x_prime = tf.layers.flatten(x_prime) pi_h = x_prime for j, layer_size in enumerate(layers): pi_h = activ(linear(pi_h, 'pi_fc' + str(j), n_hidden=layer_size, init_scale=np.sqrt(2))) pi_h = linear(pi_h, 'pi_fc_last', n_hidden=1, init_scale=np.sqrt(2)) flag = tf.reduce_sum(x[:,i], axis=1) pi_h = tf.where(flag > 0, pi_h, pi_h * 0 - 1e7) pis.append(pi_h) pi_latent = tf.layers.flatten(tf.concat(pis, axis=1)) return pi_latent
def build_actor_critic_network_tri(x, layers, action_indices, state_indices, reuse): activ = tf.nn.relu vfs = [] #x = tf.layers.flatten(x) for i in action_indices: ind = [i] ind.extend(state_indices) ind = np.array(ind, dtype=np.int32) with tf.variable_scope("actor_critic", reuse=tf.compat.v1.AUTO_REUSE): x_prime = tf.layers.flatten(tf.gather(x, ind, axis=1)) vf_h = x_prime for j, layer_size in enumerate(layers): vf_h = activ(linear(vf_h, 'vf_fc' + str(j), n_hidden=layer_size, init_scale=np.sqrt(2))) vf_h = activ(linear(vf_h, 'vf_fc_last', n_hidden=10, init_scale=np.sqrt(2))) vfs.append(vf_h) vf_h = tf.layers.flatten(tf.concat(vfs, axis=1)) vf_latent = activ(linear(vf_h, 'vf_head', len(action_indices))) pi_latent = build_policy(x, layers, action_indices, state_indices, activ) return pi_latent, vf_latent
def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): mean = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) logstd = tf.compat.v1.get_variable( name='pi/logstd', shape=[1, self.size], initializer=tf.compat.v1.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), mean, q_values
def cnn(input_tensor,**kwargs): visual_input=tf.slice(input_tensor,[0,0],[-1,49],name='input_img') prev_output=tf.slice(input_tensor,[0,49],[-1,50],'prev_outputs') visual_input=tf.reshape(visual_input,(-1,7,7,1)) activ=tf.nn.relu layer_1 = activ(conv(visual_input, 'c1', n_filters=16, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) #layer_2 = activ(conv(layer_1, 'c2', n_filters=16, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) #layer_3=conv_to_fc(layer_2) layer_2=conv_to_fc(layer_1) visual_output=activ(linear(layer_2,'fc1',n_hidden=49,init_scale=np.sqrt(2))) total_output=tf.concat([visual_output,prev_output],1) return total_output
def nature_cnn(scaled_images, **kwargs): """ CNN from Nature paper. :param scaled_images: (TensorFlow Tensor) Image input placeholder :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN :return: (TensorFlow Tensor) The CNN output layer """ activ = tf.nn.relu if 'view' in kwargs.keys(): _, h, w, d = scaled_images.shape view_type = kwargs['view'] if view_type == 'even': mask = np.array([i % 2 for i in range(h * w)]).reshape( (1, h, w, 1)) elif view_type == 'odd': mask = np.array([1 - i % 2 for i in range(h * w)]).reshape( (1, h, w, 1)) else: raise NotImplementedError scaled_images = scaled_images * tf.constant(mask, dtype=tf.float32) del kwargs['view'] layer_1 = activ( conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_3 = activ( conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_3 = conv_to_fc(layer_3) return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
def augmented_nature_cnn(scaled_images, **kwargs): """ Copied from stable_baselines policies.py. This is nature CNN head where last channel of the image contains direct features. :param scaled_images: (TensorFlow Tensor) Image input placeholder :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN :return: (TensorFlow Tensor) The CNN output layer """ activ = tf.nn.relu # Take last channel as direct features other_features = tf.contrib.slim.flatten(scaled_images[..., -1]) # Take known amount of direct features, rest are padding zeros other_features = other_features[:, :num_direct_features] scaled_images = scaled_images[..., :-1] layer_1 = activ( conv(scaled_images, 'cnn1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ( conv(layer_1, 'cnn2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) layer_3 = activ( conv(layer_2, 'cnn3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) layer_3 = conv_to_fc(layer_3) # Append direct features to the final output of extractor img_output = activ( linear(layer_3, 'cnn_fc1', n_hidden=512, init_scale=np.sqrt(2))) concat = tf.concat((img_output, other_features), axis=1) return concat