Exemple #1
0
 def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0):
     if cfg.is_mod(cfg.MOD_PRETRAIN_PI):
         # init the output layer of the policy with the weights of the pretrained policy
         # [w_hid1, w_hid2, w_out], [b_hid1, b_hid2, b_out]
         ws, bs = load_weights()
         w_out, b_out = ws[-1], bs[-1]
         # check dimensions
         assert w_out.shape[0] == pi_latent_vector.shape[1]
         assert w_out.shape[1] == self.size
         # construct the linear output layer for mean prediction
         with tf.variable_scope('pi'):
             mean_weight = tf.get_variable(f"w_mean", initializer=w_out)
             mean_bias = tf.get_variable(f"b_mean", initializer=b_out)
             output = tf.matmul(pi_latent_vector, mean_weight) + mean_bias
         mean = output
     else:
         mean = linear(pi_latent_vector, 'pi', self.size, init_scale=cfg.pi_out_init_scale, init_bias=init_bias)
     if cfg.is_mod(cfg.MOD_BOUND_MEAN):
         with tf.variable_scope('pi'):
             mean = tf.tanh(mean)  # squashing mean only
     if cfg.is_mod(cfg.MOD_CONST_EXPLORE):
         logstd = cfg.init_logstd
     else:
         logstd_initializer = tf.constant_initializer(cfg.init_logstd)
         # print(f'Initializing all logstds with: {cfg.init_logstd}')
         logstd = tf.get_variable(name='pi/logstd', shape=(self.size,), initializer=logstd_initializer)
         # clipping of logstd inspired by sac
         logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX)
         # log(f'Clipping logstd in range from {LOG_STD_MIN} to {LOG_STD_MAX}')
     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias)
     return self.proba_distribution_from_flat(pdparam), mean, q_values
Exemple #2
0
def cnn3D(input_space, **kwargs):
    """
    Custom 3d CNN.

    :param scaled_images: (TensorFlow Tensor) Image input placeholder
    :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
    :return: (TensorFlow Tensor) The CNN output layer
    """
    activ = tf.nn.relu
    layer_1 = activ(
        conv3d(input_space,
               'c1',
               n_filters=16,
               filter_size=3,
               stride=1,
               init_scale=np.sqrt(2),
               **kwargs))
    layer_2 = activ(
        conv3d(layer_1,
               'c2',
               n_filters=16,
               filter_size=3,
               stride=1,
               init_scale=np.sqrt(2),
               **kwargs))
    layer_3 = maxpool3d(layer_2, 2, 1, 'VALID')
    #layer_4 = activ(conv3d(layer_3, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs))
    layer_4 = conv_to_fc(layer_3)
    layer_5 = activ(linear(layer_4, 'fc1', n_hidden=16, init_scale=np.sqrt(2)))
    layer_6 = activ(linear(layer_5, 'fc2', n_hidden=16, init_scale=np.sqrt(2)))
    return activ(linear(layer_6, 'fc3', n_hidden=32, init_scale=np.sqrt(2)))
Exemple #3
0
def modified_deep_nature_cnn(scaled_images, **kwargs):
    """
    CNN from Nature paper.
    :param scaled_images: (TensorFlow Tensor) Image input placeholder
    :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
    :return: (TensorFlow Tensor) The CNN output layer
    """
    activ = tf.nn.relu
    layer_1 = activ(
        conv(scaled_images,
             'c1',
             n_filters=8,
             filter_size=6,
             stride=1,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_2 = activ(
        conv(layer_1,
             'c2',
             n_filters=16,
             filter_size=3,
             stride=1,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_2 = conv_to_fc(layer_2)
    layer_3 = activ(linear(layer_2, 'fc1', n_hidden=128,
                           init_scale=np.sqrt(2)))
    return activ(linear(layer_3, 'fc2', n_hidden=128, init_scale=np.sqrt(2)))
Exemple #4
0
def mlp_extractor(flat_observations, net_arch, act_fun):
    """
    Constructs an MLP that receives observations as an input and outputs a latent representation for the policy and
    a value network. The ``net_arch`` parameter allows to specify the amount and size of the hidden layers and how many
    of them are shared between the policy network and the value network. It is assumed to be a list with the following
    structure:

    1. An arbitrary length (zero allowed) number of integers each specifying the number of units in a shared layer.
       If the number of ints is zero, there will be no shared layers.
    2. An optional dict, to specify the following non-shared layers for the value network and the policy network.
       It is formatted like ``dict(vf=[<value layer sizes>], pi=[<policy layer sizes>])``.
       If it is missing any of the keys (pi or vf), no non-shared layers (empty list) is assumed.

    For example to construct a network with one shared layer of size 55 followed by two non-shared layers for the value
    network of size 255 and a single non-shared layer of size 128 for the policy network, the following layers_spec
    would be used: ``[55, dict(vf=[255, 255], pi=[128])]``. A simple shared network topology with two layers of size 128
    would be specified as [128, 128].

    :param flat_observations: (tf.Tensor) The observations to base policy and value function on.
    :param net_arch: ([int or dict]) The specification of the policy and value networks.
        See above for details on its formatting.
    :param act_fun: (tf function) The activation function to use for the networks.
    :return: (tf.Tensor, tf.Tensor) latent_policy, latent_value of the specified network.
        If all layers are shared, then ``latent_policy == latent_value``
    """
    latent = flat_observations
    policy_only_layers = []  # Layer sizes of the network that only belongs to the policy network
    value_only_layers = []  # Layer sizes of the network that only belongs to the value network

    # Iterate through the shared layers and build the shared parts of the network
    for idx, layer in enumerate(net_arch):
        if isinstance(layer, int):  # Check that this is a shared layer
            layer_size = layer
            latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2)))
        else:
            assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts"
            if 'pi' in layer:
                assert isinstance(layer['pi'], list), "Error: net_arch[-1]['pi'] must contain a list of integers."
                policy_only_layers = layer['pi']

            if 'vf' in layer:
                assert isinstance(layer['vf'], list), "Error: net_arch[-1]['vf'] must contain a list of integers."
                value_only_layers = layer['vf']
            break  # From here on the network splits up in policy and value network

    # Build the non-shared part of the network
    latent_policy = latent
    latent_value = latent
    for idx, (pi_layer_size, vf_layer_size) in enumerate(zip_longest(policy_only_layers, value_only_layers)):
        if pi_layer_size is not None:
            assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers."
            latent_policy = act_fun(linear(latent_policy, "pi/fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2)))

        if vf_layer_size is not None:
            assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers."
            latent_value = act_fun(linear(latent_value, "values_fn/vf/fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2)))

    return latent_policy, latent_value
Exemple #5
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 layers=None,
                 net_arch=None,
                 act_fun=tf.tanh,
                 cnn_extractor=nature_cnn,
                 feature_extraction="cnn",
                 **kwargs):
        super(FeedForwardWithSafeValue,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             reuse=reuse,
                             scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        if layers is not None:
            warnings.warn(
                "Usage of the `layers` parameter is deprecated! Use net_arch instead "
                "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn(
                    "The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                    DeprecationWarning)

        if net_arch is None:
            if layers is None:
                layers = [256, 256]  # [64,64]
            net_arch = [dict(vf=layers, pi=layers, vcf=layers)]

        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                pi_latent = vf_latent = vcf_latent = cnn_extractor(
                    self.processed_obs, **kwargs)
            else:
                pi_latent, vf_latent, vcf_latent = mlp_extractor_safe(
                    tf.layers.flatten(self.processed_obs), net_arch, act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)
            self._vcf = linear(vcf_latent, 'vcf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
        self._vcf_flat = self.vcf[:, 0]
Exemple #6
0
 def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0):
     mean = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
     if cfg.is_mod(cfg.MOD_BOUND_MEAN):
         with tf.variable_scope('pi'):
             mean = tf.tanh(mean)  # squashing mean only
     logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
     # inspired by sac
     logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX)
     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias)
     return self.proba_distribution_from_flat(pdparam), mean, q_values
Exemple #7
0
 def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0):
     mean = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
     if self.act_norm is None:
         norm_mean = mean
     else:
         # norm_mean = self.act_norm.un_normalize(mean)
         norm_mean = self.act_norm.clip_normalize(mean)
     logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
     pdparam = tf.concat([norm_mean, mean * 0.0 + logstd], axis=1)
     q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias)
     return self.proba_distribution_from_flat(pdparam), mean, q_values
Exemple #8
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 args,
                 reuse=tf.compat.v1.AUTO_REUSE,
                 **kwargs):
        super(EmbeddingPolicy, self).__init__(sess,
                                              ob_space,
                                              ac_space,
                                              n_env,
                                              n_steps,
                                              n_batch,
                                              reuse=reuse,
                                              scale=False)
        self.args = args
        self.step_counter = 0

        graph_args = AttrDict({
            'BATCH_SIZE': 1,
            'NODE_COUNT': args.graph_node_count,
            'UPDATE_ITERATION': args.graph_update_iteration,
            'EMBEDDING_SIZE': args.graph_embedding_size,
            'HIDDEN_LAYERS': args.graph_hidden_layers,
            'HIDDEN_ACTIVATION': tf.nn.relu,
            'END_ACTIVATION': tf.nn.relu,
            'RESIDUAL': True,
            'ROOT_EMBEDDING': False,
        })

        with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE):
            self.graph = Graph(graph_args)

            pi_latent, vf_latent = graph_extractor(self.processed_obs,
                                                   self.graph, args)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._policy = pi_latent
            self._proba_distribution = self.pdtype.proba_distribution_from_flat(
                pi_latent)
            # self.q_value = vf_latent
            self.q_value = linear(vf_latent,
                                  'q',
                                  self.args.n_action_slots,
                                  init_scale=0.01)

            # self._proba_distribution, self._policy, self.q_value = \
            #     self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
    def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0):
        # alpha = tf.exp(tf.nn.softplus(linear(pi_latent_vector, 'pi/alpha', self.size, init_scale=init_scale, init_bias=init_bias))*5)
        # beta = tf.exp(tf.nn.softplus(linear(pi_latent_vector, 'pi/beta', self.size, init_scale=init_scale, init_bias=init_bias))*5)
        mu = tf.math.sigmoid(linear(pi_latent_vector, 'pi/dense', self.size, init_scale=init_scale, init_bias=0.3)) * 0.770 + 0.117
        var = tf.math.sigmoid(linear(pi_latent_vector, 'pi/dense_1', self.size, init_scale=init_scale, init_bias=-0.3))/100

        alpha = -mu*tf.math.divide_no_nan((var+mu**2-mu),var)
        beta = (mu-1)*tf.math.divide_no_nan((var+mu**2-mu),var)
        
        pdparam = tf.concat([alpha, beta, mu, var], axis=1)
        q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias)
        return self.proba_distribution_from_flat(pdparam), pdparam, q_values
    def setup_model(self):
        self.graph = tf.Graph()

        with self.graph.as_default():
            self.sess = tf_util.make_session(num_cpu=None, graph=self.graph)
            self.observation_ph, self.processed_obs = observation_input(
                self.venv.observation_space,
                scale=(self.network_type == "cnn"))

            with tf.variable_scope("target_model"):
                if self.network_type == 'cnn':
                    self.target_network = small_convnet(
                        self.processed_obs, tf.nn.leaky_relu)
                elif self.network_type == 'mlp':
                    self.target_network = tf_layers.mlp(
                        self.processed_obs, [1024, 512])
                    self.target_network = tf_layers.linear(
                        self.target_network, "out", 512)
                else:
                    raise ValueError("Unknown network type {}!".format(
                        self.network_type))

            with tf.variable_scope("predictor_model"):
                if self.network_type == 'cnn':
                    self.predictor_network = tf.nn.relu(
                        small_convnet(self.processed_obs, tf.nn.leaky_relu))
                elif self.network_type == 'mlp':
                    self.predictor_network = tf_layers.mlp(
                        self.processed_obs, [1024, 512])

                self.predictor_network = tf.nn.relu(
                    tf_layers.linear(self.predictor_network, "pred_fc1", 512))
                self.predictor_network = tf_layers.linear(
                    self.predictor_network, "out", 512)

            with tf.name_scope("loss"):
                self.int_reward = tf.reduce_mean(tf.square(
                    tf.stop_gradient(self.target_network) -
                    self.predictor_network),
                                                 axis=1)
                self.aux_loss = tf.reduce_mean(
                    tf.square(
                        tf.stop_gradient(self.target_network) -
                        self.predictor_network))

            with tf.name_scope("train"):
                self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.training_op = self.optimizer.minimize(self.aux_loss)

            self.params = tf.trainable_variables()
            tf.global_variables_initializer().run(session=self.sess)
Exemple #11
0
def build_actor_critic_network_actionsadded(x, layers, action_indices, state_indices, reuse):
    activ = tf.nn.relu
    with tf.variable_scope("actor_critic", reuse=tf.compat.AUTO_REUSE):
        actions = tf.gather(x, action_indices, axis=1)
        actions = tf.reduce_sum(actions, axis=1, keepdims=True)
        state = tf.gather(x, state_indices, axis=1)
        vf_h = tf.layers.flatten(tf.concat([actions, state], axis=1))
        for j, layer_size in enumerate(layers):
            vf_h = activ(linear(vf_h, 'vf_fc' + str(j), n_hidden=layer_size, init_scale=np.sqrt(2)))
    vf_latent = activ(linear(vf_h, 'vf_head', len(action_indices)))

    pi_latent = build_policy(x, layers, action_indices, state_indices, activ)
    
    return pi_latent, vf_latent
 def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0, stv_from_obs=False):
     mean = linear(pi_latent_vector, 'pi/dense', self.size, init_scale=init_scale, init_bias=init_bias)
     q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias)
     if stv_from_obs:
         print("STD from OBSERVATION")
         # logstd = linear(pi_latent_vector, 'pi/dense_1', self.size, init_scale=init_scale, init_bias=init_bias)
         std = EPS + tf.nn.sigmoid(linear(pi_latent_vector, 'pi/dense_1', self.size, init_scale=init_scale, init_bias=init_bias-0.5))
         logstd = tf.log(std)
         # logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX)
         pdparam = tf.concat([mean, logstd], axis=1)
     else:
         print("STD from FIXED VALUE")
         logstd = tf.get_variable(name='pi/dense_1', shape=[1, self.size], initializer=tf.zeros_initializer())
         pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     return self.proba_distribution_from_flat(pdparam), mean, q_values, logstd
Exemple #13
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, act_norm_init=None, obs_norm_init=None,
                 net_arch=None, reuse=False, act_fun=tf.tanh):
        super(NormalMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse)

        if obs_norm_init is not None:
            self.obs_norm = TFNormalizer(sess, 'obs_norm', ob_space.shape[0], reuse=reuse, **obs_norm_init)
        else:
            self.obs_norm = None

        if act_norm_init is not None:
            self.act_norm = TFNormalizer(sess, 'act_norm', ac_space.shape[0], reuse=reuse,  **act_norm_init)
        else:
            self.act_norm = None

        del self._pdtype
        self._pdtype = ActNormGaussProbDistType(ac_space.shape[0], self.act_norm)

        if net_arch is None:
            net_arch = [dict(vf=[64, 64], pi=[64, 64])]

        with tf.variable_scope("model", reuse=reuse):
            # normalization and clipping
            if self.obs_norm is not None:
                extractor_in = self.obs_norm.clip_normalize(tf.layers.flatten(self.processed_obs))
            else:
                extractor_in = tf.layers.flatten(self.processed_obs)

            pi_latent, vf_latent = mlp_extractor(extractor_in, net_arch, act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
Exemple #14
0
 def proba_distribution_from_latent(self,
                                    pi_latent_vector,
                                    vf_latent_vector,
                                    init_scale=1.0,
                                    init_bias=0.0):
     pdparam = linear(pi_latent_vector,
                      'pi',
                      self.size,
                      init_scale=init_scale,
                      init_bias=init_bias)
     q_values = linear(vf_latent_vector,
                       'q',
                       self.size,
                       init_scale=init_scale,
                       init_bias=init_bias)
     return self.proba_distribution_from_flat(pdparam), pdparam, q_values
Exemple #15
0
    def __init__(self,
                 tf_session,
                 ob_space,
                 ac_space,
                 num_env,
                 num_steps,
                 num_batch,
                 activation_func=tf.nn.tanh,
                 reuse=False,
                 **kwargs):
        super(SafePolicy, self).__init__(tf_session,
                                         ob_space,
                                         ac_space,
                                         num_env,
                                         num_steps,
                                         num_batch,
                                         reuse=reuse)
        layers = [256, 256, 256]
        net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            pi_latent, vf_latent = mlp_extractor(
                tf.layers.flatten(self.processed_obs), net_arch,
                activation_func)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
Exemple #16
0
def Cnn1(image, **kwargs):
    activ = tf.nn.relu
    layer_1 = activ(
        conv(image,
             'c1',
             n_filters=32,
             filter_size=3,
             stride=1,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_2 = activ(
        conv(layer_1,
             'c2',
             n_filters=64,
             filter_size=3,
             stride=1,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = activ(
        conv(layer_2,
             'c3',
             n_filters=64,
             filter_size=3,
             stride=1,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = conv_to_fc(layer_3)
    return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
Exemple #17
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 feature_extraction="cnn",
                 **kwargs):
        super(PPOPolicy, self).__init__(sess,
                                        ob_space,
                                        ac_space,
                                        n_env,
                                        n_steps,
                                        n_batch,
                                        reuse=reuse,
                                        scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                pi_latent, vf_latent = customizedCNN(self.processed_obs,
                                                     **kwargs)
            else:
                pi_latent, vf_latent = mlp_extractor(
                    tf.layers.flatten(self.processed_obs))

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
def small_convnet(x, activ=tf.nn.relu, **kwargs):
    layer_1 = activ(
        tf_layers.conv(x,
                       'c1',
                       n_filters=32,
                       filter_size=8,
                       stride=4,
                       init_scale=np.sqrt(2),
                       **kwargs))
    layer_2 = activ(
        tf_layers.conv(layer_1,
                       'c2',
                       n_filters=64,
                       filter_size=4,
                       stride=2,
                       init_scale=np.sqrt(2),
                       **kwargs))
    layer_3 = activ(
        tf_layers.conv(layer_2,
                       'c3',
                       n_filters=64,
                       filter_size=3,
                       stride=1,
                       init_scale=np.sqrt(2),
                       **kwargs))
    layer_3 = tf_layers.conv_to_fc(layer_3)
    return tf_layers.linear(layer_3,
                            'fc1',
                            n_hidden=512,
                            init_scale=np.sqrt(2))
Exemple #19
0
def custom_extractor(scaled_images, **kwargs):
    activ = tf.nn.relu
    layer_1 = activ(
        conv(scaled_images,
             'c1',
             n_filters=32,
             filter_size=8,
             stride=4,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_2 = activ(
        conv(layer_1,
             'c2',
             n_filters=64,
             filter_size=4,
             stride=2,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = activ(
        conv(layer_2,
             'c3',
             n_filters=64,
             filter_size=3,
             stride=1,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = conv_to_fc(layer_3)
    return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
Exemple #20
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, args, reuse=tf.compat.v1.AUTO_REUSE, **kwargs):
        super(EnigmaPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=False)
        self.args = args
        self.step_counter = 0

        mcts_probs = self.processed_obs[:,:args.n_action_slots,-1]
        mcts_logits = tf.log(mcts_probs + 1e-5) - tf.log(1-mcts_probs+1e-5)
        self.mcts_distribution = self.pdtype.proba_distribution_from_flat(mcts_logits)

        obs = self.processed_obs[:,:,:-1]


        action_indices = list(range(self.args.n_action_slots))
        state_indices = [x+self.args.n_action_slots for x in range(self.args.state_dim)]
        with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE):
            if self.args.value_gets_actions:
                if self.args.actions_added:
                    pi_latent, vf_latent = build_actor_critic_network_actionsadded(obs, args.network_layers, action_indices, state_indices, reuse)
                else:
                    pi_latent, vf_latent = build_actor_critic_network_tri(obs, args.network_layers, action_indices, state_indices, reuse)
            else:
                pi_latent, vf_latent = build_actor_critic_network_tri_separate_vf(obs, args.network_layers, action_indices, state_indices, args.latent_dim)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._policy = pi_latent
            self._proba_distribution = self.pdtype.proba_distribution_from_flat(pi_latent)
            self.q_value = vf_latent
            # self.q_value = linear(vf_latent, 'q', self.args.n_action_slots, init_scale=0.01)

        self._setup_init()
Exemple #21
0
def customizedCNN(scaled_images, **kwargs):
    """
    CNN from Nature paper.

    :param scaled_images: (TensorFlow Tensor) Image input placeholder
    :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
    :return: (TensorFlow Tensor) The CNN output layer
    """
    activ = tf.nn.relu
    layer_1 = activ(
        conv(scaled_images,
             'c1',
             n_filters=8,
             filter_size=6,
             stride=3,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_2 = activ(
        conv(layer_1,
             'c2',
             n_filters=8,
             filter_size=3,
             stride=2,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = activ(
        conv(layer_2,
             'c3',
             n_filters=8,
             filter_size=3,
             stride=1,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_4 = conv_to_fc(layer_3)
    layer_5 = activ(linear(layer_4, 'fc1', n_hidden=256,
                           init_scale=np.sqrt(2)))
    layer_6 = activ(linear(layer_5, 'fc2', n_hidden=128,
                           init_scale=np.sqrt(2)))

    active = tf.tanh
    pi = active(linear(layer_6, "pi_fc{}".format(1), 64,
                       init_scale=np.sqrt(2)))
    # pi = active(linear(pi, "pi_fc{}".format(2), 128, init_scale=np.sqrt(2)))
    vf = active(linear(layer_6, "vf_fc{}".format(1), 64,
                       init_scale=np.sqrt(2)))
    # vf = active(linear(vf, "vf_fc{}".format(2), 128, init_scale=np.sqrt(2)))
    return pi, vf
Exemple #22
0
def modified_cnn(unscaled_images, **kwargs):
	import tensorflow as tf
	scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
	activ = tf.nn.relu
	layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=1, stride=1, init_scale=np.sqrt(2), **kwargs))
	layer_2 = activ(conv(layer_1, 'c2', n_filters=32, filter_size=2, stride=2, init_scale=np.sqrt(2), **kwargs))
	layer_2 = conv_to_fc(layer_2)
	return activ(linear(layer_2, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
Exemple #23
0
def build_actor_critic_network_peasant_method(x, layers, action_indices, state_indices, reuse):
    activ = tf.nn.relu
    pis = []
    vfs = []
    #x = tf.layers.flatten(x)
    for i in range(len(action_indices) + len(state_indices)):
        with tf.variable_scope("actor_critic", reuse=tf.compat.v1.AUTO_REUSE):
            x_prime = x[:, i, :]
            pi_h = x_prime
            vf_h = x_prime
            for j, layer_size in enumerate(layers):
                pi_h = activ(linear(pi_h, 'pi_fc' + str(j), n_hidden=layer_size, init_scale=np.sqrt(2)))
                vf_h = activ(linear(vf_h, 'vf_fc' + str(j), n_hidden=layer_size, init_scale=np.sqrt(2)))
            pis.append(pi_h)
            vfs.append(vf_h)
    pi_h = tf.layers.flatten(tf.concat(pis, axis=1))
    vf_h = tf.layers.flatten(tf.concat(vfs, axis=1))
    pi_latent = linear(pi_h, 'pi_head', len(action_indices))
    vf_latent = linear(vf_h, 'vf_head', len(action_indices))
    return pi_latent, vf_latent
Exemple #24
0
def build_actor_critic_network(x, layers, num_actions, num_state, reuse):
    activ = tf.nn.relu
    pis = []
    vfs = []
    x = tf.layers.flatten(x)
    for i in range(num_actions + num_state):
        with tf.variable_scope("actor_critic", reuse=tf.compat.v1.AUTO_REUSE):
            x_prime = x[:, i, :]
            pi_h = x_prime
            vf_h = x_prime
            for i, layer_size in enumerate(layers):
                pi_h = activ(linear(pi_h, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2)))
                vf_h = activ(linear(vf_h, 'vf_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2)))
            pis.append(pi_h)
            vfs.append(vf_h)
    pi_h = tf.layers.flatten(tf.concat(pis, axis=1))
    vf_h = tf.layers.flatten(tf.concat(vfs, axis=1))
    pi_latent = pi_h
    vf_latent = vf_h
    return pi_latent, vf_latent
Exemple #25
0
def build_policy(x, layers, action_indices, state_indices, activ):
    # policy function (works on (action_i, goal, path) triples
    pis = []
    for i in action_indices:
        ind = [i]
        ind.extend(state_indices)
        ind = np.array(ind, dtype=np.int32)
        with tf.variable_scope("actor_critic", reuse=tf.compat.v1.AUTO_REUSE):
            x_prime = tf.gather(x, ind, axis=1)
            x_prime = tf.layers.flatten(x_prime)
            pi_h = x_prime
            for j, layer_size in enumerate(layers):
                pi_h = activ(linear(pi_h, 'pi_fc' + str(j), n_hidden=layer_size, init_scale=np.sqrt(2)))
            pi_h = linear(pi_h, 'pi_fc_last', n_hidden=1, init_scale=np.sqrt(2))

            flag = tf.reduce_sum(x[:,i], axis=1)
            pi_h = tf.where(flag > 0, pi_h, pi_h * 0 - 1e7)

            pis.append(pi_h)
    pi_latent = tf.layers.flatten(tf.concat(pis, axis=1))
    return pi_latent
Exemple #26
0
def build_actor_critic_network_tri(x, layers, action_indices, state_indices, reuse):
    activ = tf.nn.relu
    vfs = []
    #x = tf.layers.flatten(x)
    for i in action_indices:
        ind = [i]
        ind.extend(state_indices)
        ind = np.array(ind, dtype=np.int32)
        with tf.variable_scope("actor_critic", reuse=tf.compat.v1.AUTO_REUSE):
            x_prime = tf.layers.flatten(tf.gather(x, ind, axis=1))
            vf_h = x_prime
            for j, layer_size in enumerate(layers):
                vf_h = activ(linear(vf_h, 'vf_fc' + str(j), n_hidden=layer_size, init_scale=np.sqrt(2)))
            vf_h = activ(linear(vf_h, 'vf_fc_last', n_hidden=10, init_scale=np.sqrt(2)))
            vfs.append(vf_h)
    vf_h = tf.layers.flatten(tf.concat(vfs, axis=1))
    vf_latent = activ(linear(vf_h, 'vf_head', len(action_indices)))

    pi_latent = build_policy(x, layers, action_indices, state_indices, activ)
    
    return pi_latent, vf_latent
Exemple #27
0
 def proba_distribution_from_latent(self,
                                    pi_latent_vector,
                                    vf_latent_vector,
                                    init_scale=1.0,
                                    init_bias=0.0):
     mean = linear(pi_latent_vector,
                   'pi',
                   self.size,
                   init_scale=init_scale,
                   init_bias=init_bias)
     logstd = tf.compat.v1.get_variable(
         name='pi/logstd',
         shape=[1, self.size],
         initializer=tf.compat.v1.zeros_initializer())
     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     q_values = linear(vf_latent_vector,
                       'q',
                       self.size,
                       init_scale=init_scale,
                       init_bias=init_bias)
     return self.proba_distribution_from_flat(pdparam), mean, q_values
def cnn(input_tensor,**kwargs):
    visual_input=tf.slice(input_tensor,[0,0],[-1,49],name='input_img') 
    prev_output=tf.slice(input_tensor,[0,49],[-1,50],'prev_outputs')
    visual_input=tf.reshape(visual_input,(-1,7,7,1))
    activ=tf.nn.relu

    layer_1 = activ(conv(visual_input, 'c1', n_filters=16, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) 
    #layer_2 = activ(conv(layer_1, 'c2', n_filters=16, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs))
    #layer_3=conv_to_fc(layer_2)
    layer_2=conv_to_fc(layer_1)
    visual_output=activ(linear(layer_2,'fc1',n_hidden=49,init_scale=np.sqrt(2)))
    total_output=tf.concat([visual_output,prev_output],1)  
    return total_output 
Exemple #29
0
def nature_cnn(scaled_images, **kwargs):
    """
    CNN from Nature paper.

    :param scaled_images: (TensorFlow Tensor) Image input placeholder
    :param kwargs: (dict) Extra keywords parameters for the convolutional
        layers of the CNN
    :return: (TensorFlow Tensor) The CNN output layer
    """
    activ = tf.nn.relu

    if 'view' in kwargs.keys():
        _, h, w, d = scaled_images.shape
        view_type = kwargs['view']
        if view_type == 'even':
            mask = np.array([i % 2 for i in range(h * w)]).reshape(
                (1, h, w, 1))
        elif view_type == 'odd':
            mask = np.array([1 - i % 2 for i in range(h * w)]).reshape(
                (1, h, w, 1))
        else:
            raise NotImplementedError
        scaled_images = scaled_images * tf.constant(mask, dtype=tf.float32)
        del kwargs['view']

    layer_1 = activ(
        conv(scaled_images,
             'c1',
             n_filters=32,
             filter_size=8,
             stride=4,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_2 = activ(
        conv(layer_1,
             'c2',
             n_filters=64,
             filter_size=4,
             stride=2,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = activ(
        conv(layer_2,
             'c3',
             n_filters=64,
             filter_size=3,
             stride=1,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = conv_to_fc(layer_3)
    return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
    def augmented_nature_cnn(scaled_images, **kwargs):
        """
        Copied from stable_baselines policies.py.
        This is nature CNN head where last channel of the image contains
        direct features.

        :param scaled_images: (TensorFlow Tensor) Image input placeholder
        :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
        :return: (TensorFlow Tensor) The CNN output layer
        """
        activ = tf.nn.relu

        # Take last channel as direct features
        other_features = tf.contrib.slim.flatten(scaled_images[..., -1])
        # Take known amount of direct features, rest are padding zeros
        other_features = other_features[:, :num_direct_features]

        scaled_images = scaled_images[..., :-1]

        layer_1 = activ(
            conv(scaled_images,
                 'cnn1',
                 n_filters=32,
                 filter_size=8,
                 stride=4,
                 init_scale=np.sqrt(2),
                 **kwargs))
        layer_2 = activ(
            conv(layer_1,
                 'cnn2',
                 n_filters=64,
                 filter_size=4,
                 stride=2,
                 init_scale=np.sqrt(2),
                 **kwargs))
        layer_3 = activ(
            conv(layer_2,
                 'cnn3',
                 n_filters=64,
                 filter_size=3,
                 stride=1,
                 init_scale=np.sqrt(2),
                 **kwargs))
        layer_3 = conv_to_fc(layer_3)

        # Append direct features to the final output of extractor
        img_output = activ(
            linear(layer_3, 'cnn_fc1', n_hidden=512, init_scale=np.sqrt(2)))
        concat = tf.concat((img_output, other_features), axis=1)

        return concat