Example #1
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **kwargs):
        super(CustomPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=True)

        with tf.compat.v1.variable_scope("model", reuse=reuse):
            activ = tf.nn.sigmoid

            pi_latent2, vf_latent2 = mlp_extractor(self.processed_obs,net_arch = [128, dict(vf=[156, 156], pi=[128])], act_fun = tf.nn.relu, **kwargs)
            actionSpace = tf.compat.v1.layers.dense(pi_latent2, ac_space.n, activation= 'sigmoid', name = 'pf')
            value_fn = tf.compat.v1.layers.dense(vf_latent2, 1, name='vf')
            vf_latent = vf_latent2

            # pi_h = extracted_features
            # for i, layer_size in enumerate([128, 128, 128]):
            #     pi_h = activ(tf.compat.v1.layers.dense(pi_h, layer_size, name='pi_fc' + str(i)))
            # pi_latent = pi_h
            #
            # vf_h = extracted_features
            # for i, layer_size in enumerate([32, 32]):
            #     vf_h = activ(tf.compat.v1.layers.dense(vf_h, layer_size, name='vf_fc' + str(i)))
            # value_fn = tf.compat.v1.layers.dense(vf_h, 1, name='vf')
            # vf_latent = vf_h

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(actionSpace, vf_latent, init_scale=0.01)

        self._value_fn = value_fn
        self._setup_init()
Example #2
0
    def __init__(self,
                 tf_session,
                 ob_space,
                 ac_space,
                 num_env,
                 num_steps,
                 num_batch,
                 activation_func=tf.nn.tanh,
                 reuse=False,
                 **kwargs):
        super(SafePolicy, self).__init__(tf_session,
                                         ob_space,
                                         ac_space,
                                         num_env,
                                         num_steps,
                                         num_batch,
                                         reuse=reuse)
        layers = [256, 256, 256]
        net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            pi_latent, vf_latent = mlp_extractor(
                tf.layers.flatten(self.processed_obs), net_arch,
                activation_func)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
Example #3
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, act_norm_init=None, obs_norm_init=None,
                 net_arch=None, reuse=False, act_fun=tf.tanh):
        super(NormalMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse)

        if obs_norm_init is not None:
            self.obs_norm = TFNormalizer(sess, 'obs_norm', ob_space.shape[0], reuse=reuse, **obs_norm_init)
        else:
            self.obs_norm = None

        if act_norm_init is not None:
            self.act_norm = TFNormalizer(sess, 'act_norm', ac_space.shape[0], reuse=reuse,  **act_norm_init)
        else:
            self.act_norm = None

        del self._pdtype
        self._pdtype = ActNormGaussProbDistType(ac_space.shape[0], self.act_norm)

        if net_arch is None:
            net_arch = [dict(vf=[64, 64], pi=[64, 64])]

        with tf.variable_scope("model", reuse=reuse):
            # normalization and clipping
            if self.obs_norm is not None:
                extractor_in = self.obs_norm.clip_normalize(tf.layers.flatten(self.processed_obs))
            else:
                extractor_in = tf.layers.flatten(self.processed_obs)

            pi_latent, vf_latent = mlp_extractor(extractor_in, net_arch, act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 layers=None,
                 net_arch=None,
                 act_fun=tf.tanh,
                 cnn_extractor=nature_cnn,
                 feature_extraction="cnn",
                 **kwargs):
        super(BSSPolicy, self).__init__(sess,
                                        ob_space,
                                        ac_space,
                                        n_env,
                                        n_steps,
                                        n_batch,
                                        reuse=reuse,
                                        scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        if layers is not None:
            warnings.warn(
                "Usage of the `layers` parameter is deprecated! Use net_arch instead "
                "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn(
                    "The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                    DeprecationWarning)

        if net_arch is None:
            if layers is None:
                layers = [64, 64]
            net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                pi_latent = vf_latent = cnn_extractor(self.processed_obs,
                                                      **kwargs)
            else:
                pi_latent, vf_latent = mlp_extractor(
                    tf.layers.flatten(self.processed_obs), net_arch, act_fun)

            self.pi_feature_m = pi_latent
            self.vf_feature_m = vf_latent

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=True,
                 layers=None,
                 net_arch=None,
                 act_fun=tf.nn.relu,
                 feature_extraction="mlp",
                 **kwargs):
        super(TransformerPolicy, self).__init__(sess,
                                                ob_space,
                                                ac_space,
                                                n_env,
                                                n_steps,
                                                n_batch,
                                                reuse=reuse)
        encoder = TransformerPolicy.get_common_police_network()

        self._kwargs_check(feature_extraction, kwargs)

        if net_arch is None:
            if layers is None:
                layers = [
                    128,
                ]
            net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            transformer_encoded = encoder(self.processed_obs['backbone'], None)

            with_energy = tf.layers.flatten(transformer_encoded)
            with_energy = tf.keras.layers.Concatenate()(
                [with_energy, self.processed_obs['step_to_end']])
            pi_latent, vf_latent = mlp_extractor(with_energy, net_arch,
                                                 act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
Example #6
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **kwargs):
        super(CustomPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=True)
        self._is_graph_network = True

        with tf.compat.v1.variable_scope("model", reuse=reuse):
            activ = tf.nn.relu

            # pi_latent2, vf_latent2 = mlp_extractor(self.processed_obs,net_arch = [128, dict(vf=[156, 156], pi=[128])], act_fun = tf.nn.relu, **kwargs)
            # actionSpace = tf.compat.v1.layers.dense(pi_latent2, ac_space.n, activation= 'sigmoid', name = 'pf')
            # value_fn = tf.compat.v1.layers.dense(vf_latent2, 1, name='vf')
            # vf_latent = vf_latent2

            shapesShared = [256]
            extracted_features = mlp_extractor(self.processed_obs, shapesShared, activ)
            # extracted_features = mlp_extractor(extracted_features, shapesShared, activ)

            pi_h = extracted_features[0]
            shapesp = [128, 64]
            for i, layer_size in enumerate(shapesp):
                if i == len(shapesp)-1:
                    pi_h = tf.nn.sigmoid(tf.layers.dense(pi_h, layer_size, name='pi_fc' + str(i)))
                else:
                    pi_h = activ(tf.layers.dense(pi_h, layer_size, name='pi_fc' + str(i)))
            pi_latent = pi_h

            vf_h = extracted_features[1]
            shapesv = [64,64]
            for i, layer_size in enumerate(shapesv):
                vf_h = activ(tf.compat.v1.layers.dense(vf_h, layer_size, name='vf_fc' + str(i)))
            value_fn = tf.compat.v1.layers.dense(vf_h, 1, name='vf')
            vf_latent = vf_h

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._value_fn = value_fn
        self._setup_init()
Example #7
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 layers=None,
                 net_arch=[dict(vf=[128, 128, 128], pi=[128, 128, 128])],
                 act_fun=tf.tanh,
                 cnn_extractor=nature_cnn,
                 feature_extraction="gnn",
                 layer_size=64,
                 layer_count=2,
                 network_graphs=None,
                 dm_memory_length=None,
                 iterations=10,
                 vf_arch="mlp",
                 **kwargs):
        super(FeedForwardPolicyWithGnn,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             reuse=reuse,
                             scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                pi_latent = vf_latent = cnn_extractor(self.processed_obs,
                                                      **kwargs)
            elif feature_extraction == "gnn":
                pi_latent, vf_latent = gnn_extractor(tf.layers.flatten(
                    self.processed_obs),
                                                     act_fun,
                                                     network_graphs,
                                                     dm_memory_length,
                                                     layer_size=layer_size,
                                                     layer_count=layer_count,
                                                     iterations=iterations,
                                                     vf_arch=vf_arch)
            elif feature_extraction == "gnn_iter":
                pi_latent, vf_latent = gnn_iter_extractor(
                    tf.layers.flatten(self.processed_obs),
                    act_fun,
                    network_graphs,
                    dm_memory_length,
                    layer_size=layer_size,
                    layer_count=layer_count,
                    iterations=iterations,
                    vf_arch=vf_arch)
            else:  # Assume mlp feature extraction
                pi_latent, vf_latent = mlp_extractor(
                    tf.layers.flatten(self.processed_obs), net_arch, act_fun)
                # Need this here as removed from proba_distribution
                # ok to choose first as can only run mlp one one graph anyway
                pi_latent = linear(pi_latent,
                                   'pi',
                                   network_graphs[0].number_of_edges() + 1,
                                   init_scale=0.01,
                                   init_bias=0.0)

            self._value_fn = linear(vf_latent, 'vf', 1)

            # self._proba_distribution, self._policy, self.q_value = \
            #     self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent,
            #                                                init_scale=0.01)
            self._proba_distribution, self._policy, self.q_value = \
                self.proba_distribution_no_pi_linear(pi_latent, vf_latent,
                                                     init_scale=0.01)

        self._setup_init()
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 layers=None,
                 net_arch=None,
                 act_fun=tf.tanh,
                 feature_extraction="attention_mlp",
                 n_object=2,
                 **kwargs):
        super(AttentionPolicy,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             reuse=reuse,
                             scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        if layers is not None:
            warnings.warn(
                "Usage of the `layers` parameter is deprecated! Use net_arch instead "
                "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn(
                    "The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                    DeprecationWarning)

        if net_arch is None:
            if layers is None:
                layers = [256, 256]
            net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            # assert feature_extraction == 'attention_mlp'
            if feature_extraction == 'attention_mlp':
                latent = attention_mlp_extractor2(tf.layers.flatten(
                    self.processed_obs),
                                                  n_object=n_object,
                                                  n_units=128)
                pi_latent, vf_latent = mlp_extractor(latent, net_arch, act_fun)
            elif feature_extraction == 'attention_mlp_particle':
                latent = attention_mlp_extractor_particle(tf.layers.flatten(
                    self.processed_obs),
                                                          n_object=n_object,
                                                          n_units=128)
                pi_latent, vf_latent = mlp_extractor(latent, net_arch, act_fun)
            elif feature_extraction == 'self_attention_mlp':
                pi_latent, vf_latent = self_attention_mlp_extractor(
                    tf.layers.flatten(self.processed_obs), n_object=n_object)
            else:
                raise NotImplementedError
            # if feature_extraction == "cnn":
            #     pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs)
            # else:
            #     pi_latent, vf_latent = mlp_extractor(tf.layers.flatten(self.processed_obs), net_arch, act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
Example #9
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 layers=None,
                 net_arch=None,
                 act_fun=tf.tanh,
                 cnn_extractor=nature_cnn,
                 feature_extraction="cnn",
                 **kwargs):
        super(FeedForwardPolicy,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             reuse=reuse,
                             scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        if layers is not None:
            warnings.warn(
                "Usage of the `layers` parameter is deprecated! Use net_arch instead "
                "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn(
                    "The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                    DeprecationWarning)

        if net_arch is None:
            if layers is None:
                layers = [64, 64]
            net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                activ = tf.nn.tanh

                observation_features = self.processed_obs[:, :, -8:]
                observation_features_flat = tf.layers.flatten(
                    observation_features)

                visual_features = self.processed_obs[:, :, :-8]
                visual_features = tf.reshape(visual_features,
                                             [-1, 128, 128, 15])

                vis_pi_latent = vis_vf_latent = cnn_extractor(
                    visual_features, **kwargs)
                vis_pi_latent = tf.reshape(vis_pi_latent, [-1, 1, 512])
                vis_vf_latent = tf.reshape(vis_vf_latent, [-1, 1, 512])

                meas_pi_h = activ(
                    linear(observation_features_flat,
                           "pi_meas_fc",
                           512,
                           init_scale=np.sqrt(2)))
                meas_pi_latent = tf.reshape(meas_pi_h, [-1, 1, 512])
                features = tf.layers.flatten(
                    tf.concat([vis_pi_latent, meas_pi_latent], axis=2))
                pi_latent = activ(
                    linear(features, "pi_fc", 128, init_scale=np.sqrt(2)))

                meas_vf_h = activ(
                    linear(observation_features_flat,
                           "vf_meas_fc",
                           512,
                           init_scale=np.sqrt(2)))
                meas_vf_latent = tf.reshape(meas_vf_h, [-1, 1, 512])
                features = tf.layers.flatten(
                    tf.concat([vis_vf_latent, meas_vf_latent], axis=2))
                vf_latent = activ(
                    linear(features, "vf_fc", 128, init_scale=np.sqrt(2)))

            else:
                pi_latent, vf_latent = mlp_extractor(
                    tf.layers.flatten(self.processed_obs), net_arch, act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()
Example #10
0
def observation_input(ob_space,
                      batch_size=None,
                      name='Ob',
                      scale=False,
                      reuse=False):
    """
    Build observation input with encoding depending on the observation space type

    When using Box ob_space, the input will be normalized between [1, 0] on the bounds ob_space.low and ob_space.high.

    :param ob_space: (Gym Space) The observation space
    :param batch_size: (int) batch size for input
                       (default is None, so that resulting input placeholder can take tensors with any batch size)
    :param name: (str) tensorflow variable name for input placeholder
    :param scale: (bool) whether or not to scale the input
    :param reuse: (bool)
    :return: (TensorFlow Tensor, TensorFlow Tensor) input_placeholder, processed_input_tensor
    """
    if isinstance(ob_space, Discrete):
        observation_ph = tf.placeholder(shape=(batch_size, ),
                                        dtype=tf.int32,
                                        name=name)
        processed_observations = tf.cast(
            tf.one_hot(observation_ph, ob_space.n), tf.float32)
        return observation_ph, processed_observations

    elif isinstance(ob_space, Box):
        observation_ph = tf.placeholder(shape=(batch_size, ) + ob_space.shape,
                                        dtype=ob_space.dtype,
                                        name=name)
        processed_observations = tf.cast(observation_ph, tf.float32)
        # rescale to [1, 0] if the bounds are defined
        if (scale and not np.any(np.isinf(ob_space.low))
                and not np.any(np.isinf(ob_space.high))
                and np.any((ob_space.high - ob_space.low) != 0)):

            # equivalent to processed_observations / 255.0 when bounds are set to [255, 0]
            processed_observations = ((processed_observations - ob_space.low) /
                                      (ob_space.high - ob_space.low))
        return observation_ph, processed_observations

    elif isinstance(ob_space, MultiBinary):
        observation_ph = tf.placeholder(shape=(batch_size, ob_space.n),
                                        dtype=tf.int32,
                                        name=name)
        processed_observations = tf.cast(observation_ph, tf.float32)
        return observation_ph, processed_observations

    elif isinstance(ob_space, MultiDiscrete):
        observation_ph = tf.placeholder(shape=(batch_size, len(ob_space.nvec)),
                                        dtype=tf.int32,
                                        name=name)
        processed_observations = tf.concat([
            tf.cast(tf.one_hot(input_split, ob_space.nvec[i]), tf.float32)
            for i, input_split in enumerate(
                tf.split(observation_ph, len(ob_space.nvec), axis=-1))
        ],
                                           axis=-1)
        return observation_ph, processed_observations

    elif isinstance(ob_space, Dict):
        ob_space_dict = list(OrderedDict(ob_space.spaces))
        ob_space_length = np.array(
            [np.prod(np.array(ob_space[key].shape)) for key in ob_space_dict])

        observation_ph = tf.placeholder(shape=(batch_size,
                                               np.sum(ob_space_length)),
                                        dtype=tf.float32,
                                        name=name)

        observation_day_ph = observation_ph[:, :ob_space_length[1]]
        processed_observation_day = tf.cast(observation_day_ph, tf.float32)

        # observation_board_ph = observation_ph[:, (ob_space_length[1]+1):(ob_space_length[1]+ob_space_length[0])]
        # processed_observation_board = tf.cast(observation_board_ph, tf.float32)
        # # rescale to [1, 0] if the bounds are defined
        # if (scale and
        #         not np.any(np.isinf(ob_space["board_config"].low)) and
        #         not np.any(np.isinf(ob_space["board_config"].high)) and
        #         np.any((ob_space["board_config"].high - ob_space["board_config"].low) != 0)):
        #     # equivalent to processed_observations / 255.0 when bounds are set to [255, 0]
        #     processed_observation_board = ((processed_observation_board - ob_space["board_config"].low) /
        #                                     (ob_space["board_config"].high - ob_space["board_config"].low))

        observation_prevsales_ph = observation_ph[:, -ob_space_length[-1]:]
        processed_observation_prevsales = tf.cast(observation_prevsales_ph,
                                                  tf.float32)
        # rescale to [1, 0] if the bounds are defined
        if (scale and not np.any(np.isinf(ob_space["prev_sales"].low))
                and not np.any(np.isinf(ob_space["prev_sales"].high))
                and np.any((ob_space["prev_sales"].high -
                            ob_space["prev_sales"].low) != 0)):
            # equivalent to processed_observations / 255.0 when bounds are set to [255, 0]
            processed_observation_prevsales = (
                (processed_observation_prevsales - ob_space["prev_sales"].low)
                / (ob_space["prev_sales"].high - ob_space["prev_sales"].low))

        # TODO: these should be in params
        net_arch = None
        act_fun = tf.tanh

        if net_arch is None:
            net_arch = [32, 16]

        with tf.variable_scope("input_embedding", reuse=reuse):
            # with tf.variable_scope("board_embed", reuse=reuse):
            #     board_latent, _ = mlp_extractor(tf.layers.flatten(processed_observation_board), net_arch, act_fun)
            with tf.variable_scope("prevsales_embed", reuse=reuse):
                prevsales, _ = mlp_extractor(
                    tf.layers.flatten(processed_observation_prevsales),
                    net_arch, act_fun)
            processed_observations = tf.concat(
                [
                    processed_observation_day,
                    # board_latent,
                    prevsales
                ],
                axis=-1,
                name="final_obs")
        # TODO: watch out! the processed observation is passed as observation_ph
        return observation_ph, processed_observations
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 layers=None,
                 net_arch=None,
                 act_fun=tf.tanh,
                 cnn_extractor=nature_cnn,
                 feature_extraction="mlp",
                 **kwargs):
        super(FeedForwardPolicy,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             reuse=reuse,
                             scale=(feature_extraction == "cnn"))
        print("OB_SPACE = ", ob_space)
        print("AC_SPACE = ", ac_space)
        print("N_ENV = ", n_env)
        print("N_STEPS = ", n_steps)
        print("N_BATCH = ", n_batch)
        print("REUSE = ", reuse)
        print("LAYERS = ", layers)
        print("NET_ARCH = ", net_arch)
        print("KWARGS = ", kwargs)

        self._pdtype = make_proba_dist_type(ac_space)

        self._kwargs_check(feature_extraction, kwargs)

        if layers is not None:
            warnings.warn(
                "Usage of the `layers` parameter is deprecated! Use net_arch instead "
                "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn(
                    "The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                    DeprecationWarning)

        if net_arch is None:
            if layers is None:
                layers = [64, 64]
            net_arch = [dict(vf=layers, pi=layers)]
        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                pi_latent = vf_latent = cnn_extractor(self.processed_obs,
                                                      **kwargs)
            else:
                pi_latent, vf_latent = mlp_extractor(
                    tf.layers.flatten(self.processed_obs), net_arch, act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent,
                                                           pi_init_scale=1.0, pi_init_bias=0.0, pi_init_std=0.125,
                                                           vf_init_scale=1.0, vf_init_bias=0.0)

        self._setup_init()
        return
Example #12
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 layers=None,
                 goal_num=1,
                 goal_net_arch=None,
                 net_arch=None,
                 act_fun=tf.tanh,
                 cnn_extractor=nature_cnn,
                 goal_encoder='mlp',
                 feature_extraction="mlp",
                 **kwargs):
        super(GoalsConditionedMLPPolicy,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             reuse=reuse,
                             scale=(feature_extraction == "mlp"))

        self.goal_encoder = goal_encoder
        # self._kwargs_check(feature_extraction, kwargs)
        self.name = "mlp_policy_" + goal_encoder

        if layers is not None:
            warnings.warn(
                "Usage of the `layers` parameter is deprecated! Use net_arch instead "
                "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn(
                    "The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                    DeprecationWarning)

        if net_arch is None:
            if layers is None:
                layers = [64, 64]
            net_arch = [dict(vf=layers, pi=layers)]
        if goal_net_arch is None:
            goal_net_arch = [[64, 32], 2]

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            self.obs_goals = tf.placeholder(dtype=ob_space.dtype,
                                            shape=(None, ob_space.shape[0]),
                                            name='goal_states')
            obs_goals_reshape = self.obs_goals  #tf.reshape(tensor=self.obs_goals, shape=(-1, self.goal_num * ob_space.shape[0]))

            if goal_encoder == "mlp_sample":
                logging.info('mlp encoder with z sampling')
                self.z_mu, self.z_log_sigma_sq = mlp_goal_encoder(
                    obs_goals_reshape, goal_net_arch, act_fun)
                eps = tf.random_normal(shape=tf.shape(self.z_log_sigma_sq),
                                       mean=0,
                                       stddev=1,
                                       dtype=tf.float32)
                self.z_goal_sample = self.z_mu + tf.sqrt(
                    tf.exp(self.z_log_sigma_sq)) * eps
            if goal_encoder == "mlp":
                logging.info('mlp encoder with z mu')
                self.z_mu, self.z_log_sigma_sq = mlp_goal_encoder(
                    obs_goals_reshape, goal_net_arch, act_fun)
                self.z_goal_sample = self.z_mu
            if goal_encoder == "no_encoder" or goal_encoder == 'no_goal_proposing':
                logging.info('no encoder for goal obs')
                self.z_goal_sample = tf.stop_gradient(self.obs_goals)

            # self.z_goal_input = tf.placeholder(dtype=ob_space.dtype, shape=self.z_mu.shape, name='input_z_goal')
            self.z_goal_input = tf.placeholder(dtype=ob_space.dtype,
                                               shape=self.z_goal_sample.shape,
                                               name='input_z_goal')
            self.use_input_z = tf.placeholder_with_default(False,
                                                           shape=(),
                                                           name='use_input_z')

            def use_sample():
                return self.z_goal_sample

            def use_input():
                return self.z_goal_input

            self.z_goal = tf.cond(self.use_input_z, use_input, use_sample)

            if goal_encoder == 'no_goal_proposing':
                latent = tf.layers.flatten(self.processed_obs)
            else:
                latent = tf.concat(
                    [tf.layers.flatten(self.processed_obs), self.z_goal], 1)
            logging_info = 'latent shape' + str(latent.shape)
            logging.info(logging_info)

            if feature_extraction == "cnn":
                pi_latent = vf_latent = cnn_extractor(self.processed_obs,
                                                      **kwargs)
            else:
                pi_latent, vf_latent = mlp_extractor(latent, net_arch, act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        if goal_encoder == "mlp_sample":
            kl_coef = 0.01
            latent_loss = -0.5 * tf.reduce_sum(
                1 + self.z_log_sigma_sq - tf.square(self.z_mu) -
                tf.exp(self.z_log_sigma_sq),
                axis=1)

            self.latent_loss = tf.reduce_mean(latent_loss) * kl_coef
        else:
            self.latent_loss = 0

        self._setup_init()
Example #13
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 layers=None,
                 net_arch=None,
                 act_fun=tf.tanh,
                 cnn_extractor=nature_cnn,
                 feature_extraction="cnn",
                 **kwargs):

        source_policy_paths, SDW, no_bias = get_master_config(kwargs)

        super(AggregatePolicy,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             reuse=reuse,
                             scale=(feature_extraction == "cnn"))

        if isinstance(ac_space, spaces.Box):
            n_actions = self.ac_space.shape[0]
            action_dtype = tf.float32

        elif isinstance(ac_space, spaces.Discrete):
            n_actions = ac_space.n
            action_dtype = tf.int64

        else:
            raise NotImplementedError(
                "Multipolar is not implemented for the required action space")

        sources_actions = get_sources_actions(self.obs_ph, source_policy_paths,
                                              n_batch, n_actions, ac_space,
                                              action_dtype)
        self.pdtype = make_multipolar_proba_dist_type(ac_space,
                                                      sources_actions,
                                                      no_bias,
                                                      SDW,
                                                      summary=reuse)

        self._kwargs_check(feature_extraction, kwargs)

        if layers is not None:
            warnings.warn(
                "Usage of the `layers` parameter is deprecated! Use net_arch instead "
                "(it has a different semantics though).", DeprecationWarning)
            if net_arch is not None:
                warnings.warn(
                    "The new `net_arch` parameter overrides the deprecated `layers` parameter!",
                    DeprecationWarning)

        if net_arch is None:
            if layers is None:
                layers = [64, 64]
            net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                pi_latent = vf_latent = cnn_extractor(self.processed_obs,
                                                      **kwargs)
            else:
                pi_latent, vf_latent = mlp_extractor(
                    tf.layers.flatten(self.processed_obs), net_arch, act_fun)

            self.value_fn = linear(vf_latent, 'vf', 1)

            self.proba_distribution, self.policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self.initial_state = None
        self._setup_init()
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 layers=None,
                 net_arch=None,
                 act_fun=tf.nn.relu,
                 feature_extraction="mlp",
                 **kwargs):
        super(LstmCustomPolicy, self).__init__(sess,
                                               ob_space,
                                               ac_space,
                                               n_env,
                                               n_steps,
                                               n_batch,
                                               reuse=reuse)

        # extracted_features = tf.keras.layers.Dense(128, activation='relu')(self.processed_obs)
        # extracted_features = tf.keras.layers.MaxPooling1D(pool_size=2)(extracted_features)
        # extracted_features = tf.keras.layers.Conv1D(128, kernel_size=3, padding='same')(extracted_features)
        # extracted_features = tf.keras.layers.MaxPooling1D(pool_size=2)(extracted_features)

        # lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=128)
        # extracted_features = nature_cnn(self.processed_obs, **kwargs)

        self._kwargs_check(feature_extraction, kwargs)

        if net_arch is None:
            if layers is None:
                layers = [128, 64]
            net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            # x_image = tf.keras.layers.Reshape((-1, 3, 1))(self.processed_obs['residue_chain'])  # batch_size  x board_x x board_y x 1
            activ = tf.nn.relu

            # encoded_chain = tf.keras.layers.LSTM(16)(self.processed_obs["residue_chain"])
            # x_image = tf.keras.layers.Reshape((64, 3, 1))(self.processed_obs['residue_chain'])  # batch_size  x board_x x board_y x 1
            #
            # embeded = tf.keras.layers.Conv2D(64,  kernel_size=3, padding='same', use_bias=False)(x_image)
            # embeded = tf.keras.layers.MaxPool2D(pool_size=(2, 1), strides= (2, 1))(embeded)
            # embeded = tf.keras.layers.Conv2D(64,  kernel_size=3, padding='same', use_bias=False)(embeded)
            # embeded = tf.keras.layers.Conv2D(64,  kernel_size=3, padding='valid', use_bias=False)(embeded)

            with_energy = tf.layers.flatten(self.processed_obs['backbone'])

            # with_energy = tf.keras.layers.Dense(64)(with_energy)
            with_energy = tf.keras.layers.Concatenate()([
                with_energy, self.processed_obs['protein_name'],
                self.processed_obs['residue_number'],
                self.processed_obs['step_to_end']
            ])
            pi_latent, vf_latent = mlp_extractor(with_energy, net_arch,
                                                 act_fun)

            self._value_fn = linear(vf_latent, 'vf', 1)

            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self._setup_init()