예제 #1
0
def build_q_func(network,
                 hiddens=(128, 128),
                 dueling=True,
                 layer_norm=False,
                 **network_kwargs):
    if isinstance(network, str):
        from common.models import get_network_builder
        network = get_network_builder(network)(**network_kwargs)

    def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
        with tf.variable_scope(scope, reuse=reuse):
            latent = network(input_placeholder)
            if isinstance(latent, tuple):
                if latent[1] is not None:
                    raise NotImplementedError(
                        "DQN is not compatible with recurrent policies yet")
                latent = latent[0]

            latent = layers.flatten(latent)

            with tf.variable_scope("action_value"):
                action_out = latent
                for hidden in hiddens:
                    action_out = layers.fully_connected(action_out,
                                                        num_outputs=hidden,
                                                        activation_fn=None)
                    if layer_norm:
                        action_out = layers.layer_norm(action_out,
                                                       center=True,
                                                       scale=True)
                    action_out = tf.nn.relu(action_out)
                action_scores = layers.fully_connected(action_out,
                                                       num_outputs=num_actions,
                                                       activation_fn=None)

            if dueling:
                with tf.variable_scope("state_value"):
                    state_out = latent
                    for hidden in hiddens:
                        state_out = layers.fully_connected(state_out,
                                                           num_outputs=hidden,
                                                           activation_fn=None)
                        if layer_norm:
                            state_out = layers.layer_norm(state_out,
                                                          center=True,
                                                          scale=True)
                        state_out = tf.nn.relu(state_out)
                    state_score = layers.fully_connected(state_out,
                                                         num_outputs=1,
                                                         activation_fn=None)
                action_scores_mean = tf.reduce_mean(action_scores, 1)
                action_scores_centered = action_scores - tf.expand_dims(
                    action_scores_mean, 1)
                q_out = state_score + action_scores_centered
            else:
                q_out = action_scores
            return q_out

    return q_func_builder
예제 #2
0
def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
    if isinstance(network, str):
        from common.models import get_network_builder
        network = get_network_builder(network)(**network_kwargs)

    def q_func_builder(input_shape, num_actions):
        # the sub Functional model which does not include the top layer.
        model = network(input_shape)

        # wrapping the sub Functional model with layers that compute action scores into another Functional model.
        latent = model.outputs
        if len(latent) > 1:
            if latent[1] is not None:
                raise NotImplementedError("DQN is not compatible with recurrent policies yet")
        latent = latent[0]

        latent = tf.keras.layers.Flatten()(latent)

        with tf.name_scope("action_value"):
            action_out = latent
            for hidden in hiddens:
                action_out = tf.keras.layers.Dense(units=hidden, activation=None)(action_out)
                if layer_norm:
                    action_out = tf.keras.layers.LayerNormalization(center=True, scale=True)(action_out)
                action_out = tf.nn.relu(action_out)
            action_scores = tf.keras.layers.Dense(units=num_actions, activation=None)(action_out)

        if dueling:
            with tf.name_scope("state_value"):
                state_out = latent
                for hidden in hiddens:
                    state_out = tf.keras.layers.Dense(units=hidden, activation=None)(state_out)
                    if layer_norm:
                        state_out = tf.keras.layers.LayerNormalization(center=True, scale=True)(state_out)
                    state_out = tf.nn.relu(state_out)
                state_score = tf.keras.layers.Dense(units=1, activation=None)(state_out)
            action_scores_mean = tf.reduce_mean(action_scores, 1)
            action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
            q_out = state_score + action_scores_centered
        else:
            q_out = action_scores
        return tf.keras.Model(inputs=model.inputs, outputs=[q_out])

    return q_func_builder
 def __init__(self, name, network='mlp', **network_kwargs):
     self.name = name
     self.network_builder = get_network_builder(network)(**network_kwargs)
def build_policy(env,
                 policy_network,
                 value_network=None,
                 lyapunove_network=None,
                 normalize_observations=False,
                 estimate_q=False,
                 **policy_kwargs):
    if isinstance(policy_network, str):
        network_type = policy_network
        policy_network = get_network_builder(network_type)(**policy_kwargs)

    def policy_fn(nbatch=None,
                  nsteps=None,
                  sess=None,
                  observ_placeholder=None,
                  observ_placeholder_=None):
        ob_space = env.observation_space

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(
            ob_space, batch_size=nbatch)
        X_ = observ_placeholder_ if observ_placeholder_ is not None else observation_placeholder_(
            ob_space, batch_size=nbatch)
        extra_tensors = {}

        if normalize_observations and X.dtype == tf.float32:
            encoded_x, rms = _normalize_clip_observation(X)
            extra_tensors['rms'] = rms
        else:
            encoded_x = X

        extra_tensors = {}

        if normalize_observations and X_.dtype == tf.float32:
            encoded_x_, rms_ = _normalize_clip_observation(X_)
            extra_tensors['rms_'] = rms_
        else:
            encoded_x_ = X_
        encoded_x_ = encode_observation(ob_space, encoded_x_)

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(encoded_x)
            if isinstance(policy_latent, tuple):
                policy_latent, recurrent_tensors = policy_latent

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(
                        nbatch, nsteps)
                    policy_latent, recurrent_tensors = policy_network(
                        encoded_x, nenv)
                    extra_tensors.update(recurrent_tensors)

        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
            vf_latent = policy_latent
        else:
            if _v_net == 'copy':
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with value_network=copy yet
                vf_latent = _v_net(encoded_x)
        _l_net = lyapunove_network

        if _l_net is None or _l_net == 'shared':
            lf_latent = policy_latent
        else:
            if _l_net == 'copy':
                _l_net = policy_network
            else:
                assert callable(_l_net)

            with tf.variable_scope('lf', reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with value_network=copy yet
                lf_latent = _l_net(encoded_x)

        policy = PolicyWithValue(env=env,
                                 observations=X,
                                 observations_=X_,
                                 latent=policy_latent,
                                 latent_=policy_latent,
                                 vf_latent=vf_latent,
                                 lf_latent=lf_latent,
                                 sess=sess,
                                 estimate_q=estimate_q,
                                 **extra_tensors)
        return policy

    return policy_fn
예제 #5
0
파일: policies.py 프로젝트: oidelima/ppo
def build_policy(
    env,
    policy_network,
    value_network=None,
    normalize_observations=False,
    estimate_q=False,
    **policy_kwargs
):
    if isinstance(policy_network, str):
        network_type = policy_network
        policy_network = get_network_builder(network_type)(**policy_kwargs)

    def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None):
        ob_space = env.observation_space

        X = (
            observ_placeholder
            if observ_placeholder is not None
            else observation_placeholder(ob_space, batch_size=nbatch)
        )

        extra_tensors = {}

        if normalize_observations and X.dtype == tf.float32:
            encoded_x, rms = _normalize_clip_observation(X)
            extra_tensors["rms"] = rms
        else:
            encoded_x = X

        encoded_x = encode_observation(ob_space, encoded_x)

        with tf.variable_scope("pi", reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(encoded_x)
            if isinstance(policy_latent, tuple):
                policy_latent, recurrent_tensors = policy_latent

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert (
                        nenv > 0
                    ), "Bad input for recurrent policy: batch size {} smaller than nsteps {}".format(
                        nbatch, nsteps
                    )
                    policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
                    extra_tensors.update(recurrent_tensors)

        _v_net = value_network

        if _v_net is None or _v_net == "shared":
            vf_latent = policy_latent
        else:
            if _v_net == "copy":
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope("vf", reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with value_network=copy yet
                vf_latent = _v_net(encoded_x)

        policy = PolicyWithValue(
            env=env,
            observations=X,
            latent=policy_latent,
            vf_latent=vf_latent,
            sess=sess,
            estimate_q=estimate_q,
            **extra_tensors
        )
        return policy

    return policy_fn