def build_q_func(network, hiddens=(128, 128), dueling=True, layer_norm=False, **network_kwargs): if isinstance(network, str): from common.models import get_network_builder network = get_network_builder(network)(**network_kwargs) def q_func_builder(input_placeholder, num_actions, scope, reuse=False): with tf.variable_scope(scope, reuse=reuse): latent = network(input_placeholder) if isinstance(latent, tuple): if latent[1] is not None: raise NotImplementedError( "DQN is not compatible with recurrent policies yet") latent = latent[0] latent = layers.flatten(latent) with tf.variable_scope("action_value"): action_out = latent for hidden in hiddens: action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None) if layer_norm: action_out = layers.layer_norm(action_out, center=True, scale=True) action_out = tf.nn.relu(action_out) action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None) if dueling: with tf.variable_scope("state_value"): state_out = latent for hidden in hiddens: state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None) if layer_norm: state_out = layers.layer_norm(state_out, center=True, scale=True) state_out = tf.nn.relu(state_out) state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None) action_scores_mean = tf.reduce_mean(action_scores, 1) action_scores_centered = action_scores - tf.expand_dims( action_scores_mean, 1) q_out = state_score + action_scores_centered else: q_out = action_scores return q_out return q_func_builder
def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs): if isinstance(network, str): from common.models import get_network_builder network = get_network_builder(network)(**network_kwargs) def q_func_builder(input_shape, num_actions): # the sub Functional model which does not include the top layer. model = network(input_shape) # wrapping the sub Functional model with layers that compute action scores into another Functional model. latent = model.outputs if len(latent) > 1: if latent[1] is not None: raise NotImplementedError("DQN is not compatible with recurrent policies yet") latent = latent[0] latent = tf.keras.layers.Flatten()(latent) with tf.name_scope("action_value"): action_out = latent for hidden in hiddens: action_out = tf.keras.layers.Dense(units=hidden, activation=None)(action_out) if layer_norm: action_out = tf.keras.layers.LayerNormalization(center=True, scale=True)(action_out) action_out = tf.nn.relu(action_out) action_scores = tf.keras.layers.Dense(units=num_actions, activation=None)(action_out) if dueling: with tf.name_scope("state_value"): state_out = latent for hidden in hiddens: state_out = tf.keras.layers.Dense(units=hidden, activation=None)(state_out) if layer_norm: state_out = tf.keras.layers.LayerNormalization(center=True, scale=True)(state_out) state_out = tf.nn.relu(state_out) state_score = tf.keras.layers.Dense(units=1, activation=None)(state_out) action_scores_mean = tf.reduce_mean(action_scores, 1) action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1) q_out = state_score + action_scores_centered else: q_out = action_scores return tf.keras.Model(inputs=model.inputs, outputs=[q_out]) return q_func_builder
def __init__(self, name, network='mlp', **network_kwargs): self.name = name self.network_builder = get_network_builder(network)(**network_kwargs)
def build_policy(env, policy_network, value_network=None, lyapunove_network=None, normalize_observations=False, estimate_q=False, **policy_kwargs): if isinstance(policy_network, str): network_type = policy_network policy_network = get_network_builder(network_type)(**policy_kwargs) def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None, observ_placeholder_=None): ob_space = env.observation_space X = observ_placeholder if observ_placeholder is not None else observation_placeholder( ob_space, batch_size=nbatch) X_ = observ_placeholder_ if observ_placeholder_ is not None else observation_placeholder_( ob_space, batch_size=nbatch) extra_tensors = {} if normalize_observations and X.dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) extra_tensors['rms'] = rms else: encoded_x = X extra_tensors = {} if normalize_observations and X_.dtype == tf.float32: encoded_x_, rms_ = _normalize_clip_observation(X_) extra_tensors['rms_'] = rms_ else: encoded_x_ = X_ encoded_x_ = encode_observation(ob_space, encoded_x_) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = policy_network(encoded_x) if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format( nbatch, nsteps) policy_latent, recurrent_tensors = policy_network( encoded_x, nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet vf_latent = _v_net(encoded_x) _l_net = lyapunove_network if _l_net is None or _l_net == 'shared': lf_latent = policy_latent else: if _l_net == 'copy': _l_net = policy_network else: assert callable(_l_net) with tf.variable_scope('lf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet lf_latent = _l_net(encoded_x) policy = PolicyWithValue(env=env, observations=X, observations_=X_, latent=policy_latent, latent_=policy_latent, vf_latent=vf_latent, lf_latent=lf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors) return policy return policy_fn
def build_policy( env, policy_network, value_network=None, normalize_observations=False, estimate_q=False, **policy_kwargs ): if isinstance(policy_network, str): network_type = policy_network policy_network = get_network_builder(network_type)(**policy_kwargs) def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None): ob_space = env.observation_space X = ( observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch) ) extra_tensors = {} if normalize_observations and X.dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) extra_tensors["rms"] = rms else: encoded_x = X encoded_x = encode_observation(ob_space, encoded_x) with tf.variable_scope("pi", reuse=tf.AUTO_REUSE): policy_latent = policy_network(encoded_x) if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert ( nenv > 0 ), "Bad input for recurrent policy: batch size {} smaller than nsteps {}".format( nbatch, nsteps ) policy_latent, recurrent_tensors = policy_network(encoded_x, nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == "shared": vf_latent = policy_latent else: if _v_net == "copy": _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope("vf", reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet vf_latent = _v_net(encoded_x) policy = PolicyWithValue( env=env, observations=X, latent=policy_latent, vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors ) return policy return policy_fn