def rollout_step(s, a, n_actions): state_shape = s.shape.as_list()[1:] op_trans_state = tf.make_template('trans_state', vi_trans, n_actions=n_actions) r = tf.layers.dense(s, n_actions, activation=None, name='reward', kernel_initializer=ortho_init(1)) s = op_trans_state(s) v = tf.reshape( tf.layers.dense(s, 1, activation=None, name='value', kernel_initializer=ortho_init(1)), [-1, n_actions]) if a is not None: l = tf.expand_dims(tf.range(0, tf.shape(r)[0]), 1) l = tf.concat([l, tf.dtypes.cast(tf.reshape(a, [-1, 1]), tf.int32)], axis=1) r = tf.gather_nd(r, l) s = tf.gather_nd(tf.reshape(s, [-1, n_actions] + state_shape), l) v = tf.gather_nd(v, l) return r, v, s
def gru(xs, ms, s, scope, nh, init_scale=1.0, activ='tanh'): """ Implements a gated recurrent unit """ nbatch, nin = [v.value for v in xs[0].get_shape()] nsteps = len(xs) with tf.variable_scope(scope): wx1 = tf.get_variable("wx1", [nin, nh * 2], initializer=ortho_init(init_scale)) wh1 = tf.get_variable("wh1", [nh, nh * 2], initializer=ortho_init(init_scale)) b1 = tf.get_variable("b1", [nh * 2], initializer=tf.constant_initializer(0.0)) wx2 = tf.get_variable("wx2", [nin, nh], initializer=ortho_init(init_scale)) wh2 = tf.get_variable("wh2", [nh, nh], initializer=ortho_init(init_scale)) b2 = tf.get_variable("b2", [nh], initializer=tf.constant_initializer(0.0)) for idx, (x, m) in enumerate(zip(xs, ms)): s = s * (1 - m) # resets hidden state of RNN y = tf.matmul(x, wx1) + tf.matmul(s, wh1) + b1 z, r = tf.split(axis=1, num_or_size_splits=2, value=y) z = tf.nn.sigmoid(z) r = tf.nn.sigmoid(r) h = tf.matmul(x, wx2) + tf.matmul(s * r, wh2) + b2 if activ == 'tanh': h = tf.tanh(h) elif activ == 'relu': h = tf.nn.relu(h) else: raise ValueError(activ) s = (1 - z) * h + z * s xs[idx] = s return xs, s
def nature_cnn(input_shape, **conv_kwargs): """ CNN from Nature paper. """ print('input shape is {}'.format(input_shape)) x_input = tf.keras.Input(shape=input_shape, dtype=tf.uint8) h = x_input h = tf.cast(h, tf.float32) / 255. h = conv('c1', nf=32, rf=8, stride=4, activation='relu', init_scale=np.sqrt(2))(h) h2 = conv('c2', nf=64, rf=4, stride=2, activation='relu', init_scale=np.sqrt(2))(h) h3 = conv('c3', nf=64, rf=3, stride=1, activation='relu', init_scale=np.sqrt(2))(h2) h3 = tf.keras.layers.Flatten()(h3) h3 = tf.keras.layers.Dense(units=512, kernel_initializer=ortho_init(np.sqrt(2)), name='fc1', activation='relu')(h3) network = tf.keras.Model(inputs=[x_input], outputs=[h3]) return network
def deconv(x, scope, *, nf, rf, stride, output_shape, pad='VALID', init_scale=1.0, data_format='NHWC'): if data_format == 'NHWC': channel_ax = 3 strides = [1, stride, stride, 1] bshape = [1, 1, 1, nf] elif data_format == 'NCHW': channel_ax = 1 strides = [1, 1, stride, stride] bshape = [1, nf, 1, 1] else: raise NotImplementedError nin = x.get_shape()[channel_ax].value wshape = [rf, rf, nf, nin] with tf.variable_scope(scope): w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale)) b = tf.get_variable("b", [1, nf, 1, 1], initializer=tf.constant_initializer(0.0)) if data_format == 'NHWC': b = tf.reshape(b, bshape) return b + tf.nn.conv2d_transpose(x, w, output_shape, strides=strides, padding=pad, data_format=data_format)
def ppo_cnn_model(state): kernel_initializer = ortho_init(np.sqrt(2)) conv_kwargs = { "activation": tf.nn.relu, "kernel_initializer": kernel_initializer, # "padding": "same", } pool_kwargs = { # "padding": "same", } c1 = tf.layers.conv2d(state, filters=32, kernel_size=8, strides=1, name="c1", **conv_kwargs) p1 = tf.layers.max_pooling2d(c1, pool_size=2, strides=2, name="p1", **pool_kwargs) c2 = tf.layers.conv2d(p1, filters=64, kernel_size=4, strides=1, name="c2", **conv_kwargs) p2 = tf.layers.max_pooling2d(c2, pool_size=2, strides=2, name="p2", **pool_kwargs) c3 = tf.layers.conv2d(p2, filters=64, kernel_size=3, strides=1, name="c3", **conv_kwargs) p3 = tf.layers.max_pooling2d(c3, pool_size=2, strides=2, name="p3", **pool_kwargs) f = tf.layers.flatten(p3) return tf.layers.dense(f, units=512, activation=tf.nn.relu, kernel_initializer=kernel_initializer)
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0): with tf.variable_scope(scope): nin = x.get_shape()[1].value w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale)) b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0)) z = tf.matmul(x, w)+b h = act(z) return h
def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0, collections=None, trainable=True): with tf.variable_scope(scope): nin = x.get_shape()[1].value w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale), collections=collections, trainable=trainable) b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias), collections=collections, trainable=trainable) if trainable and not collections: tf.add_to_collection("l2_losses", tf.contrib.layers.l2_regularizer(1.0)(w)) tf.add_to_collection("l2_losses", tf.contrib.layers.l2_regularizer(1.0)(b)) return tf.matmul(x, w)+b
def network_fn(input_shape): print('input shape is {}'.format(input_shape)) x_input = tf.keras.Input(shape=input_shape) # h = tf.keras.layers.Flatten(x_input) h = x_input for i in range(num_layers): h = tf.keras.layers.Dense(units=num_hidden, kernel_initializer=ortho_init( np.sqrt(2)), name='mlp_fc{}'.format(i), activation=activation)(h) network = tf.keras.Model(inputs=[x_input], outputs=[h]) return network
def deconv_unequ_size(x, scope, *, nf, rf, stride, output_shape, pad='VALID', init_scale=1.0, data_format='NHWC'): ''' :param x: :param scope: :param nf: :param rf: a 2-element list, [kernel_h, kernel_w] :param stride: a 2-element list, [h, w] :param pad: :param init_scale: :param data_format: :return: ''' if data_format == 'NHWC': channel_ax = 3 strides = [1, stride[0], stride[1], 1] bshape = [1, 1, 1, nf] elif data_format == 'NCHW': channel_ax = 1 strides = [1, 1, stride[0], stride[1]] bshape = [1, nf, 1, 1] else: raise NotImplementedError nin = x.get_shape()[channel_ax].value wshape = [rf[0], rf[1], nf, nin] with tf.variable_scope(scope): w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale)) b = tf.get_variable("b", [1, nf, 1, 1], initializer=tf.constant_initializer(0.0)) if data_format == 'NHWC': b = tf.reshape(b, bshape) return b + tf.nn.conv2d_transpose(x, w, output_shape, strides=strides, padding=pad, data_format=data_format)
def context_read(nm, s_flat, r, c_dim, initializer): # input: neural map (nm), flattened state (s_flat), r # output: c-dimensional context read vector (c) scope = 'cr' with tf.variable_scope(scope): input = tf.concat([s_flat, r], 1) no_rows_W = input.get_shape()[1].value if initializer == 'ortho_init': W = tf.get_variable("W", [no_rows_W, c_dim], initializer=ortho_init(np.sqrt(2))) elif initializer == 'random_normal': W = tf.get_variable( "W", [no_rows_W, c_dim], initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1)) elif initializer == 'glorot_uniform': W = tf.get_variable( "W", [no_rows_W, c_dim], initializer=tf.glorot_uniform_initializer()) batch_size = s_flat.shape[0] nm_reshaped = tf.reshape(nm, [batch_size, -1, c_dim]) q = tf.matmul(input, W, name='cr_matmul') a = tf.keras.backend.batch_dot(nm_reshaped, q, (2, 1)) #a_exp = tf.math.exp(a) #norm_fac = tf.reduce_sum(a_exp, 1) #norm_fac_expanded = tf.expand_dims(norm_fac, -1) #alpha = tf.math.divide(a_exp, norm_fac_expanded) alpha = tf.nn.softmax(a, name='cr_softmax') alpha_expanded = tf.expand_dims(alpha, -1) nm_scored = tf.math.multiply(alpha_expanded, nm_reshaped, name='cr_multiply') c = tf.reduce_sum(nm_scored, 1, name='cr_reduce_sum') return c
def network_fn(X): assert X.get_shape()[1].value == 45 batch_size = X.get_shape()[0].value h = list() for i in range(1, 10): h.append(tf.layers.flatten(X[:, 5 * (i - 1):5 * i])) h.append(X[:, -1:]) h.append(tf.layers.flatten(X[:, 40:-1])) nin_edge = h[0].get_shape()[1].value # nin_cloud = h[8].get_shape()[1].value nh = 4 with tf.variable_scope("forMEC_fc_edge") as scope: w = tf.get_variable("w", [nin_edge, nh], initializer=ortho_init(np.sqrt(2))) b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0)) for i in range(8): with tf.variable_scope("forMEC_fc_edge") as scope: scope.reuse_variables() w = tf.get_variable("w", [nin_edge, nh]) b = tf.get_variable("b", [nh]) h[i] = tf.reshape(tf.matmul(h[i], w) + b, [batch_size, -1]) h_e = tf.concat(axis=1, values=h[:9]) h_e = fc(h_e, 'forMEC_fc_edge2', nh=int(num_hidden) / 2, init_scale=np.sqrt(2)) h_e = activation(h_e) h_c = fc(h[9], 'forMEC_fc_cloud', nh=nh, init_scale=np.sqrt(2)) h_c = tf.reshape(h_c, [batch_size, -1]) h = tf.concat(axis=1, values=[h_e, h_c]) h = activation(h) h = fc(h, 'forMEC_fc2', nh=num_hidden, init_scale=np.sqrt(2)) h = activation(h) return h
def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC', collections=None, trainable=True): if data_format == 'NHWC': channel_ax = 3 strides = [1, stride, stride, 1] bshape = [1, 1, 1, nf] elif data_format == 'NCHW': channel_ax = 1 strides = [1, 1, stride, stride] bshape = [1, nf, 1, 1] else: raise NotImplementedError nin = x.get_shape()[channel_ax].value wshape = [rf, rf, nin, nf] with tf.variable_scope(scope): w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale), collections=collections, trainable=trainable) b = tf.get_variable("b", [1, nf, 1, 1], initializer=tf.constant_initializer(0.0), collections=collections, trainable=trainable) if trainable and not collections: tf.add_to_collection("l2_losses", tf.contrib.layers.l2_regularizer(1.0)(w)) tf.add_to_collection("l2_losses", tf.contrib.layers.l2_regularizer(1.0)(b)) if data_format == 'NHWC': b = tf.reshape(b, bshape) return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format)
def policy_fn(nbatch=None, nsteps=None, sess=None, state_placeholder=None, goal_placeholder=None, summary_stats=False): assert state_placeholder is not None X = state_placeholder extra_tensors = {} if normalize_observations and X.dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) extra_tensors['rms'] = rms else: encoded_x = X encoded_x = tf.to_float(encoded_x) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = policy_network(encoded_x) if goal_placeholder is not None: logger.info("concat obs and goals on latent") addition_layers = True if addition_layers: nin = policy_latent.get_shape()[1].value nh = 64 activ = tf.tanh w = tf.get_variable("addition_fc_w", [nin, nh], initializer=ortho_init(np.sqrt(2))) b = tf.get_variable( "addition_fc_b", [nh], initializer=tf.constant_initializer(0.)) policy_latent = activ(tf.matmul(policy_latent, w) + b) logger.info('additional mlp on policy latent') if summary_stats: variable_summaries(policy_latent, 'policy_latent') variable_summaries(goal_placeholder, 'goal_placeholder') policy_latent = tf.concat([policy_latent, goal_placeholder], axis=-1, name="concat_latent") if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format( nbatch, nsteps) policy_latent, recurrent_tensors = policy_network( encoded_x, nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet vf_latent = _v_net(encoded_x) policy = PolicyWithValue(env=env, observations=state_placeholder, goals=goal_placeholder, latent=policy_latent, vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors) return policy