def conv_nn(input, dims1, dims2, size1, size2, k_size=3): pp = tf.pad(tensor=input, paddings=[[0, 0], [1, 1], [1, 1], [0, 0]], mode="REFLECT") L1 = layers.conv2d(pp, dims1, [k_size, k_size], strides=[1, 1], padding='VALID', activation=None) L1 = tf.nn.elu(L1) pp = tf.pad(tensor=L1, paddings=[[0, 0], [1, 1], [1, 1], [0, 0]], mode="REFLECT") L2 = layers.conv2d(pp, dims2, [k_size, k_size], strides=[1, 1], padding='VALID', activation=None) L2 = tf.nn.elu(L2) L2 = tf.image.resize(L2, (size1, size2), method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) return L2
def __call__(self, x, reuse=True): with tf.variable_scope(self.name) as vs: if reuse: vs.reuse_variables() fc = x fc = tf.reshape(fc, shape=[-1, 56, 56, 3]) fc = layers.conv2d(fc, filters=self.nfilt, kernel_initializer=tf.keras.initializers.glorot_normal(), kernel_size=self.k, padding='same', strides=[self.s,self.s], activation=None, name='h1') #fc = bn(fc, 'eb1') fc = tf.nn.leaky_relu(fc) fc = layers.conv2d(fc, filters=self.nfilt*2, kernel_initializer=tf.keras.initializers.glorot_normal(), kernel_size=self.k, padding='same', strides=[self.s,self.s], activation=None, name='h2') #fc = bn(fc, 'eb2') fc = tf.nn.leaky_relu(fc) fc = layers.conv2d(fc, filters=self.nfilt*4, kernel_initializer=tf.keras.initializers.glorot_normal(), kernel_size=self.k, padding='same', strides=[self.s,self.s], activation=None, name='h3') #fc = bn(fc, 'eb3') fc = tf.nn.leaky_relu(fc) fc = layers.flatten(fc) fc = layers.dense( fc, self.num_at-1, activation=self.act_at, kernel_initializer=tf.keras.initializers.glorot_normal() ) return fc
def reward_prediction_mid(input_images): """A reward predictor network from intermediate layers. The inputs can be any image size (usually the intermediate conv outputs). The model runs 3 conv layers on top of each with a dense layer at the end. All of these are combined with 2 additional dense layer. Args: input_images: the input images. size is arbitrary. Returns: the predicted reward. """ encoded = [] for i, x in enumerate(input_images): enc = x enc = tfl.conv2d(enc, 16, [3, 3], strides=(1, 1), activation=tf.nn.relu) enc = tfl.conv2d(enc, 8, [3, 3], strides=(2, 2), activation=tf.nn.relu) enc = tfl.conv2d(enc, 4, [3, 3], strides=(2, 2), activation=tf.nn.relu) enc = tfl.flatten(enc) enc = tfl.dense(enc, 8, activation=tf.nn.relu, name="rew_enc_%d" % i) encoded.append(enc) x = encoded x = tf.stack(x, axis=1) x = tfl.flatten(x) x = tfl.dense(x, 32, activation=tf.nn.relu, name="rew_dense1") x = tfl.dense(x, 16, activation=tf.nn.relu, name="rew_dense2") return x
def reward_prediction_video_conv(frames, rewards, prediction_len): """A reward predictor network from observed/predicted images. The inputs is a list of frames. Args: frames: the list of input images. rewards: previously observed rewards. prediction_len: the length of the reward vector. Returns: the predicted rewards. """ x = tf.concat(frames, axis=-1) x = tfl.conv2d(x, 32, [3, 3], strides=(2, 2), activation=tf.nn.relu) x = tfl.conv2d(x, 32, [3, 3], strides=(2, 2), activation=tf.nn.relu) x = tfl.conv2d(x, 16, [3, 3], strides=(2, 2), activation=tf.nn.relu) x = tfl.conv2d(x, 8, [3, 3], strides=(2, 2), activation=tf.nn.relu) x = tfl.flatten(x) y = tf.concat(rewards, axis=-1) y = tfl.dense(y, 32, activation=tf.nn.relu) y = tfl.dense(y, 16, activation=tf.nn.relu) y = tfl.dense(y, 8, activation=tf.nn.relu) z = tf.concat([x, y], axis=-1) z = tfl.dense(z, 32, activation=tf.nn.relu) z = tfl.dense(z, 16, activation=tf.nn.relu) z = tfl.dense(z, prediction_len, activation=None) z = tf.expand_dims(z, axis=-1) return z
def discriminator_L(input, reuse, name): with tf.compat.v1.variable_scope(name): # image is 256 x 256 x input_c_dim if reuse: tf.compat.v1.get_variable_scope().reuse_variables() else: assert tf.compat.v1.get_variable_scope().reuse is False p = tf.pad(tensor=input, paddings=[[0, 0], [2, 2], [2, 2], [0, 0]], mode="REFLECT") L1 = layers.conv2d(p, 64, [5, 5], strides=2, padding='VALID', activation=None) #L1 = instance_norm(L1, 'di1l') L1 = tf.nn.leaky_relu(L1) # 32 32 64 p = tf.pad(tensor=L1, paddings=[[0, 0], [2, 2], [2, 2], [0, 0]], mode="REFLECT") L2 = layers.conv2d(p, 128, [5, 5], strides=2, padding='VALID', activation=None) #L2 = instance_norm(L2, 'di2l') L2 = tf.nn.leaky_relu(L2) # 16 16 128 p = tf.pad(tensor=L2, paddings=[[0, 0], [2, 2], [2, 2], [0, 0]], mode="REFLECT") L3 = layers.conv2d(p, 256, [5, 5], strides=2, padding='VALID', activation=None) #L3 = instance_norm(L3, 'di3l') L3 = tf.nn.leaky_relu(L3) # 8 8 256 p = tf.pad(tensor=L3, paddings=[[0, 0], [2, 2], [2, 2], [0, 0]], mode="REFLECT") L4 = layers.conv2d(p, 512, [5, 5], strides=2, padding='VALID', activation=None) #L4 = instance_norm(L4, 'di4l') L4 = tf.nn.leaky_relu(L4) # 4 4 512 L4 = layers.flatten(L4) L5 = tf.compat.v1.layers.dense(L4, 1) return L5
def reward_prediction_big(input_images, input_reward, action, latent, action_injection, small_mode): """A big reward predictor network that incorporates lots of additional info. Args: input_images: context frames. input_reward: context rewards. action: next action. latent: predicted latent vector for this frame. action_injection: action injection method. small_mode: smaller convs for faster runtiume. Returns: the predicted reward. """ conv_size = common.tinyify([32, 32, 16, 8], False, small_mode) x = tf.concat(input_images, axis=3) x = tfcl.layer_norm(x) if not small_mode: x = tfl.conv2d(x, conv_size[1], [3, 3], strides=(2, 2), activation=tf.nn.relu, name="reward_conv1") x = tfcl.layer_norm(x) # Inject additional inputs if action is not None: x = layers.inject_additional_input(x, action, "action_enc", action_injection) if input_reward is not None: x = layers.inject_additional_input(x, input_reward, "reward_enc") if latent is not None: latent = tfl.flatten(latent) latent = tf.expand_dims(latent, axis=1) latent = tf.expand_dims(latent, axis=1) x = layers.inject_additional_input(x, latent, "latent_enc") x = tfl.conv2d(x, conv_size[2], [3, 3], strides=(2, 2), activation=tf.nn.relu, name="reward_conv2") x = tfcl.layer_norm(x) x = tfl.conv2d(x, conv_size[3], [3, 3], strides=(2, 2), activation=tf.nn.relu, name="reward_conv3") return x
def decoder(input, size1, size2, reuse, name): with tf.compat.v1.variable_scope(name): if reuse: tf.compat.v1.get_variable_scope().reuse_variables() else: assert tf.compat.v1.get_variable_scope().reuse is False DL1 = conv_nn(input, 128, 128, int(size1 / 4), int(size2 / 4)) # 64 64 128 DL2 = conv_nn(DL1, 64, 64, int(size1 / 2), int(size2 / 2)) # 128 128 64 DL3 = conv_nn(DL2, 32, 32, int(size1), int(size2)) DL4 = conv_nn(DL3, 16, 16, int(size1), int(size2)) LL2 = layers.conv2d(DL4, 3, [3, 3], strides=[1, 1], padding='SAME', activation=None) # 256 256 3 LL2 = tf.clip_by_value(LL2, -1.0, 1.0) return LL2
def atari_model(img_in, num_actions, scope, reuse=False): with tf.variable_scope(scope, reuse=reuse): out = img_in with tf.variable_scope("convnet"): # out = layers.convolution2d(out, num_outputs=32, # kernel_size=8, stride=4, activation_fn=tf.nn.relu) # out = layers.convolution2d(out, num_outputs=64, # kernel_size=4, stride=2, activation_fn=tf.nn.relu) # out = layers.convolution2d(out, num_outputs=64, # kernel_size=3, stride=1, activation_fn=tf.nn.relu) # out = layers.flatten(out) print(tf.shape(out)) out = layers.conv2d(out, filters=32, kernel_size=8, strides=(4, 4), activation=tf.nn.relu) print(tf.shape(out)) out = layers.conv2d(out, filters=64, kernel_size=4, strides=(2, 2), activation=tf.nn.relu) print(tf.shape(out)) out = layers.conv2d(out, filters=64, kernel_size=3, strides=(1, 1), activation=tf.nn.relu) print(tf.shape(out)) out = layers.flatten(out) with tf.variable_scope("action_value"): # out = layers.fully_connected(out, num_outputs=512, # activation_fn=tf.nn.relu) # out = layers.fully_connected(out, num_outputs=num_actions, # activation_fn=None) print(tf.shape(out)) out = layers.dense(out, units=512, activation=tf.nn.relu) out = layers.dense(out, units=num_actions, activation=None) return out
def detection_layer(input, num_classes, img_size, anchors): """ The last convontinal layer and decode layer """ img_size = (416, 416) num_anchors = len(anchors) predict = layers.conv2d(input, num_anchors * (5 + num_classes), 1, strides=1) shape = predict.get_shape().as_list() grid_size = shape[1:3] grids_num = grid_size[0] * grid_size[1] bboxes = 5 + num_classes predict = tf.reshape(predict, [-1, grids_num, num_anchors, bboxes]) box_centers, box_sizes, confidence, classes = tf.split( predict, [2, 2, 1, num_classes], axis=-1) box_centers = tf.nn.sigmoid(box_centers) confidence = tf.nn.sigmoid(confidence) batch_size = shape[0] a = tf.range(grid_size[0], dtype=tf.float32) b = tf.range(grid_size[1], dtype=tf.float32) x_offset = tf.reshape(a, (-1, 1)) x_offset = tf.tile(x_offset, [grid_size[1], 1]) y_offset = tf.reshape(b, (1, -1)) y_offset = tf.reshape( tf.transpose(tf.tile(y_offset, [grid_size[0], 1]), [1, 0]), [grids_num, 1]) x_y_offset = tf.concat([x_offset, y_offset], axis=-1) x_y_offset = tf.tile(tf.reshape(x_y_offset, [1, -1, 1, 2]), [1, 1, num_anchors, 1]) box_centers = (box_centers + x_y_offset) * (img_size) / grid_size anchors = tf.tile(tf.reshape(anchors, [1, -1, 2]), [grids_num, 1, 1]) anchors = tf.cast(anchors, dtype=tf.float32) box_sizes = tf.exp(box_sizes) * anchors classes = tf.nn.sigmoid(classes) result_detect_result = tf.concat( [box_centers, box_sizes, confidence, classes], axis=-1) result_detect_result = tf.reshape(result_detect_result, [ -1, grids_num * num_anchors, result_detect_result.get_shape().as_list()[-1] ]) return result_detect_result
def DBL(input, filters, kernel_size, strides=1): padding = 'same' if strides > 1: padding = 'valid' input = pad(input, kernel_size) input = layers.conv2d(input, filters, kernel_size, strides=strides, padding=padding, use_bias=None) input = layers.batch_normalization(input, momentum=DECAY_BATCH_NORM, epsilon=EPSILON) input = tf.nn.leaky_relu(input, alpha=LEAKY_RELU) return input
def conv_latent_tower(images, time_axis, latent_channels=1, min_logvar=-5, is_training=False, random_latent=False, tiny_mode=False, small_mode=False): """Builds convolutional latent tower for stochastic model. At training time this tower generates a latent distribution (mean and std) conditioned on the entire video. This latent variable will be fed to the main tower as an extra variable to be used for future frames prediction. At inference time, the tower is disabled and only returns latents sampled from N(0,1). If the multi_latent flag is on, a different latent for every timestep would be generated. Args: images: tensor of ground truth image sequences time_axis: the time axis in images tensor latent_channels: number of latent channels min_logvar: minimum value for log_var is_training: whether or not it is training mode random_latent: whether or not generate random latents tiny_mode: whether or not it is tiny_mode. tiny_mode sets the number of conv channels to 1 at each layer. useful for testing the integration tests. small_mode: whether or not it is small_mode. small mode is the same model with less conv and lstm layers and also lower number of channels. suitable for videos with less complexity and testing. Returns: latent_mean: predicted latent mean latent_logvar: predicted latent log variance """ conv_size = common.tinyify([32, 64, 64], tiny_mode, small_mode) with tf.variable_scope("latent", reuse=tf.AUTO_REUSE): images = common.to_float(images) images = tf.unstack(images, axis=time_axis) images = tf.concat(images, axis=3) x = images x = make_even_size(x) x = tfl.conv2d(x, conv_size[0], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_conv1") x = tfcl.layer_norm(x) if not small_mode: x = tfl.conv2d(x, conv_size[1], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_conv2") x = tfcl.layer_norm(x) x = tfl.conv2d(x, conv_size[2], [3, 3], strides=(1, 1), padding="SAME", activation=tf.nn.relu, name="latent_conv3") x = tfcl.layer_norm(x) nc = latent_channels mean = tfl.conv2d(x, nc, [3, 3], strides=(2, 2), padding="SAME", activation=None, name="latent_mean") logv = tfl.conv2d(x, nc, [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_std") logvar = logv + min_logvar # No latent tower at inference time, just standard gaussian. if not is_training: return tf.zeros_like(mean), tf.zeros_like(logvar) # No latent in the first phase ret_mean, ret_logvar = tf.cond( random_latent, lambda: (tf.zeros_like(mean), tf.zeros_like(logvar)), lambda: (mean, logvar)) return ret_mean, ret_logvar
def conv2d(input_, output_dim, ks=4, s=2, stddev=0.02, padding='SAME', name="conv2d"): with tf.variable_scope(name): return slim.conv2d(input_, output_dim, ks, s, padding=padding, activation_fn=None, weights_initializer=tf.truncated_normal_initializer(stddev=stddev), biases_initializer=None)
def __init__(self, myScope, h_size, agent, env, trace_length, batch_size, reuse=None, step=False): if step: trace_length = 1 else: trace_length = trace_length with tf.variable_scope(myScope, reuse=reuse): self.batch_size = batch_size zero_state = tf.zeros((batch_size, h_size * 2), dtype=tf.float32) self.gamma_array = tf.placeholder(shape=[1, trace_length], dtype=tf.float32, name='gamma_array') self.gamma_array_inverse = tf.placeholder(shape=[1, trace_length], dtype=tf.float32, name='gamma_array_inv') self.lstm_state = tf.placeholder(shape=[batch_size, h_size * 2], dtype=tf.float32, name='lstm_state') if step: self.state_input = tf.placeholder(shape=[self.batch_size] + env.ob_space_shape, dtype=tf.float32, name='state_input') lstm_state = self.lstm_state else: self.state_input = tf.placeholder( shape=[batch_size * trace_length] + env.ob_space_shape, dtype=tf.float32, name='state_input') lstm_state = zero_state self.sample_return = tf.placeholder(shape=[None, trace_length], dtype=tf.float32, name='sample_return') self.sample_reward = tf.placeholder(shape=[None, trace_length], dtype=tf.float32, name='sample_reward') with tf.variable_scope('input_proc', reuse=reuse): output = layers.conv2d(self.state_input, kernel_size=(3, 3), filters=20, activation=tf.nn.relu, padding='same') output = layers.conv2d(output, kernel_size=(3, 3), filters=20, activation=tf.nn.relu, padding='same') output = layers.flatten(output) print('values', output.get_shape()) self.value = tf.reshape(layers.dense(tf.nn.relu(output), 1), [-1, trace_length]) if step: output_seq = batch_to_seq(output, self.batch_size, 1) else: output_seq = batch_to_seq(output, self.batch_size, trace_length) output_seq, state_output = lstm(output_seq, lstm_state, scope='rnn', nh=h_size) output = seq_to_batch(output_seq) output = layers.dense(output, units=env.NUM_ACTIONS, activation=None) self.log_pi = tf.nn.log_softmax(output) self.lstm_state_output = state_output self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name='actions') self.actions_onehot = tf.one_hot(self.actions, env.NUM_ACTIONS, dtype=tf.float32) predict = tf.multinomial(self.log_pi, 1) self.predict = tf.squeeze(predict) self.next_value = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='next_value') self.next_v = tf.matmul(self.next_value, self.gamma_array_inverse) self.target = self.sample_return + self.next_v self.td_error = tf.square(self.target - self.value) / 2 self.loss = tf.reduce_mean(self.td_error) self.parameters = [] self.value_params = [] for i in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=myScope): if not ('value_params' in i.name): self.parameters.append(i) # i.name if you want just a name if 'input_proc' in i.name: self.value_params.append(i) if not step: self.log_pi_action = tf.reduce_mean(tf.multiply( self.log_pi, self.actions_onehot), reduction_indices=1) self.log_pi_action_bs = tf.reduce_sum( tf.reshape(self.log_pi_action, [-1, trace_length]), 1) self.log_pi_action_bs_t = tf.reshape( self.log_pi_action, [self.batch_size, trace_length]) self.trainer = tf.train.GradientDescentOptimizer(learning_rate=1) self.updateModel = self.trainer.minimize( self.loss, var_list=self.value_params) self.setparams = SetFromFlat(self.parameters) self.getparams = GetFlat(self.parameters) self.param_len = len(self.parameters) for var in self.parameters: print(var.name, var.get_shape())
def encoder(input, reuse, name): with tf.compat.v1.variable_scope(name): if reuse: tf.compat.v1.get_variable_scope().reuse_variables() else: assert tf.compat.v1.get_variable_scope().reuse is False p = tf.pad(tensor=input, paddings=[[0, 0], [2, 2], [2, 2], [0, 0]], mode="REFLECT") CL1 = layers.conv2d(p, 32, [5, 5], strides=[1, 1], padding='VALID', activation=None) CL1 = tf.nn.elu(CL1) # 256 256 32 p = tf.pad(tensor=CL1, paddings=[[0, 0], [1, 1], [1, 1], [0, 0]], mode="REFLECT") CL2 = layers.conv2d(p, 64, [3, 3], strides=[2, 2], padding='VALID', activation=None) CL2 = tf.nn.elu(CL2) # 128 128 64 p = tf.pad(tensor=CL2, paddings=[[0, 0], [1, 1], [1, 1], [0, 0]], mode="REFLECT") CL3 = layers.conv2d(p, 64, [3, 3], strides=[1, 1], padding='VALID', activation=None) CL3 = tf.nn.elu(CL3) # 128 128 64 p = tf.pad(tensor=CL3, paddings=[[0, 0], [1, 1], [1, 1], [0, 0]], mode="REFLECT") CL4 = layers.conv2d(p, 128, [3, 3], strides=[2, 2], padding='VALID', activation=None) CL4 = tf.nn.elu(CL4) # 64 64 128 p = tf.pad(tensor=CL4, paddings=[[0, 0], [1, 1], [1, 1], [0, 0]], mode="REFLECT") CL5 = layers.conv2d(p, 128, [3, 3], strides=[1, 1], padding='VALID', activation=None) CL5 = tf.nn.elu(CL5) # 64 64 128 p = tf.pad(tensor=CL5, paddings=[[0, 0], [1, 1], [1, 1], [0, 0]], mode="REFLECT") CL6 = layers.conv2d(p, 256, [3, 3], strides=[2, 2], padding='VALID', activation=None) CL6 = tf.nn.elu(CL6) # 32 32 128 p = tf.pad(tensor=CL6, paddings=[[0, 0], [2, 2], [2, 2], [0, 0]], mode="REFLECT") DCL1 = layers.conv2d(p, 256, [3, 3], dilation_rate=2, strides=[1, 1], padding='VALID', activation=None) DCL1 = tf.nn.elu(DCL1) p = tf.pad(tensor=DCL1, paddings=[[0, 0], [4, 4], [4, 4], [0, 0]], mode="REFLECT") DCL2 = layers.conv2d(p, 256, [3, 3], dilation_rate=4, strides=[1, 1], padding='VALID', activation=None) DCL2 = tf.nn.elu(DCL2) p = tf.pad(tensor=DCL2, paddings=[[0, 0], [8, 8], [8, 8], [0, 0]], mode="REFLECT") DCL3 = layers.conv2d(p, 256, [3, 3], dilation_rate=8, strides=[1, 1], padding='VALID', activation=None) DCL3 = tf.nn.elu(DCL3) p = tf.pad(tensor=DCL3, paddings=[[0, 0], [16, 16], [16, 16], [0, 0]], mode="REFLECT") DCL4 = layers.conv2d(p, 256, [3, 3], dilation_rate=16, strides=[1, 1], padding='VALID', activation=None) DCL4 = tf.nn.elu(DCL4) # 32 32 128 return DCL4
def contextual_block(bg_in, fg_in, mask, k_size, lamda, name, stride=1): with tf.compat.v1.variable_scope(name): b, h, w, dims = [i for i in bg_in.get_shape()] temp = tf.image.resize(mask, (h, w), method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) temp = tf.expand_dims(temp[:, :, :, 0], 3) # b 128 128 1 mask_r = tf.tile(temp, [1, 1, 1, dims]) # b 128 128 128 bg = bg_in * mask_r kn = int((k_size - 1) / 2) c = 0 for p in range(kn, h - kn, stride): for q in range(kn, w - kn, stride): c += 1 patch1 = tf.image.extract_patches(bg, [1, k_size, k_size, 1], [1, stride, stride, 1], [1, 1, 1, 1], 'VALID') patch1 = tf.reshape(patch1, (b, 1, c, k_size * k_size * dims)) patch1 = tf.reshape(patch1, (b, 1, 1, c, k_size * k_size * dims)) patch1 = tf.transpose(a=patch1, perm=[0, 1, 2, 4, 3]) patch2 = tf.image.extract_patches(fg_in, [1, k_size, k_size, 1], [1, 1, 1, 1], [1, 1, 1, 1], 'SAME') ACL = [] for ib in range(b): k1 = patch1[ib, :, :, :, :] k1d = tf.reduce_sum(input_tensor=tf.square(k1), axis=2) k2 = tf.reshape(k1, (k_size, k_size, dims, c)) ww = patch2[ib, :, :, :] wwd = tf.reduce_sum(input_tensor=tf.square(ww), axis=2, keepdims=True) ft = tf.expand_dims(ww, 0) CS = tf.nn.conv2d(input=ft, filters=k1, strides=[1, 1, 1, 1], padding='SAME') tt = k1d + wwd DS1 = tf.expand_dims(tt, 0) - 2 * CS DS2 = (DS1 - tf.reduce_mean( input_tensor=DS1, axis=3, keepdims=True)) / reduce_std( DS1, 3, True) DS2 = -1 * tf.nn.tanh(DS2) CA = softmax(lamda * DS2) ACLt = tf.nn.conv2d_transpose(CA, k2, output_shape=[1, h, w, dims], strides=[1, 1, 1, 1], padding='SAME') ACLt = ACLt / (k_size**2) if ib == 0: ACL = ACLt else: ACL = tf.concat((ACL, ACLt), 0) ACL = bg + ACL * (1.0 - mask_r) con1 = tf.concat([bg_in, ACL], 3) ACL2 = layers.conv2d(con1, dims, [1, 1], strides=[1, 1], padding='VALID', activation=None, name='ML') ACL2 = tf.nn.elu(ACL2) return ACL2
def __init__(self, state_size, action_size, learning_rate, name='DQLearner'): self.state_size = state_size self.action_size = action_size self.learning_rate = learning_rate with v1.variable_scope(name): # We create the placeholders # *state_size means that we take each elements of state_size in tuple hence is like if we wrote # [None, 84, 84, 4] self.inputs_ = v1.placeholder(tf.float32, [None, *state_size], name="inputs") self.actions_ = v1.placeholder(tf.float32, [None, 3], name="actions_") # Remember that target_Q is the R(s,a) + ymax Qhat(s', a') self.target_Q = v1.placeholder(tf.float32, [None], name="target") """ First convnet: CNN BatchNormalization ELU """ # Input is 84x84x4 self.conv1 = v1l.conv2d(inputs=self.inputs_, filters=32, kernel_size=[8, 8], strides=[4, 4], padding="VALID", kernel_initializer=v1.initializers.glorot_uniform(), name="conv1") self.conv1_batchnorm = v1l.batch_normalization(self.conv1, training=True, epsilon=1e-5, name='batch_norm1') self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out") ## --> [20, 20, 32] """ Second convnet: CNN BatchNormalization ELU """ self.conv2 = v1l.conv2d(inputs=self.conv1_out, filters=64, kernel_size=[4, 4], strides=[2, 2], padding="VALID", kernel_initializer=v1.initializers.glorot_uniform(), name="conv2") self.conv2_batchnorm = v1l.batch_normalization(self.conv2, training=True, epsilon=1e-5, name='batch_norm2') self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out") ## --> [9, 9, 64] """ Third convnet: CNN BatchNormalization ELU """ self.conv3 = v1l.conv2d(inputs=self.conv2_out, filters=128, kernel_size=[4, 4], strides=[2, 2], padding="VALID", kernel_initializer=v1.initializers.glorot_uniform(), name="conv3") self.conv3_batchnorm = v1l.batch_normalization(self.conv3, training=True, epsilon=1e-5, name='batch_norm3') self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out") ## --> [3, 3, 128] self.flatten = v1l.flatten(self.conv3_out) ## --> [1152] self.fc = v1l.dense(inputs=self.flatten, units=512, activation=tf.nn.elu, kernel_initializer=v1.initializers.glorot_uniform(), name="fc1") self.output = v1l.dense(inputs=self.fc, kernel_initializer=v1.initializers.glorot_uniform(), units=3, activation=None) # Q is our predicted Q value. self.Q = tf.math.reduce_sum(tf.math.multiply(self.output, self.actions_), axis=1) # The loss is the difference between our predicted Q_values and the Q_target # Sum(Qtarget - Q)^2 self.loss = tf.math.reduce_mean(tf.math.square(self.target_Q - self.Q)) self.optimizer = v1.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)