Beispiel #1
0
    def __init__(self,
                 sess,
                 height,
                 width,
                 phi_length,
                 n_actions,
                 name,
                 gamma=0.99,
                 copy_interval=4,
                 optimizer='RMS',
                 learning_rate=0.00025,
                 epsilon=0.01,
                 decay=0.95,
                 momentum=0.,
                 l2_decay=0.0001,
                 error_clip=1.0,
                 slow=False,
                 tau=0.01,
                 verbose=False,
                 path='',
                 folder='_networks',
                 decay_learning_rate=False,
                 transfer=False):
        """ Initialize network """
        Network.__init__(self, sess, name=name)
        self.gamma = gamma
        self.slow = slow
        self.tau = tau
        self.name = name
        self.sess = sess
        self.path = path
        self.folder = folder
        self.copy_interval = copy_interval
        self.update_counter = 0
        self.decay_learning_rate = decay_learning_rate

        self.observation = tf.placeholder(tf.float32,
                                          [None, height, width, phi_length],
                                          name=self.name + '_observation')
        self.actions = tf.placeholder(tf.float32,
                                      shape=[None, n_actions],
                                      name=self.name +
                                      "_actions")  # one-hot matrix
        self.next_observation = tf.placeholder(
            tf.float32, [None, height, width, phi_length],
            name=self.name + '_t_next_observation')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=[None],
                                      name=self.name + "_rewards")
        self.terminals = tf.placeholder(tf.float32,
                                        shape=[None],
                                        name=self.name + "_terminals")

        self.slow_learnrate_vars = []
        self.fast_learnrate_vars = []

        self.observation_n = tf.div(self.observation, 255.)
        self.next_observation_n = tf.div(self.next_observation, 255.)

        # q network model:
        self.is_training = tf.placeholder(tf.bool, [])

        with tf.name_scope("Conv1") as scope:
            kernel_shape = [8, 8, phi_length, 32]
            self.W_conv1 = self.weight_variable(phi_length, kernel_shape,
                                                'conv1')
            #self.b_conv1 = self.bias_variable(kernel_shape, 'conv1')
            self.h_conv1_bn = batch_norm(self.conv2d(self.observation_n,
                                                     self.W_conv1, 4),
                                         32,
                                         self.is_training,
                                         self.sess,
                                         slow=self.slow,
                                         tau=self.tau)
            self.h_conv1 = tf.nn.relu(self.h_conv1_bn.bnorm,
                                      name=self.name + '_conv1_activations')
            tf.add_to_collection('conv_weights', self.W_conv1)
            tf.add_to_collection('conv_output', self.h_conv1)
            if transfer:
                self.slow_learnrate_vars.append(self.W_conv1)
                self.slow_learnrate_vars.append(self.h_conv1_bn.scale)
                self.slow_learnrate_vars.append(self.h_conv1_bn.beta)

        with tf.name_scope("Conv2") as scope:
            kernel_shape = [4, 4, 32, 64]
            self.W_conv2 = self.weight_variable(32, kernel_shape, 'conv2')
            #self.b_conv2 = self.bias_variable(kernel_shape, 'conv2')
            self.h_conv2_bn = batch_norm(self.conv2d(self.h_conv1,
                                                     self.W_conv2, 2),
                                         64,
                                         self.is_training,
                                         self.sess,
                                         slow=self.slow,
                                         tau=self.tau)
            self.h_conv2 = tf.nn.relu(self.h_conv2_bn.bnorm,
                                      name=self.name + '_conv2_activations')
            tf.add_to_collection('conv_weights', self.W_conv2)
            tf.add_to_collection('conv_output', self.h_conv2)
            if transfer:
                self.slow_learnrate_vars.append(self.W_conv2)
                self.slow_learnrate_vars.append(self.h_conv2_bn.scale)
                self.slow_learnrate_vars.append(self.h_conv2_bn.beta)

        with tf.name_scope("Conv3") as scope:
            kernel_shape = [3, 3, 64, 64]
            self.W_conv3 = self.weight_variable(64, kernel_shape, 'conv3')
            #self.b_conv3 = self.bias_variable(kernel_shape, 'conv3')
            self.h_conv3_bn = batch_norm(self.conv2d(self.h_conv2,
                                                     self.W_conv3, 1),
                                         64,
                                         self.is_training,
                                         self.sess,
                                         slow=self.slow,
                                         tau=self.tau)
            self.h_conv3 = tf.nn.relu(self.h_conv3_bn.bnorm,
                                      name=self.name + '_conv3_activations')
            tf.add_to_collection('conv_weights', self.W_conv3)
            tf.add_to_collection('conv_output', self.h_conv3)
            if transfer:
                self.slow_learnrate_vars.append(self.W_conv3)
                self.slow_learnrate_vars.append(self.h_conv3_bn.scale)
                self.slow_learnrate_vars.append(self.h_conv3_bn.beta)

        self.h_conv3_flat = tf.reshape(self.h_conv3, [-1, 3136])

        with tf.name_scope("FullyConnected1") as scope:
            kernel_shape = [3136, 512]
            self.W_fc1 = self.weight_variable_linear(kernel_shape, 'fc1')
            #self.b_fc1 = self.bias_variable(kernel_shape, 'fc1')
            self.h_fc1_bn = batch_norm(tf.matmul(self.h_conv3_flat,
                                                 self.W_fc1),
                                       512,
                                       self.is_training,
                                       self.sess,
                                       slow=self.slow,
                                       tau=self.tau,
                                       linear=True)
            self.h_fc1 = tf.nn.relu(self.h_fc1_bn.bnorm,
                                    name=self.name + '_fc1_activations')
            if transfer:
                self.fast_learnrate_vars.append(self.W_fc1)
                self.fast_learnrate_vars.append(self.h_fc1_bn.scale)
                self.fast_learnrate_vars.append(self.h_fc1_bn.beta)

        with tf.name_scope("FullyConnected2") as scope:
            kernel_shape = [512, n_actions]
            self.W_fc2 = self.weight_variable_linear(kernel_shape, 'fc2')
            self.b_fc2 = self.bias_variable_linear(kernel_shape, 'fc2')
            self.q_value = tf.add(tf.matmul(self.h_fc1, self.W_fc2),
                                  self.b_fc2,
                                  name=self.name + '_fc1_outputs')
            if transfer:
                self.fast_learnrate_vars.append(self.W_fc2)
                self.fast_learnrate_vars.append(self.b_fc2)

        if transfer:
            self.load_transfer_model(optimizer=optimizer.lower())
            # Scale down the last layer
            W_fc2_scaled = tf.scalar_mul(0.01, self.W_fc2)
            b_fc2_scaled = tf.scalar_mul(0.01, self.b_fc2)
            self.sess.run([
                self.W_fc2.assign(W_fc2_scaled),
                self.b_fc2.assign(b_fc2_scaled)
            ])

        if verbose:
            self.init_verbosity()

        # target q network model:
        self.t_is_training = tf.placeholder(tf.bool, [])
        with tf.name_scope("TConv1") as scope:
            kernel_shape = [8, 8, phi_length, 32]
            self.t_W_conv1 = self.weight_variable(phi_length, kernel_shape,
                                                  't_conv1')
            #self.t_b_conv1 = self.bias_variable(kernel_shape, 't_conv1')
            self.t_h_conv1_bn = batch_norm(self.conv2d(self.next_observation_n,
                                                       self.t_W_conv1, 4),
                                           32,
                                           self.t_is_training,
                                           self.sess,
                                           parForTarget=self.h_conv1_bn,
                                           slow=self.slow,
                                           tau=self.tau)
            self.t_h_conv1 = tf.nn.relu(self.t_h_conv1_bn.bnorm,
                                        name=self.name +
                                        '_t_conv1_activations')

        with tf.name_scope("TConv2") as scope:
            kernel_shape = [4, 4, 32, 64]
            self.t_W_conv2 = self.weight_variable(32, kernel_shape, 't_conv2')
            #self.t_b_conv2 = self.bias_variable(kernel_shape, 't_conv2')
            self.t_h_conv2_bn = batch_norm(self.conv2d(self.t_h_conv1,
                                                       self.t_W_conv2, 2),
                                           64,
                                           self.t_is_training,
                                           self.sess,
                                           parForTarget=self.h_conv2_bn,
                                           slow=self.slow,
                                           tau=self.tau)
            self.t_h_conv2 = tf.nn.relu(self.t_h_conv2_bn.bnorm,
                                        name=self.name +
                                        '_t_conv2_activations')

        with tf.name_scope("TConv3") as scope:
            kernel_shape = [3, 3, 64, 64]
            self.t_W_conv3 = self.weight_variable(64, kernel_shape, 't_conv3')
            #self.t_b_conv3 = self.bias_variable(kernel_shape, 't_conv3')
            self.t_h_conv3_bn = batch_norm(self.conv2d(self.t_h_conv2,
                                                       self.t_W_conv3, 1),
                                           64,
                                           self.t_is_training,
                                           self.sess,
                                           parForTarget=self.h_conv3_bn,
                                           slow=self.slow,
                                           tau=self.tau)
            self.t_h_conv3 = tf.nn.relu(self.t_h_conv3_bn.bnorm,
                                        name=self.name +
                                        '_t_conv3_activations')

        self.t_h_conv3_flat = tf.reshape(self.t_h_conv3, [-1, 3136])

        with tf.name_scope("TFullyConnected1") as scope:
            kernel_shape = [3136, 512]
            self.t_W_fc1 = self.weight_variable_linear(kernel_shape, 't_fc1')
            #self.t_b_fc1 = self.bias_variable(kernel_shape, 't_fc1')
            self.t_h_fc1_bn = batch_norm(tf.matmul(self.t_h_conv3_flat,
                                                   self.t_W_fc1),
                                         512,
                                         self.t_is_training,
                                         self.sess,
                                         parForTarget=self.h_fc1_bn,
                                         slow=self.slow,
                                         tau=self.tau,
                                         linear=True)
            self.t_h_fc1 = tf.nn.relu(self.t_h_fc1_bn.bnorm,
                                      name=self.name + '_t_fc1_activations')

        with tf.name_scope("TFullyConnected2") as scope:
            kernel_shape = [512, n_actions]
            self.t_W_fc2 = self.weight_variable_linear(kernel_shape, 't_fc2')
            self.t_b_fc2 = self.bias_variable_linear(kernel_shape, 't_fc2')
            self.t_q_value = tf.add(tf.matmul(self.t_h_fc1, self.t_W_fc2),
                                    self.t_b_fc2,
                                    name=self.name + '_t_fc1_outputs')

        if transfer:
            # only intialize tensor variables that are not loaded from the transfer model
            #self.sess.run(tf.variables_initializer(fast_learnrate_vars))
            self._global_vars_temp = set(tf.global_variables())

        # cost of q network
        #self.l2_regularizer_loss = l2_decay * (tf.reduce_sum(tf.pow(self.W_conv1, 2)) + tf.reduce_sum(tf.pow(self.W_conv2, 2)) + tf.reduce_sum(tf.pow(self.W_conv3, 2))  + tf.reduce_sum(tf.pow(self.W_fc1, 2)) + tf.reduce_sum(tf.pow(self.W_fc2, 2)))
        self.cost = self.build_loss(error_clip,
                                    n_actions)  #+ self.l2_regularizer_loss
        # self.parameters = [
        #     self.W_conv1, self.h_conv1_bn.scale, self.h_conv1_bn.beta,
        #     self.W_conv2, self.h_conv2_bn.scale, self.h_conv2_bn.beta,
        #     self.W_conv3, self.h_conv3_bn.scale, self.h_conv3_bn.beta,
        #     self.W_fc1, self.h_fc1_bn.scale, self.h_fc1_bn.beta,
        #     self.W_fc2, self.b_fc2,
        # ]
        with tf.name_scope("Train") as scope:
            if optimizer == "Graves":
                # Nature RMSOptimizer
                self.train_step, self.grads_vars = graves_rmsprop_optimizer(
                    self.cost, learning_rate, decay, epsilon, 1)
            else:
                if optimizer == "Adam":
                    self.opt = tf.train.AdamOptimizer(
                        learning_rate=learning_rate, epsilon=epsilon)
                elif optimizer == "RMS":
                    # Tensorflow RMSOptimizer
                    self.opt = tf.train.RMSPropOptimizer(learning_rate,
                                                         decay=decay,
                                                         momentum=momentum,
                                                         epsilon=epsilon)
                else:
                    print(colored("Unknown Optimizer!", "red"))
                    sys.exit()

                self.grads_vars = self.opt.compute_gradients(self.cost)
                grads = []
                params = []
                for p in self.grads_vars:
                    if p[0] == None:
                        continue
                    grads.append(p[0])
                    params.append(p[1])
                #grads = tf.clip_by_global_norm(grads, 1)[0]
                self.grads_vars_updates = zip(grads, params)
                self.train_step = self.opt.apply_gradients(
                    self.grads_vars_updates)

            # for grad, var in self.grads_vars:
            #     if grad == None:
            #         continue
            #     tf.summary.histogram(var.op.name + '/gradients', grad)

        if transfer:
            vars_diff = set(tf.global_variables()) - self._global_vars_temp
            self.sess.run(tf.variables_initializer(vars_diff))
            self.sess.run(
                tf.variables_initializer([
                    self.t_h_conv1_bn.pop_mean, self.t_h_conv1_bn.pop_var,
                    self.t_h_conv2_bn.pop_mean, self.t_h_conv2_bn.pop_var,
                    self.t_h_conv3_bn.pop_mean, self.t_h_conv3_bn.pop_var,
                    self.t_h_fc1_bn.pop_mean, self.t_h_fc1_bn.pop_var
                ]))
        else:
            # initialize all tensor variable parameters
            self.sess.run(tf.global_variables_initializer())

        # Make sure q and target model have same initial parameters copy the parameters
        self.sess.run([
            self.t_W_conv1.assign(
                self.W_conv1),  #self.t_b_conv1.assign(self.b_conv1),
            self.t_W_conv2.assign(
                self.W_conv2),  #self.t_b_conv2.assign(self.b_conv2),
            self.t_W_conv3.assign(
                self.W_conv3),  #self.t_b_conv3.assign(self.b_conv3),
            self.t_W_fc1.assign(self.W_fc1),  #self.t_b_fc1.assign(self.b_fc1),
            self.t_W_fc2.assign(self.W_fc2),
            self.t_b_fc2.assign(self.b_fc2),
            self.t_h_conv1_bn.scale.assign(self.h_conv1_bn.scale),
            self.t_h_conv1_bn.beta.assign(self.h_conv1_bn.beta),
            self.t_h_conv2_bn.scale.assign(self.h_conv2_bn.scale),
            self.t_h_conv2_bn.beta.assign(self.h_conv2_bn.beta),
            self.t_h_conv3_bn.scale.assign(self.h_conv3_bn.scale),
            self.t_h_conv3_bn.beta.assign(self.h_conv3_bn.beta),
            self.t_h_fc1_bn.scale.assign(self.h_fc1_bn.scale),
            self.t_h_fc1_bn.beta.assign(self.h_fc1_bn.beta)
        ])

        if self.slow:
            self.update_target_op = [
                self.t_W_conv1.assign(self.tau * self.W_conv1 +
                                      (1 - self.tau) * self.t_W_conv1
                                      ),  #self.t_b_conv1.assign(self.b_conv1),
                self.t_W_conv2.assign(self.tau * self.W_conv2 +
                                      (1 - self.tau) * self.t_W_conv2
                                      ),  #self.t_b_conv2.assign(self.b_conv2),
                self.t_W_conv3.assign(self.tau * self.W_conv3 +
                                      (1 - self.tau) * self.t_W_conv3
                                      ),  #self.t_b_conv3.assign(self.b_conv3),
                self.t_W_fc1.assign(self.tau * self.W_fc1 +
                                    (1 - self.tau) * self.t_W_fc1
                                    ),  #self.t_b_fc1.assign(self.b_fc1),
                self.t_W_fc2.assign(self.tau * self.W_fc2 +
                                    (1 - self.tau) * self.t_W_fc2),
                self.t_b_fc2.assign(self.tau * self.b_fc2 +
                                    (1 - self.tau) * self.t_b_fc2),
                self.t_h_conv1_bn.updateTarget,
                self.t_h_conv2_bn.updateTarget,
                self.t_h_conv3_bn.updateTarget,
                self.t_h_fc1_bn.updateTarget
            ]
        else:
            self.update_target_op = [
                self.t_W_conv1.assign(
                    self.W_conv1),  #self.t_b_conv1.assign(self.b_conv1),
                self.t_W_conv2.assign(
                    self.W_conv2),  #self.t_b_conv2.assign(self.b_conv2),
                self.t_W_conv3.assign(
                    self.W_conv3),  #self.t_b_conv3.assign(self.b_conv3),
                self.t_W_fc1.assign(
                    self.W_fc1),  #self.t_b_fc1.assign(self.b_fc1),
                self.t_W_fc2.assign(self.W_fc2),
                self.t_b_fc2.assign(self.b_fc2),
                self.t_h_conv1_bn.updateTarget,
                self.t_h_conv2_bn.updateTarget,
                self.t_h_conv3_bn.updateTarget,
                self.t_h_fc1_bn.updateTarget
            ]

        self.saver = tf.train.Saver()
        self.merged = tf.summary.merge_all()
        self.writer = tf.summary.FileWriter(
            self.path + self.folder + '/log_tb', self.sess.graph)
Beispiel #2
0
    def __init__(self,
                 sess,
                 height,
                 width,
                 phi_length,
                 n_actions,
                 name,
                 gamma=0.99,
                 optimizer='RMS',
                 learning_rate=0.00025,
                 epsilon=0.01,
                 decay=0.95,
                 momentum=0.,
                 l2_decay=0.0001,
                 slow=False,
                 tau=0.01,
                 verbose=False,
                 folder='_networks',
                 transfer=False,
                 transfer_folder='',
                 not_transfer_conv2=False,
                 not_transfer_conv3=False,
                 not_transfer_fc1=False,
                 not_transfer_fc2=False,
                 device="/cpu:0",
                 transformed_bellman=False,
                 target_consistency_loss=False,
                 clip_norm=None,
                 weight_decay=None):
        """ Initialize network """
        Network.__init__(self, sess, name=name)
        self.gamma = gamma
        self.slow = slow
        self.tau = tau
        self.name = name
        self.sess = sess
        self.folder = folder
        self._device = device
        self.transformed_bellman = transformed_bellman
        self.target_consistency_loss = target_consistency_loss
        self.verbose = verbose

        self.observation = tf.placeholder(tf.float32,
                                          [None, height, width, phi_length],
                                          name='observation')
        self.observation_n = tf.div(self.observation, 255.)

        with tf.device(self._device), tf.variable_scope('net_-1') as scope:
            # q network model:
            self.W_conv1, self.b_conv1 = self.conv_variable(
                [8, 8, phi_length, 32], layer_name='conv1', gain=np.sqrt(2))
            self.h_conv1 = tf.nn.relu(tf.add(
                self.conv2d(self.observation_n, self.W_conv1, 4),
                self.b_conv1),
                                      name=self.name + '_conv1_activations')
            tf.add_to_collection('conv_weights', self.W_conv1)
            tf.add_to_collection('conv_output', self.h_conv1)

            self.W_conv2, self.b_conv2 = self.conv_variable([4, 4, 32, 64],
                                                            layer_name='conv2',
                                                            gain=np.sqrt(2))
            self.h_conv2 = tf.nn.relu(tf.add(
                self.conv2d(self.h_conv1, self.W_conv2, 2), self.b_conv2),
                                      name=self.name + '_conv2_activations')
            tf.add_to_collection('conv_weights', self.W_conv2)
            tf.add_to_collection('conv_output', self.h_conv2)

            self.W_conv3, self.b_conv3 = self.conv_variable([3, 3, 64, 64],
                                                            layer_name='conv3',
                                                            gain=np.sqrt(2))
            self.h_conv3 = tf.nn.relu(tf.add(
                self.conv2d(self.h_conv2, self.W_conv3, 1), self.b_conv3),
                                      name=self.name + '_conv3_activations')
            tf.add_to_collection('conv_weights', self.W_conv3)
            tf.add_to_collection('conv_output', self.h_conv3)

            self.h_conv3_flat = tf.reshape(self.h_conv3, [-1, 3136])

            self.W_fc1, self.b_fc1 = self.fc_variable([3136, 512],
                                                      layer_name='fc1',
                                                      gain=np.sqrt(2))
            self.h_fc1 = tf.nn.relu(tf.add(
                tf.matmul(self.h_conv3_flat, self.W_fc1), self.b_fc1),
                                    name=self.name + '_fc1_activations')

            self.W_fc2, self.b_fc2 = self.fc_variable([512, n_actions],
                                                      layer_name='fc2')
            self.q_value = tf.add(tf.matmul(self.h_fc1, self.W_fc2),
                                  self.b_fc2,
                                  name=self.name + '_fc1_outputs')

        if self.target_consistency_loss:
            self.tc_observation = tf.placeholder(
                tf.float32, [None, height, width, phi_length],
                name='observation_tc')
            self.tc_observation_n = tf.div(self.tc_observation, 255.)

            with tf.device(self._device), tf.variable_scope(
                    'net_-1', reuse=True) as scope:
                # q network model:
                tc_W_conv1, tc_b_conv1 = self.conv_variable(
                    [8, 8, phi_length, 32],
                    layer_name='conv1',
                    gain=np.sqrt(2))
                tc_h_conv1 = tf.nn.relu(tf.add(
                    self.conv2d(self.tc_observation_n, tc_W_conv1, 4),
                    tc_b_conv1),
                                        name=self.name + '_conv1_activations')

                tc_W_conv2, tc_b_conv2 = self.conv_variable([4, 4, 32, 64],
                                                            layer_name='conv2',
                                                            gain=np.sqrt(2))
                tc_h_conv2 = tf.nn.relu(tf.add(
                    self.conv2d(tc_h_conv1, tc_W_conv2, 2), tc_b_conv2),
                                        name=self.name + '_conv2_activations')

                tc_W_conv3, tc_b_conv3 = self.conv_variable([3, 3, 64, 64],
                                                            layer_name='conv3',
                                                            gain=np.sqrt(2))
                tc_h_conv3 = tf.nn.relu(tf.add(
                    self.conv2d(tc_h_conv2, tc_W_conv3, 1), tc_b_conv3),
                                        name=self.name + '_conv3_activations')

                tc_h_conv3_flat = tf.reshape(tc_h_conv3, [-1, 3136])

                tc_W_fc1, tc_b_fc1 = self.fc_variable([3136, 512],
                                                      layer_name='fc1',
                                                      gain=np.sqrt(2))
                tc_h_fc1 = tf.nn.relu(tf.add(
                    tf.matmul(tc_h_conv3_flat, tc_W_fc1), tc_b_fc1),
                                      name=self.name + '_fc1_activations')

                tc_W_fc2, tc_b_fc2 = self.fc_variable([512, n_actions],
                                                      layer_name='fc2')
                self.tc_q_value = tf.add(tf.matmul(tc_h_fc1, tc_W_fc2),
                                         tc_b_fc2,
                                         name=self.name + '_fc1_outputs')

        if transfer:
            self.load_transfer_model(self.sess,
                                     folder=transfer_folder,
                                     not_transfer_fc2=not_transfer_fc2,
                                     not_transfer_fc1=not_transfer_fc1,
                                     not_transfer_conv3=not_transfer_conv3,
                                     not_transfer_conv2=not_transfer_conv2)

        if self.verbose:
            self.init_verbosity()

        self.next_observation = tf.placeholder(
            tf.float32, [None, height, width, phi_length],
            name='t_next_observation')
        self.next_observation_n = tf.div(self.next_observation, 255.)

        with tf.device(
                self._device), tf.variable_scope('net_-1-target') as scope:
            # target q network model:
            kernel_shape = [8, 8, phi_length, 32]
            self.t_W_conv1, self.t_b_conv1 = self.conv_variable(
                kernel_shape, layer_name='t_conv1')
            self.t_h_conv1 = tf.nn.relu(
                tf.add(self.conv2d(self.next_observation_n, self.t_W_conv1, 4),
                       self.t_b_conv1),
                name=self.name + '_t_conv1_activations')

            kernel_shape = [4, 4, 32, 64]
            self.t_W_conv2, self.t_b_conv2 = self.conv_variable(
                kernel_shape, layer_name='t_conv2')
            self.t_h_conv2 = tf.nn.relu(
                tf.add(self.conv2d(self.t_h_conv1, self.t_W_conv2, 2),
                       self.t_b_conv2),
                name=self.name + '_t_conv2_activations')

            kernel_shape = [3, 3, 64, 64]
            self.t_W_conv3, self.t_b_conv3 = self.conv_variable(
                kernel_shape, layer_name='t_conv3')
            self.t_h_conv3 = tf.nn.relu(
                tf.add(self.conv2d(self.t_h_conv2, self.t_W_conv3, 1),
                       self.t_b_conv3),
                name=self.name + '_t_conv3_activations')

            self.t_h_conv3_flat = tf.reshape(self.t_h_conv3, [-1, 3136])

            kernel_shape = [3136, 512]
            self.t_W_fc1, self.t_b_fc1 = self.fc_variable(kernel_shape,
                                                          layer_name='t_fc1')
            self.t_h_fc1 = tf.nn.relu(tf.add(
                tf.matmul(self.t_h_conv3_flat, self.t_W_fc1), self.t_b_fc1),
                                      name=self.name + '_t_fc1_activations')

            kernel_shape = [512, n_actions]
            self.t_W_fc2, self.t_b_fc2 = self.fc_variable(kernel_shape,
                                                          layer_name='t_fc2')
            self.t_q_value = tf.add(tf.matmul(self.t_h_fc1, self.t_W_fc2),
                                    self.t_b_fc2,
                                    name=self.name + '_t_fc1_outputs')

        with tf.device(self._device):
            # cost of q network
            self.cost = self.build_loss(n_actions)  #+ self.l2_regularizer_loss

            with tf.name_scope("Train") as scope:
                if optimizer == "Adam":
                    self.opt = tf.train.AdamOptimizer(
                        learning_rate=learning_rate, epsilon=epsilon)
                elif optimizer == "AdamW":
                    assert weight_decay is not None
                    self.opt = tf.contrib.opt.AdamWOptimizer(
                        weight_decay=weight_decay,
                        learning_rate=learning_rate,
                        epsilon=epsilon)
                elif optimizer == "RMS":
                    # Tensorflow RMSOptimizer
                    if weight_decay is None:
                        self.opt = tf.train.RMSPropOptimizer(
                            learning_rate=learning_rate,
                            decay=decay,
                            momentum=momentum,
                            epsilon=epsilon)
                    else:
                        RMSPropW = tf.contrib.opt.extend_with_decoupled_weight_decay(
                            tf.train.RMSPropOptimizer)
                        self.opt = RMSPropW(weight_decay=weight_decay,
                                            learning_rate=learning_rate,
                                            decay=decay,
                                            momentum=momentum,
                                            epsilon=epsilon)
                else:
                    logger.error("Unknown Optimizer!")
                    sys.exit()

                var_refs = [v._ref() for v in self.get_vars()]
                gradients = tf.gradients(self.cost, var_refs)
                if clip_norm is not None:
                    gradients, grad_norm = tf.clip_by_global_norm(
                        gradients, clip_norm)
                gradients = list(zip(gradients, self.get_vars()))
                self.train_step = self.opt.apply_gradients(gradients)

        def initialize_uninitialized(sess):
            global_vars = tf.global_variables()
            is_not_initialized = sess.run(
                [tf.is_variable_initialized(var) for var in global_vars])
            not_initialized_vars = [
                v for (v, f) in zip(global_vars, is_not_initialized) if not f
            ]

            if len(not_initialized_vars):
                sess.run(tf.variables_initializer(not_initialized_vars))

        if transfer:
            initialize_uninitialized(self.sess)
        else:
            # initialize all tensor variable parameters
            self.sess.run(tf.global_variables_initializer())

        # Make sure q and target model have same initial parameters copy the parameters
        self.update_target_network(slow=False)
        logger.info("target model assigned the same parameters as q model")

        self.saver = tf.train.Saver()
        if self.folder is not None:
            self.summary_op = tf.summary.merge_all()
            self.writer = tf.summary.FileWriter(
                'results/log/dqn/{}/'.format(self.name.replace('-', '_')) +
                self.folder[12:], self.sess.graph)