def __init__(self, sess, height, width, phi_length, n_actions, name, gamma=0.99, copy_interval=4, optimizer='RMS', learning_rate=0.00025, epsilon=0.01, decay=0.95, momentum=0., l2_decay=0.0001, error_clip=1.0, slow=False, tau=0.01, verbose=False, path='', folder='_networks', decay_learning_rate=False, transfer=False): """ Initialize network """ Network.__init__(self, sess, name=name) self.gamma = gamma self.slow = slow self.tau = tau self.name = name self.sess = sess self.path = path self.folder = folder self.copy_interval = copy_interval self.update_counter = 0 self.decay_learning_rate = decay_learning_rate self.observation = tf.placeholder(tf.float32, [None, height, width, phi_length], name=self.name + '_observation') self.actions = tf.placeholder(tf.float32, shape=[None, n_actions], name=self.name + "_actions") # one-hot matrix self.next_observation = tf.placeholder( tf.float32, [None, height, width, phi_length], name=self.name + '_t_next_observation') self.rewards = tf.placeholder(tf.float32, shape=[None], name=self.name + "_rewards") self.terminals = tf.placeholder(tf.float32, shape=[None], name=self.name + "_terminals") self.slow_learnrate_vars = [] self.fast_learnrate_vars = [] self.observation_n = tf.div(self.observation, 255.) self.next_observation_n = tf.div(self.next_observation, 255.) # q network model: self.is_training = tf.placeholder(tf.bool, []) with tf.name_scope("Conv1") as scope: kernel_shape = [8, 8, phi_length, 32] self.W_conv1 = self.weight_variable(phi_length, kernel_shape, 'conv1') #self.b_conv1 = self.bias_variable(kernel_shape, 'conv1') self.h_conv1_bn = batch_norm(self.conv2d(self.observation_n, self.W_conv1, 4), 32, self.is_training, self.sess, slow=self.slow, tau=self.tau) self.h_conv1 = tf.nn.relu(self.h_conv1_bn.bnorm, name=self.name + '_conv1_activations') tf.add_to_collection('conv_weights', self.W_conv1) tf.add_to_collection('conv_output', self.h_conv1) if transfer: self.slow_learnrate_vars.append(self.W_conv1) self.slow_learnrate_vars.append(self.h_conv1_bn.scale) self.slow_learnrate_vars.append(self.h_conv1_bn.beta) with tf.name_scope("Conv2") as scope: kernel_shape = [4, 4, 32, 64] self.W_conv2 = self.weight_variable(32, kernel_shape, 'conv2') #self.b_conv2 = self.bias_variable(kernel_shape, 'conv2') self.h_conv2_bn = batch_norm(self.conv2d(self.h_conv1, self.W_conv2, 2), 64, self.is_training, self.sess, slow=self.slow, tau=self.tau) self.h_conv2 = tf.nn.relu(self.h_conv2_bn.bnorm, name=self.name + '_conv2_activations') tf.add_to_collection('conv_weights', self.W_conv2) tf.add_to_collection('conv_output', self.h_conv2) if transfer: self.slow_learnrate_vars.append(self.W_conv2) self.slow_learnrate_vars.append(self.h_conv2_bn.scale) self.slow_learnrate_vars.append(self.h_conv2_bn.beta) with tf.name_scope("Conv3") as scope: kernel_shape = [3, 3, 64, 64] self.W_conv3 = self.weight_variable(64, kernel_shape, 'conv3') #self.b_conv3 = self.bias_variable(kernel_shape, 'conv3') self.h_conv3_bn = batch_norm(self.conv2d(self.h_conv2, self.W_conv3, 1), 64, self.is_training, self.sess, slow=self.slow, tau=self.tau) self.h_conv3 = tf.nn.relu(self.h_conv3_bn.bnorm, name=self.name + '_conv3_activations') tf.add_to_collection('conv_weights', self.W_conv3) tf.add_to_collection('conv_output', self.h_conv3) if transfer: self.slow_learnrate_vars.append(self.W_conv3) self.slow_learnrate_vars.append(self.h_conv3_bn.scale) self.slow_learnrate_vars.append(self.h_conv3_bn.beta) self.h_conv3_flat = tf.reshape(self.h_conv3, [-1, 3136]) with tf.name_scope("FullyConnected1") as scope: kernel_shape = [3136, 512] self.W_fc1 = self.weight_variable_linear(kernel_shape, 'fc1') #self.b_fc1 = self.bias_variable(kernel_shape, 'fc1') self.h_fc1_bn = batch_norm(tf.matmul(self.h_conv3_flat, self.W_fc1), 512, self.is_training, self.sess, slow=self.slow, tau=self.tau, linear=True) self.h_fc1 = tf.nn.relu(self.h_fc1_bn.bnorm, name=self.name + '_fc1_activations') if transfer: self.fast_learnrate_vars.append(self.W_fc1) self.fast_learnrate_vars.append(self.h_fc1_bn.scale) self.fast_learnrate_vars.append(self.h_fc1_bn.beta) with tf.name_scope("FullyConnected2") as scope: kernel_shape = [512, n_actions] self.W_fc2 = self.weight_variable_linear(kernel_shape, 'fc2') self.b_fc2 = self.bias_variable_linear(kernel_shape, 'fc2') self.q_value = tf.add(tf.matmul(self.h_fc1, self.W_fc2), self.b_fc2, name=self.name + '_fc1_outputs') if transfer: self.fast_learnrate_vars.append(self.W_fc2) self.fast_learnrate_vars.append(self.b_fc2) if transfer: self.load_transfer_model(optimizer=optimizer.lower()) # Scale down the last layer W_fc2_scaled = tf.scalar_mul(0.01, self.W_fc2) b_fc2_scaled = tf.scalar_mul(0.01, self.b_fc2) self.sess.run([ self.W_fc2.assign(W_fc2_scaled), self.b_fc2.assign(b_fc2_scaled) ]) if verbose: self.init_verbosity() # target q network model: self.t_is_training = tf.placeholder(tf.bool, []) with tf.name_scope("TConv1") as scope: kernel_shape = [8, 8, phi_length, 32] self.t_W_conv1 = self.weight_variable(phi_length, kernel_shape, 't_conv1') #self.t_b_conv1 = self.bias_variable(kernel_shape, 't_conv1') self.t_h_conv1_bn = batch_norm(self.conv2d(self.next_observation_n, self.t_W_conv1, 4), 32, self.t_is_training, self.sess, parForTarget=self.h_conv1_bn, slow=self.slow, tau=self.tau) self.t_h_conv1 = tf.nn.relu(self.t_h_conv1_bn.bnorm, name=self.name + '_t_conv1_activations') with tf.name_scope("TConv2") as scope: kernel_shape = [4, 4, 32, 64] self.t_W_conv2 = self.weight_variable(32, kernel_shape, 't_conv2') #self.t_b_conv2 = self.bias_variable(kernel_shape, 't_conv2') self.t_h_conv2_bn = batch_norm(self.conv2d(self.t_h_conv1, self.t_W_conv2, 2), 64, self.t_is_training, self.sess, parForTarget=self.h_conv2_bn, slow=self.slow, tau=self.tau) self.t_h_conv2 = tf.nn.relu(self.t_h_conv2_bn.bnorm, name=self.name + '_t_conv2_activations') with tf.name_scope("TConv3") as scope: kernel_shape = [3, 3, 64, 64] self.t_W_conv3 = self.weight_variable(64, kernel_shape, 't_conv3') #self.t_b_conv3 = self.bias_variable(kernel_shape, 't_conv3') self.t_h_conv3_bn = batch_norm(self.conv2d(self.t_h_conv2, self.t_W_conv3, 1), 64, self.t_is_training, self.sess, parForTarget=self.h_conv3_bn, slow=self.slow, tau=self.tau) self.t_h_conv3 = tf.nn.relu(self.t_h_conv3_bn.bnorm, name=self.name + '_t_conv3_activations') self.t_h_conv3_flat = tf.reshape(self.t_h_conv3, [-1, 3136]) with tf.name_scope("TFullyConnected1") as scope: kernel_shape = [3136, 512] self.t_W_fc1 = self.weight_variable_linear(kernel_shape, 't_fc1') #self.t_b_fc1 = self.bias_variable(kernel_shape, 't_fc1') self.t_h_fc1_bn = batch_norm(tf.matmul(self.t_h_conv3_flat, self.t_W_fc1), 512, self.t_is_training, self.sess, parForTarget=self.h_fc1_bn, slow=self.slow, tau=self.tau, linear=True) self.t_h_fc1 = tf.nn.relu(self.t_h_fc1_bn.bnorm, name=self.name + '_t_fc1_activations') with tf.name_scope("TFullyConnected2") as scope: kernel_shape = [512, n_actions] self.t_W_fc2 = self.weight_variable_linear(kernel_shape, 't_fc2') self.t_b_fc2 = self.bias_variable_linear(kernel_shape, 't_fc2') self.t_q_value = tf.add(tf.matmul(self.t_h_fc1, self.t_W_fc2), self.t_b_fc2, name=self.name + '_t_fc1_outputs') if transfer: # only intialize tensor variables that are not loaded from the transfer model #self.sess.run(tf.variables_initializer(fast_learnrate_vars)) self._global_vars_temp = set(tf.global_variables()) # cost of q network #self.l2_regularizer_loss = l2_decay * (tf.reduce_sum(tf.pow(self.W_conv1, 2)) + tf.reduce_sum(tf.pow(self.W_conv2, 2)) + tf.reduce_sum(tf.pow(self.W_conv3, 2)) + tf.reduce_sum(tf.pow(self.W_fc1, 2)) + tf.reduce_sum(tf.pow(self.W_fc2, 2))) self.cost = self.build_loss(error_clip, n_actions) #+ self.l2_regularizer_loss # self.parameters = [ # self.W_conv1, self.h_conv1_bn.scale, self.h_conv1_bn.beta, # self.W_conv2, self.h_conv2_bn.scale, self.h_conv2_bn.beta, # self.W_conv3, self.h_conv3_bn.scale, self.h_conv3_bn.beta, # self.W_fc1, self.h_fc1_bn.scale, self.h_fc1_bn.beta, # self.W_fc2, self.b_fc2, # ] with tf.name_scope("Train") as scope: if optimizer == "Graves": # Nature RMSOptimizer self.train_step, self.grads_vars = graves_rmsprop_optimizer( self.cost, learning_rate, decay, epsilon, 1) else: if optimizer == "Adam": self.opt = tf.train.AdamOptimizer( learning_rate=learning_rate, epsilon=epsilon) elif optimizer == "RMS": # Tensorflow RMSOptimizer self.opt = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=momentum, epsilon=epsilon) else: print(colored("Unknown Optimizer!", "red")) sys.exit() self.grads_vars = self.opt.compute_gradients(self.cost) grads = [] params = [] for p in self.grads_vars: if p[0] == None: continue grads.append(p[0]) params.append(p[1]) #grads = tf.clip_by_global_norm(grads, 1)[0] self.grads_vars_updates = zip(grads, params) self.train_step = self.opt.apply_gradients( self.grads_vars_updates) # for grad, var in self.grads_vars: # if grad == None: # continue # tf.summary.histogram(var.op.name + '/gradients', grad) if transfer: vars_diff = set(tf.global_variables()) - self._global_vars_temp self.sess.run(tf.variables_initializer(vars_diff)) self.sess.run( tf.variables_initializer([ self.t_h_conv1_bn.pop_mean, self.t_h_conv1_bn.pop_var, self.t_h_conv2_bn.pop_mean, self.t_h_conv2_bn.pop_var, self.t_h_conv3_bn.pop_mean, self.t_h_conv3_bn.pop_var, self.t_h_fc1_bn.pop_mean, self.t_h_fc1_bn.pop_var ])) else: # initialize all tensor variable parameters self.sess.run(tf.global_variables_initializer()) # Make sure q and target model have same initial parameters copy the parameters self.sess.run([ self.t_W_conv1.assign( self.W_conv1), #self.t_b_conv1.assign(self.b_conv1), self.t_W_conv2.assign( self.W_conv2), #self.t_b_conv2.assign(self.b_conv2), self.t_W_conv3.assign( self.W_conv3), #self.t_b_conv3.assign(self.b_conv3), self.t_W_fc1.assign(self.W_fc1), #self.t_b_fc1.assign(self.b_fc1), self.t_W_fc2.assign(self.W_fc2), self.t_b_fc2.assign(self.b_fc2), self.t_h_conv1_bn.scale.assign(self.h_conv1_bn.scale), self.t_h_conv1_bn.beta.assign(self.h_conv1_bn.beta), self.t_h_conv2_bn.scale.assign(self.h_conv2_bn.scale), self.t_h_conv2_bn.beta.assign(self.h_conv2_bn.beta), self.t_h_conv3_bn.scale.assign(self.h_conv3_bn.scale), self.t_h_conv3_bn.beta.assign(self.h_conv3_bn.beta), self.t_h_fc1_bn.scale.assign(self.h_fc1_bn.scale), self.t_h_fc1_bn.beta.assign(self.h_fc1_bn.beta) ]) if self.slow: self.update_target_op = [ self.t_W_conv1.assign(self.tau * self.W_conv1 + (1 - self.tau) * self.t_W_conv1 ), #self.t_b_conv1.assign(self.b_conv1), self.t_W_conv2.assign(self.tau * self.W_conv2 + (1 - self.tau) * self.t_W_conv2 ), #self.t_b_conv2.assign(self.b_conv2), self.t_W_conv3.assign(self.tau * self.W_conv3 + (1 - self.tau) * self.t_W_conv3 ), #self.t_b_conv3.assign(self.b_conv3), self.t_W_fc1.assign(self.tau * self.W_fc1 + (1 - self.tau) * self.t_W_fc1 ), #self.t_b_fc1.assign(self.b_fc1), self.t_W_fc2.assign(self.tau * self.W_fc2 + (1 - self.tau) * self.t_W_fc2), self.t_b_fc2.assign(self.tau * self.b_fc2 + (1 - self.tau) * self.t_b_fc2), self.t_h_conv1_bn.updateTarget, self.t_h_conv2_bn.updateTarget, self.t_h_conv3_bn.updateTarget, self.t_h_fc1_bn.updateTarget ] else: self.update_target_op = [ self.t_W_conv1.assign( self.W_conv1), #self.t_b_conv1.assign(self.b_conv1), self.t_W_conv2.assign( self.W_conv2), #self.t_b_conv2.assign(self.b_conv2), self.t_W_conv3.assign( self.W_conv3), #self.t_b_conv3.assign(self.b_conv3), self.t_W_fc1.assign( self.W_fc1), #self.t_b_fc1.assign(self.b_fc1), self.t_W_fc2.assign(self.W_fc2), self.t_b_fc2.assign(self.b_fc2), self.t_h_conv1_bn.updateTarget, self.t_h_conv2_bn.updateTarget, self.t_h_conv3_bn.updateTarget, self.t_h_fc1_bn.updateTarget ] self.saver = tf.train.Saver() self.merged = tf.summary.merge_all() self.writer = tf.summary.FileWriter( self.path + self.folder + '/log_tb', self.sess.graph)
def __init__(self, sess, height, width, phi_length, n_actions, name, gamma=0.99, optimizer='RMS', learning_rate=0.00025, epsilon=0.01, decay=0.95, momentum=0., l2_decay=0.0001, slow=False, tau=0.01, verbose=False, folder='_networks', transfer=False, transfer_folder='', not_transfer_conv2=False, not_transfer_conv3=False, not_transfer_fc1=False, not_transfer_fc2=False, device="/cpu:0", transformed_bellman=False, target_consistency_loss=False, clip_norm=None, weight_decay=None): """ Initialize network """ Network.__init__(self, sess, name=name) self.gamma = gamma self.slow = slow self.tau = tau self.name = name self.sess = sess self.folder = folder self._device = device self.transformed_bellman = transformed_bellman self.target_consistency_loss = target_consistency_loss self.verbose = verbose self.observation = tf.placeholder(tf.float32, [None, height, width, phi_length], name='observation') self.observation_n = tf.div(self.observation, 255.) with tf.device(self._device), tf.variable_scope('net_-1') as scope: # q network model: self.W_conv1, self.b_conv1 = self.conv_variable( [8, 8, phi_length, 32], layer_name='conv1', gain=np.sqrt(2)) self.h_conv1 = tf.nn.relu(tf.add( self.conv2d(self.observation_n, self.W_conv1, 4), self.b_conv1), name=self.name + '_conv1_activations') tf.add_to_collection('conv_weights', self.W_conv1) tf.add_to_collection('conv_output', self.h_conv1) self.W_conv2, self.b_conv2 = self.conv_variable([4, 4, 32, 64], layer_name='conv2', gain=np.sqrt(2)) self.h_conv2 = tf.nn.relu(tf.add( self.conv2d(self.h_conv1, self.W_conv2, 2), self.b_conv2), name=self.name + '_conv2_activations') tf.add_to_collection('conv_weights', self.W_conv2) tf.add_to_collection('conv_output', self.h_conv2) self.W_conv3, self.b_conv3 = self.conv_variable([3, 3, 64, 64], layer_name='conv3', gain=np.sqrt(2)) self.h_conv3 = tf.nn.relu(tf.add( self.conv2d(self.h_conv2, self.W_conv3, 1), self.b_conv3), name=self.name + '_conv3_activations') tf.add_to_collection('conv_weights', self.W_conv3) tf.add_to_collection('conv_output', self.h_conv3) self.h_conv3_flat = tf.reshape(self.h_conv3, [-1, 3136]) self.W_fc1, self.b_fc1 = self.fc_variable([3136, 512], layer_name='fc1', gain=np.sqrt(2)) self.h_fc1 = tf.nn.relu(tf.add( tf.matmul(self.h_conv3_flat, self.W_fc1), self.b_fc1), name=self.name + '_fc1_activations') self.W_fc2, self.b_fc2 = self.fc_variable([512, n_actions], layer_name='fc2') self.q_value = tf.add(tf.matmul(self.h_fc1, self.W_fc2), self.b_fc2, name=self.name + '_fc1_outputs') if self.target_consistency_loss: self.tc_observation = tf.placeholder( tf.float32, [None, height, width, phi_length], name='observation_tc') self.tc_observation_n = tf.div(self.tc_observation, 255.) with tf.device(self._device), tf.variable_scope( 'net_-1', reuse=True) as scope: # q network model: tc_W_conv1, tc_b_conv1 = self.conv_variable( [8, 8, phi_length, 32], layer_name='conv1', gain=np.sqrt(2)) tc_h_conv1 = tf.nn.relu(tf.add( self.conv2d(self.tc_observation_n, tc_W_conv1, 4), tc_b_conv1), name=self.name + '_conv1_activations') tc_W_conv2, tc_b_conv2 = self.conv_variable([4, 4, 32, 64], layer_name='conv2', gain=np.sqrt(2)) tc_h_conv2 = tf.nn.relu(tf.add( self.conv2d(tc_h_conv1, tc_W_conv2, 2), tc_b_conv2), name=self.name + '_conv2_activations') tc_W_conv3, tc_b_conv3 = self.conv_variable([3, 3, 64, 64], layer_name='conv3', gain=np.sqrt(2)) tc_h_conv3 = tf.nn.relu(tf.add( self.conv2d(tc_h_conv2, tc_W_conv3, 1), tc_b_conv3), name=self.name + '_conv3_activations') tc_h_conv3_flat = tf.reshape(tc_h_conv3, [-1, 3136]) tc_W_fc1, tc_b_fc1 = self.fc_variable([3136, 512], layer_name='fc1', gain=np.sqrt(2)) tc_h_fc1 = tf.nn.relu(tf.add( tf.matmul(tc_h_conv3_flat, tc_W_fc1), tc_b_fc1), name=self.name + '_fc1_activations') tc_W_fc2, tc_b_fc2 = self.fc_variable([512, n_actions], layer_name='fc2') self.tc_q_value = tf.add(tf.matmul(tc_h_fc1, tc_W_fc2), tc_b_fc2, name=self.name + '_fc1_outputs') if transfer: self.load_transfer_model(self.sess, folder=transfer_folder, not_transfer_fc2=not_transfer_fc2, not_transfer_fc1=not_transfer_fc1, not_transfer_conv3=not_transfer_conv3, not_transfer_conv2=not_transfer_conv2) if self.verbose: self.init_verbosity() self.next_observation = tf.placeholder( tf.float32, [None, height, width, phi_length], name='t_next_observation') self.next_observation_n = tf.div(self.next_observation, 255.) with tf.device( self._device), tf.variable_scope('net_-1-target') as scope: # target q network model: kernel_shape = [8, 8, phi_length, 32] self.t_W_conv1, self.t_b_conv1 = self.conv_variable( kernel_shape, layer_name='t_conv1') self.t_h_conv1 = tf.nn.relu( tf.add(self.conv2d(self.next_observation_n, self.t_W_conv1, 4), self.t_b_conv1), name=self.name + '_t_conv1_activations') kernel_shape = [4, 4, 32, 64] self.t_W_conv2, self.t_b_conv2 = self.conv_variable( kernel_shape, layer_name='t_conv2') self.t_h_conv2 = tf.nn.relu( tf.add(self.conv2d(self.t_h_conv1, self.t_W_conv2, 2), self.t_b_conv2), name=self.name + '_t_conv2_activations') kernel_shape = [3, 3, 64, 64] self.t_W_conv3, self.t_b_conv3 = self.conv_variable( kernel_shape, layer_name='t_conv3') self.t_h_conv3 = tf.nn.relu( tf.add(self.conv2d(self.t_h_conv2, self.t_W_conv3, 1), self.t_b_conv3), name=self.name + '_t_conv3_activations') self.t_h_conv3_flat = tf.reshape(self.t_h_conv3, [-1, 3136]) kernel_shape = [3136, 512] self.t_W_fc1, self.t_b_fc1 = self.fc_variable(kernel_shape, layer_name='t_fc1') self.t_h_fc1 = tf.nn.relu(tf.add( tf.matmul(self.t_h_conv3_flat, self.t_W_fc1), self.t_b_fc1), name=self.name + '_t_fc1_activations') kernel_shape = [512, n_actions] self.t_W_fc2, self.t_b_fc2 = self.fc_variable(kernel_shape, layer_name='t_fc2') self.t_q_value = tf.add(tf.matmul(self.t_h_fc1, self.t_W_fc2), self.t_b_fc2, name=self.name + '_t_fc1_outputs') with tf.device(self._device): # cost of q network self.cost = self.build_loss(n_actions) #+ self.l2_regularizer_loss with tf.name_scope("Train") as scope: if optimizer == "Adam": self.opt = tf.train.AdamOptimizer( learning_rate=learning_rate, epsilon=epsilon) elif optimizer == "AdamW": assert weight_decay is not None self.opt = tf.contrib.opt.AdamWOptimizer( weight_decay=weight_decay, learning_rate=learning_rate, epsilon=epsilon) elif optimizer == "RMS": # Tensorflow RMSOptimizer if weight_decay is None: self.opt = tf.train.RMSPropOptimizer( learning_rate=learning_rate, decay=decay, momentum=momentum, epsilon=epsilon) else: RMSPropW = tf.contrib.opt.extend_with_decoupled_weight_decay( tf.train.RMSPropOptimizer) self.opt = RMSPropW(weight_decay=weight_decay, learning_rate=learning_rate, decay=decay, momentum=momentum, epsilon=epsilon) else: logger.error("Unknown Optimizer!") sys.exit() var_refs = [v._ref() for v in self.get_vars()] gradients = tf.gradients(self.cost, var_refs) if clip_norm is not None: gradients, grad_norm = tf.clip_by_global_norm( gradients, clip_norm) gradients = list(zip(gradients, self.get_vars())) self.train_step = self.opt.apply_gradients(gradients) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f ] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if transfer: initialize_uninitialized(self.sess) else: # initialize all tensor variable parameters self.sess.run(tf.global_variables_initializer()) # Make sure q and target model have same initial parameters copy the parameters self.update_target_network(slow=False) logger.info("target model assigned the same parameters as q model") self.saver = tf.train.Saver() if self.folder is not None: self.summary_op = tf.summary.merge_all() self.writer = tf.summary.FileWriter( 'results/log/dqn/{}/'.format(self.name.replace('-', '_')) + self.folder[12:], self.sess.graph)