def kurtosis(data_matrix, k=3, Lambda=10.0, Num_iter=30, gamma=1.0, BATCH_SIZE=1000, number_of_moments=1000, C=10.0): dimensionality = data_matrix.shape[1] number_of_points = data_matrix.shape[0] number_of_neurons = number_of_points Data = np.float32(np.transpose(data_matrix)) max_grad_norm = 1000 lr_decay = 0.3 lambda1 = 1.0 lambda2 = 1.0 lambda3 = 1.0 lambda4 = 1.0 def model(inputs): with tf.variable_scope('Characteristic', reuse=tf.AUTO_REUSE) as scope: w = tf.get_variable('w', initializer=tf.random_normal( [number_of_neurons, 1], stddev=0.0)) B = tf.get_variable('B', initializer=tf.convert_to_tensor(Data)) out = (2 / number_of_neurons) * tf.matmul( tf.math.cos(tf.matmul(inputs, B)), tf.nn.sigmoid(w)) return out def gradients(inputs): with tf.variable_scope('Characteristic', reuse=tf.AUTO_REUSE) as scope: w = tf.get_variable('w', initializer=tf.random_normal( [number_of_neurons, 1], stddev=0.0)) B = tf.get_variable('B', initializer=tf.convert_to_tensor(Data)) out = (2 / number_of_neurons) * tf.matmul( tf.math.cos(tf.matmul(inputs, B)), tf.nn.sigmoid(w)) grads = tf.reshape( tf.gradients(out, [inputs])[0], [BATCH_SIZE, dimensionality]) return grads def old_gradients(inputs, old_weights): old_B, old_w = old_weights old = (2 / number_of_neurons) * tf.matmul( tf.math.cos(tf.matmul(inputs, old_B)), tf.nn.sigmoid(old_w)) return tf.reshape( tf.gradients(old, [inputs])[0], [BATCH_SIZE, dimensionality]) new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") Dataplace = tf.placeholder(tf.float32, shape=(dimensionality, None)) tf_data_x = 0.0001 * tf.random_normal( [BATCH_SIZE, dimensionality] ) #tf.placeholder(tf.float32, shape=(1, dimensionality)) #tf.zeros(shape=(1, dimensionality), dtype=tf.dtypes.float32) # tf_data_y = tf.reduce_mean(tf.math.cos(tf.matmul(tf_data_x, Dataplace))) # tf_data_w = tf.placeholder(tf.float32, shape=(number_of_neurons, 1)) tf_data_B = tf.placeholder(tf.float32, shape=(dimensionality, number_of_neurons)) tf_data_P = tf.placeholder(tf.float32, shape=(dimensionality, dimensionality)) tf_data_second = gamma * tf.random_normal([BATCH_SIZE, dimensionality]) Lbd = tf.placeholder(tf.float32, shape=[], name="lambda") prediction = model(tf_data_x) with tf.variable_scope('Characteristic', reuse=tf.AUTO_REUSE) as scope: w = tf.get_variable('w') penalty = tf.square((2 / number_of_neurons) * tf.reduce_sum(tf.nn.sigmoid(w)) - 1.0) rand_index1 = tf.random.uniform(shape=[number_of_moments, 1], minval=0, maxval=dimensionality, dtype=tf.int32, seed=10) rand_index2 = tf.random.uniform(shape=[number_of_moments, 2], minval=0, maxval=dimensionality, dtype=tf.int32, seed=10) rand_index3 = tf.random.uniform(shape=[number_of_moments, 3], minval=0, maxval=dimensionality, dtype=tf.int32, seed=10) rand_index4 = tf.random.uniform(shape=[number_of_moments, 4], minval=0, maxval=dimensionality, dtype=tf.int32, seed=10) phi = prediction - tf_data_y collect = 0.0 for i in range(0, number_of_moments): a = rand_index1[i][0] b = rand_index2[i] c = rand_index3[i] d = rand_index4[i] r = tf.gradients(phi, [tf_data_x[0, a]], unconnected_gradients='zero')[0] collect = collect + lambda1 * r * r r = tf.gradients(phi, [tf_data_x[0, b[0]]], unconnected_gradients='zero')[0] r = tf.gradients(r, [tf_data_x[0, b[1]]], unconnected_gradients='zero')[0] collect = collect + lambda2 * r * r r = tf.gradients(phi, [tf_data_x[0, c[0]]], unconnected_gradients='zero')[0] r = tf.gradients(r, [tf_data_x[0, c[1]]], unconnected_gradients='zero')[0] r = tf.gradients(r, [tf_data_x[0, c[2]]], unconnected_gradients='zero')[0] collect = collect + lambda3 * r * r r = tf.gradients(phi, [tf_data_x[0, d[0]]], unconnected_gradients='zero')[0] r = tf.gradients(r, [tf_data_x[0, d[1]]], unconnected_gradients='zero')[0] r = tf.gradients(r, [tf_data_x[0, d[2]]], unconnected_gradients='zero')[0] r = tf.gradients(r, [tf_data_x[0, d[3]]], unconnected_gradients='zero')[0] collect = collect + lambda4 * r * r out_loss = collect / number_of_moments + C * penalty new_part = gradients(tf_data_second) tf_data_grad_psi = old_gradients(tf_data_second, [tf_data_B, tf_data_w]) old_part = tf.matmul(tf_data_grad_psi, tf_data_P) loss = out_loss + Lbd * tf.reduce_mean( tf.reduce_sum(tf.square(tf.subtract(new_part, old_part)), axis=1)) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), max_grad_norm) optimizer = RAdamOptimizer(learning_rate=new_lr, beta1=0.5, beta2=0.9) target = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) cur_w = np.random.normal(0, 0.35, (number_of_neurons, 1)) cur_B = np.random.normal(0, 0.35, (dimensionality, number_of_neurons)) cur_P = np.zeros((dimensionality, dimensionality)) O = np.zeros((dimensionality, k)) lr = 0.0001 prev_run_res = 1000000000 # default session sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) for s in range(0, Num_iter): lr = 0.001 epoch = 0 while lr > 0.0000001: # iterations epoch < 500000 and sess.run(target, feed_dict={ new_lr: lr, Dataplace: Data, Lbd: Lambda, tf_data_w: cur_w, tf_data_B: cur_B, tf_data_P: cur_P, }) if epoch % 100 == 0: run_res = 0.0 for times in range(100): run_res = run_res + sess.run(loss, feed_dict={ Dataplace: Data, Lbd: Lambda, tf_data_w: cur_w, tf_data_B: cur_B, tf_data_P: cur_P }) run_res = run_res / 100.0 print("epoch = %d, train error = %.8f" % (epoch + 1, run_res)) if run_res > prev_run_res: lr *= lr_decay prev_run_res = run_res epoch += 1 with tf.variable_scope('Characteristic', reuse=tf.AUTO_REUSE) as scope: w1 = tf.get_variable('w') B1 = tf.get_variable('B') cur_w, cur_B = sess.run([w1, B1]) third_grad_psi = np.reshape(sess.run([new_part]), (BATCH_SIZE, dimensionality)) for r in range(1000): sess.run([tf_data_second]) np.concatenate((third_grad_psi, np.reshape(sess.run([new_part]), (BATCH_SIZE, dimensionality))), axis=0) u, s, vh = np.linalg.svd(np.matmul(np.transpose(third_grad_psi), third_grad_psi), full_matrices=True) O = u[:, 0:k:1] # print(s) # print(u) # print(O) cur_P = np.matmul(O, np.transpose(O)) # print(cur_P) print(np.sum(np.square(third_grad_psi))) print( np.sum(np.square(third_grad_psi - np.matmul(third_grad_psi, cur_P)))) sess.close() return O
def _train_deeplab_model(iterator, num_of_classes, ignore_label): """Trains the deeplab model. Args: iterator: An iterator of type tf.data.Iterator for images and labels. num_of_classes: Number of classes for the dataset. ignore_label: Ignore label for the dataset. Returns: train_tensor: A tensor to update the model variables. summary_op: An operation to log the summaries. """ global_step = tf.train.get_or_create_global_step() learning_rate = train_utils.get_model_learning_rate( FLAGS.learning_policy, FLAGS.base_learning_rate, FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor, FLAGS.training_number_of_steps, FLAGS.learning_power, FLAGS.slow_start_step, FLAGS.slow_start_learning_rate) tf.summary.scalar('learning_rate', learning_rate) # optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum) optimizer = RAdamOptimizer(learning_rate=learning_rate) # NadamWOptimizer = tf.contrib.opt.extend_with_decoupled_weight_decay(tf.contrib.opt.NadamOptimizer) # optimizer = NadamWOptimizer(weight_decay=0.00004, learning_rate=learning_rate) tower_losses = [] tower_grads = [] for i in range(FLAGS.num_clones): with tf.device('/gpu:%d' % i): # First tower has default name scope. name_scope = ('clone_%d' % i) if i else '' with tf.name_scope(name_scope) as scope: loss = _tower_loss( iterator=iterator, num_of_classes=num_of_classes, ignore_label=ignore_label, scope=scope, reuse_variable=(i != 0)) tower_losses.append(loss) if FLAGS.quantize_delay_step >= 0: if FLAGS.num_clones > 1: raise ValueError('Quantization doesn\'t support multi-clone yet.') tf.contrib.quantize.create_training_graph( quant_delay=FLAGS.quantize_delay_step) # backbonebn_var_list = [t for t in tf.trainable_variables() if FLAGS.model_variant in t.name and 'BatchNorm' in t.name] # freeze_backbonebn_var_list = [t for t in tf.trainable_variables() if t not in backbonebn_var_list] for i in range(FLAGS.num_clones): with tf.device('/gpu:%d' % i): name_scope = ('clone_%d' % i) if i else '' with tf.name_scope(name_scope) as scope: grads = optimizer.compute_gradients(tower_losses[i]) tower_grads.append(grads) with tf.device('/cpu:0'): grads_and_vars = _average_gradients(tower_grads) # Modify the gradients for biases and last layer variables. last_layers = model.get_extra_layer_scopes( FLAGS.last_layers_contain_logits_only) grad_mult = train_utils.get_model_gradient_multipliers( last_layers, FLAGS.last_layer_gradient_multiplier) if grad_mult: grads_and_vars = tf.contrib.training.multiply_gradients( grads_and_vars, grad_mult) # Create gradient update op. grad_updates = optimizer.apply_gradients( grads_and_vars, global_step=global_step) # Gather update_ops. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) update_ops.append(grad_updates) update_op = tf.group(*update_ops) total_loss = tf.losses.get_total_loss(add_regularization_losses=True) # Print total loss to the terminal. # This implementation is mirrored from tf.slim.summaries. should_log = math_ops.equal(math_ops.mod(global_step, FLAGS.log_steps), 0) total_loss = tf.cond( should_log, lambda: tf.Print(total_loss, [total_loss], 'Total loss is :'), lambda: total_loss) tf.summary.scalar('total_loss', total_loss) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Excludes summaries from towers other than the first one. summary_op = tf.summary.merge_all(scope='(?!clone_)') return train_tensor, summary_op