def model(hparams, X, past=None, scope='model', reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse): results = {} batch, sequence = shape_list(X) wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd], initializer=tf.random_normal_initializer(stddev=0.01)) wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd], initializer=tf.random_normal_initializer(stddev=0.02)) past_length = 0 if past is None else tf.shape(past)[-2] h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length)) # Transformer presents = [] pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer assert len(pasts) == hparams.n_layer for layer, past in enumerate(pasts): h, present = block(h, 'h%d' % layer, past=past, hparams=hparams) if layer == 10: tf.add_to_collection('checkpoints', h) presents.append(present) results['present'] = tf.stack(presents, axis=1) h = norm(h, 'ln_f') # Language model loss. Do tokens <n predict token n? h_flat = tf.reshape(h, [batch * sequence, hparams.n_embd]) logits = tf.matmul(h_flat, wte, transpose_b=True) logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab]) results['logits'] = logits return results
def conv1d(x, scope, nf, *, w_init_stdev=0.02): with tf.variable_scope(scope): *start, nx = shape_list(x) w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev)) b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0)) c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b, start + [nf]) return c
def mask_attn_weights(w): # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from # src to dst. _, _, nd, ns = shape_list(w) b = attention_mask(nd, ns, dtype=w.dtype) b = tf.reshape(b, [1, 1, nd, ns]) w = w * b - tf.cast(1e10, w.dtype) * (1 - b) return w
def __init__(self): self.embedding = self.getEmb() self.embSize = self.embedding.shape[1] self.vocabSize = self.embedding.shape[0] self.x = tf.placeholder(tf.int32, [None, 5]) with tf.variable_scope("training_variable"): self.weights = { "MLP1": tf.Variable( tf.truncated_normal( shape=[self.embSize, int(self.embSize / 2)], stddev=0.08)), "MLP2": tf.Variable( tf.truncated_normal(shape=[int(self.embSize / 2), 1], stddev=0.08)) } self.biases = { "MLP1": tf.Variable( tf.constant(0.01, shape=[int(self.embSize / 2)], dtype=tf.float32)), "MLP2": tf.Variable(tf.constant(0.01, shape=[1], dtype=tf.float32)) } self.inputEmb = tf.nn.embedding_lookup(self.embedding, self.x) p1 = tf.matmul(tf.reshape(self.inputEmb, [-1, self.embSize]), self.weights["MLP1"]) + self.biases["MLP1"] p1 = tf.matmul(tf.nn.relu(p1), self.weights["MLP2"]) + self.biases["MLP2"] p1 = tf.reshape(p1, [-1, 5]) p1 = tf.reshape(tf.nn.softmax(p1), [-1, 1, 5]) self.finalState = tf.reshape(tf.matmul(p1, self.inputEmb), [-1, self.embSize])
def pca(x, dim=2): ''' x:输入矩阵 dim:降维之后的维度数 ''' with tf.name_scope("PCA"): m, n = tf.to_float(x.get_shape()[0]), tf.to_int32(x.get_shape()[1]) assert not tf.assert_less(dim, n) mean = tf.reduce_mean(x, axis=1) x_new = x - tf.reshape(mean, (-1, 1)) cov = tf.matmul(x_new, x_new, transpose_a=True) / (m - 1) e, v = tf.linalg.eigh(cov, name="eigh") e_index_sort = tf.math.top_k(e, sorted=True, k=dim)[1] v_new = tf.gather(v, indices=e_index_sort) pca = tf.matmul(x_new, v_new, transpose_b=True) return pca
def __init__(self, input_width=227, input_height=227, input_channels=3, num_classes=1000, learning_rate=0.01, momentum=0.9, keep_prob=0.5): # From article: The learning rate was initialized at 0.01. # From article: We trained our models using stochastic gradient descent with a batch size of 128 examples, # momentum of 0.9, and weight decay of 0.0005 # From article: We initialized the weights in each layer from a zero-mean Gaussian distribution with standard # deviation 0.01. self.input_width = input_width self.input_height = input_height self.input_channels = input_channels self.num_classes = num_classes self.learning_rate = learning_rate self.momentum = momentum self.keep_prob = keep_prob self.random_mean = 0 self.random_stddev = 0.01 # ---------------------------------------------------------------------------------------------------- # From article: We initialized the neuron biases in the second, fourth, and fifth convolutional layers, as well # as in the fully-connected hidden layers, with the constant 1. ... We initialized the neuron biases in the # remaining layers with the constant 0. # Input: 227x227x3. with tf.name_scope('input'): self.X = tf.placeholder(dtype=tf.float32, shape=[ None, self.input_height, self.input_width, self.input_channels ], name='X') # Labels: 1000. with tf.name_scope('labels'): self.Y = tf.placeholder(dtype=tf.float32, shape=[None, self.num_classes], name='Y') # Dropout keep prob. with tf.name_scope('dropout'): self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, shape=(), name='dropout_keep_prob') # Layer 1. # [Input] ==> 227x227x3 # --> 227x227x3 ==> [Convolution: size=(11x11x3)x96, strides=4, padding=valid] ==> 55x55x96 # --> 55x55x96 ==> [ReLU] ==> 55x55x96 # --> 55x55x96 ==> [Local Response Normalization] ==> 55x55x96 # --> 55x55x96 ==> [Max-Pool: size=3x3, strides=2, padding=valid] ==> 27x27x96 # --> [Output] ==> 27x27x96 # Note: 48*2=96, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer1'): layer1_activations = self.__conv( input=self.X, filter_width=11, filter_height=11, filters_count=96, stride_x=4, stride_y=4, padding='VALID', init_biases_with_the_constant_1=False) layer1_lrn = self.__local_response_normalization( input=layer1_activations) layer1_pool = self.__max_pool(input=layer1_lrn, filter_width=3, filter_height=3, stride_x=2, stride_y=2, padding='VALID') # Layer 2. # [Input] ==> 27x27x96 # --> 27x27x96 ==> [Convolution: size=(5x5x96)x256, strides=1, padding=same] ==> 27x27x256 # --> 27x27x256 ==> [ReLU] ==> 27x27x256 # --> 27x27x256 ==> [Local Response Normalization] ==> 27x27x256 # --> 27x27x256 ==> [Max-Pool: size=3x3, strides=2, padding=valid] ==> 13x13x256 # --> [Output] ==> 13x13x256 # Note: 128*2=256, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer2'): layer2_activations = self.__conv( input=layer1_pool, filter_width=5, filter_height=5, filters_count=256, stride_x=1, stride_y=1, padding='SAME', init_biases_with_the_constant_1=True) layer2_lrn = self.__local_response_normalization( input=layer2_activations) layer2_pool = self.__max_pool(input=layer2_lrn, filter_width=3, filter_height=3, stride_x=2, stride_y=2, padding='VALID') # Layer 3. # [Input] ==> 13x13x256 # --> 13x13x256 ==> [Convolution: size=(3x3x256)x384, strides=1, padding=same] ==> 13x13x384 # --> 13x13x384 ==> [ReLU] ==> 13x13x384 # --> [Output] ==> 13x13x384 # Note: 192*2=384, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer3'): layer3_activations = self.__conv( input=layer2_pool, filter_width=3, filter_height=3, filters_count=384, stride_x=1, stride_y=1, padding='SAME', init_biases_with_the_constant_1=False) # Layer 4. # [Input] ==> 13x13x384 # --> 13x13x384 ==> [Convolution: size=(3x3x384)x384, strides=1, padding=same] ==> 13x13x384 # --> 13x13x384 ==> [ReLU] ==> 13x13x384 # --> [Output] ==> 13x13x384 # Note: 192*2=384, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer4'): layer4_activations = self.__conv( input=layer3_activations, filter_width=3, filter_height=3, filters_count=384, stride_x=1, stride_y=1, padding='SAME', init_biases_with_the_constant_1=True) # Layer 5. # [Input] ==> 13x13x384 # --> 13x13x384 ==> [Convolution: size=(3x3x384)x256, strides=1, padding=same] ==> 13x13x256 # --> 13x13x256 ==> [ReLU] ==> 13x13x256 # --> 13x13x256 ==> [Max-Pool: size=3x3, strides=2, padding=valid] ==> 6x6x256 # --> [Output] ==> 6x6x256 # Note: 128*2=256, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer5'): layer5_activations = self.__conv( input=layer4_activations, filter_width=3, filter_height=3, filters_count=256, stride_x=1, stride_y=1, padding='SAME', init_biases_with_the_constant_1=True) layer5_pool = self.__max_pool(input=layer5_activations, filter_width=3, filter_height=3, stride_x=2, stride_y=2, padding='VALID') # Layer 6. # [Input] ==> 6x6x256=9216 # --> 9216 ==> [Fully Connected: neurons=4096] ==> 4096 # --> 4096 ==> [ReLU] ==> 4096 # --> 4096 ==> [Dropout] ==> 4096 # --> [Output] ==> 4096 # Note: 2048*2=4096, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer6'): pool5_shape = layer5_pool.get_shape().as_list() flattened_input_size = pool5_shape[1] * pool5_shape[ 2] * pool5_shape[3] layer6_fc = self.__fully_connected( input=tf.reshape(layer5_pool, shape=[-1, flattened_input_size]), inputs_count=flattened_input_size, outputs_count=4096, relu=True, init_biases_with_the_constant_1=True) layer6_dropout = self.__dropout(input=layer6_fc) # Layer 7. # [Input] ==> 4096 # --> 4096 ==> [Fully Connected: neurons=4096] ==> 4096 # --> 4096 ==> [ReLU] ==> 4096 # --> 4096 ==> [Dropout] ==> 4096 # --> [Output] ==> 4096 # Note: 2048*2=4096, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer7'): layer7_fc = self.__fully_connected( input=layer6_dropout, inputs_count=4096, outputs_count=4096, relu=True, init_biases_with_the_constant_1=True) layer7_dropout = self.__dropout(input=layer7_fc) # Layer 8. # [Input] ==> 4096 # --> 4096 ==> [Logits: neurons=1000] ==> 1000 # --> [Output] ==> 1000 with tf.name_scope('layer8'): layer8_logits = self.__fully_connected( input=layer7_dropout, inputs_count=4096, outputs_count=self.num_classes, relu=False, name='logits') # Cross Entropy. with tf.name_scope('cross_entropy'): cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( logits=layer8_logits, labels=self.Y, name='cross_entropy') self.__variable_summaries(cross_entropy) # Training. with tf.name_scope('training'): loss_operation = tf.reduce_mean(cross_entropy, name='loss_operation') tf.summary.scalar(name='loss', tensor=loss_operation) optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=self.momentum) # self.training_operation = optimizer.minimize(loss_operation, name='training_operation') grads_and_vars = optimizer.compute_gradients(loss_operation) self.training_operation = optimizer.apply_gradients( grads_and_vars, name='training_operation') for grad, var in grads_and_vars: if grad is not None: with tf.name_scope(var.op.name + '/gradients'): self.__variable_summaries(grad) # Accuracy. with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(layer8_logits, 1), tf.argmax(self.Y, 1), name='correct_prediction') self.accuracy_operation = tf.reduce_mean(tf.cast( correct_prediction, tf.float32), name='accuracy_operation') tf.summary.scalar(name='accuracy', tensor=self.accuracy_operation)
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') mnist = input_data.read_data_sets("data/", one_hot=True) g = tf.Graph() with g.as_default(): x = tf.placeholder("float", shape=[None, 784]) y_ = tf.placeholder("float", shape=[None, 10]) W_conv1 = weight_variable([5, 5, 1, 32]) b_conv1 = bias_variable([32]) x_image = tf.reshape(x, [-1, 28, 28, 1]) h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) h_pool1 = max_pool_2x2(h_conv1) W_conv2 = weight_variable([5, 5, 32, 64]) b_conv2 = bias_variable([64]) h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) h_pool2 = max_pool_2x2(h_conv2) W_fc1 = weight_variable([7 * 7 * 64, 1024]) b_fc1 = bias_variable([1024]) h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) keep_prob = tf.placeholder("float") h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
tf.Variable(tf.truncated_normal(fc_connection_shapes["f3_shape"][3]), name="f3_biases") } dataset_dict["total_image_size"] = dataset_dict["image_size"] * dataset_dict[ "image_size"] # Declare the input and output placeholders input_img = tf.placeholder(tf.float32, shape=[ BATCH_SIZE, dataset_dict["image_size"], dataset_dict["image_size"], dataset_dict["num_channels"] ]) img_4d_shaped = tf.reshape(input_img, [ -1, dataset_dict["image_size"], dataset_dict["image_size"], dataset_dict["num_channels"] ]) labels = tf.placeholder(tf.float32, shape=[None, dataset_dict["num_labels"]]) # Convolution Layer 1 | Response Normalization | Max Pooling | ReLU c_layer_1 = tf.nn.conv2d(img_4d_shaped, conv_weights["c1_weights"], strides=[1, 4, 4, 1], padding="SAME", name="c_layer_1") c_layer_1 += conv_biases["c1_biases"] c_layer_1 = tf.nn.relu(c_layer_1) c_layer_1 = tf.nn.lrn(c_layer_1, depth_radius=N_DEPTH_RADIUS, bias=K_BIAS, alpha=ALPHA,
def main(trainModel=True, buildConfusionMatrix=True, restore=False, buildClassifiedMatrix=True): tf.disable_v2_behavior() input_images = tf.placeholder(tf.float32, [None, 28, 28], name="Input") real = tf.placeholder(tf.float32, [None, CLASSES], name="real_classes") layer1 = create_conv_layer(tf.reshape(input_images, [-1, 28, 28, 1]), 1, 28, [5, 5], [2, 2], name="conv_no_pool") layer2 = create_conv_layer(layer1, 28, 56, [5, 5], [2, 2], name='conv_with_pool') conv_result = tf.reshape(layer2, [-1, 7 * 7 * 56]) relu_layer_weight = tf.Variable(tf.truncated_normal([7 * 7 * 56, 1000], stddev=STDDEV * 2), name='relu_layer_weight') rely_layer_bias = tf.Variable(tf.truncated_normal([1000], stddev=STDDEV / 2), name='rely_layer_bias') relu_layer = tf.matmul(conv_result, relu_layer_weight) + rely_layer_bias relu_layer = tf.nn.relu(relu_layer) relu_layer = tf.nn.dropout(relu_layer, DROPOUT) final_layer_weight = tf.Variable(tf.truncated_normal([1000, CLASSES], stddev=STDDEV * 2), name='final_layer_weight') final_layer_bias = tf.Variable(tf.truncated_normal([CLASSES], stddev=STDDEV / 2), name='final_layer_bias') final_layer = tf.matmul(relu_layer, final_layer_weight) + final_layer_bias predicts = tf.nn.softmax(final_layer) predicts_for_log = tf.clip_by_value(predicts, 1e-9, 0.999999999) #crossEntropy = -tf.reduce_mean(tf.reduce_sum(y * tf.log(y_clipped) + (1 - y) * tf.log(1 - y_clipped), axis=1)) loss = -tf.reduce_mean( tf.reduce_sum(real * tf.log(predicts_for_log) + (1 - real) * tf.log(1 - predicts_for_log), axis=1), axis=0) #test = tf.reduce_sum(real * tf.log(predicts_for_log) + (1 - real) * tf.log(1 - predicts_for_log), axis=1) #loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=final_layer, labels=real)) optimiser = tf.train.GradientDescentOptimizer( learning_rate=LEARNING_RATE).minimize(loss) correct_prediction = tf.equal(tf.argmax(real, axis=1), tf.argmax(predicts, axis=1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) confusion_matrix = tf.confusion_matrix(labels=tf.argmax(real, axis=1), predictions=tf.argmax(predicts, axis=1), num_classes=CLASSES) saver = tf.train.Saver() # dataset = get_mnist_dataset() dataset = get_fashion_dataset() with tf.Session() as session: session.run(tf.global_variables_initializer()) if restore: saver.restore(session, SAVE_PATH) if trainModel: train(input_images, real, session, optimiser, loss, accuracy, saver, dataset) if buildConfusionMatrix: test_cm = session.run(confusion_matrix, feed_dict={ input_images: dataset.test_x, real: dataset.test_y }) draw_confusion_matrix(test_cm) if buildClassifiedMatrix: all_probs = session.run(predicts, feed_dict={ input_images: dataset.test_x, real: dataset.test_y }) max_failure_picture_index = [[(-1, -1.0)] * CLASSES for _ in range(CLASSES)] for i in range(len(all_probs)): real = np.argmax(dataset.test_y[i]) for j in range(CLASSES): if max_failure_picture_index[real][j][1] < all_probs[i][j]: max_failure_picture_index[real][j] = (i, all_probs[i][j]) draw_max_failure_pictures(dataset.test_x, max_failure_picture_index)
def merge_states(x): """Smash the last two dimensions of x into a single dimension.""" *start, a, b = shape_list(x) return tf.reshape(x, start + [a * b])
def split_states(x, n): """Reshape the last dimension of x into [n, x.shape[-1]/n].""" *start, m = shape_list(x) return tf.reshape(x, start + [n, m // n])