def __conv(self, input, filter_width, filter_height, filters_count, stride_x, stride_y, padding='VALID', init_biases_with_the_constant_1=False, name='conv'): with tf.name_scope(name): input_channels = input.get_shape().as_list()[-1] filters = tf.Variable(self.__random_values(shape=[ filter_height, filter_width, input_channels, filters_count ]), name='filters') convs = tf.nn.conv2d(input=input, filter=filters, strides=[1, stride_y, stride_x, 1], padding=padding, name='convs') if init_biases_with_the_constant_1: biases = tf.Variable(tf.ones(shape=[filters_count], dtype=tf.float32), name='biases') else: biases = tf.Variable(tf.zeros(shape=[filters_count], dtype=tf.float32), name='biases') preactivations = tf.nn.bias_add(convs, biases, name='preactivations') activations = tf.nn.relu(preactivations, name='activations') with tf.name_scope('filter_summaries'): self.__variable_summaries(filters) with tf.name_scope('bias_summaries'): self.__variable_summaries(biases) with tf.name_scope('preactivations_histogram'): tf.summary.histogram('preactivations', preactivations) with tf.name_scope('activations_histogram'): tf.summary.histogram('activations', activations) return activations
def sample_sequence(*, hparams, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, top_p=0.0): if start_token is None: assert context is not None, 'Specify exactly one of start_token and context!' else: assert context is None, 'Specify exactly one of start_token and context!' context = tf.fill([batch_size, 1], start_token) def step(hparams, tokens, past=None): lm_output = model.model(hparams=hparams, X=tokens, past=past, reuse=tf.AUTO_REUSE) logits = lm_output['logits'][:, :, :hparams.n_vocab] presents = lm_output['present'] presents.set_shape(model.past_shape(hparams=hparams, batch_size=batch_size)) return { 'logits': logits, 'presents': presents, } with tf.name_scope('sample_sequence'): # Don't feed the last context token -- leave that to the loop below # TODO: Would be slightly faster if we called step on the entire context, # rather than leaving the last token transformer calculation to the while loop. context_output = step(hparams, context[:, :-1]) def body(past, prev, output): next_outputs = step(hparams, prev[:, tf.newaxis], past=past) logits = next_outputs['logits'][:, -1, :] / tf.to_float(temperature) if top_p > 0.0: logits = top_p_logits(logits, p=top_p) else: logits = top_k_logits(logits, k=top_k) samples = tf.multinomial(logits, num_samples=1, output_dtype=tf.int32) return [ tf.concat([past, next_outputs['presents']], axis=-2), tf.squeeze(samples, axis=[1]), tf.concat([output, samples], axis=1), ] def cond(*args): return True _, _, tokens = tf.while_loop( cond=cond, body=body, maximum_iterations=length, loop_vars=[ context_output['presents'], context[:, -1], context, ], shape_invariants=[ tf.TensorShape(model.past_shape(hparams=hparams, batch_size=batch_size)), tf.TensorShape([batch_size]), tf.TensorShape([batch_size, None]), ], back_prop=False, ) return tokens
def __local_response_normalization(self, input, name='lrn'): # From article: Local Response Normalization: we used k=2, n=5, α=10^−4, and β=0.75. with tf.name_scope(name): lrn = tf.nn.local_response_normalization( input=input, depth_radius=2, alpha=10**-4, beta=0.75, name='local_response_normalization') return lrn
def __fully_connected(self, input, inputs_count, outputs_count, relu=True, init_biases_with_the_constant_1=False, name='fully_connected'): with tf.name_scope(name): wights = tf.Variable( self.__random_values(shape=[inputs_count, outputs_count]), name='wights') if init_biases_with_the_constant_1: biases = tf.Variable(tf.ones(shape=[outputs_count], dtype=tf.float32), name='biases') else: biases = tf.Variable(tf.zeros(shape=[outputs_count], dtype=tf.float32), name='biases') preactivations = tf.nn.bias_add(tf.matmul(input, wights), biases, name='preactivations') if relu: activations = tf.nn.relu(preactivations, name='activations') with tf.name_scope('wight_summaries'): self.__variable_summaries(wights) with tf.name_scope('bias_summaries'): self.__variable_summaries(biases) with tf.name_scope('preactivations_histogram'): tf.summary.histogram('preactivations', preactivations) if relu: with tf.name_scope('activations_histogram'): tf.summary.histogram('activations', activations) if relu: return activations else: return preactivations
def __max_pool(self, input, filter_width, filter_height, stride_x, stride_y, padding='VALID', name='pool'): with tf.name_scope(name): pool = tf.nn.max_pool(input, ksize=[1, filter_height, filter_width, 1], strides=[1, stride_y, stride_x, 1], padding=padding, name='pool') return pool
def capture_ops(): """Decorator to capture ops created in the block. with capture_ops() as ops: # create some ops print(ops) # => prints ops created. """ micros = int(time.time() * 10 ** 6) scope_name = str(micros) op_list = [] with tf.name_scope(scope_name): yield op_list g = tf.get_default_graph() op_list.extend(ge.select_ops(scope_name + "/.*", graph=g))
def pca(x, dim=2): ''' x:输入矩阵 dim:降维之后的维度数 ''' with tf.name_scope("PCA"): m, n = tf.to_float(x.get_shape()[0]), tf.to_int32(x.get_shape()[1]) assert not tf.assert_less(dim, n) mean = tf.reduce_mean(x, axis=1) x_new = x - tf.reshape(mean, (-1, 1)) cov = tf.matmul(x_new, x_new, transpose_a=True) / (m - 1) e, v = tf.linalg.eigh(cov, name="eigh") e_index_sort = tf.math.top_k(e, sorted=True, k=dim)[1] v_new = tf.gather(v, indices=e_index_sort) pca = tf.matmul(x_new, v_new, transpose_b=True) return pca
def __init__(self, input_width=227, input_height=227, input_channels=3, num_classes=1000, learning_rate=0.01, momentum=0.9, keep_prob=0.5): # From article: The learning rate was initialized at 0.01. # From article: We trained our models using stochastic gradient descent with a batch size of 128 examples, # momentum of 0.9, and weight decay of 0.0005 # From article: We initialized the weights in each layer from a zero-mean Gaussian distribution with standard # deviation 0.01. self.input_width = input_width self.input_height = input_height self.input_channels = input_channels self.num_classes = num_classes self.learning_rate = learning_rate self.momentum = momentum self.keep_prob = keep_prob self.random_mean = 0 self.random_stddev = 0.01 # ---------------------------------------------------------------------------------------------------- # From article: We initialized the neuron biases in the second, fourth, and fifth convolutional layers, as well # as in the fully-connected hidden layers, with the constant 1. ... We initialized the neuron biases in the # remaining layers with the constant 0. # Input: 227x227x3. with tf.name_scope('input'): self.X = tf.placeholder(dtype=tf.float32, shape=[ None, self.input_height, self.input_width, self.input_channels ], name='X') # Labels: 1000. with tf.name_scope('labels'): self.Y = tf.placeholder(dtype=tf.float32, shape=[None, self.num_classes], name='Y') # Dropout keep prob. with tf.name_scope('dropout'): self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, shape=(), name='dropout_keep_prob') # Layer 1. # [Input] ==> 227x227x3 # --> 227x227x3 ==> [Convolution: size=(11x11x3)x96, strides=4, padding=valid] ==> 55x55x96 # --> 55x55x96 ==> [ReLU] ==> 55x55x96 # --> 55x55x96 ==> [Local Response Normalization] ==> 55x55x96 # --> 55x55x96 ==> [Max-Pool: size=3x3, strides=2, padding=valid] ==> 27x27x96 # --> [Output] ==> 27x27x96 # Note: 48*2=96, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer1'): layer1_activations = self.__conv( input=self.X, filter_width=11, filter_height=11, filters_count=96, stride_x=4, stride_y=4, padding='VALID', init_biases_with_the_constant_1=False) layer1_lrn = self.__local_response_normalization( input=layer1_activations) layer1_pool = self.__max_pool(input=layer1_lrn, filter_width=3, filter_height=3, stride_x=2, stride_y=2, padding='VALID') # Layer 2. # [Input] ==> 27x27x96 # --> 27x27x96 ==> [Convolution: size=(5x5x96)x256, strides=1, padding=same] ==> 27x27x256 # --> 27x27x256 ==> [ReLU] ==> 27x27x256 # --> 27x27x256 ==> [Local Response Normalization] ==> 27x27x256 # --> 27x27x256 ==> [Max-Pool: size=3x3, strides=2, padding=valid] ==> 13x13x256 # --> [Output] ==> 13x13x256 # Note: 128*2=256, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer2'): layer2_activations = self.__conv( input=layer1_pool, filter_width=5, filter_height=5, filters_count=256, stride_x=1, stride_y=1, padding='SAME', init_biases_with_the_constant_1=True) layer2_lrn = self.__local_response_normalization( input=layer2_activations) layer2_pool = self.__max_pool(input=layer2_lrn, filter_width=3, filter_height=3, stride_x=2, stride_y=2, padding='VALID') # Layer 3. # [Input] ==> 13x13x256 # --> 13x13x256 ==> [Convolution: size=(3x3x256)x384, strides=1, padding=same] ==> 13x13x384 # --> 13x13x384 ==> [ReLU] ==> 13x13x384 # --> [Output] ==> 13x13x384 # Note: 192*2=384, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer3'): layer3_activations = self.__conv( input=layer2_pool, filter_width=3, filter_height=3, filters_count=384, stride_x=1, stride_y=1, padding='SAME', init_biases_with_the_constant_1=False) # Layer 4. # [Input] ==> 13x13x384 # --> 13x13x384 ==> [Convolution: size=(3x3x384)x384, strides=1, padding=same] ==> 13x13x384 # --> 13x13x384 ==> [ReLU] ==> 13x13x384 # --> [Output] ==> 13x13x384 # Note: 192*2=384, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer4'): layer4_activations = self.__conv( input=layer3_activations, filter_width=3, filter_height=3, filters_count=384, stride_x=1, stride_y=1, padding='SAME', init_biases_with_the_constant_1=True) # Layer 5. # [Input] ==> 13x13x384 # --> 13x13x384 ==> [Convolution: size=(3x3x384)x256, strides=1, padding=same] ==> 13x13x256 # --> 13x13x256 ==> [ReLU] ==> 13x13x256 # --> 13x13x256 ==> [Max-Pool: size=3x3, strides=2, padding=valid] ==> 6x6x256 # --> [Output] ==> 6x6x256 # Note: 128*2=256, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer5'): layer5_activations = self.__conv( input=layer4_activations, filter_width=3, filter_height=3, filters_count=256, stride_x=1, stride_y=1, padding='SAME', init_biases_with_the_constant_1=True) layer5_pool = self.__max_pool(input=layer5_activations, filter_width=3, filter_height=3, stride_x=2, stride_y=2, padding='VALID') # Layer 6. # [Input] ==> 6x6x256=9216 # --> 9216 ==> [Fully Connected: neurons=4096] ==> 4096 # --> 4096 ==> [ReLU] ==> 4096 # --> 4096 ==> [Dropout] ==> 4096 # --> [Output] ==> 4096 # Note: 2048*2=4096, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer6'): pool5_shape = layer5_pool.get_shape().as_list() flattened_input_size = pool5_shape[1] * pool5_shape[ 2] * pool5_shape[3] layer6_fc = self.__fully_connected( input=tf.reshape(layer5_pool, shape=[-1, flattened_input_size]), inputs_count=flattened_input_size, outputs_count=4096, relu=True, init_biases_with_the_constant_1=True) layer6_dropout = self.__dropout(input=layer6_fc) # Layer 7. # [Input] ==> 4096 # --> 4096 ==> [Fully Connected: neurons=4096] ==> 4096 # --> 4096 ==> [ReLU] ==> 4096 # --> 4096 ==> [Dropout] ==> 4096 # --> [Output] ==> 4096 # Note: 2048*2=4096, One GPU runs the layer-parts at the top while the other runs the layer-parts at the bottom. with tf.name_scope('layer7'): layer7_fc = self.__fully_connected( input=layer6_dropout, inputs_count=4096, outputs_count=4096, relu=True, init_biases_with_the_constant_1=True) layer7_dropout = self.__dropout(input=layer7_fc) # Layer 8. # [Input] ==> 4096 # --> 4096 ==> [Logits: neurons=1000] ==> 1000 # --> [Output] ==> 1000 with tf.name_scope('layer8'): layer8_logits = self.__fully_connected( input=layer7_dropout, inputs_count=4096, outputs_count=self.num_classes, relu=False, name='logits') # Cross Entropy. with tf.name_scope('cross_entropy'): cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( logits=layer8_logits, labels=self.Y, name='cross_entropy') self.__variable_summaries(cross_entropy) # Training. with tf.name_scope('training'): loss_operation = tf.reduce_mean(cross_entropy, name='loss_operation') tf.summary.scalar(name='loss', tensor=loss_operation) optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=self.momentum) # self.training_operation = optimizer.minimize(loss_operation, name='training_operation') grads_and_vars = optimizer.compute_gradients(loss_operation) self.training_operation = optimizer.apply_gradients( grads_and_vars, name='training_operation') for grad, var in grads_and_vars: if grad is not None: with tf.name_scope(var.op.name + '/gradients'): self.__variable_summaries(grad) # Accuracy. with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(layer8_logits, 1), tf.argmax(self.Y, 1), name='correct_prediction') self.accuracy_operation = tf.reduce_mean(tf.cast( correct_prediction, tf.float32), name='accuracy_operation') tf.summary.scalar(name='accuracy', tensor=self.accuracy_operation)
def __dropout(self, input, name='dropout'): with tf.name_scope(name): return tf.nn.dropout(input, keep_prob=self.dropout_keep_prob, name='dropout')