def _build_graph(self, inputs): input, label = inputs fw, fa, fg = get_dorefa(FLAGS.bit_w, FLAGS.bit_a, 32) old_get_variable = tf.get_variable # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'fc0' in name or 'fct' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if FLAGS.bit_a == 32 and not FLAGS.use_clip: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) activations = [] with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity): curr_layer = LinearWrap(input) for i in range(FLAGS.n_layers): curr_layer = (curr_layer.FullyConnected( 'fc' + str(i), FLAGS.state_size).apply(activate)) activations.append(curr_layer.tensor()) curr_layer = (curr_layer.BatchNorm('bn_fc' + str(i)).Dropout( 'dropout', FLAGS.dropout)) logits = (curr_layer.FullyConnected( 'fc' + str(FLAGS.n_layers), 256).apply(nonlin).BatchNorm( 'bnfc' + str(FLAGS.n_layers)).FullyConnected( 'fct', self.n_spks, use_bias=True)()) print_all_tf_vars() prob = tf.nn.softmax(logits, name='output') # used for validation accuracy of utterance identity_guesses = flatten(tf.argmax(prob, axis=1)) uniq_identities, _, count = tf.unique_with_counts(identity_guesses) idx_to_identity_with_most_votes = tf.argmax(count) chosen_identity = tf.gather(uniq_identities, idx_to_identity_with_most_votes) wrong = tf.expand_dims(tf.not_equal(chosen_identity, tf.cast(label[0], tf.int64)), axis=0, name='utt-wrong') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6), name='regularize_cost') add_param_summary(('.*/W', ['histogram', 'rms'])) self.cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, self.cost) for activation in activations: add_activation_summary(activation) tf.summary.histogram(activation.name, activation)
def build_graph(self, image, label): is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def activate(x): return fa(cabs(x)) image = image / 256.0 with remap_variables(binarize_weight), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False): logits = ( LinearWrap(image).Conv2D('conv0', 48, 5, padding='VALID', use_bias=True).MaxPooling( 'pool0', 2, padding='SAME').apply(activate) # 18 .Conv2D('conv1', 64, 3, padding='SAME').apply(fg).BatchNorm( 'bn1').apply(activate).Conv2D( 'conv2', 64, 3, padding='SAME').apply(fg).BatchNorm('bn2').MaxPooling( 'pool1', 2, padding='SAME').apply(activate) # 9 .Conv2D( 'conv3', 128, 3, padding='VALID').apply(fg).BatchNorm('bn3').apply(activate) # 7 .Conv2D('conv4', 128, 3, padding='SAME').apply(fg). BatchNorm('bn4').apply(activate).Conv2D( 'conv5', 128, 3, padding='VALID').apply(fg).BatchNorm('bn5').apply(activate) # 5 .tf.nn.dropout(0.5 if is_training else 1.0).Conv2D( 'conv6', 512, 5, padding='VALID').apply(fg).BatchNorm( 'bn6').apply(cabs).FullyConnected('fc1', 10)()) tf.nn.softmax(logits, name='output') # compute the number of failed samples wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) add_param_summary(('.*/W', ['histogram', 'rms'])) total_cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, total_cost) return total_cost
def _build_graph(self, inputs): image, label = inputs """Add a single channel here""" image = tf.expand_dims(image, 3) image = image * 256 image = tf.round(image) fw, fa, fg = get_dorefa(BITW, BITA, BITG) old_get_variable = tf.get_variable def monitor(x, name): if MONITOR == 1: return tf.Print(x, [x], message='\n\n' + name + ': ', summarize=1000, name=name) else: return x def new_get_variable(v): name = v.op.name if not name.endswith('W') or 'conv0' in name or 'fc1' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) if MONITOR == 1: return tf.Print(fw(v), [fw(v)], message='\n\n' + v.name + ', Quantized weights are:', summarize=100) else: return fw(v) def activate(x): if BITA == 32: return tf.nn.relu(x) else: return fa(tf.nn.relu(x)) with remap_variables(new_get_variable), \ argscope(Conv2D, kernel_shape=3, use_bias=False, nl=tf.identity, out_channel=32): logits = (LinearWrap(image).apply(monitor, 'image_out').Conv2D( 'conv0').apply(fg).BatchNorm('bn0').apply(activate).apply( monitor, 'conv0_out').MaxPooling('pool0', 2).apply( monitor, 'pool0_out').Conv2D('conv1').apply( fg).BatchNorm('bn1').apply(activate).apply( monitor, 'conv1_out').Conv2D('conv2').apply( fg).BatchNorm('bn2').apply(activate).apply( monitor, 'conv2_out').MaxPooling( 'pool1', 2).apply( monitor, 'pool1_out').Conv2D('conv3'). apply(fg).BatchNorm('bn3').apply(activate).apply( monitor, 'conv3_out').FullyConnected( 'fc0', use_bias=False, out_dim=20, nl=tf.identity).apply(activate).apply( monitor, 'fc0_out').FullyConnected( 'fc1', use_bias=False, out_dim=10, nl=tf.identity).apply( monitor, 'fc1_out')()) prob = tf.nn.softmax(logits, name='prob') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = symbf.prediction_incorrect(logits, label, name='incorrect') accuracy = symbf.accuracy(logits, label, name='accuracy') train_error = tf.reduce_mean(wrong, name='train_error') summary.add_moving_summary(train_error, accuracy) wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') self.cost = tf.add_n([wd_cost, cost], name='total_cost') summary.add_moving_summary(cost, wd_cost, self.cost)
def _build_graph(self, input_vars, is_training): image, label = input_vars fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw old_get_variable = tf.get_variable def new_get_variable(name, shape=None, **kwargs): v = old_get_variable(name, shape, **kwargs) # don't binarize first and last layer if name != 'W' or 'conv0' in v.op.name or 'fc' in v.op.name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) tf.get_variable = new_get_variable def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def activate(x): return fa(cabs(x)) image = image / 256.0 with argscope(BatchNorm, decay=0.9, epsilon=1e-4, use_local_stat=is_training), \ argscope(Conv2D, use_bias=False, nl=tf.identity): logits = ( LinearWrap(image).Conv2D('conv0', 48, 5, padding='VALID', use_bias=True).MaxPooling( 'pool0', 2, padding='SAME').apply(activate) # 18 .Conv2D('conv1', 64, 3, padding='SAME').apply(fg).BatchNorm( 'bn1').apply(activate).Conv2D( 'conv2', 64, 3, padding='SAME').apply(fg).BatchNorm('bn2').MaxPooling( 'pool1', 2, padding='SAME').apply(activate) # 9 .Conv2D( 'conv3', 128, 3, padding='VALID').apply(fg).BatchNorm('bn3').apply(activate) # 7 .Conv2D('conv4', 128, 3, padding='SAME').apply(fg). BatchNorm('bn4').apply(activate).Conv2D( 'conv5', 128, 3, padding='VALID').apply(fg).BatchNorm('bn5').apply(activate) # 5 .tf.nn.dropout(0.5 if is_training else 1.0).Conv2D( 'conv6', 512, 5, padding='VALID').apply(fg).BatchNorm( 'bn6').apply(cabs).FullyConnected('fc1', 10, nl=tf.identity)()) tf.get_variable = old_get_variable prob = tf.nn.softmax(logits, name='output') # compute the number of failed samples wrong = prediction_incorrect(logits, label) nr_wrong = tf.reduce_sum(wrong, name='wrong') # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) add_moving_summary(cost, wd_cost) add_param_summary([('.*/W', ['histogram', 'rms'])]) self.cost = tf.add_n([cost, wd_cost], name='cost')
def _build_graph(self, input_vars, is_training): image, label = input_vars image = image / 255.0 fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw old_get_variable = tf.get_variable def new_get_variable(name, shape=None, **kwargs): v = old_get_variable(name, shape, **kwargs) # don't binarize first and last layer if name != 'W' or 'conv0' in v.op.name or 'fct' in v.op.name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) tf.get_variable = new_get_variable def nonlin(x): if BITA == 32: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) with argscope(BatchNorm, decay=0.9, epsilon=1e-4, use_local_stat=is_training), \ argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity): logits = (LinearWrap(image) .Conv2D('conv0', 96, 12, stride=4, padding='VALID') .apply(activate) .Conv2D('conv1', 256, 5, padding='SAME', split=2) .apply(fg) .BatchNorm('bn1') .MaxPooling('pool1', 3, 2, padding='SAME') .apply(activate) .Conv2D('conv2', 384, 3) .apply(fg) .BatchNorm('bn2') .MaxPooling('pool2', 3, 2, padding='SAME') .apply(activate) .Conv2D('conv3', 384, 3, split=2) .apply(fg) .BatchNorm('bn3') .apply(activate) .Conv2D('conv4', 256, 3, split=2) .apply(fg) .BatchNorm('bn4') .MaxPooling('pool4', 3, 2, padding='VALID') .apply(activate) .FullyConnected('fc0', 4096) .apply(fg) .BatchNorm('bnfc0') .apply(activate) .FullyConnected('fc1', 4096) .apply(fg) .BatchNorm('bnfc1') .apply(nonlin) .FullyConnected('fct', 1000, use_bias=True)()) tf.get_variable = old_get_variable prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1) nr_wrong = tf.reduce_sum(wrong, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train_error_top1')) wrong = prediction_incorrect(logits, label, 5) nr_wrong = tf.reduce_sum(wrong, name='wrong-top5') add_moving_summary(tf.reduce_mean(wrong, name='train_error_top5')) # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6)) add_moving_summary(cost, wd_cost) add_param_summary([('.*/W', ['histogram', 'rms'])]) self.cost = tf.add_n([cost, wd_cost], name='cost')
def build_graph(self, image, label): """This function should build the model which takes the input variables (defined above) and return cost at the end.""" is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'fc0' in name or 'fc_out' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) #FIXMEreturn tf.clip_by_value(x, 0.0, 1.0) return tf.clip_by_value(x, -1.0, 1.0) def activate(x): return fa(nonlin(x)) # The context manager `argscope` sets the default option for all the layers under # this context. Here we use 32 channel convolution with shape 3x3 # See tutorial at https://tensorpack.readthedocs.io/tutorial/symbolic.html with remap_variables(binarize_weight), \ argscope(FullyConnected, use_bias=False), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4): # LinearWrap is just a syntax sugar. # See tutorial at https://tensorpack.readthedocs.io/tutorial/symbolic.html logits = ( LinearWrap(image).Dropout('dropout_in', rate=0.2 if is_training else 0.0) # hidden 0 .FullyConnected( 'fc0', n_units).BatchNorm('bn0').apply(activate).Dropout( 'dropout_hidden0', rate=0.5 if is_training else 0.0) # hidden 1 .FullyConnected( 'fc1', n_units).BatchNorm('bn1').apply(activate).Dropout( 'dropout_hidden1', rate=0.5 if is_training else 0.0) # hidden 2 .FullyConnected( 'fc2', n_units).BatchNorm('bn2').apply(activate).Dropout( 'dropout_hidden2', rate=0.5 if is_training else 0.0) # output layer .FullyConnected('fc_out', 10, activation=tf.identity)()) # a vector of length B with loss of each sample cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean( cost, name='cross_entropy_loss') # the average cross-entropy loss correct = tf.cast(tf.nn.in_top_k(predictions=logits, targets=label, k=1), tf.float32, name='correct') accuracy = tf.reduce_mean(correct, name='accuracy') # This will monitor training error & accuracy (in a moving average fashion). The value will be automatically # 1. written to tensosrboard # 2. written to stat.json # 3. printed after each epoch # You can also just call `tf.summary.scalar`. But moving summary has some other benefits. # See tutorial at https://tensorpack.readthedocs.io/tutorial/summary.html train_error = tf.reduce_mean(1 - correct, name='train_error') summary.add_moving_summary(train_error, accuracy) # Use a regex to find parameters to apply weight decay. # Here we apply a weight decay on all W (weight matrix) of all fc layers # If you don't like regex, you can certainly define the cost in any other methods. wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') total_cost = tf.add_n([wd_cost, cost], name='total_cost') summary.add_moving_summary(cost, wd_cost, total_cost) # monitor histogram of all weight (of conv and fc layers) in tensorboard summary.add_param_summary(('.*/W', ['histogram', 'rms'])) # the function should return the total cost to be optimized return total_cost
def _conv_bn_layer(self, layer_name, inputs, filters, size, stride, padding='SAME', use_bias=False, freeze=False, xavier=False, relu=True, activation_fn=tf.nn.relu, stddev=0.001, kernel_name='kernels', bias_name='biases'): """Convolutional layer operation constructor. Args: layer_name: layer name. inputs: input tensor filters: number of output filters. size: kernel size. stride: stride padding: 'SAME' or 'VALID'. See tensorflow doc for detailed description. freeze: if true, then do not train the parameters in this layer. xavier: whether to use xavier weight initializer or not. relu: whether to use relu or not. stddev: standard deviation used for random weight initializer. Returns: A convolutional layer operation. """ mc = self.mc use_pretrained_param = False if mc.LOAD_PRETRAINED_MODEL: cw = self.caffemodel_weight if layer_name in cw: kernel_val = np.transpose(cw[layer_name][0], [2, 3, 1, 0]) bias_val = cw[layer_name][1] # check the shape if (kernel_val.shape == (size, size, inputs.get_shape().as_list()[-1], filters)) \ and (bias_val.shape == (filters, )): use_pretrained_param = True else: print( 'Shape of the pretrained parameter of {} does not match, ' 'use randomly initialized parameter'.format( layer_name)) else: print( 'Cannot find {} in the pretrained model. Use randomly initialized ' 'parameters'.format(layer_name)) if mc.DEBUG_MODE: print('Input tensor shape to {}: {}'.format( layer_name, inputs.get_shape())) with tf.variable_scope(layer_name) as scope: channels = inputs.get_shape()[3] # re-order the caffe kernel with shape [out, in, h, w] -> tf kernel with # shape [h, w, in, out] if use_pretrained_param: if mc.DEBUG_MODE: print('Using pretrained model for {}'.format(layer_name)) kernel_init = tf.constant(kernel_val, dtype=tf.float32) bias_init = tf.constant(bias_val, dtype=tf.float32) elif xavier: kernel_init = tf.contrib.layers.xavier_initializer_conv2d() bias_init = tf.constant_initializer(0.0) else: kernel_init = tf.truncated_normal_initializer(stddev=stddev, dtype=tf.float32) bias_init = tf.constant_initializer(0.0) kernel = _variable_with_weight_decay( kernel_name, shape=[size, size, int(channels), filters], wd=mc.WEIGHT_DECAY, initializer=kernel_init, trainable=(not freeze)) #kernel_binary = binarize(kernel) if use_bias == True: biases = _variable_on_device(bias_name, [filters], bias_init, trainable=(not freeze)) self.model_params += [kernel, biases] if mc.bDoreFa == True: fw, fa, fg = get_dorefa(mc.BITW, mc.BITA, mc.BITG) kernel = fw(kernel) if mc.BITA != 32: #inputs = tf.clip_by_value(inputs, 0.0, 1.0) inputs = inputs / tf.reduce_max(inputs) inputs = fa(inputs) if mc.bQuant == True: if mc.bQuantWeights == True: kernel = self._quant_kernel_v1(mc, kernel) if mc.bQuant == True: if mc.bQuantActivations == True: inputs = self._quant_activations(mc, inputs) conv = tf.nn.conv2d(inputs, kernel, [1, stride, stride, 1], padding=padding, name='convolution') if use_bias == True: out0 = tf.nn.bias_add(conv, biases, name='bias_add') else: out0 = conv out0 = slim.batch_norm(out0, scope='BatchNorm') if relu == True: out = activation_fn(out0, 'relu') else: out = out0 self.model_size_counter.append( (layer_name, (1 + size * size * int(channels)) * filters)) out_shape = out.get_shape().as_list() num_flops = \ (1+2*int(channels)*size*size)*filters*out_shape[1]*out_shape[2] if relu: num_flops += 2 * filters * out_shape[1] * out_shape[2] self.flop_counter.append((layer_name, num_flops)) self.activation_counter.append( (layer_name, out_shape[1] * out_shape[2] * out_shape[3])) return out
def build_graph(self, image, label): image = image / 256.0 fw, fa, fg = get_dorefa(BITW, BITA, BITG) def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv1' in name or 'fct' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) def resblock(x, channel, stride): def get_stem_full(x): return (LinearWrap(x).Conv2D( 'c3x3a', channel, 3).BatchNorm('stembn').apply(activate).Conv2D( 'c3x3b', channel, 3)()) channel_mismatch = channel != x.get_shape().as_list()[3] if stride != 1 or channel_mismatch or 'pool1' in x.name: # handling pool1 is to work around an architecture bug in our model if stride != 1 or 'pool1' in x.name: x = AvgPooling('pool', x, stride, stride) x = BatchNorm('bn', x) x = activate(x) shortcut = Conv2D('shortcut', x, channel, 1) stem = get_stem_full(x) else: shortcut = x x = BatchNorm('bn', x) x = activate(x) stem = get_stem_full(x) return shortcut + stem def group(x, name, channel, nr_block, stride): with tf.variable_scope(name + 'blk1'): x = resblock(x, channel, stride) for i in range(2, nr_block + 1): with tf.variable_scope(name + 'blk{}'.format(i)): x = resblock(x, channel, 1) return x with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False, nl=tf.identity): logits = ( LinearWrap(image) # use explicit padding here, because our private training framework has # different padding mechanisms from TensorFlow .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]]).Conv2D( 'conv1', 64, 7, stride=2, padding='VALID', use_bias=True).tf.pad( [[0, 0], [1, 1], [1, 1], [0, 0]], 'SYMMETRIC').MaxPooling( 'pool1', 3, 2, padding='VALID').apply( group, 'conv2', 64, 2, 1).apply(group, 'conv3', 128, 2, 2).apply( group, 'conv4', 256, 2, 2).apply(group, 'conv5', 512, 2, 2).BatchNorm('lastbn'). apply(nonlin).GlobalAvgPooling('gap').tf.multiply( 49) # this is due to a bug in our model design .FullyConnected('fct', 1000)()) tf.nn.softmax(logits, name='output') ImageNetModel.compute_loss_and_error(logits, label)
def _build_graph(self, inputs): image, label = inputs image = tf.expand_dims(image, 3) image = image * 2 - 1 # center the pixels values at zero is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def activate(x): return fa(cabs(x)) with remap_variables(binarize_weight), \ argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32): logits = (LinearWrap(image).Conv2D('conv0').MaxPooling( 'pool0', 2).apply(activate).Conv2D('conv1').apply(fg).Conv2D( 'conv2').apply(fg).MaxPooling('pool1', 2).apply(activate). Conv2D('conv3').apply(fg).apply(cabs).FullyConnected( 'fc0', 512, activation=tf.nn.relu).Dropout( 'dropout', 0.5).FullyConnected('fc1', 10, activation=tf.identity)()) tf.nn.softmax(logits, name='output') # a vector of length B with loss of each sample cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean( cost, name='cross_entropy_loss') # the average cross-entropy loss correct = tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32, name='correct') accuracy = tf.reduce_mean(correct, name='accuracy') # This will monitor training error (in a moving_average fashion): # 1. write the value to tensosrboard # 2. write the value to stat.json # 3. print the value after each epoch train_error = tf.reduce_mean(1 - correct, name='train_error') summary.add_moving_summary(train_error, accuracy) # Use a regex to find parameters to apply weight decay. # Here we apply a weight decay on all W (weight matrix) of all fc layers wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') self.cost = tf.add_n([wd_cost, cost], name='total_cost') summary.add_moving_summary(cost, wd_cost, self.cost) # monitor histogram of all weight (of conv and fc layers) in tensorboard summary.add_param_summary(('.*/W', ['histogram', 'rms']))
import tensorflow as tf import math from tensorflow.python.training import moving_averages from tensorflow.python.ops import control_flow_ops from tensorflow.python.framework import ops from dorefa import get_dorefa BITW = 1 BITA = 3 BITG = 32 fw, fa, fg = get_dorefa(BITW, BITA, BITG) def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def DoReFa_Convolution_w(nOutputPlane, kW, kH, dW=1, dH=1, padding='VALID', bias=True, reuse=None, name='DoReFa_Convolution_w'): def b_conv2d(x, is_training=True): nInputPlane = x.get_shape().as_list()[3] with tf.variable_op_scope([x], None, name, reuse=reuse): w = tf.get_variable( 'weight', [kH, kW, nInputPlane, nOutputPlane],
def _build_graph(self, inputs): inp, label = inputs is_training = get_current_tower_context().is_training fw, fa = get_dorefa(BITW, BITA) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name if not (name.endswith('W') or name.endswith('b') ) or 'linear0' in name or 'last_linear' in name: print("Not quantizing", name) return v else: logger.info("Quantizing weight {}".format(v.op.name)) return fw(v) def nonlin(x, name="activate"): return fa(tf.clip_by_value(BNWithTrackedMults(x), 0.0, 1.0)) with remap_variables(binarize_weight), \ argscope([FullyConnectedWithTrackedMults], network_complexity=self.network_complexity), \ argscope([BNReLUWithTrackedMults], network_complexity=self.network_complexity), \ argscope([BNWithTrackedMults], network_complexity=self.network_complexity), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4): l = self.net_fn(inp, nonlin, self.n_context) logits = FullyConnectedWithTrackedMults('last_linear', l, out_dim=self.n_spks, nl=tf.identity) prob = tf.nn.softmax(logits, name='output') # used for validation accuracy of utterance identity_guesses = flatten(tf.argmax(prob, axis=1)) uniq_identities, _, count = tf.unique_with_counts(identity_guesses) idx_to_identity_with_most_votes = tf.argmax(count) chosen_identity = tf.gather(uniq_identities, idx_to_identity_with_most_votes) wrong = tf.expand_dims(tf.not_equal(chosen_identity, tf.cast(label[0], tf.int64)), axis=0, name='utt-wrong') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') add_moving_summary(cost) wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) with tf.name_scope('original-weight-summaries'): add_param_summary(('.*/W', ['rms', 'histogram'])) add_param_summary(('.*/b', ['rms', 'histogram'])) with tf.name_scope('activation-summaries'): def fn(name): return ( name.endswith('output') or name.endswith('output:0') ) and "Inference" not in name and 'quantized' not in name tensors = get_tensors_from_graph(tf.get_default_graph(), fn) print("Adding activation tensors to summary:", tensors) for tensor in tensors: add_tensor_summary(tensor, ['rms', 'histogram']) if self.regularize: # decreasing regularization on all W of fc layers wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), 480000, 0.2, True) wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') add_moving_summary(wd_cost) self.cost = tf.add_n([cost, wd_cost], name='cost') else: self.cost = tf.identity(cost, name='cost') tf.constant([self.network_complexity['mults']], name='TotalMults') tf.constant([self.network_complexity['weights']], name='TotalWeights') logger.info("Parameter count: {}".format(self.network_complexity))
def _build_graph(self, input_vars, is_training): is_training = bool(is_training) keep_prob = tf.constant(0.5 if is_training else 1.0) image, label = input_vars image = tf.expand_dims(image, 3) # add a single channel fw, fa, fg = get_dorefa(1, 2, 7) # monkey-patch tf.get_variable to apply fw old_get_variable = tf.get_variable # weightの更新 nl = PReLU.f image = image * 2 - 1 def new_get_variable(name, shape=None, **kwargs): v = old_get_variable(name, shape, **kwargs) # don't binarize first and last layer if name != 'W' or 'conv0' in v.op.name or 'fct' in v.op.name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) tf.get_variable = new_get_variable def nonlin(x): if BITA == 32: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def activate(x): return fa(cabs(x)) # 活性化関数の出力のクリップ with argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, kernel_shape=3, nl=nl, out_channel=32): logits = ( LinearWrap( image) # the starting brace is only for line-breaking .Conv2D( 'conv0', padding='VALID' ) #.apply(fg).BatchNorm('bn1',use_local_stat=is_training) .MaxPooling('pool0', 2).apply(activate).Conv2D('conv1', padding='SAME'). apply(fg).BatchNorm('bn2').apply(activate).Conv2D( 'conv2', padding='VALID').apply(fg).BatchNorm('bn3').MaxPooling( 'pool1', 2).apply(activate).Conv2D('conv3', padding='VALID'). apply(fg).BatchNorm('bn4').apply(activate).FullyConnected( 'fc0', 512).apply(fg).BatchNorm('bn5') #.tf.nn.dropout(keep_prob) .apply(cabs).FullyConnected('fc1', out_dim=10, nl=tf.identity)()) prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # compute the number of failed samples, for ClassificationError to use at test time wrong = symbolic_functions.prediction_incorrect(logits, label) nr_wrong = tf.reduce_sum(wrong, name='wrong') # monitor training error summary.add_moving_summary(tf.reduce_mean(wrong, name='train_error')) # weight decay on all W of fc layers wd_cost = tf.mul(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') summary.add_moving_summary(cost, wd_cost) summary.add_param_summary([('.*/W', ['histogram']) ]) # monitor histogram of all W self.cost = tf.add_n([wd_cost, cost], name='cost')
def get_logits(self, image): if BITW == 't': fw, fa, fg = get_dorefa(32, 32, 32) fw = ternarize else: fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fct' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) with remap_variables(new_get_variable), \ argscope([Conv2D, BatchNorm, MaxPooling], data_format='channels_first'), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False): logits = (LinearWrap(image) .Conv2D('conv0', 96, 12, strides=4, padding='VALID', use_bias=True) .apply(activate) .Conv2D('conv1', 256, 5, padding='SAME', split=2) .apply(fg) .BatchNorm('bn1') .MaxPooling('pool1', 3, 2, padding='SAME') .apply(activate) .Conv2D('conv2', 384, 3) .apply(fg) .BatchNorm('bn2') .MaxPooling('pool2', 3, 2, padding='SAME') .apply(activate) .Conv2D('conv3', 384, 3, split=2) .apply(fg) .BatchNorm('bn3') .apply(activate) .Conv2D('conv4', 256, 3, split=2) .apply(fg) .BatchNorm('bn4') .MaxPooling('pool4', 3, 2, padding='VALID') .apply(activate) .FullyConnected('fc0', 4096) .apply(fg) .BatchNorm('bnfc0') .apply(activate) .FullyConnected('fc1', 4096, use_bias=False) .apply(fg) .BatchNorm('bnfc1') .apply(nonlin) .FullyConnected('fct', 1000, use_bias=True)()) add_param_summary(('.*/W', ['histogram', 'rms'])) tf.nn.softmax(logits, name='output') # for prediction return logits
def _build_graph(self, input_vars): image, label = input_vars image = image / 255.0 fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw old_get_variable = tf.get_variable def new_get_variable(name, shape=None, **kwargs): v = old_get_variable(name, shape, **kwargs) # don't binarize first and last layer if name != 'W' or 'conv0' in v.op.name or 'fct' in v.op.name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) tf.get_variable = new_get_variable def nonlin(x): if BITA == 32: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) with argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity): logits = (LinearWrap(image).Conv2D( 'conv0', 96, 12, stride=4, padding='VALID').apply(activate).Conv2D( 'conv1', 256, 5, padding='SAME', split=2).apply(fg).BatchNorm('bn1').MaxPooling( 'pool1', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv2', 384, 3).apply(fg).BatchNorm('bn2').MaxPooling( 'pool2', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv3', 384, 3, split=2).apply(fg). BatchNorm('bn3').apply(activate).Conv2D( 'conv4', 256, 3, split=2).apply(fg).BatchNorm('bn4').MaxPooling( 'pool4', 3, 2, padding='VALID').apply(activate).FullyConnected( 'fc0', 4096).apply(fg). BatchNorm('bnfc0').apply(activate).FullyConnected( 'fc1', 4096).apply(fg).BatchNorm('bnfc1').apply( nonlin).FullyConnected('fct', 1000, use_bias=True)()) tf.get_variable = old_get_variable prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6)) add_moving_summary(cost, wd_cost) add_param_summary([('.*/W', ['histogram', 'rms'])]) self.cost = tf.add_n([cost, wd_cost], name='cost')
def build_graph(self, image, label): is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'weak' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) #return ternarize(v) def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def activate(x): return fa(cabs(x)) def merge(x, y): #return x + y #return x - y return tf.concat([x,y], axis=3) image = image / 256.0; k=3; zp=0.25; zp2=zp / 1 #scale = tf.train.exponential_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*5, decay_rate=0.5, staircase=True, name='scale') #scale = tf.where(scale>0.001, scale, tf.zeros_like(scale)) scale = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*50, alpha=0.0) tf.summary.scalar('scale', scale); endconv=[]; endweak=[] #scale2 = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*50, alpha=0.0) #scale3 = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*80, alpha=0.0) with remap_variables(binarize_weight), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False): net=Conv2D('conv0', image, np.round(48*zp), 5, padding='VALID', use_bias=True) net=MaxPooling('pool0', net, 2, padding='SAME'); net=activate(net) net1=Conv2D('conv1', net, np.round(64*zp), 3, padding='SAME'); net1=BatchNorm('bn1', net1); endconv.append(net1) net2=Conv2D('weak1', net, np.round(64*zp2), k, padding='SAME'); net2=BatchNorm('bn12', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net=merge(activate(net1), scale*net2) #net=activate(net1) net1=Conv2D('conv2', net, np.round(64*zp), 3, padding='SAME'); net1=BatchNorm('bn2', net1); endconv.append(net1) net2=Conv2D('weak2', net, np.round(64*zp2), k, padding='SAME'); net2=BatchNorm('bn22', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net1=MaxPooling('pool1', net1, 2, padding='SAME'); net2=MaxPooling('pool12', net2, 2, padding='SAME'); net=merge(activate(net1), scale*net2) net=activate(net1) net1=Conv2D('conv3', net, np.round(128*zp), 3, padding='VALID'); net1=BatchNorm('bn3', net1); endconv.append(net1) net2=Conv2D('weak3', net, np.round(128*zp2), k, padding='VALID'); net2=BatchNorm('bn32', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net=merge(activate(net1), scale*net2) #net=activate(net1) net1=Conv2D('conv4', net, np.round(128*zp), 3, padding='SAME'); net1=BatchNorm('bn4', net1); endconv.append(net1) net2=Conv2D('weak4', net, np.round(128*zp2), k, padding='SAME'); net2=BatchNorm('bn42', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net=merge(activate(net1), scale*net2) # net=activate(net1) net1=Conv2D('conv5', net, np.round(128*zp), 3, padding='VALID'); net1=BatchNorm('bn5', net1); endconv.append(net1) net2=Conv2D('weak5', net, np.round(128*zp2), k, padding='VALID'); net2=BatchNorm('bn52', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net=merge(activate(net1), scale*net2) #net=activate(net1) net=tf.nn.dropout(net, 0.5 if is_training else 1.0) net1=Conv2D('conv6', net, np.round(512*zp), 5, padding='VALID'); net1=BatchNorm('bn6', net1); endconv.append(net1) net2=Conv2D('weak6', net, np.round(512*zp2), 5, padding='VALID'); net2=BatchNorm('bn62', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net=merge(cabs(net1), scale*net2) # net=cabs(net1) logits=FullyConnected('fc1', net, 10) tf.nn.softmax(logits, name='output') # compute the number of failed samples wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='wrong_tensor') # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) add_param_summary(('.*/W', ['histogram', 'rms'])) total_cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, total_cost) for i in range(len(endweak)): add_moving_summary(tf.reduce_mean(tf.abs(endconv[i]), name='mean_conv_'+str(i+1) ) ) add_moving_summary(tf.reduce_mean(tf.abs(endweak[i]), name='mean_weak_'+str(i+1) ) ) return total_cost
def _build_graph(self, inputs): image, label, ious, valids, bndboxes = inputs image = tf.round(image) fw, fa, fg = get_dorefa(BITW, BITA, BITG) old_get_variable = tf.get_variable def monitor(x, name): if MONITOR == 1: return tf.Print(x, [x], message='\n\n' + name + ': ', summarize=1000, name=name) else: return x def new_get_variable(v): name = v.op.name if not name.endswith( 'W' ) or 'conv1' in name or 'conv_obj' in name or 'conv_box' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) if MONITOR == 1: return tf.Print(fw(v), [fw(v)], message='\n\n' + v.name + ', Quantized weights are:', summarize=100) else: return fw(v) def activate(x): if BITA == 32: return tf.nn.relu(x) else: return fa(tf.nn.relu(x)) def bn_activate(name, x): x = BatchNorm(name, x) x = monitor(x, name + '_noact_out') return activate(x) def halffire(name, x, num_squeeze_filters, num_expand_3x3_filters, skip): out_squeeze = Conv2D('squeeze_conv_' + name, x, out_channel=num_squeeze_filters, kernel_shape=1, stride=1, padding='SAME') out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze) out_expand_3x3 = Conv2D('expand_3x3_conv_' + name, out_squeeze, out_channel=num_expand_3x3_filters, kernel_shape=3, stride=1, padding='SAME') out_expand_3x3 = bn_activate('bn_expand_3x3_' + name, out_expand_3x3) if skip == 0: return out_expand_3x3 else: return tf.add(x, out_expand_3x3) def halffire_noact(name, x, num_squeeze_filters, num_expand_3x3_filters): out_squeeze = Conv2D('squeeze_conv_' + name, x, out_channel=num_squeeze_filters, kernel_shape=1, stride=1, padding='SAME') out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze) out_expand_3x3 = Conv2D('expand_3x3_conv_' + name, out_squeeze, out_channel=num_expand_3x3_filters, kernel_shape=3, stride=1, padding='SAME') return out_expand_3x3 with remap_variables(new_get_variable), \ argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4): image = monitor(image, 'image_out') l = Conv2D('conv1', image, out_channel=32, kernel_shape=3, stride=2, padding='SAME') l = bn_activate('bn1', l) l = monitor(l, 'conv1_out') l = MaxPooling('pool1', l, shape=3, stride=2, padding='SAME') l = monitor(l, 'pool1_out') l = halffire('fire1', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire1_out') l = MaxPooling('pool2', l, shape=3, stride=2, padding='SAME') l = monitor(l, 'pool2_out') l = halffire('fire2', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire2_out') l = MaxPooling('pool3', l, shape=3, stride=2, padding='SAME') l = monitor(l, 'pool3_out') l = halffire('fire3', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire3_out') l = halffire('fire4', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire4_out') l = halffire('fire5', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire5_out') l = halffire('fire6', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire6_out') l = halffire('fire7', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire7_out') # Classification classify = Conv2D('conv_class', l, out_channel=12, kernel_shape=1, stride=1, padding='SAME') classify = bn_activate('bn_class', classify) classify = monitor(classify, 'conv_class_out') logits = GlobalAvgPooling('pool_class', classify) class_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=label) class_loss = tf.reduce_mean(class_loss, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) # Object Detection l = tf.concat([l, classify], axis=3) objdetect = Conv2D('conv_obj', l, out_channel=1, kernel_shape=1, stride=1, padding='SAME') objdetect = tf.identity(objdetect, name='objdetect_out') objdetect_loss = tf.losses.hinge_loss(labels=ious, logits=objdetect) bndbox = Conv2D('conv_box', l, out_channel=4, kernel_shape=1, stride=1, padding='SAME') bndbox = tf.identity(bndbox, name='bndbox_out') bndbox = tf.multiply(bndbox, valids, name='mult0') bndbox_loss = tf.losses.mean_squared_error(labels=bndboxes, predictions=bndbox) # weight decay on all W of fc layers # reg_cost = regularize_cost('(fire7|conv_obj|conv_box).*/W', l2_regularizer(1e-5), name='regularize_cost') # cost = class_loss*objdetect_loss*bndbox_loss # cost = class_loss + objdetect_loss + bndbox_loss + reg_cost cost = class_loss + 10 * objdetect_loss + bndbox_loss add_moving_summary(class_loss, objdetect_loss, bndbox_loss, cost) self.cost = cost tf.get_variable = old_get_variable
def _build_graph(self, inputs): input, label = inputs fw, fa, fg = get_dorefa(FLAGS.bit_w, FLAGS.bit_a, 32) logger.info("Using {}-bit activations and {}-bit weights".format(FLAGS.bit_a, FLAGS.bit_w)) logger.info("Using trn_cache: {}".format(FLAGS.trn_cache_dir)) logger.info("Using host: {}".format(socket.gethostname())) old_get_variable = tf.get_variable # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name logger.info("Binarizing weight {}".format(v.op.name)) return fw(v, FLAGS.force_quantization) def nonlin(x): if FLAGS.bit_a == 32 and not FLAGS.use_clip: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) activations = [] with remap_variables(new_get_variable), \ argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity): curr_layer = LinearWrap(input) if model_type == 'fc': for i in range(FLAGS.n_layers): curr_layer = (curr_layer .FullyConnected('fc' + str(i), FLAGS.state_size) .LayerNorm('ln_fc' + str(i)) .apply(activate)) activations.append(curr_layer.tensor()) curr_layer = (curr_layer .Dropout('dropout', FLAGS.dropout)) elif model_type == 'cnn': logits = curr_layer.FullyConnected('fct', self.n_spks, use_bias=True)() print_all_tf_vars() prob = tf.nn.softmax(logits, name='output') # used for validation accuracy of utterance identity_guesses = flatten(tf.argmax(prob, axis=1)) uniq_identities, _, count = tf.unique_with_counts(identity_guesses) idx_to_identity_with_most_votes = tf.argmax(count) chosen_identity = tf.gather(uniq_identities, idx_to_identity_with_most_votes) wrong = tf.expand_dims(tf.not_equal(chosen_identity, tf.cast(label[0], tf.int64)), axis=0, name='utt-wrong') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6), name='regularize_cost') add_param_summary(('.*/W', ['histogram', 'rms'])) self.cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, self.cost) for activation in activations: add_activation_summary(activation) tf.summary.histogram(activation.name, activation) def _get_optimizer(self): lr = get_scalar_var('learning_rate', FLAGS.learning_rate, summary=True) return tf.train.AdamOptimizer(lr, epsilon=1e-5)
def _build_graph(self, inputs): image, label = inputs image = image / 256.0 fw, fa, fg = get_dorefa(BITW, BITA, BITG) def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv1' in name or 'fct' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) def resblock(x, channel, stride): def get_stem_full(x): return (LinearWrap(x) .Conv2D('c3x3a', channel, 3) .BatchNorm('stembn') .apply(activate) .Conv2D('c3x3b', channel, 3)()) channel_mismatch = channel != x.get_shape().as_list()[3] if stride != 1 or channel_mismatch or 'pool1' in x.name: # handling pool1 is to work around an architecture bug in our model if stride != 1 or 'pool1' in x.name: x = AvgPooling('pool', x, stride, stride) x = BatchNorm('bn', x) x = activate(x) shortcut = Conv2D('shortcut', x, channel, 1) stem = get_stem_full(x) else: shortcut = x x = BatchNorm('bn', x) x = activate(x) stem = get_stem_full(x) return shortcut + stem def group(x, name, channel, nr_block, stride): with tf.variable_scope(name + 'blk1'): x = resblock(x, channel, stride) for i in range(2, nr_block + 1): with tf.variable_scope(name + 'blk{}'.format(i)): x = resblock(x, channel, 1) return x with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False, nl=tf.identity): logits = (LinearWrap(image) # use explicit padding here, because our training framework has # different padding mechanisms from TensorFlow .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]]) .Conv2D('conv1', 64, 7, stride=2, padding='VALID', use_bias=True) .tf.pad([[0, 0], [1, 1], [1, 1], [0, 0]], 'SYMMETRIC') .MaxPooling('pool1', 3, 2, padding='VALID') .apply(group, 'conv2', 64, 2, 1) .apply(group, 'conv3', 128, 2, 2) .apply(group, 'conv4', 256, 2, 2) .apply(group, 'conv5', 512, 2, 2) .BatchNorm('lastbn') .apply(nonlin) .GlobalAvgPooling('gap') .tf.multiply(49) # this is due to a bug in our model design .FullyConnected('fct', 1000)()) tf.nn.softmax(logits, name='output') ImageNetModel.compute_loss_and_error(logits, label)
def build_graph(self, image, label): image = image / 255.0 if BITW == 't': fw, fa, fg = get_dorefa(32, 32, 32) fw = ternarize else: fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fct' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) with remap_variables(new_get_variable), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False): logits = (LinearWrap(image).Conv2D( 'conv0', 96, 12, strides=4, padding='VALID').apply(activate).Conv2D( 'conv1', 256, 5, padding='SAME', split=2).apply(fg).BatchNorm('bn1').MaxPooling( 'pool1', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv2', 384, 3).apply(fg).BatchNorm('bn2').MaxPooling( 'pool2', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv3', 384, 3, split=2).apply(fg). BatchNorm('bn3').apply(activate).Conv2D( 'conv4', 256, 3, split=2).apply(fg).BatchNorm('bn4').MaxPooling( 'pool4', 3, 2, padding='VALID').apply(activate).FullyConnected( 'fc0', 4096).apply(fg).BatchNorm('bnfc0'). apply(activate).FullyConnected( 'fc1', 4096, use_bias=False).apply(fg).BatchNorm('bnfc1').apply( nonlin).FullyConnected('fct', 1000, use_bias=True)()) tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6), name='regularize_cost') add_param_summary(('.*/W', ['histogram', 'rms'])) total_cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, total_cost) return total_cost
def _build_graph(self, inputs): image, label = inputs is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def activate(x): return fa(cabs(x)) image = image / 256.0 with remap_variables(binarize_weight), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False, nl=tf.identity): logits = (LinearWrap(image) .Conv2D('conv0', 48, 5, padding='VALID', use_bias=True) .MaxPooling('pool0', 2, padding='SAME') .apply(activate) # 18 .Conv2D('conv1', 64, 3, padding='SAME') .apply(fg) .BatchNorm('bn1').apply(activate) .Conv2D('conv2', 64, 3, padding='SAME') .apply(fg) .BatchNorm('bn2') .MaxPooling('pool1', 2, padding='SAME') .apply(activate) # 9 .Conv2D('conv3', 128, 3, padding='VALID') .apply(fg) .BatchNorm('bn3').apply(activate) # 7 .Conv2D('conv4', 128, 3, padding='SAME') .apply(fg) .BatchNorm('bn4').apply(activate) .Conv2D('conv5', 128, 3, padding='VALID') .apply(fg) .BatchNorm('bn5').apply(activate) # 5 .tf.nn.dropout(0.5 if is_training else 1.0) .Conv2D('conv6', 512, 5, padding='VALID') .apply(fg).BatchNorm('bn6') .apply(cabs) .FullyConnected('fc1', 10, nl=tf.identity)()) tf.nn.softmax(logits, name='output') # compute the number of failed samples wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) add_param_summary(('.*/W', ['histogram', 'rms'])) self.cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, self.cost)
def build_graph(self, image, label): """This function should build the model which takes the input variables (defined above) and return cost at the end.""" is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'fc0' in name or 'fc_out' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) #FIXMEreturn tf.clip_by_value(x, 0.0, 1.0) return tf.clip_by_value(x, -1.0, 1.0) def activate(x): return fa(nonlin(x)) # The context manager `argscope` sets the default option for all the layers under # this context. Here we use 32 channel convolution with shape 3x3 # See tutorial at https://tensorpack.readthedocs.io/tutorial/symbolic.html with remap_variables(binarize_weight), \ argscope(FullyConnected, use_bias=False), \ argscope(BatchNorm, momentum=0.1, epsilon=1e-4): # LinearWrap is just a syntax sugar. # See tutorial at https://tensorpack.readthedocs.io/tutorial/symbolic.html logits = ( LinearWrap(image).Dropout('dropout_in', rate=0.2 if is_training else 0.0) # hidden 0 .FullyConnected( 'fc0', n_units).BatchNorm('bn0').apply(activate).Dropout( 'dropout_hidden0', rate=0.5 if is_training else 0.0) # hidden 1 .FullyConnected( 'fc1', n_units).BatchNorm('bn1').apply(activate).Dropout( 'dropout_hidden1', rate=0.5 if is_training else 0.0) # hidden 2 .FullyConnected( 'fc2', n_units).BatchNorm('bn2').apply(activate).Dropout( 'dropout_hidden2', rate=0.5 if is_training else 0.0) # output layer .FullyConnected('fc_out', 10, activation=tf.identity)()) # a vector of length B with loss of each sample cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean( cost, name='cross_entropy_loss') # the average cross-entropy loss correct = tf.cast(tf.nn.in_top_k(predictions=logits, targets=label, k=1), tf.float32, name='correct') accuracy = tf.reduce_mean(correct, name='accuracy')