def resnet18_imagenet(image): with remap_variables(new_get_variable), \ argscope(Conv2D, use_bias=False, kernel_initializer=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')): # Note that this pads the image by [2, 3] instead of [3, 2]. # Similar things happen in later stride=2 layers as well. l = Conv2D('conv0', image, 64, 7, strides=2, activation=BNReLU) l = MaxPooling('pool0', l, pool_size=3, strides=2, padding='SAME') l = resnet_group('group0', l, resnet_basicblock, 64, 2, 1) l = activate(l) l = resnet_group('group1', l, resnet_basicblock, 128, 2, 2) l = activate(l) l = resnet_group('group2', l, resnet_basicblock, 256, 2, 2) l = activate(l) l = resnet_group('group3', l, resnet_basicblock, 512, 2, 2) l = GlobalAvgPooling('gap', l) logits = FullyConnected( 'linear', l, 1000, kernel_initializer=tf.random_normal_initializer( stddev=0.01)) # tmp = tf.trainable_variables() return logits
def weight_standardization_context(enable=True): """ Implement Centered Weight Normalization (http://openaccess.thecvf.com/content_ICCV_2017/papers/Huang_Centered_Weight_Normalization_ICCV_2017_paper.pdf) or Weight Standardization (https://arxiv.org/abs/1903.10520) Usage: with weight_standardization_context(): l = Conv2D('conv', l) ... """ if enable: def weight_standardization(v): if (not v.name.endswith('/W:0')) or v.shape.ndims != 4: return v mean, var = tf.nn.moments(v, [0, 1, 2], keep_dims=True) v = (v - mean) / (tf.sqrt(var) + 1e-5) return v with remap_variables(weight_standardization): yield else: yield
def get_logits(self, image): if BITW == 't': fw, fa, fg = get_dorefa(32, 32, 32) fw = ternarize else: fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fct' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) with remap_variables(new_get_variable), \ argscope([Conv2D, BatchNorm, MaxPooling], data_format='channels_first'), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False): logits = (LinearWrap(image).Conv2D( 'conv0', 96, 12, strides=4, padding='VALID', use_bias=True).apply(activate).Conv2D( 'conv1', 256, 5, padding='SAME', split=2).apply(fg).BatchNorm('bn1').MaxPooling( 'pool1', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv2', 384, 3).apply(fg).BatchNorm('bn2').MaxPooling( 'pool2', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv3', 384, 3, split=2).apply(fg). BatchNorm('bn3').apply(activate).Conv2D( 'conv4', 256, 3, split=2).apply(fg).BatchNorm('bn4').MaxPooling( 'pool4', 3, 2, padding='VALID').apply(activate).FullyConnected( 'fc0', 4096).apply(fg).BatchNorm('bnfc0'). apply(activate).FullyConnected( 'fc1', 4096, use_bias=False).apply(fg).BatchNorm('bnfc1').apply( nonlin).FullyConnected('fct', 1000, use_bias=True)()) add_param_summary(('.*/W', ['histogram', 'rms'])) tf.nn.softmax(logits, name='output') # for prediction return logits
def get_logits(self, image): def weight_standardization(v): if not self.use_WS: return v if (not v.name.endswith('/W:0')) or v.shape.ndims != 4: return v mean, var = tf.nn.moments(v, [0, 1, 2], keep_dims=True) v = (v - mean) / (tf.sqrt(var) + 1e-5) return v num_blocks = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3]}[self.depth] block_func = resnet_bottleneck with argscope([Conv2D, MaxPooling, GlobalAvgPooling], data_format=self.data_format), \ varreplace.remap_variables(weight_standardization): return resnet_backbone(image, num_blocks, resnet_group, block_func)
def weight_standardization_context(enable): if enable: def weight_standardization(v): if (not v.name.endswith('/W:0')) or v.shape.ndims != 4: return v print("WS on " + v.name) mean, std = tf.nn.moments(v, [0, 1, 2], keep_dims=True) v = (v - mean) / (std + 1e-5) return v with remap_variables(weight_standardization): yield else: yield
def alexnet(image): with remap_variables(new_get_variable), \ argscope([Conv2D, BatchNorm, MaxPooling], data_format='channels_first'), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False): logits = ( LinearWrap(image).Conv2D( 'conv0', 96, 12, strides=4, padding='VALID', use_bias=True).apply(fg).Conv2D( 'conv1', 256, 5, padding='SAME', split=2).apply(fg).BatchNorm('bn1').MaxPooling( 'pool1', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv2', 384, 3).apply(fg).BatchNorm('bn2').MaxPooling( 'pool2', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv3', 384, 3, split=2). apply(fg).BatchNorm('bn3').apply(activate).Conv2D( 'conv4', 256, 3, split=2).apply(fg).BatchNorm('bn4').MaxPooling( 'pool4', 3, 2, padding='VALID').apply(activate).FullyConnected( 'fc0', 4096).apply(fg).BatchNorm('bnfc0').apply( activate).FullyConnected('fc1', 4096, use_bias=False). apply(fg).BatchNorm('bnfc1').apply(nonlin).FullyConnected( 'fct', self.class_num, use_bias=True)()) return logits
def build_model(input_quant_wei_layer, intput_quant_wei_lambda, input_quant_wei_delta, input_quant_wei_levels): global id_target_quant_layer global q_lambda global q_delta global num_quant_levels id_target_quant_layer = input_quant_wei_layer q_lambda = intput_quant_wei_lambda q_delta = input_quant_wei_delta num_quant_levels = input_quant_wei_levels with tf.name_scope('main_params'): global_step = tf.Variable(initial_value=0, trainable=False, name='global_step') learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate') with remap_variables(quant_wei_uni_dead_zone), tf.variable_scope( 'conv1') as scope: conv1 = tf.layers.conv2d( inputs=data_x, filters=32, kernel_size=[1, 64], padding='VALID', use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(WEIGHT_DECAY)) bn1 = tf.layers.batch_normalization(conv1, training=isTraining) relu1 = tf.nn.relu(bn1) with remap_variables(quant_wei_uni_dead_zone), tf.variable_scope( 'conv2') as scope: conv2 = tf.layers.conv2d( inputs=relu1, filters=64, kernel_size=[1, 32], padding='VALID', use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(WEIGHT_DECAY)) bn2 = tf.layers.batch_normalization(conv2, training=isTraining) relu2 = tf.nn.relu(bn2) with remap_variables(quant_wei_uni_dead_zone), tf.variable_scope( 'conv3') as scope: conv3 = tf.layers.conv2d( inputs=relu2, filters=128, kernel_size=[1, 16], padding='VALID', use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(WEIGHT_DECAY)) bn3 = tf.layers.batch_normalization(conv3, training=isTraining) relu3 = tf.nn.relu(bn3) with remap_variables(quant_wei_uni_dead_zone), tf.variable_scope( 'fully_connected') as scope: flat = tf.layers.flatten(relu3) logits = tf.layers.dense( inputs=flat, units=NUM_CLASSES, name=scope.name, use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(WEIGHT_DECAY)) y_pred_cls = tf.argmax(logits, axis=1) gtlabel = tf.one_hot(label_y, NUM_CLASSES) # LOSS AND OPTIMIZER cross_entropy_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=gtlabel)) loss = cross_entropy_loss + tf.add_n( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-3).minimize( loss, global_step=global_step) # PREDICTION AND ACCURACY CALCULATION def get_eval_op(preds, labels): correct_prediction = tf.equal(preds, labels) return tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) eval_op = get_eval_op(y_pred_cls, label_y) return loss, optimizer, eval_op, global_step, learning_rate
def _build_graph(self, inputs): image, label, ious, ious_weights, valids, bndboxes = inputs image = tf.round(image) fw, fa, fg = get_dorefa(BITW, BITA, BITG) old_get_variable = tf.get_variable def monitor(x, name): if MONITOR == 1: return tf.Print(x, [x], message='\n\n' + name + ': ', summarize=1000, name=name) else: return x def new_get_variable(v): name = v.op.name # if not name.endswith('W') or 'conv1' in name or 'conv_obj' in name or 'conv_box' in name: if not name.endswith( 'W') or 'conv_obj' in name or 'conv_box' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) if MONITOR == 1: return tf.Print(fw(v), [fw(v)], message='\n\n' + v.name + ', Quantized weights are:', summarize=100) else: return fw(v) def activate(x): if BITA == 32: return tf.nn.relu(x) else: return fa(tf.nn.relu(x)) def bn_activate(name, x): x = BatchNorm(name, x) x = monitor(x, name + '_noact_out') return activate(x) def halffire(name, x, num_squeeze_filters, num_expand_3x3_filters, skip): out_squeeze = Conv2D('squeeze_conv_' + name, x, out_channel=num_squeeze_filters, kernel_shape=1, stride=1, padding='SAME') out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze) out_expand_3x3 = Conv2D('expand_3x3_conv_' + name, out_squeeze, out_channel=num_expand_3x3_filters, kernel_shape=3, stride=1, padding='SAME') out_expand_3x3 = bn_activate('bn_expand_3x3_' + name, out_expand_3x3) if skip == 0: return out_expand_3x3 else: return tf.add(x, out_expand_3x3) def halffire_noact(name, x, num_squeeze_filters, num_expand_3x3_filters): out_squeeze = Conv2D('squeeze_conv_' + name, x, out_channel=num_squeeze_filters, kernel_shape=1, stride=1, padding='SAME') out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze) out_expand_3x3 = Conv2D('expand_3x3_conv_' + name, out_squeeze, out_channel=num_expand_3x3_filters, kernel_shape=3, stride=1, padding='SAME') return out_expand_3x3 def halffire_final(l, name): l = halffire('fire4' + name, l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = halffire('fire5' + name, l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = halffire('fire6' + name, l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = halffire('fire7' + name, l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) return l def decision(l, name): classify = Conv2D('conv_class' + name, l, out_channel=12, kernel_shape=1, stride=1, padding='SAME') classify = bn_activate('bn_class' + name, classify) classify = monitor(classify, 'conv_class_out' + name) logits = GlobalAvgPooling('pool_class' + name, classify) l = tf.concat([l, classify], axis=3) objdetect = Conv2D('conv_obj' + name, l, out_channel=1, kernel_shape=1, stride=1, padding='SAME') bndbox = Conv2D('conv_box' + name, l, out_channel=4, kernel_shape=1, stride=1, padding='SAME') return logits, objdetect, bndbox def first_layer(x): l = Conv2D('conv1', x, out_channel=16, kernel_shape=3, stride=1, padding='SAME') l = bn_activate('bn1', l) l = monitor(l, 'conv1_out') return l with remap_variables(new_get_variable), \ argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4): image = monitor(image, 'image_out') if DEMO_DATASET == 0: l = first_layer(image) else: l = tf.stop_gradient(first_layer(image)) l = MaxPooling('pool1', l, shape=3, stride=2, padding='SAME') l = monitor(l, 'pool1_out') l = halffire('fire1', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire1_out') l = MaxPooling('pool2', l, shape=3, stride=2, padding='SAME') l = monitor(l, 'pool2_out') l = halffire('fire2', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire2_out') l = MaxPooling('pool3', l, shape=3, stride=2, padding='SAME') l = monitor(l, 'pool3_out') l = halffire('fire3', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire3_out') l = MaxPooling('pool4', l, shape=3, stride=2, padding='SAME') l = monitor(l, 'pool4_out') l1 = halffire_final(l, '1') l1 = monitor(l1, 'final1_out') l2 = halffire_final(l, '2') l2 = monitor(l2, 'final2_out') l3 = halffire_final(l, '3') l3 = monitor(l3, 'final3_out') l4 = halffire_final(l, '4') l4 = monitor(l4, 'final4_out') logits1, objdetect1, bndbox1 = decision(l1, '1') logits2, objdetect2, bndbox2 = decision(l2, '2') logits3, objdetect3, bndbox3 = decision(l3, '3') logits4, objdetect4, bndbox4 = decision(l4, '4') # Classification logits = (logits1 + logits2 + logits3 + logits4) / 4 class_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label), name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) # Object Detection objdetect = (objdetect1 + objdetect2 + objdetect3 + objdetect4) / 4 objdetect = tf.identity(objdetect, name='objdetect_out') objdetect_loss = tf.losses.hinge_loss(labels=ious, logits=objdetect, weights=ious_weights) bndbox = (bndbox1 + bndbox2 + bndbox3 + bndbox4) / 4 bndbox = tf.identity(bndbox, name='bndbox_out') bndbox_loss = tf.losses.mean_squared_error(labels=bndboxes, predictions=tf.multiply( bndbox, valids, name='mult0')) if DEMO_DATASET == 0: cost = class_loss + 5 * objdetect_loss + bndbox_loss else: cost = 1000 * objdetect_loss + bndbox_loss add_moving_summary(class_loss, objdetect_loss, bndbox_loss, cost) self.cost = cost tf.get_variable = old_get_variable
def _build_graph(self, inputs): image, label = inputs is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def activate(x): return fa(cabs(x)) image = image / 256.0 with remap_variables(binarize_weight), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False, nl=tf.identity): logits = ( LinearWrap(image).Conv2D('conv0', 48, 5, padding='VALID', use_bias=True).MaxPooling( 'pool0', 2, padding='SAME').apply(activate) # 18 .Conv2D('conv1', 64, 3, padding='SAME').apply(fg).BatchNorm( 'bn1').apply(activate).Conv2D( 'conv2', 64, 3, padding='SAME').apply(fg).BatchNorm('bn2').MaxPooling( 'pool1', 2, padding='SAME').apply(activate) # 9 .Conv2D( 'conv3', 128, 3, padding='VALID').apply(fg).BatchNorm('bn3').apply(activate) # 7 .Conv2D('conv4', 128, 3, padding='SAME').apply(fg). BatchNorm('bn4').apply(activate).Conv2D( 'conv5', 128, 3, padding='VALID').apply(fg).BatchNorm('bn5').apply(activate) # 5 .tf.nn.dropout(0.5 if is_training else 1.0).Conv2D( 'conv6', 512, 5, padding='VALID').apply(fg).BatchNorm( 'bn6').apply(cabs).FullyConnected('fc1', 10, nl=tf.identity)()) tf.nn.softmax(logits, name='output') # compute the number of failed samples wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) add_param_summary(('.*/W', ['histogram', 'rms'])) self.cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, self.cost)
def resnet18_cifar(input_tensor, is_training=True, pooling_and_fc=True, reuse=False, kernel_initializer=tf.contrib.layers. variance_scaling_initializer()): with remap_variables(new_get_variable): x = tf.layers.conv2d(input_tensor, 64, (3, 3), strides=(1, 1), kernel_initializer=kernel_initializer, use_bias=False, padding='SAME', name='conv1_1/3x3_s1', reuse=reuse) x = tf.layers.batch_normalization(x, training=is_training, name='bn1_1/3x3_s1', reuse=reuse) x = tf.nn.relu(x) x1 = identity_block2d(x, 3, [48, 64, 64], stage=2, block='1b', is_training=is_training, reuse=reuse, kernel_initializer=kernel_initializer) x1 = identity_block2d(x1, 3, [48, 64, 64], stage=3, block='1c', is_training=is_training, reuse=reuse, kernel_initializer=kernel_initializer) x2 = conv_block_2d(x1, 3, [96, 128, 128], stage=3, block='2a', strides=(2, 2), is_training=is_training, reuse=reuse, kernel_initializer=kernel_initializer) x2 = activate(x2) x2 = identity_block2d(x2, 3, [96, 128, 128], stage=3, block='2b', is_training=is_training, reuse=reuse, kernel_initializer=kernel_initializer) x3 = conv_block_2d(x2, 3, [128, 256, 256], stage=4, block='3a', strides=(2, 2), is_training=is_training, reuse=reuse, kernel_initializer=kernel_initializer) x3 = activate(x3) x3 = identity_block2d(x3, 3, [128, 256, 256], stage=4, block='3b', is_training=is_training, reuse=reuse, kernel_initializer=kernel_initializer) x4 = conv_block_2d(x3, 3, [256, 512, 512], stage=5, block='4a', strides=(2, 2), is_training=is_training, reuse=reuse, kernel_initializer=kernel_initializer) x4 = activate(x4) x4 = identity_block2d(x4, 3, [256, 512, 512], stage=5, block='4b', is_training=is_training, reuse=reuse, kernel_initializer=kernel_initializer) print('before gap: ', x4) x4 = tf.reduce_mean(x4, [1, 2]) print('after gap: ', x4) # flatten = tf.contrib.layers.flatten(x4) prob = tf.layers.dense( x4, self.class_num, reuse=reuse, kernel_initializer=tf.contrib.layers.xavier_initializer()) # tmp = tf.trainable_variables() # prob = tf.layers.batch_normalization(prob, training=is_training, name='fbn', reuse=reuse) print('prob', prob) return prob
def _build_graph(self, inputs): conf = Config() is_training = get_current_tower_context().is_training input, nextinput = inputs initializer = tf.random_uniform_initializer(-conf.init_scale, conf.init_scale) def get_basic_cell(): # cell = rnn.BasicLSTMCell(num_units=conf.hidden_size, forget_bias=0.0, reuse=tf.get_variable_scope().reuse) cell = ttq_rnn.TtqLSTMCell( num_units=conf.hidden_size, thre=0.05, #) forget_bias=1.0, reuse=tf.get_variable_scope().reuse) if is_training and conf.keep_prob < 1: cell = rnn.DropoutWrapper(cell, output_keep_prob=conf.keep_prob) return cell cell = rnn.MultiRNNCell( [get_basic_cell() for _ in range(conf.num_layers)]) def get_v(n): return tf.get_variable( n, [conf.batch_size, conf.hidden_size], #,[BATCH, HIDDEN_SIZE], trainable=False, initializer=tf.constant_initializer()) def replace_w(x): if x.op.name.endswith('W'): print("\nBefore quantize name: " + x.op.name) return tw_ternarize(x, 0.05) # tanh to round to [-1,+1] #return bit_utils.quantize_w(tf.tanh(x), bit=self._w_bit) elif x.op.name.endswith('b'): print("\nBefore quantize name: " + x.op.name) return tw_ternarize_bias(x, 0.05) else: print("\nNOT Quantizing:" + x.op.name) print(x.shape) print(type(x)) tf.summary.histogram(x.name, x) return x # Parameters of gates are concatenated into one multiply for efficiency. # with bit_utils.replace_variable(replace_w): self.state = state_var = \ (rnn.LSTMStateTuple(get_v('c0'), get_v('h0')), rnn.LSTMStateTuple(get_v('c1'), get_v('h1'))) embeddingW = tf.get_variable( 'embedding', [conf.vocab_size, conf.hidden_size], initializer=initializer) #tf.random_uniform_initializer()) input_feature = tf.nn.embedding_lookup( embeddingW, input) # B x seqlen x hiddensize if is_training and conf.keep_prob < 1: input_feature = Dropout(input_feature, conf.keep_prob) print("\n\nThe STATE:") print(self.state) with tf.variable_scope('LSTM', initializer=initializer): input_list = tf.unstack(input_feature, num=conf.num_steps, axis=1) # seqlen x (Bxhidden) outputs, last_state = rnn.static_rnn(cell, input_list, state_var, scope='rnn') update_state_ops = [] for k in range(conf.num_layers): update_state_ops.extend([ tf.assign(state_var[k].c, last_state[k].c), tf.assign(state_var[k].h, last_state[k].h) ]) # seqlen x (Bxrnnsize) output = tf.reshape(tf.concat(outputs, 1), [-1, conf.hidden_size]) # (Bxseqlen) x hidden with varreplace.remap_variables(replace_w): logits = FullyConnected('fc', output, conf.vocab_size, nl=tf.identity, W_init=initializer, b_init=initializer) xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=tf.reshape(nextinput, [-1])) with tf.control_dependencies(update_state_ops): self.cost = tf.truediv(tf.reduce_sum(xent_loss), tf.cast(conf.batch_size, tf.float32), name='cost') # log-perplexity perpl = tf.exp(self.cost / conf.num_steps, name='perplexity') summary.add_moving_summary(perpl, self.cost)
def _build_graph(self, inputs): image, label = inputs is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def activate(x): return fa(cabs(x)) image = image / 256.0 with remap_variables(binarize_weight), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False, nl=tf.identity): logits = (LinearWrap(image) .Conv2D('conv0', 48, 5, padding='VALID', use_bias=True) .MaxPooling('pool0', 2, padding='SAME') .apply(activate) # 18 .Conv2D('conv1', 64, 3, padding='SAME') .apply(fg) .BatchNorm('bn1').apply(activate) .Conv2D('conv2', 64, 3, padding='SAME') .apply(fg) .BatchNorm('bn2') .MaxPooling('pool1', 2, padding='SAME') .apply(activate) # 9 .Conv2D('conv3', 128, 3, padding='VALID') .apply(fg) .BatchNorm('bn3').apply(activate) # 7 .Conv2D('conv4', 128, 3, padding='SAME') .apply(fg) .BatchNorm('bn4').apply(activate) .Conv2D('conv5', 128, 3, padding='VALID') .apply(fg) .BatchNorm('bn5').apply(activate) # 5 .tf.nn.dropout(0.5 if is_training else 1.0) .Conv2D('conv6', 512, 5, padding='VALID') .apply(fg).BatchNorm('bn6') .apply(cabs) .FullyConnected('fc1', 10, nl=tf.identity)()) tf.nn.softmax(logits, name='output') # compute the number of failed samples wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) add_param_summary(('.*/W', ['histogram', 'rms'])) self.cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, self.cost)
def _build_graph(self, inputs): image, label = inputs image = tf.expand_dims(image, 3) image = image * 2 - 1 # center the pixels values at zero is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def activate(x): return fa(cabs(x)) with remap_variables(binarize_weight), \ argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32): logits = (LinearWrap(image).Conv2D('conv0').MaxPooling( 'pool0', 2).apply(activate).Conv2D('conv1').apply(fg).Conv2D( 'conv2').apply(fg).MaxPooling('pool1', 2).apply(activate). Conv2D('conv3').apply(fg).apply(cabs).FullyConnected( 'fc0', 512, activation=tf.nn.relu).Dropout( 'dropout', 0.5).FullyConnected('fc1', 10, activation=tf.identity)()) tf.nn.softmax(logits, name='output') # a vector of length B with loss of each sample cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean( cost, name='cross_entropy_loss') # the average cross-entropy loss correct = tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32, name='correct') accuracy = tf.reduce_mean(correct, name='accuracy') # This will monitor training error (in a moving_average fashion): # 1. write the value to tensosrboard # 2. write the value to stat.json # 3. print the value after each epoch train_error = tf.reduce_mean(1 - correct, name='train_error') summary.add_moving_summary(train_error, accuracy) # Use a regex to find parameters to apply weight decay. # Here we apply a weight decay on all W (weight matrix) of all fc layers wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') self.cost = tf.add_n([wd_cost, cost], name='total_cost') summary.add_moving_summary(cost, wd_cost, self.cost) # monitor histogram of all weight (of conv and fc layers) in tensorboard summary.add_param_summary(('.*/W', ['histogram', 'rms']))
def _build_graph(self, inputs): image, label = inputs image = image / 256.0 fw, fa, fg = get_dorefa(BITW, BITA, BITG) def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv1' in name or 'fct' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) def resblock(x, channel, stride): def get_stem_full(x): return (LinearWrap(x) .Conv2D('c3x3a', channel, 3) .BatchNorm('stembn') .apply(activate) .Conv2D('c3x3b', channel, 3)()) channel_mismatch = channel != x.get_shape().as_list()[3] if stride != 1 or channel_mismatch or 'pool1' in x.name: # handling pool1 is to work around an architecture bug in our model if stride != 1 or 'pool1' in x.name: x = AvgPooling('pool', x, stride, stride) x = BatchNorm('bn', x) x = activate(x) shortcut = Conv2D('shortcut', x, channel, 1) stem = get_stem_full(x) else: shortcut = x x = BatchNorm('bn', x) x = activate(x) stem = get_stem_full(x) return shortcut + stem def group(x, name, channel, nr_block, stride): with tf.variable_scope(name + 'blk1'): x = resblock(x, channel, stride) for i in range(2, nr_block + 1): with tf.variable_scope(name + 'blk{}'.format(i)): x = resblock(x, channel, 1) return x with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False, nl=tf.identity): logits = (LinearWrap(image) # use explicit padding here, because our training framework has # different padding mechanisms from TensorFlow .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]]) .Conv2D('conv1', 64, 7, stride=2, padding='VALID', use_bias=True) .tf.pad([[0, 0], [1, 1], [1, 1], [0, 0]], 'SYMMETRIC') .MaxPooling('pool1', 3, 2, padding='VALID') .apply(group, 'conv2', 64, 2, 1) .apply(group, 'conv3', 128, 2, 2) .apply(group, 'conv4', 256, 2, 2) .apply(group, 'conv5', 512, 2, 2) .BatchNorm('lastbn') .apply(nonlin) .GlobalAvgPooling('gap') .tf.multiply(49) # this is due to a bug in our model design .FullyConnected('fct', 1000)()) tf.nn.softmax(logits, name='output') ImageNetModel.compute_loss_and_error(logits, label)
def _build_graph(self, inputs): image, label = inputs image = image / 256.0 fw, fa = get_quantize(BITW, BITA) old_get_variable = tf.get_variable def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv1' in name or 'fct' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x * 0.1)) def resblock(x, channel, stride, bottleneck_dividend=4, stem_type='full'): bottleneck_channel_s = channel // bottleneck_dividend def get_stem_bottleneck(x): return (LinearWrap(x).Conv2D( 'c1x1shrink', bottleneck_channel_s, 1).BatchNorm('stembn1').apply(activate).Conv2D( 'c3x3', bottleneck_channel_s, 3).BatchNorm('stembn2').apply(activate).Conv2D( 'c1x1expand', channel, 1)()) def get_stem_full(x): return (LinearWrap(x).Conv2D( 'c3x3a', channel, 3).BatchNorm('stembn').apply(activate).Conv2D( 'c3x3b', channel, 3)()) get_stem = dict(bottleneck=get_stem_bottleneck, full=get_stem_full)[stem_type] channel_mismatch = channel != x.get_shape().as_list()[3] if stride != 1 or channel_mismatch or 'pool1' in x.name: # handling pool1 is to work around an architecture bug in our model if stride != 1 or 'pool1' in x.name: x = AvgPooling('pool', x, stride, stride) x = BatchNorm('bn', x) x = activate(x) shortcut = Conv2D('shortcut', x, channel, 1) stem = get_stem(x) else: shortcut = x x = BatchNorm('bn', x) x = activate(x) stem = get_stem(x) return shortcut + stem def group(x, name, channel, nr_block, stride): with tf.variable_scope(name + 'blk1'): x = resblock(x, channel, stride) for i in range(2, nr_block + 1): with tf.variable_scope(name + 'blk{}'.format(i)): x = resblock(x, channel, 1) return x with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False, nl=tf.identity): logits = ( LinearWrap(image) # use explicit padding here, because our training framework has # different padding mechanisms from TensorFlow .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]]).Conv2D( 'conv1', 64, 7, stride=2, padding='VALID', use_bias=True).tf.pad( [[0, 0], [1, 1], [1, 1], [0, 0]], 'SYMMETRIC').MaxPooling( 'pool1', 3, 2, padding='VALID').apply( group, 'conv2', 64, 2, 1).apply(group, 'conv3', 128, 2, 2).apply( group, 'conv4', 256, 2, 2).apply(group, 'conv5', 512, 2, 2).BatchNorm('lastbn'). apply(nonlin).GlobalAvgPooling('gap').tf.multiply( 49) # this is due to a bug in our model design .FullyConnected('fct', 1000)()) prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6), name='regularize_cost') add_param_summary(('.*/W', ['histogram', 'rms'])) self.cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, self.cost)
def _build_graph(self, inputs): image, label = inputs image = image / 255.0 fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fct' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity): logits = (LinearWrap(image) .Conv2D('conv0', 96, 12, stride=4, padding='VALID') .apply(activate) .Conv2D('conv1', 256, 5, padding='SAME', split=2) .apply(fg) .BatchNorm('bn1') .MaxPooling('pool1', 3, 2, padding='SAME') .apply(activate) .Conv2D('conv2', 384, 3) .apply(fg) .BatchNorm('bn2') .MaxPooling('pool2', 3, 2, padding='SAME') .apply(activate) .Conv2D('conv3', 384, 3, split=2) .apply(fg) .BatchNorm('bn3') .apply(activate) .Conv2D('conv4', 256, 3, split=2) .apply(fg) .BatchNorm('bn4') .MaxPooling('pool4', 3, 2, padding='VALID') .apply(activate) .FullyConnected('fc0', 4096) .apply(fg) .BatchNorm('bnfc0') .apply(activate) .FullyConnected('fc1', 4096) .apply(fg) .BatchNorm('bnfc1') .apply(nonlin) .FullyConnected('fct', 1000, use_bias=True)()) tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6), name='regularize_cost') add_param_summary(('.*/W', ['histogram', 'rms'])) self.cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, self.cost)
def _build_graph(self, inputs): image, label = inputs image = image / 255.0 fw, fa = get_quantize(BITW, BITA) old_get_variable = tf.get_variable # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fct' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x * 0.1)) def inception_bn(name, x, nr_c0_conv_1x1, nr_c1_conv_1x1, nr_c1_conv_3x3, nr_c2_conv_1x1, nr_c2_conv_5x5, nr_c3_conv_1x1, nonlinearity=tf.nn.relu, internal_nonlin=None, do_proc=True): if internal_nonlin is None: internal_nonlin = nonlinearity outputs = [] with tf.variable_scope(name) as scope: c0 = Conv2D('column_0_conv_1x1', x, nr_c0_conv_1x1, 1) c0 = BatchNorm('bn_0_1x1', c0) if do_proc: c0 = activate(c0) outputs.append(c0) c1_1x1 = Conv2D('column_1_conv_1x1', x, nr_c1_conv_1x1, 1) c1_1x1 = BatchNorm('bn_1_1x1', c1_1x1) c1_1x1 = activate(c1_1x1) c1_3x3 = Conv2D('column_1_conv_3x3', c1_1x1, nr_c1_conv_3x3, 3) c1_3x3 = BatchNorm('bn_1_3x3', c1_3x3) if do_proc: c1_3x3 = activate(c1_3x3) outputs.append(c1_3x3) c2_1x1 = Conv2D('column_2_conv_1x1', x, nr_c2_conv_1x1, 1) c2_1x1 = BatchNorm('bn_2_1x1', c2_1x1) c2_1x1 = activate(c2_1x1) c2_5x5 = Conv2D('column_2_conv_5x5', c2_1x1, nr_c2_conv_5x5, 5) c2_5x5 = BatchNorm('bn_2_5x5', c2_5x5) if do_proc: c2_5x5 = activate(c2_5x5) outputs.append(c2_5x5) c3_maxpool = MaxPooling('column_3_maxpool', x, 3, 1, padding='SAME') c3_1x1 = Conv2D('column_3_conv_1x1', c3_maxpool, nr_c3_conv_1x1, 1) c3_1x1 = BatchNorm('bn_3_1x1', c3_1x1) if do_proc: c3_1x1 = activate(c3_1x1) outputs.append(c3_1x1) return tf.concat(outputs, 3, name='concat') with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity): nl = tf.identity l = (LinearWrap(image).Conv2D( 'conv1_1', 64, 7, stride=2, padding='SAME').BatchNorm('bn1_1').MaxPooling( 'pool1', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv2_1', 64, 1, padding='SAME').BatchNorm( 'bn2_1').apply(activate).Conv2D( 'conv2_2', 192, 3, padding='SAME').BatchNorm('bn2_2').MaxPooling( 'pool2', 3, 2, padding='SAME').apply(activate)()) l = inception_bn('inception_3_1', l, 64, 96, 128, 16, 32, 32, nl) l = inception_bn('inception_3_2', l, 128, 128, 192, 32, 96, 64, nl) l = MaxPooling('pool3', l, 3, 2, padding='SAME') l = inception_bn('inception_4_1', l, 192, 96, 208, 16, 48, 64, nl) l = inception_bn('inception_4_2', l, 160, 112, 224, 24, 64, 64, nl) l = inception_bn('inception_4_3', l, 128, 128, 256, 24, 64, 64, nl) l = inception_bn('inception_4_4', l, 112, 144, 288, 32, 64, 64, nl) l = inception_bn('inception_4_5', l, 256, 160, 320, 32, 128, 128, nl) l = MaxPooling('pool4', l, 3, 2, padding='SAME') l = inception_bn('inception_5_1', l, 256, 160, 320, 32, 128, 128, nl) l = inception_bn('inception_5_2', l, 384, 192, 384, 48, 128, 128, nl, do_proc=False) l = GlobalAvgPooling('gap', l) l = activate(l) l = FullyConnected('fct', l, 1000, use_bias=True) logits = l prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6), name='regularize_cost') add_param_summary(('.*/W', ['histogram', 'rms'])) self.cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, self.cost)
def get_logits(self, image): if BITW == 't': fw, fa, fg = get_dorefa(32, 32, 32) fw = ternarize else: fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fct' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) with remap_variables(new_get_variable), \ argscope([Conv2D, BatchNorm, MaxPooling], data_format='channels_first'), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False): logits = (LinearWrap(image) .Conv2D('conv0', 96, 12, strides=4, padding='VALID', use_bias=True) .apply(activate) .Conv2D('conv1', 256, 5, padding='SAME', split=2) .apply(fg) .BatchNorm('bn1') .MaxPooling('pool1', 3, 2, padding='SAME') .apply(activate) .Conv2D('conv2', 384, 3) .apply(fg) .BatchNorm('bn2') .MaxPooling('pool2', 3, 2, padding='SAME') .apply(activate) .Conv2D('conv3', 384, 3, split=2) .apply(fg) .BatchNorm('bn3') .apply(activate) .Conv2D('conv4', 256, 3, split=2) .apply(fg) .BatchNorm('bn4') .MaxPooling('pool4', 3, 2, padding='VALID') .apply(activate) .FullyConnected('fc0', 4096) .apply(fg) .BatchNorm('bnfc0') .apply(activate) .FullyConnected('fc1', 4096, use_bias=False) .apply(fg) .BatchNorm('bnfc1') .apply(nonlin) .FullyConnected('fct', 1000, use_bias=True)()) add_param_summary(('.*/W', ['histogram', 'rms'])) tf.nn.softmax(logits, name='output') # for prediction return logits
def build_graph(self, image, label): is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) #获取对三个参量量化的函数变量 # monkey-patch tf.get_variable to apply fw def binarize_weight(v): #注意,对模型的第一层和最后一层,一般是不做任何量化的。 name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): #这里是clip_Relu if BITA == 32: return tf.nn.relu(x) return tf.clip_by_value(x, 0.0, 1.0) def activate(x): #这里是对A先做clip_Relu,再做量化 return fa(nonlin(x)) image = image / 256.0 with remap_variables(binarize_weight), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4,center=True, scale=True,), \ argscope(Conv2D, use_bias=False):#这行代码是对所有的variables对过binarize_weight函数、设置BN和Conv的参数 logits = ( LinearWrap(image) #LinearWrap用来搭建线性模型,其中apply的是函数句柄,可以向其中传递参数; .Conv2D('conv0', 48, 5, padding='VALID', use_bias=True ) #conv0 input:[none,40,40,3] output:[none,36,36,48] .MaxPooling('pool0', 2, padding='SAME' ) #pooling input[none:36,36,48] output:[18,18,48] .apply(activate) #对Activation进行量化。 # 18 .Conv2D('conv1', 64, 3, padding='SAME' ) #input[none,18,18,48] output[none,18,18,64] .apply(fg) #对导数进行量化 .BatchNorm('bn1').apply(activate).Conv2D( 'conv2', 64, 3, padding='SAME' ) #input[none 18,18,64] output[none,18,18,64] .apply(fg).BatchNorm('bn2').MaxPooling( 'pool1', 2, padding='SAME') #input[none,18,18,64] output[none,9,9,64] .apply(activate) # 9 .Conv2D( 'conv3', 128, 3, padding='VALID') #input[none,9,9,64] output[none,7,7,128] .apply(fg).BatchNorm('bn3').apply(activate) # 7 .Conv2D( 'conv4', 128, 3, padding='SAME') #input[none,7,7,128] output[none,7,7,128] .apply(fg).BatchNorm('bn4').apply(activate).Conv2D( 'conv5', 128, 3, padding='VALID') #input[none,7,7,128] output[none,5,5,128] .apply(fg).BatchNorm('bn5').apply(activate) # 5 .Dropout(rate=0.5 if is_training else 0.0).Conv2D( 'conv6', 512, 5, padding='VALID') #input[none,5,5,128] output[none,1,1,512] .apply(fg).BatchNorm('bn6').apply( nonlin) #这里只做了clip_relu.并没有过量化。 .FullyConnected('fc1', 10)()) #fc1 output[none,10] tf.nn.softmax(logits, name='output') # compute the number of failed samples wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='wrong_tensor') # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) add_param_summary(('.*/W', ['histogram', 'rms'])) total_cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, total_cost) return total_cost
def _build_graph(self, inputs): image, label, ious, valids, bndboxes = inputs image = tf.round(image) fw, fa, fg = get_dorefa(BITW, BITA, BITG) old_get_variable = tf.get_variable def monitor(x, name): if MONITOR == 1: return tf.Print(x, [x], message='\n\n' + name + ': ', summarize=1000, name=name) else: return x def new_get_variable(v): name = v.op.name if not name.endswith('W') or 'conv1' in name or 'conv_obj' in name or 'conv_box' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) if MONITOR == 1: return tf.Print(fw(v), [fw(v)], message='\n\n' + v.name + ', Quantized weights are:', summarize=100) else: return fw(v) def activate(x): if BITA == 32: return tf.nn.relu(x) else: return fa(tf.nn.relu(x)) def bn_activate(name, x): x = BatchNorm(name, x) x = monitor(x, name + '_noact_out') return activate(x) def halffire(name, x, num_squeeze_filters, num_expand_3x3_filters, skip): out_squeeze = Conv2D('squeeze_conv_' + name, x, out_channel=num_squeeze_filters, kernel_shape=1, stride=1, padding='SAME') out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze) out_expand_3x3 = Conv2D('expand_3x3_conv_' + name, out_squeeze, out_channel=num_expand_3x3_filters, kernel_shape=3, stride=1, padding='SAME') out_expand_3x3 = bn_activate('bn_expand_3x3_' + name, out_expand_3x3) if skip == 0: return out_expand_3x3 else: return tf.add(x, out_expand_3x3) def halffire_noact(name, x, num_squeeze_filters, num_expand_3x3_filters): out_squeeze = Conv2D('squeeze_conv_' + name, x, out_channel=num_squeeze_filters, kernel_shape=1, stride=1, padding='SAME') out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze) out_expand_3x3 = Conv2D('expand_3x3_conv_' + name, out_squeeze, out_channel=num_expand_3x3_filters, kernel_shape=3, stride=1, padding='SAME') return out_expand_3x3 with remap_variables(new_get_variable), \ argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4): image = monitor(image, 'image_out') l = Conv2D('conv1', image, out_channel=32, kernel_shape=3, stride=2, padding='SAME') l = bn_activate('bn1', l) l = monitor(l, 'conv1_out') l = MaxPooling('pool1', l, shape=3, stride=2, padding='SAME') l = monitor(l, 'pool1_out') l = halffire('fire1', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire1_out') l = MaxPooling('pool2', l, shape=3, stride=2, padding='SAME') l = monitor(l, 'pool2_out') l = halffire('fire2', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire2_out') l = MaxPooling('pool3', l, shape=3, stride=2, padding='SAME') l = monitor(l, 'pool3_out') l = halffire('fire3', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire3_out') l = halffire('fire4', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire4_out') l = halffire('fire5', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire5_out') l = halffire('fire6', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire6_out') l = halffire('fire7', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0) l = monitor(l, 'fire7_out') # Classification classify = Conv2D('conv_class', l, out_channel=12, kernel_shape=1, stride=1, padding='SAME') classify = bn_activate('bn_class', classify) classify = monitor(classify, 'conv_class_out') logits = GlobalAvgPooling('pool_class', classify) class_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) class_loss = tf.reduce_mean(class_loss, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) # Object Detection l = tf.concat([l, classify], axis=3) objdetect = Conv2D('conv_obj', l, out_channel=1, kernel_shape=1, stride=1, padding='SAME') objdetect = tf.identity(objdetect, name='objdetect_out') objdetect_loss = tf.losses.hinge_loss(labels=ious, logits=objdetect) bndbox = Conv2D('conv_box', l, out_channel=4, kernel_shape=1, stride=1, padding='SAME') bndbox = tf.identity(bndbox, name='bndbox_out') bndbox = tf.multiply(bndbox, valids, name='mult0') bndbox_loss = tf.losses.mean_squared_error(labels=bndboxes, predictions=bndbox) # weight decay on all W of fc layers # reg_cost = regularize_cost('(fire7|conv_obj|conv_box).*/W', l2_regularizer(1e-5), name='regularize_cost') # cost = class_loss*objdetect_loss*bndbox_loss # cost = class_loss + objdetect_loss + bndbox_loss + reg_cost cost = class_loss + 10*objdetect_loss + bndbox_loss add_moving_summary(class_loss, objdetect_loss, bndbox_loss, cost) self.cost = cost tf.get_variable = old_get_variable
def build_graph(self, image, label): is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'weak' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) #return ternarize(v) def cabs(x): return tf.minimum(1.0, tf.abs(x), name='cabs') def activate(x): return fa(cabs(x)) def merge(x, y): #return x + y #return x - y return tf.concat([x,y], axis=3) image = image / 256.0; k=3; zp=0.25; zp2=zp / 1 #scale = tf.train.exponential_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*5, decay_rate=0.5, staircase=True, name='scale') #scale = tf.where(scale>0.001, scale, tf.zeros_like(scale)) scale = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*50, alpha=0.0) tf.summary.scalar('scale', scale); endconv=[]; endweak=[] #scale2 = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*50, alpha=0.0) #scale3 = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*80, alpha=0.0) with remap_variables(binarize_weight), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False): net=Conv2D('conv0', image, np.round(48*zp), 5, padding='VALID', use_bias=True) net=MaxPooling('pool0', net, 2, padding='SAME'); net=activate(net) net1=Conv2D('conv1', net, np.round(64*zp), 3, padding='SAME'); net1=BatchNorm('bn1', net1); endconv.append(net1) net2=Conv2D('weak1', net, np.round(64*zp2), k, padding='SAME'); net2=BatchNorm('bn12', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net=merge(activate(net1), scale*net2) #net=activate(net1) net1=Conv2D('conv2', net, np.round(64*zp), 3, padding='SAME'); net1=BatchNorm('bn2', net1); endconv.append(net1) net2=Conv2D('weak2', net, np.round(64*zp2), k, padding='SAME'); net2=BatchNorm('bn22', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net1=MaxPooling('pool1', net1, 2, padding='SAME'); net2=MaxPooling('pool12', net2, 2, padding='SAME'); net=merge(activate(net1), scale*net2) net=activate(net1) net1=Conv2D('conv3', net, np.round(128*zp), 3, padding='VALID'); net1=BatchNorm('bn3', net1); endconv.append(net1) net2=Conv2D('weak3', net, np.round(128*zp2), k, padding='VALID'); net2=BatchNorm('bn32', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net=merge(activate(net1), scale*net2) #net=activate(net1) net1=Conv2D('conv4', net, np.round(128*zp), 3, padding='SAME'); net1=BatchNorm('bn4', net1); endconv.append(net1) net2=Conv2D('weak4', net, np.round(128*zp2), k, padding='SAME'); net2=BatchNorm('bn42', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net=merge(activate(net1), scale*net2) # net=activate(net1) net1=Conv2D('conv5', net, np.round(128*zp), 3, padding='VALID'); net1=BatchNorm('bn5', net1); endconv.append(net1) net2=Conv2D('weak5', net, np.round(128*zp2), k, padding='VALID'); net2=BatchNorm('bn52', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net=merge(activate(net1), scale*net2) #net=activate(net1) net=tf.nn.dropout(net, 0.5 if is_training else 1.0) net1=Conv2D('conv6', net, np.round(512*zp), 5, padding='VALID'); net1=BatchNorm('bn6', net1); endconv.append(net1) net2=Conv2D('weak6', net, np.round(512*zp2), 5, padding='VALID'); net2=BatchNorm('bn62', net2); endweak.append(net2); # net2=tf.nn.relu(net2) net=merge(cabs(net1), scale*net2) # net=cabs(net1) logits=FullyConnected('fc1', net, 10) tf.nn.softmax(logits, name='output') # compute the number of failed samples wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='wrong_tensor') # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) add_param_summary(('.*/W', ['histogram', 'rms'])) total_cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, total_cost) for i in range(len(endweak)): add_moving_summary(tf.reduce_mean(tf.abs(endconv[i]), name='mean_conv_'+str(i+1) ) ) add_moving_summary(tf.reduce_mean(tf.abs(endweak[i]), name='mean_weak_'+str(i+1) ) ) return total_cost
def ResNet18(image, label, scope, is_training, dataset='cifar', reuse=False, Distill=None, bit_a=32, bit_w=32, bit_g=32): end_points = {} nChannels = [] if 'cifar' in dataset or 'svhn' in dataset: nChannels = [64, 64, 128, 256, 512] elif 'imagenet' in dataset: nChannels = [64, 256, 512, 1024, 2048] assert len(nChannels) > 0, "empty channels!!" stride = [1, 2, 2, 2] # 4 (stride) * n * 2 + 2. shortcut is not involved. # 32 -> 16 -> 8 -> 4 n = 2 if scope == 'Teacher': with tf.variable_scope(scope): std = tf.contrib.layers.conv2d(image, nChannels[0], [3, 3], 1, scope='base_conv', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn0', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) for i in range(len(stride)): std = NetworkBlock(std, ResBlock, n, nChannels[i + 1], stride[i], is_training=is_training, reuse=reuse, name='Resblock%d' % i) fc = tf.reduce_mean(std, [1, 2]) logits = tf.contrib.layers.fully_connected( fc, label.get_shape().as_list()[-1], weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=tf.zeros_initializer(), trainable=True, scope='full', reuse=reuse) end_points['Logits'] = logits elif scope == 'Student': fw, fa, fg = get_dorefa(bit_w, bit_a, bit_g) # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith( 'weights') or 'base_conv' in name or 'full' in name: return v else: tf.logging.info("Quantizing weight {} at bits {}".format( v.op.name, bit_w)) return fw(v) def nonlin(x): if bit_a == 32: return tf.nn.relu(x) # still use relu for 32-bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): tf.logging.info("Quantizing activations {} at bits {}".format( x.name, bit_a)) return fa(nonlin(x)) with tf.variable_scope(scope), remap_variables(new_get_variable): std = tf.contrib.layers.conv2d(image, nChannels[0], [3, 3], 1, scope='base_conv', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn0', trainable=True, is_training=is_training, reuse=reuse) for i in range(len(stride)): std = NetworkBlock(std, ResBlock, n, nChannels[i + 1], stride[i], activate, is_training=is_training, reuse=reuse, name='Resblock%d' % i, scope=scope) fc = tf.reduce_mean(std, [1, 2]) logits = tf.contrib.layers.fully_connected( fc, label.get_shape().as_list()[-1], weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=tf.zeros_initializer(), trainable=True, scope='full', reuse=reuse) end_points['Logits'] = logits if Distill is not None: if Distill == 'DML': teacher_train = True else: is_training = False teacher_train = False with tf.variable_scope('Teacher'): with tf.contrib.framework.arg_scope( [tf.contrib.layers.conv2d, tf.contrib.layers.fully_connected], variables_collections=[ tf.GraphKeys.GLOBAL_VARIABLES, 'Teacher' ]): with tf.contrib.framework.arg_scope( [tf.contrib.layers.batch_norm], variables_collections=[ tf.GraphKeys.GLOBAL_VARIABLES, 'Teacher' ]): tch = tf.contrib.layers.conv2d(image, nChannels[0], [3, 3], 1, scope='base_conv', trainable=teacher_train, reuse=reuse) tch = tf.contrib.layers.batch_norm(tch, scope='bn0', trainable=teacher_train, is_training=is_training, reuse=reuse) tch = tf.nn.relu(tch) for i in range(len(stride)): tch = NetworkBlock(tch, ResBlock, n, nChannels[i + 1], stride[i], is_training=is_training, reuse=reuse, name='Resblock%d' % i) fc = tf.reduce_mean(tch, [1, 2]) logits_tch = tf.contrib.layers.fully_connected( fc, label.get_shape().as_list()[-1], weights_initializer=tf.contrib.layers. xavier_initializer(), biases_initializer=tf.zeros_initializer(), trainable=teacher_train, scope='full', reuse=reuse) end_points['Logits_tch'] = logits_tch with tf.variable_scope('Distillation'): feats = tf.get_collection('feat') student_feats = feats[:len(feats) // 2] teacher_feats = feats[len(feats) // 2:] feats_noact = tf.get_collection('feat_noact') student_feats_noact = feats[:len(feats_noact) // 2] teacher_feats_noact = feats[len(feats_noact) // 2:] if Distill == 'Soft_logits': tf.add_to_collection( 'dist', Response.Soft_logits(logits, logits_tch, 3)) elif Distill == 'DML': tf.add_to_collection('dist', Response.DML(logits, logits_tch)) elif Distill == 'FT': tf.add_to_collection( 'dist', Response.Factor_Transfer(student_feats_noact[-1], teacher_feats_noact[-1])) elif Distill == 'FitNet': tf.add_to_collection( 'dist', Multiple.FitNet(student_feats, teacher_feats)) elif Distill == 'AT': tf.add_to_collection( 'dist', Multiple.Attention_transfer(student_feats, teacher_feats)) elif Distill == 'AB': tf.add_to_collection( 'dist', Multiple.AB_distillation(student_feats, teacher_feats, 1., 3e-3)) elif Distill == 'FSP': tf.add_to_collection('dist', Shared.FSP(student_feats, teacher_feats)) elif Distill[:3] == 'KD-': tf.add_to_collection( 'dist', Shared.KD_SVD(student_feats, teacher_feats, Distill[-3:])) elif Distill == 'RKD': tf.add_to_collection( 'dist', Relation.RKD(logits, logits_tch, l=[5e1, 1e2])) elif Distill == 'MHGD': tf.add_to_collection( 'dist', Relation.MHGD(student_feats, teacher_feats)) elif Distill == 'MHGD-RKD': tf.add_to_collection( 'dist', Relation.MHGD(student_feats, teacher_feats) + Relation.RKD(logits, logits_tch, l=[5e1, 1e2])) elif Distill == 'MHGD-RKD-SVD': tf.add_to_collection( 'dist', Relation.MHGD(student_feats, teacher_feats) + Relation.RKD(logits, logits_tch, l=[5e1, 1e2]) + Shared.KD_SVD(student_feats, teacher_feats, "SVD")) return end_points
def build_graph(self, image, label): image = image / 256.0 is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv1' in name or 'fct' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) def resblock(x, channel, stride): def get_stem_full(x): return (LinearWrap(x) .Conv2D('c3x3a', channel, 3) .quan_all_L2norm('stembn') .apply(activate) .Conv2D('c3x3b', channel, 3)()) channel_mismatch = channel != x.get_shape().as_list()[3] if stride != 1 or channel_mismatch or 'pool1' in x.name: # handling pool1 is to work around an architecture bug in our model if stride != 1 or 'pool1' in x.name: x = AvgPooling('pool', x, stride, stride) x = quan_all_L2norm('bn', x) x = activate(x) shortcut = Conv2D('shortcut', x, channel, 1) stem = get_stem_full(x) else: shortcut = x x = quan_all_L2norm('bn', x) x = activate(x) stem = get_stem_full(x) return shortcut + stem def group(x, name, channel, nr_block, stride): with tf.variable_scope(name + 'blk1'): x = resblock(x, channel, stride) for i in range(2, nr_block + 1): with tf.variable_scope(name + 'blk{}'.format(i)): x = resblock(x, channel, 1) return x with remap_variables(new_get_variable), \ argscope(quan_all_L2norm, momentum=0.9, eps=1e-4,train=is_training), \ argscope(Conv2D, use_bias=False, nl=tf.identity): logits = (LinearWrap(image) # use explicit padding here, because our private training framework has # different padding mechanisms from TensorFlow .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]]) .Conv2D('conv1', 64, 7, stride=2, padding='VALID', use_bias=True) .tf.pad([[0, 0], [1, 1], [1, 1], [0, 0]], 'SYMMETRIC') .MaxPooling('pool1', 3, 2, padding='VALID') .apply(group, 'conv2', 64, 2, 1) .apply(group, 'conv3', 128, 2, 2) .apply(group, 'conv4', 256, 2, 2) .apply(group, 'conv5', 512, 2, 2) .quan_all_L2norm('lastbn') .apply(nonlin) .GlobalAvgPooling('gap') #.tf.multiply(49) # this is due to a bug in our model design .FullyConnected('fct', 10)()) tf.nn.softmax(logits, name='output') # compute the number of failed samples cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='wrong_vector') # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) # weight decay on all W of fc layers #wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(),480000, 0.2, True) #wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') #add_moving_summary(cost, wd_cost) add_param_summary(('.*/W', ['histogram'])) # monitor W return tf.add_n([cost], name='cost')
def build_graph(self, image, label): # get quantization function # quantize weights qw = quantize_weight(int(self.quantizer_config['BITW']), self.quantizer_config['name'], self.quantizer_config['W_opts'], self.quantizer_config) # quantize activation if self.quantizer_config['BITA'] in ['32', 32]: qa = tf.identity else: qa = quantize_activation(int(self.quantizer_config['BITA']), self.quantizer_config['name'], self.quantizer_config) # quantize gradient qg = quantize_gradient(int(self.quantizer_config['BITG'])) def new_get_variable(v): name = v.op.name # don't quantize first and last layer if not name.endswith('/W') or 'conv1' in name or 'fct' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) return qw(v) def activate(x): return qa(self.activation(x)) @layer_register(use_scope=True) def DWConv2D(inputs, channel, kernel_size=3, stride=1, padding='SAME', data_format=None, dilations=None): #output = tf.keras.layers.DepthwiseConv2D(kernel_size, strides=(stride,stride), padding='same', use_bias=False)(inputs) #print(output.name, ': ', inputs.shape, ' --> ', output.shape) #return output curr_channel = inputs.get_shape().as_list()[3] var = tf.get_variable( name='dwconv_kernel', shape=[kernel_size, kernel_size, curr_channel, 1], initializer=tf.glorot_uniform_initializer) output = tf.nn.depthwise_conv2d(inputs, var, strides=(1, stride, stride, 1), padding=padding) print(output.name, ': ', inputs.shape, ' --> ', output.shape) return output @layer_register(use_scope=True) def SE_block(input_feature, ratio=8): kernel_initializer = tf.contrib.layers.variance_scaling_initializer( ) bias_initializer = tf.constant_initializer(value=0.0) channel = input_feature.get_shape()[-1] # Global average pooling squeeze = tf.reduce_mean(input_feature, axis=[1, 2], keepdims=True) excitation = tf.layers.dense(inputs=squeeze, units=channel // ratio, activation=tf.nn.relu, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name='bottleneck_fc') excitation = tf.layers.dense(inputs=excitation, units=channel, activation=tf.nn.sigmoid, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name='recover_fc') scale = input_feature * excitation return scale def SepConv(x, name, nr_block, channel, kernel_size, stride): with tf.variable_scope(name, reuse=tf.AUTO_REUSE): x = DWConv2D('dwconv', x, channel, kernel_size, stride) x = Conv2D('pwconv', x, channel, 1, strides=(stride, stride)) return x def block(x, channel, kernel_size, stride, extension, SE): channel_match = channel == x.get_shape().as_list()[3] shortcut = x if x.get_shape().as_list()[3] < 20: x = Conv2D('pwconv_a', x, channel * extension, 1, strides=(1, 1)) else: x = Conv2D('pwconv_a1', x, 20, 1, strides=(1, 1)) x = Conv2D('pwconv_a2', x, channel * extension, 1, strides=(1, 1)) x = BatchNorm('bn_a', x) x = activate(x) x = DWConv2D('dwconv_b', x, channel * extension, kernel_size, stride) x = BatchNorm('bn_b', x) x = activate(x) if SE: x = SE_block('se_block', x) if channel < 20: x = Conv2D('pwconv_c', x, channel, 1, strides=(1, 1)) else: x = Conv2D('pwconv_c1', x, 20, 1, strides=(1, 1)) x = Conv2D('pwconv_c2', x, channel, 1, strides=(1, 1)) x = BatchNorm('bn_c', x) if stride == 1 and channel_match: x = x + shortcut return x def group(x, name, nr_block, channel, kernel_size, stride, extension, SE): with tf.variable_scope(name + 'blk1', reuse=tf.AUTO_REUSE): x = block(x, channel, kernel_size, stride, extension, SE) for i in range(2, nr_block + 1): with tf.variable_scope(name + 'blk{}'.format(i), reuse=tf.AUTO_REUSE): x = block(x, channel, kernel_size, 1, extension, SE) return x with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.99, epsilon=1e-3), \ argscope(Conv2D, use_bias=False, nl=tf.identity, kernel_initializer=tf.variance_scaling_initializer(scale=float(self.initializer_config['scale']), mode=self.initializer_config['mode'])): logits = ( LinearWrap(image).Conv2D('conv1', 32, 3) # size=32 .apply(group, 'mbconv2', 1, 16, 3, 1, 6, False) # size=32 .apply(group, 'mbconv3', 2, 24, 3, 1, 6, False) # size=16 .apply(group, 'mbconv4', 3, 32, 3, 2, 6, False) # size=8 .apply(group, 'mbconv5', 4, 64, 3, 2, 6, False) # size=4 .apply(group, 'mbconv6', 3, 96, 3, 1, 6, False) # size=4 .apply(group, 'mbconv7', 3, 160, 3, 2, 6, False) # size=2 .apply(group, 'mbconv8', 1, 320, 3, 1, 6, False) # size=2 .Conv2D('conv9/pwconv_a1', 20, 1, strides=(1, 1)).Conv2D( 'conv9/pwconv_a2', 1280, 1, strides=(1, 1)).BatchNorm('last_bn').apply( activate).GlobalAvgPooling('gap').FullyConnected( 'fct', self.nb_classes)()) prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # regularization if self.regularizer_config['name'] not in [None, 'None']: reg_func = getattr(regularizers, self.regularizer_config['name'])().get_func( self.regularizer_config, self.quantizer_config) reg_cost = tf.multiply(float(self.regularizer_config['lmbd']), regularize_cost('.*/W', reg_func), name='reg_cost') #reg_cost = tf.multiply(float(self.regularizer_config['lmbd']), regularize_cost_from_collection(), name='reg_cost') total_cost = tf.add_n([cost, reg_cost], name='total_cost') else: total_cost = cost # summary def add_summary(logits, cost): err_top1 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='err_top1') add_moving_summary( tf.reduce_mean(err_top1, name='train_error_top1')) err_top5 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 5)), tf.float32, name='err_top5') add_moving_summary( tf.reduce_mean(err_top5, name='train_error_top5')) add_moving_summary(cost) add_param_summary(('.*/W', ['histogram'])) # monitor W add_summary(logits, cost) return total_cost
def _build_graph(self, inputs): inp, label = inputs is_training = get_current_tower_context().is_training fw, fa = get_dorefa(self.bitw, self.bita) def binarize_weight(v): name = v.op.name if not (name.endswith('W') or name.endswith('b')): logger.info("Not quantizing {}".format(name)) return v elif not self.quant_ends and 'conv0' in name: logger.info("Not quantizing {}".format(name)) return v elif not self.quant_ends and 'last_linear' in name: logger.info("Not quantizing {}".format(name)) return v elif not self.quant_ends and (self.net_fn == fcn1_net or self.net_fn == fcn2_net) and 'linear0' in name: logger.info("Not quantizing {}".format(name)) return v else: logger.info("Quantizing weight {}".format(name)) return fw(v) def nonlin(x, name="activate"): if self.bita == 32: return fa(tf.nn.relu(BNWithTrackedMults(x))) else: return fa(tf.clip_by_value(BNWithTrackedMults(x), 0.0, 1.0)) with remap_variables(binarize_weight), \ argscope([FullyConnectedWithTrackedMults], network_complexity=self.network_complexity), \ argscope([Conv2DWithTrackedMults], network_complexity=self.network_complexity), \ argscope([BNReLUWithTrackedMults], network_complexity=self.network_complexity), \ argscope([BNWithTrackedMults], network_complexity=self.network_complexity), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4): l = self.net_fn(inp, nonlin, self.n_context) logits = FullyConnectedWithTrackedMults('last_linear', l, out_dim=self.n_spks, nl=tf.identity) prob = tf.nn.softmax(logits, name='output') # used for validation accuracy of utterance identity_guesses = flatten(tf.argmax(prob, axis=1)) uniq_identities, _, count = tf.unique_with_counts(identity_guesses) idx_to_identity_with_most_votes = tf.argmax(count) chosen_identity = tf.gather(uniq_identities, idx_to_identity_with_most_votes) wrong = tf.expand_dims(tf.not_equal(chosen_identity, tf.cast(label[0], tf.int64)), axis=0, name='utt-wrong') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') add_moving_summary(cost) wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) with tf.name_scope('original-weight-summaries'): add_param_summary(('.*/W', ['rms', 'histogram'])) add_param_summary(('.*/b', ['rms', 'histogram'])) with tf.name_scope('activation-summaries'): def fn(name): return (name.endswith('output') or name.endswith('output:0')) and "Inference" not in name and 'quantized' not in name tensors = get_tensors_from_graph(tf.get_default_graph(), fn) logger.info("Adding activation tensors to summary: {}".format(tensors)) for tensor in tensors: add_tensor_summary(tensor, ['rms', 'histogram']) wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), 480000, 0.2, True) wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') add_moving_summary(wd_cost) self.cost = tf.add_n([cost, wd_cost], name='cost') tf.constant([self.network_complexity['mults']], name='TotalMults') tf.constant([self.network_complexity['weights']], name='TotalWeights') logger.info("Parameter count: {}".format(self.network_complexity))
def _build_graph(self, inputs): image, label = inputs """Add a single channel here""" image = tf.expand_dims(image, 3) image = image * 256 image = tf.round(image) fw, fa, fg = get_dorefa(BITW, BITA, BITG) old_get_variable = tf.get_variable def monitor(x, name): if MONITOR == 1: return tf.Print(x, [x], message='\n\n' + name + ': ', summarize=1000, name=name) else: return x def new_get_variable(v): name = v.op.name if not name.endswith('W') or 'conv0' in name or 'fc1' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) if MONITOR == 1: return tf.Print(fw(v), [fw(v)], message='\n\n' + v.name + ', Quantized weights are:', summarize=100) else: return fw(v) def activate(x): if BITA == 32: return tf.nn.relu(x) else: return fa(tf.nn.relu(x)) with remap_variables(new_get_variable), \ argscope(Conv2D, kernel_shape=3, use_bias=False, nl=tf.identity, out_channel=32): logits = (LinearWrap(image).apply(monitor, 'image_out').Conv2D( 'conv0').apply(fg).BatchNorm('bn0').apply(activate).apply( monitor, 'conv0_out').MaxPooling('pool0', 2).apply( monitor, 'pool0_out').Conv2D('conv1').apply( fg).BatchNorm('bn1').apply(activate).apply( monitor, 'conv1_out').Conv2D('conv2').apply( fg).BatchNorm('bn2').apply(activate).apply( monitor, 'conv2_out').MaxPooling( 'pool1', 2).apply( monitor, 'pool1_out').Conv2D('conv3'). apply(fg).BatchNorm('bn3').apply(activate).apply( monitor, 'conv3_out').FullyConnected( 'fc0', use_bias=False, out_dim=20, nl=tf.identity).apply(activate).apply( monitor, 'fc0_out').FullyConnected( 'fc1', use_bias=False, out_dim=10, nl=tf.identity).apply( monitor, 'fc1_out')()) prob = tf.nn.softmax(logits, name='prob') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = symbf.prediction_incorrect(logits, label, name='incorrect') accuracy = symbf.accuracy(logits, label, name='accuracy') train_error = tf.reduce_mean(wrong, name='train_error') summary.add_moving_summary(train_error, accuracy) wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') self.cost = tf.add_n([wd_cost, cost], name='total_cost') summary.add_moving_summary(cost, wd_cost, self.cost)
def build_graph(self, image, label): image = image / 255.0 fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fct' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) with remap_variables(new_get_variable), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False): logits = (LinearWrap(image).Conv2D( 'conv0', 96, 12, strides=4, padding='VALID').apply(activate).Conv2D( 'conv1', 256, 5, padding='SAME', split=2).apply(fg).BatchNorm('bn1').MaxPooling( 'pool1', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv2', 384, 3).apply(fg).BatchNorm('bn2').MaxPooling( 'pool2', 3, 2, padding='SAME').apply(activate).Conv2D( 'conv3', 384, 3, split=2).apply(fg). BatchNorm('bn3').apply(activate).Conv2D( 'conv4', 256, 3, split=2).apply(fg).BatchNorm('bn4').MaxPooling( 'pool4', 3, 2, padding='VALID').apply(activate).FullyConnected( 'fc0', 4096).apply(fg).BatchNorm('bnfc0'). apply(activate).FullyConnected( 'fc1', 4096, use_bias=False).apply(fg).BatchNorm('bnfc1').apply( nonlin).FullyConnected('fct', 1000, use_bias=True)()) tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6), name='regularize_cost') add_param_summary(('.*/W', ['histogram', 'rms'])) total_cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, total_cost) return total_cost
def build_graph(self, image, label): is_training = get_current_tower_context().is_training fw, fa, fg = get_dorefa(BITW, BITA, BITG) # monkey-patch tf.get_variable to apply fw def binarize_weight(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv0' in name or 'fc' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): if BITA == 32: return tf.nn.relu(x) return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) image = image / 256.0 with remap_variables(binarize_weight), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False): logits = ( LinearWrap(image).Conv2D('conv0', 48, 5, padding='VALID', use_bias=True).MaxPooling( 'pool0', 2, padding='SAME').apply(activate) # 18 .Conv2D('conv1', 64, 3, padding='SAME').apply(fg).BatchNorm( 'bn1').apply(activate).Conv2D( 'conv2', 64, 3, padding='SAME').apply(fg).BatchNorm('bn2').MaxPooling( 'pool1', 2, padding='SAME').apply(activate) # 9 .Conv2D( 'conv3', 128, 3, padding='VALID').apply(fg).BatchNorm('bn3').apply(activate) # 7 .Conv2D('conv4', 128, 3, padding='SAME').apply(fg). BatchNorm('bn4').apply(activate).Conv2D( 'conv5', 128, 3, padding='VALID').apply(fg).BatchNorm('bn5').apply(activate) # 5 .Dropout(rate=0.5 if is_training else 0.0).Conv2D( 'conv6', 512, 5, padding='VALID').apply(fg).BatchNorm( 'bn6').apply(nonlin).FullyConnected('fc1', 10)()) tf.nn.softmax(logits, name='output') # compute the number of failed samples wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='wrong_tensor') # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) add_param_summary(('.*/W', ['histogram', 'rms'])) total_cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, total_cost) return total_cost
def AlexNetCifar(image, label, scope, is_training, dataset='cifar', reuse=False, Distill=None, bit_a=32, bit_w=32, bit_g=32): end_points = {} if scope == 'Teacher': with tf.variable_scope(scope): image = tf.pad(image, [[0, 0], [5, 5], [5, 5], [0, 0]]) std = tf.contrib.layers.conv2d(image, 64, [11, 11], 1, scope='conv0', padding='VALID', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn0', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME') std = tf.contrib.layers.conv2d(std, 192, [5, 5], padding='SAME', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn1', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME') tf.add_to_collection('feat', std) std = tf.contrib.layers.conv2d(std, 384, [3, 3], padding='SAME', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn2', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) tf.add_to_collection('feat', std) std = tf.contrib.layers.conv2d(std, 256, [3, 3], padding='SAME', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn3', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) tf.add_to_collection('feat', std) std = tf.contrib.layers.conv2d(std, 256, [3, 3], padding='SAME', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn4', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME') tf.add_to_collection('feat', std) fc = tf.layers.flatten(std, name='fc_flat') fc1 = tf.contrib.layers.fully_connected(fc, 4096, scope='fc0', trainable=True, reuse=reuse) fc1 = tf.contrib.layers.batch_norm(fc1, scope='bn_fc0', trainable=True, is_training=is_training, reuse=reuse) fc1 = tf.nn.relu(fc1) fc2 = tf.contrib.layers.fully_connected(fc1, 4096, scope='fc1', trainable=True, reuse=reuse) fc2 = tf.contrib.layers.batch_norm(fc2, scope='bn_fc1', trainable=True, is_training=is_training, reuse=reuse) fc2 = tf.nn.relu(fc2) logits = tf.contrib.layers.fully_connected( fc2, label.get_shape().as_list()[-1], scope='fct', trainable=True, reuse=reuse) end_points['Logits'] = logits else: fw, fa, fg = get_dorefa(bit_w, bit_a, bit_g) def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith( 'weights') or 'conv0' in name or 'fct' in name: return v else: tf.logging.info("Quantizing weight {} at bits {}".format( v.op.name, bit_w)) return fw(v) def nonlin(x): if bit_a == 32: return tf.nn.relu(x) # still use relu for 32-bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): tf.logging.info("Quantizing activations {} at bits {}".format( x.name, bit_a)) return fa(nonlin(x)) with tf.variable_scope(scope), remap_variables(new_get_variable): image = tf.pad(image, [[0, 0], [5, 5], [5, 5], [0, 0]]) std = tf.contrib.layers.conv2d(image, 64, [11, 11], 1, scope='conv0', padding='VALID', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn0', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = activate(std) std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME') std = tf.contrib.layers.conv2d(std, 192, [5, 5], padding='SAME', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn1', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = activate(std) std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME') tf.add_to_collection('feat', std) std = tf.contrib.layers.conv2d(std, 384, [3, 3], padding='SAME', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn2', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = activate(std) tf.add_to_collection('feat', std) std = tf.contrib.layers.conv2d(std, 256, [3, 3], padding='SAME', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn3', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = activate(std) tf.add_to_collection('feat', std) std = tf.contrib.layers.conv2d(std, 256, [3, 3], padding='SAME', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn4', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = activate(std) std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME') tf.add_to_collection('feat', std) fc = tf.layers.flatten(std, name='fc_flat') fc1 = tf.contrib.layers.fully_connected(fc, 4096, scope='fc0', trainable=True, reuse=reuse) fc1 = tf.contrib.layers.batch_norm(fc1, scope='bn_fc0', trainable=True, is_training=is_training, reuse=reuse) fc1 = tf.nn.relu(fc1) fc1 = activate(fc1) fc2 = tf.contrib.layers.fully_connected(fc1, 4096, scope='fc1', trainable=True, reuse=reuse) fc2 = tf.contrib.layers.batch_norm(fc2, scope='bn_fc1', trainable=True, is_training=is_training, reuse=reuse) fc2 = tf.nn.relu(fc2) logits = tf.contrib.layers.fully_connected( fc2, label.get_shape().as_list()[-1], scope='fct', trainable=True, reuse=reuse) end_points['Logits'] = logits if Distill is not None: if Distill == 'DML': teacher_train = True else: is_training = False teacher_train = False with tf.variable_scope('Teacher'): with tf.contrib.framework.arg_scope( [tf.contrib.layers.conv2d, tf.contrib.layers.fully_connected], variables_collections=[ tf.GraphKeys.GLOBAL_VARIABLES, 'Teacher' ]): with tf.contrib.framework.arg_scope( [tf.contrib.layers.batch_norm], variables_collections=[ tf.GraphKeys.GLOBAL_VARIABLES, 'Teacher' ]): std = tf.contrib.layers.conv2d(image, 64, [11, 11], 1, scope='conv0', padding='VALID', trainable=True, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn0', trainable=True, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME') # tf.add_to_collection('feat', std) std = tf.contrib.layers.conv2d(std, 192, [5, 5], padding='SAME', trainable=teacher_train, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn1', trainable=teacher_train, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME') tf.add_to_collection('feat', std) std = tf.contrib.layers.conv2d(std, 384, [3, 3], padding='SAME', trainable=teacher_train, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn2', trainable=teacher_train, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) tf.add_to_collection('feat', std) std = tf.contrib.layers.conv2d(std, 256, [3, 3], padding='SAME', trainable=teacher_train, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn3', trainable=teacher_train, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) tf.add_to_collection('feat', std) std = tf.contrib.layers.conv2d(std, 256, [3, 3], padding='SAME', trainable=teacher_train, reuse=reuse) std = tf.contrib.layers.batch_norm(std, scope='bn4', trainable=teacher_train, is_training=is_training, reuse=reuse) std = tf.nn.relu(std) std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME') tf.add_to_collection('feat', std) fc_tch = tf.layers.flatten(std, name='fc_flat') fc1 = tf.contrib.layers.fully_connected( fc_tch, 4096, scope='fc0', trainable=teacher_train, reuse=reuse) fc1 = tf.contrib.layers.batch_norm(fc1, scope='bn_fc0', trainable=teacher_train, is_training=is_training, reuse=reuse) fc1 = tf.nn.relu(fc1) fc2 = tf.contrib.layers.fully_connected( fc1, 4096, scope='fc1', trainable=teacher_train, reuse=reuse) fc2 = tf.contrib.layers.batch_norm(fc2, scope='bn_fc1', trainable=teacher_train, is_training=is_training, reuse=reuse) fc2 = tf.nn.relu(fc2) logits_tch = tf.contrib.layers.fully_connected( fc2, label.get_shape().as_list()[-1], scope='fct', trainable=teacher_train, reuse=reuse) end_points['Logits_tch'] = logits_tch with tf.variable_scope('Distillation'): feats = tf.get_collection('feat') student_feats = feats[:len(feats) // 2] teacher_feats = feats[len(feats) // 2:] feats_noact = tf.get_collection('feat_noact') student_feats_noact = feats[:len(feats_noact) // 2] teacher_feats_noact = feats[len(feats_noact) // 2:] if Distill == 'Soft_logits': tf.add_to_collection( 'dist', Response.Soft_logits(logits, logits_tch, 3)) elif Distill == 'DML': tf.add_to_collection('dist', Response.DML(logits, logits_tch)) elif Distill == 'FT': tf.add_to_collection( 'dist', Response.Factor_Transfer(student_feats_noact[-1], teacher_feats_noact[-1])) elif Distill == 'FitNet': tf.add_to_collection( 'dist', Multiple.FitNet(student_feats, teacher_feats)) elif Distill == 'AT': tf.add_to_collection( 'dist', Multiple.Attention_transfer(student_feats, teacher_feats)) elif Distill == 'AB': tf.add_to_collection( 'dist', Multiple.AB_distillation(student_feats, teacher_feats, 1., 3e-3)) elif Distill == 'FSP': tf.add_to_collection('dist', Shared.FSP(student_feats, teacher_feats)) elif Distill[:3] == 'KD-': tf.add_to_collection( 'dist', Shared.KD_SVD(student_feats, teacher_feats, Distill[-3:])) elif Distill == 'RKD': tf.add_to_collection( 'dist', Relation.RKD(logits, logits_tch, l=[5e1, 1e2])) elif Distill == 'MHGD': tf.add_to_collection( 'dist', Relation.MHGD(student_feats, teacher_feats)) elif Distill == 'MHGD-RKD': tf.add_to_collection( 'dist', Relation.MHGD(student_feats, teacher_feats) + Relation.RKD(logits, logits_tch, l=[5e1, 1e2])) elif Distill == 'MHGD-RKD-SVD': tf.add_to_collection( 'dist', Relation.MHGD(student_feats, teacher_feats) + Relation.RKD(logits, logits_tch, l=[5e1, 1e2]) + Shared.KD_SVD(student_feats, teacher_feats, "SVD")) return end_points
def build_graph(self, image, label): # get quantization function # quantize weights qw = quantize_weight(int(self.quantizer_config['BITW']), self.quantizer_config['name'], self.quantizer_config['W_opts'], self.quantizer_config) # quantize activation if self.quantizer_config['BITA'] in ['32', 32]: qa = tf.identity else: qa = quantize_activation(int(self.quantizer_config['BITA']), self.quantizer_config['name'], self.quantizer_config) # quantize gradient qg = quantize_gradient(int(self.quantizer_config['BITG'])) def new_get_variable(v): name = v.op.name # don't quantize first and last layer if not name.endswith('/W') or 'conv1' in name or 'fct' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) return qw(v) def activate(x): return qa(self.activation(x)) def resblock(x, channel, stride): def get_stem_full(x): return (LinearWrap(x).Conv2D( 'stem_conv_a', channel, 3).BatchNorm('stem_bn').apply(activate).Conv2D( 'stem_conv_b', channel, 3)()) channel_mismatch = channel != x.get_shape().as_list()[3] if stride != 1 or channel_mismatch: if stride != 1: x = AvgPooling('avgpool', x, stride, stride) x = BatchNorm('bn', x) x = activate(x) shortcut = Conv2D('shortcut', x, channel, 1) stem = get_stem_full(x) else: shortcut = x x = BatchNorm('bn', x) x = activate(x) stem = get_stem_full(x) return shortcut + stem def group(x, name, channel, nr_block, stride): with tf.variable_scope(name + 'blk1', reuse=tf.AUTO_REUSE): x = resblock(x, channel, stride) for i in range(2, nr_block + 1): with tf.variable_scope(name + 'blk{}'.format(i), reuse=tf.AUTO_REUSE): x = resblock(x, channel, 1) return x def resblock_idt(x, channel, stride, first): def get_r(x): if 'InferenceTower' in x.op.name: idx = x.op.name.index('/') n = x.op.name[idx + 1::] elif 'tower' in x.op.name: idx = x.op.name.index('/') n = x.op.name[idx + 1::] else: n = x.op.name n0 = n.split('blk')[0] n1 = n0 + 'blk1/shortcut/maxW' n2 = n.split('/output')[0] + '/maxW' if int( self.quantizer_config['BITW'] ) != 32: # and eval(self.quantizer_config['W_opts']['fix_max']): n1 += '_stop_grad' n2 += '_stop_grad' maxs = tf.get_collection('maxs') for tensor in maxs: tn = tensor.op.name if n1 == tn: m1 = tensor elif n2 == tn: m2 = tensor r = m2 / m1 temp = self.quantizer_config['mulR'] if temp == '2R': r2 = (1 / r) * (2.0**tf.floor(tf.log(r) / tf.log(2.0))) elif temp == 'R': r2 = 1 / r return r2 def get_stem_full(x): return (LinearWrap(x).Conv2D( 'stem_conv_a', channel, 1, strides=(1, 1)).BatchNorm('stem_bn1').apply(activate).Conv2D( 'stem_conv_b', channel, 3, strides=(stride, stride)).BatchNorm('stem_bn2').apply( activate).Conv2D('stem_conv_c', channel * 4, 1, strides=(1, 1))()) #channel_mismatch = channel != x.get_shape().as_list()[3] #if stride != 1 or channel_mismatch: if first: #shortcut = tf.concat([x[::, 0::2, 0::2, ::], x[::, 1::2, 1::2, ::]], -1) x = BatchNorm('bn', x) x = activate(x) #if stride != 1: # shortcut = Conv2D('shortcut', x, channel, 1, strides=(stride, stride)) #else: # shortcut = Conv2D('shortcut', x, channel, 1) shortcut = Conv2D('shortcut', x, channel * 4, 1, strides=(stride, stride)) stem = get_stem_full(x) else: shortcut = x x = BatchNorm('bn', x) x = activate(x) stem = get_stem_full(x) if self.quantizer_config['mulR'] in ['2R', 'R']: r = get_r(stem) stem = stem * r return shortcut + stem def group_v2(x, name, channel, nr_block, stride): with tf.variable_scope(name + 'blk1', reuse=tf.AUTO_REUSE): x = resblock_idt(x, channel, stride, True) for i in range(2, nr_block + 1): with tf.variable_scope(name + 'blk{}'.format(i), reuse=tf.AUTO_REUSE): x = resblock_idt(x, channel, 1, False) return x with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False, nl=tf.identity, kernel_initializer=tf.variance_scaling_initializer(scale=float(self.initializer_config['scale']), mode=self.initializer_config['mode'])): logits = ( LinearWrap(image).Conv2D('conv1', 64, 7, strides=2) # size=112 .MaxPooling('pool1', pool_size=3, strides=2, padding="SAME") # size=56 #.BatchNorm('bn1') #.apply(activate) .apply(group_v2, 'res1', 64, 3, 1) # size=56 .apply(group_v2, 'res2', 128, 4, 2) # size=28 .apply(group_v2, 'res3', 256, 6, 2) # size=14 .apply(group_v2, 'res4', 512, 3, 2) # size=7 .BatchNorm('last_bn').apply(activate).GlobalAvgPooling( 'gap').FullyConnected('fct', self.nb_classes)()) prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # regularization if self.regularizer_config['name'] not in [None, 'None']: reg_func = getattr(regularizers, self.regularizer_config['name'])().get_func( self.regularizer_config, self.quantizer_config) reg_cost = tf.multiply(float(self.regularizer_config['lmbd']), regularize_cost('.*/W', reg_func), name='reg_cost') total_cost = tf.add_n([cost, reg_cost], name='total_cost') else: total_cost = cost # summary def add_summary(logits, cost): err_top1 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='err_top1') add_moving_summary( tf.reduce_mean(err_top1, name='train_error_top1')) err_top5 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 5)), tf.float32, name='err_top5') add_moving_summary( tf.reduce_mean(err_top5, name='train_error_top5')) add_moving_summary(cost) add_param_summary(('.*/W', ['histogram'])) # monitor W add_summary(logits, cost) return total_cost
def _build_graph(self, inputs): image, label = inputs fw, fa, fg = get_dorefa(BITW, BITA, BITG) old_get_variable = tf.get_variable def monitor(x, name): if MONITOR == 1: return tf.Print(x, [x], message='\n\n' + name + ': ', summarize=1000, name=name) else: return x def new_get_variable(v): name = v.op.name if not name.endswith('W') or 'conv1_1' in name or 'fc8' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) if MONITOR == 1: return tf.Print(fw(v), [fw(v)], message='\n\n' + v.name + ', Quantized weights are:', summarize=100) else: return fw(v) def bn_activate(name, x): X = BatchNorm(name, x) x = monitor(x, name + '_noact_out') return activate(x) def activate(x): if BITA == 32: return tf.nn.relu(x) else: return fa(tf.nn.relu(x)) # VGG 16 with remap_variables(new_get_variable), \ argscope(Conv2D, kernel_shape=3, use_bias=False, nl = tf.identity): logits = ( LinearWrap(image).apply(monitor, 'image_out').Conv2D( 'conv1_1', 64).apply(fg).BatchNorm('bn1_1').apply(activate).apply( monitor, 'conv1_1_out').Conv2D('conv1_2', 64).apply( fg).BatchNorm('bn1_2').apply(activate).apply( monitor, 'conv1_2_out').MaxPooling('pool1', 2).apply( monitor, 'pool1_out') # 112 .Conv2D( 'conv2_1', 128).apply(fg).BatchNorm('bn2_1').apply(activate).apply( monitor, 'conv2_1_out').Conv2D('conv2_2', 128).apply( fg).BatchNorm('bn2_2').apply(activate).apply( monitor, 'conv2_2_out').MaxPooling( 'pool2', 2).apply(monitor, 'pool2_out') # 56 .Conv2D( 'conv3_1', 256).apply(fg).BatchNorm('bn3_1').apply(activate).apply( monitor, 'conv3_1_out').Conv2D( 'conv3_2', 256).apply(fg).BatchNorm('bn3_2'). apply(activate).apply(monitor, 'conv3_2_out').Conv2D( 'conv3_3', 256).apply(fg).BatchNorm('bn3_3').apply(activate).apply( monitor, 'conv3_3_out').MaxPooling('pool3', 2).apply( monitor, 'pool3_out') # 28 .Conv2D( 'conv4_1', 512).apply(fg).BatchNorm('bn4_1').apply(activate).apply( monitor, 'conv4_1_out').Conv2D( 'conv4_2', 512).apply(fg).BatchNorm('bn4_2'). apply(activate).apply(monitor, 'conv4_2_out').Conv2D( 'conv4_3', 512).apply(fg).BatchNorm('bn4_3').apply(activate).apply( monitor, 'conv4_3_out').MaxPooling('pool4', 2).apply( monitor, 'pool4_out') # 14 .Conv2D( 'conv5_1', 512).apply(fg).BatchNorm('bn5_1').apply(activate).apply( monitor, 'conv5_1_out').Conv2D( 'conv5_2', 512).apply(fg).BatchNorm('bn5_2'). apply(activate).apply(monitor, 'conv5_2_out').Conv2D( 'conv5_3', 512).apply(fg).BatchNorm('bn5_3').apply(activate).apply( monitor, 'conv5_3_out').MaxPooling('pool5', 2).apply( monitor, 'pool5_out').FullyConnected( 'fc6', use_bias=False, out_dim=512).apply(activate).apply( monitor, 'fc6_out').FullyConnected( 'fc7', use_bias=False, out_dim=512).apply(activate).apply( monitor, 'fc7_out').FullyConnected( 'fc8', use_bias=False, out_dim=self.cifar_classnum, nl=tf.identity).apply( monitor, 'fc8_out')()) prob = tf.nn.softmax(logits, name='prob') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = symbf.prediction_incorrect(logits, label, name='incorrect') accuracy = symbf.accuracy(logits, label, name='accuracy') train_error = tf.reduce_mean(wrong, name='train_error') summary.add_moving_summary(train_error, accuracy) wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') self.cost = tf.add_n([wd_cost, cost], name='total_cost') summary.add_moving_summary(cost, wd_cost, self.cost)
def _build_graph(self, inputs): image, label = inputs image = image / 256.0 fw, fa, fg = get_dorefa(BITW, BITA, BITG) old_get_variable = tf.get_variable def new_get_variable(v): name = v.op.name # don't binarize first and last layer if not name.endswith('W') or 'conv1' in name or 'fct' in name: return v else: logger.info("Binarizing weight {}".format(v.op.name)) return fw(v) def nonlin(x): return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) def resblock(x, channel, stride): def get_stem_full(x): return (LinearWrap(x) .Conv2D('c3x3a', channel, 3) .BatchNorm('stembn') .apply(activate) .Conv2D('c3x3b', channel, 3)()) channel_mismatch = channel != x.get_shape().as_list()[3] if stride != 1 or channel_mismatch or 'pool1' in x.name: # handling pool1 is to work around an architecture bug in our model if stride != 1 or 'pool1' in x.name: x = AvgPooling('pool', x, stride, stride) x = BatchNorm('bn', x) x = activate(x) shortcut = Conv2D('shortcut', x, channel, 1) stem = get_stem_full(x) else: shortcut = x x = BatchNorm('bn', x) x = activate(x) stem = get_stem_full(x) return shortcut + stem def group(x, name, channel, nr_block, stride): with tf.variable_scope(name + 'blk1'): x = resblock(x, channel, stride) for i in range(2, nr_block + 1): with tf.variable_scope(name + 'blk{}'.format(i)): x = resblock(x, channel, 1) return x with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False, nl=tf.identity): logits = (LinearWrap(image) # use explicit padding here, because our training framework has # different padding mechanisms from TensorFlow .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]]) .Conv2D('conv1', 64, 7, stride=2, padding='VALID', use_bias=True) .tf.pad([[0, 0], [1, 1], [1, 1], [0, 0]], 'SYMMETRIC') .MaxPooling('pool1', 3, 2, padding='VALID') .apply(group, 'conv2', 64, 2, 1) .apply(group, 'conv3', 128, 2, 2) .apply(group, 'conv4', 256, 2, 2) .apply(group, 'conv5', 512, 2, 2) .BatchNorm('lastbn') .apply(nonlin) .GlobalAvgPooling('gap') .tf.multiply(49) # this is due to a bug in our model design .FullyConnected('fct', 1000)()) prob = tf.nn.softmax(logits, name='output') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
def _build_graph(self, inputs): input, label = inputs fw, fa, fg = get_dorefa(FLAGS.bit_w, FLAGS.bit_a, 32) old_get_variable = tf.get_variable # monkey-patch tf.get_variable to apply fw def new_get_variable(v): name = v.op.name logger.info("Binarizing weight {}".format(v.op.name)) return fw(v, FLAGS.force_quantization) def nonlin(x): if FLAGS.bit_a == 32 and not FLAGS.use_clip: return tf.nn.relu(x) # still use relu for 32bit cases return tf.clip_by_value(x, 0.0, 1.0) def activate(x): return fa(nonlin(x)) activations = [] with remap_variables(new_get_variable), \ argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity): curr_layer = LinearWrap(input) for i in range(FLAGS.n_layers): curr_layer = (curr_layer.FullyConnected( 'fc' + str(i), FLAGS.state_size).LayerNorm('ln_fc' + str(i)).apply(activate)) activations.append(curr_layer.tensor()) curr_layer = (curr_layer.Dropout('dropout', FLAGS.dropout)) logits = (curr_layer.FullyConnected( 'fc' + str(FLAGS.n_layers), 256).LayerNorm('lnfc' + str(FLAGS.n_layers)).apply( nonlin).FullyConnected('fct', self.n_spks, use_bias=True)()) print_all_tf_vars() prob = tf.nn.softmax(logits, name='output') # used for validation accuracy of utterance identity_guesses = flatten(tf.argmax(prob, axis=1)) uniq_identities, _, count = tf.unique_with_counts(identity_guesses) idx_to_identity_with_most_votes = tf.argmax(count) chosen_identity = tf.gather(uniq_identities, idx_to_identity_with_most_votes) wrong = tf.expand_dims(tf.not_equal(chosen_identity, tf.cast(label[0], tf.int64)), axis=0, name='utt-wrong') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6), name='regularize_cost') add_param_summary(('.*/W', ['histogram', 'rms'])) self.cost = tf.add_n([cost, wd_cost], name='cost') add_moving_summary(cost, wd_cost, self.cost) for activation in activations: add_activation_summary(activation) tf.summary.histogram(activation.name, activation)
def build_graph(self, image, label): # get quantization function # quantize weights qw = quantize_weight(int(self.quantizer_config['BITW']), self.quantizer_config['name'], self.quantizer_config['W_opts'], self.quantizer_config) # quantize activation if self.quantizer_config['BITA'] in ['32', 32]: qa = tf.identity else: qa = quantize_activation(int(self.quantizer_config['BITA'])) # quantize gradient qg = quantize_gradient(int(self.quantizer_config['BITG'])) def new_get_variable(v): name = v.op.name # don't quantize first and last layer if not name.endswith('/W') or 'conv1' in name or 'fct' in name: return v else: logger.info("Quantizing weight {}".format(v.op.name)) return qw(v) def activate(x): return qa(self.activation(x)) with remap_variables(new_get_variable), \ argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(Conv2D, use_bias=False, nl=tf.identity, kernel_initializer=tf.variance_scaling_initializer(scale=float(self.initializer_config['scale']), mode=self.initializer_config['mode'])): logits = (LinearWrap(image) .Conv2D('conv1', 96, 3) .BatchNorm('bn1') .apply(activate) .Conv2D('conv2', 256, 3, padding='SAME', split=2) .BatchNorm('bn2') .apply(activate) .MaxPooling('pool2', 2, 2, padding='VALID') # size=16 .Conv2D('conv3', 384, 3) .BatchNorm('bn3') .apply(activate) .MaxPooling('pool2', 2, 2, padding='VALID') # size=8 .Conv2D('conv4', 384, 3, split=2) .BatchNorm('bn4') .apply(activate) .Conv2D('conv5', 256, 3, split=2) .BatchNorm('bn5') .apply(activate) .MaxPooling('pool5', 2, 2, padding='VALID') # size=4 .FullyConnected('fc1', 4096, use_bias=False) .BatchNorm('bnfc1') .apply(activate) .FullyConnected('fc2', 4096, use_bias=False) .BatchNorm('bnfc2') .apply(activate) .FullyConnected('fct', self.nb_classes, use_bias=True)()) prob = tf.nn.softmax(logits, name='output') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # regularization if self.regularizer_config['name'] not in [None, 'None']: reg_func = getattr(regularizers, self.regularizer_config['name'])().get_func(self.regularizer_config) reg_cost = tf.multiply(float(self.regularizer_config['lmbd']), regularize_cost('.*/W', reg_func), name='reg_cost') total_cost = tf.add_n([cost, reg_cost], name='total_cost') else: total_cost = cost # summary def add_summary(logits, cost): err_top1 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='err_top1') add_moving_summary(tf.reduce_mean(err_top1, name='train_error_top1')) err_top5 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 5)), tf.float32, name='err_top5') add_moving_summary(tf.reduce_mean(err_top5, name='train_error_top5')) add_moving_summary(cost) add_param_summary(('.*/W', ['histogram'])) # monitor W add_summary(logits, cost) return total_cost