Esempio n. 1
0
        def resnet18_imagenet(image):
            with remap_variables(new_get_variable), \
                 argscope(Conv2D, use_bias=False,
                          kernel_initializer=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')):
                # Note that this pads the image by [2, 3] instead of [3, 2].
                # Similar things happen in later stride=2 layers as well.
                l = Conv2D('conv0', image, 64, 7, strides=2, activation=BNReLU)
                l = MaxPooling('pool0',
                               l,
                               pool_size=3,
                               strides=2,
                               padding='SAME')
                l = resnet_group('group0', l, resnet_basicblock, 64, 2, 1)
                l = activate(l)
                l = resnet_group('group1', l, resnet_basicblock, 128, 2, 2)
                l = activate(l)
                l = resnet_group('group2', l, resnet_basicblock, 256, 2, 2)
                l = activate(l)
                l = resnet_group('group3', l, resnet_basicblock, 512, 2, 2)
                l = GlobalAvgPooling('gap', l)
                logits = FullyConnected(
                    'linear',
                    l,
                    1000,
                    kernel_initializer=tf.random_normal_initializer(
                        stddev=0.01))

            # tmp = tf.trainable_variables()
            return logits
Esempio n. 2
0
def weight_standardization_context(enable=True):
    """
    Implement Centered Weight Normalization
    (http://openaccess.thecvf.com/content_ICCV_2017/papers/Huang_Centered_Weight_Normalization_ICCV_2017_paper.pdf)
    or Weight Standardization (https://arxiv.org/abs/1903.10520)

    Usage:

    with weight_standardization_context():
        l = Conv2D('conv', l)
        ...
    """
    if enable:

        def weight_standardization(v):
            if (not v.name.endswith('/W:0')) or v.shape.ndims != 4:
                return v
            mean, var = tf.nn.moments(v, [0, 1, 2], keep_dims=True)
            v = (v - mean) / (tf.sqrt(var) + 1e-5)
            return v

        with remap_variables(weight_standardization):
            yield

    else:
        yield
Esempio n. 3
0
    def get_logits(self, image):
        if BITW == 't':
            fw, fa, fg = get_dorefa(32, 32, 32)
            fw = ternarize
        else:
            fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fct' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)  # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        with remap_variables(new_get_variable), \
                argscope([Conv2D, BatchNorm, MaxPooling], data_format='channels_first'), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False):
            logits = (LinearWrap(image).Conv2D(
                'conv0', 96, 12, strides=4, padding='VALID',
                use_bias=True).apply(activate).Conv2D(
                    'conv1', 256, 5, padding='SAME',
                    split=2).apply(fg).BatchNorm('bn1').MaxPooling(
                        'pool1', 3, 2, padding='SAME').apply(activate).Conv2D(
                            'conv2', 384,
                            3).apply(fg).BatchNorm('bn2').MaxPooling(
                                'pool2', 3, 2,
                                padding='SAME').apply(activate).Conv2D(
                                    'conv3', 384, 3, split=2).apply(fg).
                      BatchNorm('bn3').apply(activate).Conv2D(
                          'conv4', 256, 3,
                          split=2).apply(fg).BatchNorm('bn4').MaxPooling(
                              'pool4', 3, 2,
                              padding='VALID').apply(activate).FullyConnected(
                                  'fc0', 4096).apply(fg).BatchNorm('bnfc0').
                      apply(activate).FullyConnected(
                          'fc1', 4096,
                          use_bias=False).apply(fg).BatchNorm('bnfc1').apply(
                              nonlin).FullyConnected('fct',
                                                     1000,
                                                     use_bias=True)())
        add_param_summary(('.*/W', ['histogram', 'rms']))
        tf.nn.softmax(logits, name='output')  # for prediction
        return logits
    def get_logits(self, image):
        def weight_standardization(v):
            if not self.use_WS:
                return v
            if (not v.name.endswith('/W:0')) or v.shape.ndims != 4:
                return v
            mean, var = tf.nn.moments(v, [0, 1, 2], keep_dims=True)
            v = (v - mean) / (tf.sqrt(var) + 1e-5)
            return v

        num_blocks = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3]}[self.depth]
        block_func = resnet_bottleneck
        with argscope([Conv2D, MaxPooling, GlobalAvgPooling], data_format=self.data_format), \
                varreplace.remap_variables(weight_standardization):
            return resnet_backbone(image, num_blocks, resnet_group, block_func)
def weight_standardization_context(enable):
    if enable:
        def weight_standardization(v):
            if (not v.name.endswith('/W:0')) or v.shape.ndims != 4:
                return v
            print("WS on " + v.name)
            mean, std = tf.nn.moments(v, [0, 1, 2], keep_dims=True)
            v = (v - mean) / (std + 1e-5)
            return v

        with remap_variables(weight_standardization):
            yield

    else:
        yield
Esempio n. 6
0
        def alexnet(image):
            with remap_variables(new_get_variable), \
                 argscope([Conv2D, BatchNorm, MaxPooling], data_format='channels_first'), \
                 argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
                 argscope(Conv2D, use_bias=False):
                logits = (
                    LinearWrap(image).Conv2D(
                        'conv0',
                        96,
                        12,
                        strides=4,
                        padding='VALID',
                        use_bias=True).apply(fg).Conv2D(
                            'conv1', 256, 5, padding='SAME',
                            split=2).apply(fg).BatchNorm('bn1').MaxPooling(
                                'pool1', 3, 2,
                                padding='SAME').apply(activate).Conv2D(
                                    'conv2', 384,
                                    3).apply(fg).BatchNorm('bn2').MaxPooling(
                                        'pool2', 3, 2,
                                        padding='SAME').apply(activate).Conv2D(
                                            'conv3', 384, 3, split=2).
                    apply(fg).BatchNorm('bn3').apply(activate).Conv2D(
                        'conv4', 256, 3,
                        split=2).apply(fg).BatchNorm('bn4').MaxPooling(
                            'pool4', 3, 2,
                            padding='VALID').apply(activate).FullyConnected(
                                'fc0',
                                4096).apply(fg).BatchNorm('bnfc0').apply(
                                    activate).FullyConnected('fc1',
                                                             4096,
                                                             use_bias=False).
                    apply(fg).BatchNorm('bnfc1').apply(nonlin).FullyConnected(
                        'fct', self.class_num, use_bias=True)())

            return logits
Esempio n. 7
0
def build_model(input_quant_wei_layer, intput_quant_wei_lambda,
                input_quant_wei_delta, input_quant_wei_levels):
    global id_target_quant_layer
    global q_lambda
    global q_delta
    global num_quant_levels

    id_target_quant_layer = input_quant_wei_layer
    q_lambda = intput_quant_wei_lambda
    q_delta = input_quant_wei_delta
    num_quant_levels = input_quant_wei_levels

    with tf.name_scope('main_params'):
        global_step = tf.Variable(initial_value=0,
                                  trainable=False,
                                  name='global_step')
        learning_rate = tf.placeholder(tf.float32,
                                       shape=[],
                                       name='learning_rate')

    with remap_variables(quant_wei_uni_dead_zone), tf.variable_scope(
            'conv1') as scope:
        conv1 = tf.layers.conv2d(
            inputs=data_x,
            filters=32,
            kernel_size=[1, 64],
            padding='VALID',
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(WEIGHT_DECAY))
        bn1 = tf.layers.batch_normalization(conv1, training=isTraining)
        relu1 = tf.nn.relu(bn1)

    with remap_variables(quant_wei_uni_dead_zone), tf.variable_scope(
            'conv2') as scope:
        conv2 = tf.layers.conv2d(
            inputs=relu1,
            filters=64,
            kernel_size=[1, 32],
            padding='VALID',
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(WEIGHT_DECAY))
        bn2 = tf.layers.batch_normalization(conv2, training=isTraining)
        relu2 = tf.nn.relu(bn2)

    with remap_variables(quant_wei_uni_dead_zone), tf.variable_scope(
            'conv3') as scope:
        conv3 = tf.layers.conv2d(
            inputs=relu2,
            filters=128,
            kernel_size=[1, 16],
            padding='VALID',
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(WEIGHT_DECAY))
        bn3 = tf.layers.batch_normalization(conv3, training=isTraining)
        relu3 = tf.nn.relu(bn3)

    with remap_variables(quant_wei_uni_dead_zone), tf.variable_scope(
            'fully_connected') as scope:
        flat = tf.layers.flatten(relu3)
        logits = tf.layers.dense(
            inputs=flat,
            units=NUM_CLASSES,
            name=scope.name,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(WEIGHT_DECAY))

    y_pred_cls = tf.argmax(logits, axis=1)
    gtlabel = tf.one_hot(label_y, NUM_CLASSES)
    # LOSS AND OPTIMIZER
    cross_entropy_loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                   labels=gtlabel))
    loss = cross_entropy_loss + tf.add_n(
        tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           epsilon=1e-3).minimize(
                                               loss, global_step=global_step)

    # PREDICTION AND ACCURACY CALCULATION
    def get_eval_op(preds, labels):
        correct_prediction = tf.equal(preds, labels)
        return tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    eval_op = get_eval_op(y_pred_cls, label_y)

    return loss, optimizer, eval_op, global_step, learning_rate
Esempio n. 8
0
    def _build_graph(self, inputs):
        image, label, ious, ious_weights, valids, bndboxes = inputs
        image = tf.round(image)

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        old_get_variable = tf.get_variable

        def monitor(x, name):
            if MONITOR == 1:
                return tf.Print(x, [x],
                                message='\n\n' + name + ': ',
                                summarize=1000,
                                name=name)
            else:
                return x

        def new_get_variable(v):
            name = v.op.name
            # if not name.endswith('W') or 'conv1' in name or 'conv_obj' in name or 'conv_box' in name:
            if not name.endswith(
                    'W') or 'conv_obj' in name or 'conv_box' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                if MONITOR == 1:
                    return tf.Print(fw(v), [fw(v)],
                                    message='\n\n' + v.name +
                                    ', Quantized weights are:',
                                    summarize=100)
                else:
                    return fw(v)

        def activate(x):
            if BITA == 32:
                return tf.nn.relu(x)
            else:
                return fa(tf.nn.relu(x))

        def bn_activate(name, x):
            x = BatchNorm(name, x)
            x = monitor(x, name + '_noact_out')
            return activate(x)

        def halffire(name, x, num_squeeze_filters, num_expand_3x3_filters,
                     skip):
            out_squeeze = Conv2D('squeeze_conv_' + name,
                                 x,
                                 out_channel=num_squeeze_filters,
                                 kernel_shape=1,
                                 stride=1,
                                 padding='SAME')
            out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze)
            out_expand_3x3 = Conv2D('expand_3x3_conv_' + name,
                                    out_squeeze,
                                    out_channel=num_expand_3x3_filters,
                                    kernel_shape=3,
                                    stride=1,
                                    padding='SAME')
            out_expand_3x3 = bn_activate('bn_expand_3x3_' + name,
                                         out_expand_3x3)
            if skip == 0:
                return out_expand_3x3
            else:
                return tf.add(x, out_expand_3x3)

        def halffire_noact(name, x, num_squeeze_filters,
                           num_expand_3x3_filters):
            out_squeeze = Conv2D('squeeze_conv_' + name,
                                 x,
                                 out_channel=num_squeeze_filters,
                                 kernel_shape=1,
                                 stride=1,
                                 padding='SAME')
            out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze)
            out_expand_3x3 = Conv2D('expand_3x3_conv_' + name,
                                    out_squeeze,
                                    out_channel=num_expand_3x3_filters,
                                    kernel_shape=3,
                                    stride=1,
                                    padding='SAME')
            return out_expand_3x3

        def halffire_final(l, name):
            l = halffire('fire4' + name, l, NUM_SQUEEZE_FILTERS,
                         NUM_EXPAND_FILTERS, 0)
            l = halffire('fire5' + name, l, NUM_SQUEEZE_FILTERS,
                         NUM_EXPAND_FILTERS, 0)
            l = halffire('fire6' + name, l, NUM_SQUEEZE_FILTERS,
                         NUM_EXPAND_FILTERS, 0)
            l = halffire('fire7' + name, l, NUM_SQUEEZE_FILTERS,
                         NUM_EXPAND_FILTERS, 0)
            return l

        def decision(l, name):
            classify = Conv2D('conv_class' + name,
                              l,
                              out_channel=12,
                              kernel_shape=1,
                              stride=1,
                              padding='SAME')
            classify = bn_activate('bn_class' + name, classify)
            classify = monitor(classify, 'conv_class_out' + name)
            logits = GlobalAvgPooling('pool_class' + name, classify)

            l = tf.concat([l, classify], axis=3)

            objdetect = Conv2D('conv_obj' + name,
                               l,
                               out_channel=1,
                               kernel_shape=1,
                               stride=1,
                               padding='SAME')
            bndbox = Conv2D('conv_box' + name,
                            l,
                            out_channel=4,
                            kernel_shape=1,
                            stride=1,
                            padding='SAME')

            return logits, objdetect, bndbox

        def first_layer(x):
            l = Conv2D('conv1',
                       x,
                       out_channel=16,
                       kernel_shape=3,
                       stride=1,
                       padding='SAME')
            l = bn_activate('bn1', l)
            l = monitor(l, 'conv1_out')
            return l

        with  remap_variables(new_get_variable), \
          argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity), \
          argscope(BatchNorm, decay=0.9, epsilon=1e-4):

            image = monitor(image, 'image_out')

            if DEMO_DATASET == 0:
                l = first_layer(image)
            else:
                l = tf.stop_gradient(first_layer(image))

            l = MaxPooling('pool1', l, shape=3, stride=2, padding='SAME')
            l = monitor(l, 'pool1_out')

            l = halffire('fire1', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS,
                         0)
            l = monitor(l, 'fire1_out')

            l = MaxPooling('pool2', l, shape=3, stride=2, padding='SAME')
            l = monitor(l, 'pool2_out')

            l = halffire('fire2', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS,
                         0)
            l = monitor(l, 'fire2_out')

            l = MaxPooling('pool3', l, shape=3, stride=2, padding='SAME')
            l = monitor(l, 'pool3_out')

            l = halffire('fire3', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS,
                         0)
            l = monitor(l, 'fire3_out')

            l = MaxPooling('pool4', l, shape=3, stride=2, padding='SAME')
            l = monitor(l, 'pool4_out')

            l1 = halffire_final(l, '1')
            l1 = monitor(l1, 'final1_out')
            l2 = halffire_final(l, '2')
            l2 = monitor(l2, 'final2_out')
            l3 = halffire_final(l, '3')
            l3 = monitor(l3, 'final3_out')
            l4 = halffire_final(l, '4')
            l4 = monitor(l4, 'final4_out')

            logits1, objdetect1, bndbox1 = decision(l1, '1')
            logits2, objdetect2, bndbox2 = decision(l2, '2')
            logits3, objdetect3, bndbox3 = decision(l3, '3')
            logits4, objdetect4, bndbox4 = decision(l4, '4')

            # Classification
            logits = (logits1 + logits2 + logits3 + logits4) / 4
            class_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                               labels=label),
                name='cross_entropy_loss')

            wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
            add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))

            # Object Detection
            objdetect = (objdetect1 + objdetect2 + objdetect3 + objdetect4) / 4
            objdetect = tf.identity(objdetect, name='objdetect_out')
            objdetect_loss = tf.losses.hinge_loss(labels=ious,
                                                  logits=objdetect,
                                                  weights=ious_weights)

            bndbox = (bndbox1 + bndbox2 + bndbox3 + bndbox4) / 4
            bndbox = tf.identity(bndbox, name='bndbox_out')
            bndbox_loss = tf.losses.mean_squared_error(labels=bndboxes,
                                                       predictions=tf.multiply(
                                                           bndbox,
                                                           valids,
                                                           name='mult0'))

            if DEMO_DATASET == 0:
                cost = class_loss + 5 * objdetect_loss + bndbox_loss
            else:
                cost = 1000 * objdetect_loss + bndbox_loss

            add_moving_summary(class_loss, objdetect_loss, bndbox_loss, cost)

        self.cost = cost

        tf.get_variable = old_get_variable
Esempio n. 9
0
    def _build_graph(self, inputs):
        image, label = inputs
        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fc' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def cabs(x):
            return tf.minimum(1.0, tf.abs(x), name='cabs')

        def activate(x):
            return fa(cabs(x))

        image = image / 256.0

        with remap_variables(binarize_weight), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False, nl=tf.identity):
            logits = (
                LinearWrap(image).Conv2D('conv0',
                                         48,
                                         5,
                                         padding='VALID',
                                         use_bias=True).MaxPooling(
                                             'pool0', 2,
                                             padding='SAME').apply(activate)
                # 18
                .Conv2D('conv1', 64, 3, padding='SAME').apply(fg).BatchNorm(
                    'bn1').apply(activate).Conv2D(
                        'conv2', 64, 3,
                        padding='SAME').apply(fg).BatchNorm('bn2').MaxPooling(
                            'pool1', 2, padding='SAME').apply(activate)
                # 9
                .Conv2D(
                    'conv3', 128, 3,
                    padding='VALID').apply(fg).BatchNorm('bn3').apply(activate)
                # 7
                .Conv2D('conv4', 128, 3, padding='SAME').apply(fg).
                BatchNorm('bn4').apply(activate).Conv2D(
                    'conv5', 128, 3,
                    padding='VALID').apply(fg).BatchNorm('bn5').apply(activate)
                # 5
                .tf.nn.dropout(0.5 if is_training else 1.0).Conv2D(
                    'conv6', 512, 5, padding='VALID').apply(fg).BatchNorm(
                        'bn6').apply(cabs).FullyConnected('fc1',
                                                          10,
                                                          nl=tf.identity)())
        tf.nn.softmax(logits, name='output')

        # compute the number of failed samples
        wrong = prediction_incorrect(logits, label)
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))

        add_param_summary(('.*/W', ['histogram', 'rms']))
        self.cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, self.cost)
Esempio n. 10
0
        def resnet18_cifar(input_tensor,
                           is_training=True,
                           pooling_and_fc=True,
                           reuse=False,
                           kernel_initializer=tf.contrib.layers.
                           variance_scaling_initializer()):
            with remap_variables(new_get_variable):
                x = tf.layers.conv2d(input_tensor,
                                     64, (3, 3),
                                     strides=(1, 1),
                                     kernel_initializer=kernel_initializer,
                                     use_bias=False,
                                     padding='SAME',
                                     name='conv1_1/3x3_s1',
                                     reuse=reuse)
                x = tf.layers.batch_normalization(x,
                                                  training=is_training,
                                                  name='bn1_1/3x3_s1',
                                                  reuse=reuse)
                x = tf.nn.relu(x)

                x1 = identity_block2d(x,
                                      3, [48, 64, 64],
                                      stage=2,
                                      block='1b',
                                      is_training=is_training,
                                      reuse=reuse,
                                      kernel_initializer=kernel_initializer)
                x1 = identity_block2d(x1,
                                      3, [48, 64, 64],
                                      stage=3,
                                      block='1c',
                                      is_training=is_training,
                                      reuse=reuse,
                                      kernel_initializer=kernel_initializer)

                x2 = conv_block_2d(x1,
                                   3, [96, 128, 128],
                                   stage=3,
                                   block='2a',
                                   strides=(2, 2),
                                   is_training=is_training,
                                   reuse=reuse,
                                   kernel_initializer=kernel_initializer)
                x2 = activate(x2)
                x2 = identity_block2d(x2,
                                      3, [96, 128, 128],
                                      stage=3,
                                      block='2b',
                                      is_training=is_training,
                                      reuse=reuse,
                                      kernel_initializer=kernel_initializer)

                x3 = conv_block_2d(x2,
                                   3, [128, 256, 256],
                                   stage=4,
                                   block='3a',
                                   strides=(2, 2),
                                   is_training=is_training,
                                   reuse=reuse,
                                   kernel_initializer=kernel_initializer)
                x3 = activate(x3)
                x3 = identity_block2d(x3,
                                      3, [128, 256, 256],
                                      stage=4,
                                      block='3b',
                                      is_training=is_training,
                                      reuse=reuse,
                                      kernel_initializer=kernel_initializer)

                x4 = conv_block_2d(x3,
                                   3, [256, 512, 512],
                                   stage=5,
                                   block='4a',
                                   strides=(2, 2),
                                   is_training=is_training,
                                   reuse=reuse,
                                   kernel_initializer=kernel_initializer)
                x4 = activate(x4)
                x4 = identity_block2d(x4,
                                      3, [256, 512, 512],
                                      stage=5,
                                      block='4b',
                                      is_training=is_training,
                                      reuse=reuse,
                                      kernel_initializer=kernel_initializer)

                print('before gap: ', x4)
                x4 = tf.reduce_mean(x4, [1, 2])
                print('after gap: ', x4)
                # flatten = tf.contrib.layers.flatten(x4)
                prob = tf.layers.dense(
                    x4,
                    self.class_num,
                    reuse=reuse,
                    kernel_initializer=tf.contrib.layers.xavier_initializer())

                # tmp = tf.trainable_variables()
                # prob = tf.layers.batch_normalization(prob, training=is_training, name='fbn', reuse=reuse)
                print('prob', prob)

            return prob
Esempio n. 11
0
    def _build_graph(self, inputs):
        conf = Config()

        is_training = get_current_tower_context().is_training
        input, nextinput = inputs
        initializer = tf.random_uniform_initializer(-conf.init_scale,
                                                    conf.init_scale)

        def get_basic_cell():
            # cell = rnn.BasicLSTMCell(num_units=conf.hidden_size, forget_bias=0.0, reuse=tf.get_variable_scope().reuse)
            cell = ttq_rnn.TtqLSTMCell(
                num_units=conf.hidden_size,
                thre=0.05,  #)
                forget_bias=1.0,
                reuse=tf.get_variable_scope().reuse)
            if is_training and conf.keep_prob < 1:
                cell = rnn.DropoutWrapper(cell,
                                          output_keep_prob=conf.keep_prob)
            return cell

        cell = rnn.MultiRNNCell(
            [get_basic_cell() for _ in range(conf.num_layers)])

        def get_v(n):
            return tf.get_variable(
                n,
                [conf.batch_size, conf.hidden_size],  #,[BATCH, HIDDEN_SIZE],
                trainable=False,
                initializer=tf.constant_initializer())

        def replace_w(x):
            if x.op.name.endswith('W'):
                print("\nBefore quantize name: " + x.op.name)
                return tw_ternarize(x, 0.05)  # tanh to round to [-1,+1]
                #return bit_utils.quantize_w(tf.tanh(x), bit=self._w_bit)
            elif x.op.name.endswith('b'):
                print("\nBefore quantize name: " + x.op.name)
                return tw_ternarize_bias(x, 0.05)
            else:
                print("\nNOT Quantizing:" + x.op.name)
                print(x.shape)
                print(type(x))
                tf.summary.histogram(x.name, x)
                return x

    # Parameters of gates are concatenated into one multiply for efficiency.
    # with bit_utils.replace_variable(replace_w):

        self.state = state_var = \
            (rnn.LSTMStateTuple(get_v('c0'), get_v('h0')),
             rnn.LSTMStateTuple(get_v('c1'), get_v('h1')))
        embeddingW = tf.get_variable(
            'embedding', [conf.vocab_size, conf.hidden_size],
            initializer=initializer)  #tf.random_uniform_initializer())
        input_feature = tf.nn.embedding_lookup(
            embeddingW, input)  # B x seqlen x hiddensize
        if is_training and conf.keep_prob < 1:
            input_feature = Dropout(input_feature, conf.keep_prob)

        print("\n\nThe STATE:")
        print(self.state)

        with tf.variable_scope('LSTM', initializer=initializer):
            input_list = tf.unstack(input_feature, num=conf.num_steps,
                                    axis=1)  # seqlen x (Bxhidden)
            outputs, last_state = rnn.static_rnn(cell,
                                                 input_list,
                                                 state_var,
                                                 scope='rnn')

        update_state_ops = []
        for k in range(conf.num_layers):
            update_state_ops.extend([
                tf.assign(state_var[k].c, last_state[k].c),
                tf.assign(state_var[k].h, last_state[k].h)
            ])

        # seqlen x (Bxrnnsize)
        output = tf.reshape(tf.concat(outputs, 1),
                            [-1, conf.hidden_size])  # (Bxseqlen) x hidden
        with varreplace.remap_variables(replace_w):
            logits = FullyConnected('fc',
                                    output,
                                    conf.vocab_size,
                                    nl=tf.identity,
                                    W_init=initializer,
                                    b_init=initializer)

        xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=tf.reshape(nextinput, [-1]))
        with tf.control_dependencies(update_state_ops):
            self.cost = tf.truediv(tf.reduce_sum(xent_loss),
                                   tf.cast(conf.batch_size, tf.float32),
                                   name='cost')  # log-perplexity

        perpl = tf.exp(self.cost / conf.num_steps, name='perplexity')
        summary.add_moving_summary(perpl, self.cost)
Esempio n. 12
0
    def _build_graph(self, inputs):
        image, label = inputs
        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fc' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def cabs(x):
            return tf.minimum(1.0, tf.abs(x), name='cabs')

        def activate(x):
            return fa(cabs(x))

        image = image / 256.0

        with remap_variables(binarize_weight), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False, nl=tf.identity):
            logits = (LinearWrap(image)
                      .Conv2D('conv0', 48, 5, padding='VALID', use_bias=True)
                      .MaxPooling('pool0', 2, padding='SAME')
                      .apply(activate)
                      # 18
                      .Conv2D('conv1', 64, 3, padding='SAME')
                      .apply(fg)
                      .BatchNorm('bn1').apply(activate)

                      .Conv2D('conv2', 64, 3, padding='SAME')
                      .apply(fg)
                      .BatchNorm('bn2')
                      .MaxPooling('pool1', 2, padding='SAME')
                      .apply(activate)
                      # 9
                      .Conv2D('conv3', 128, 3, padding='VALID')
                      .apply(fg)
                      .BatchNorm('bn3').apply(activate)
                      # 7

                      .Conv2D('conv4', 128, 3, padding='SAME')
                      .apply(fg)
                      .BatchNorm('bn4').apply(activate)

                      .Conv2D('conv5', 128, 3, padding='VALID')
                      .apply(fg)
                      .BatchNorm('bn5').apply(activate)
                      # 5
                      .tf.nn.dropout(0.5 if is_training else 1.0)
                      .Conv2D('conv6', 512, 5, padding='VALID')
                      .apply(fg).BatchNorm('bn6')
                      .apply(cabs)
                      .FullyConnected('fc1', 10, nl=tf.identity)())
        tf.nn.softmax(logits, name='output')

        # compute the number of failed samples
        wrong = prediction_incorrect(logits, label)
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))

        add_param_summary(('.*/W', ['histogram', 'rms']))
        self.cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, self.cost)
Esempio n. 13
0
    def _build_graph(self, inputs):
        image, label = inputs
        image = tf.expand_dims(image, 3)
        image = image * 2 - 1  # center the pixels values at zero

        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fc' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def cabs(x):
            return tf.minimum(1.0, tf.abs(x), name='cabs')

        def activate(x):
            return fa(cabs(x))


        with remap_variables(binarize_weight), \
                argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32):
            logits = (LinearWrap(image).Conv2D('conv0').MaxPooling(
                'pool0', 2).apply(activate).Conv2D('conv1').apply(fg).Conv2D(
                    'conv2').apply(fg).MaxPooling('pool1', 2).apply(activate).
                      Conv2D('conv3').apply(fg).apply(cabs).FullyConnected(
                          'fc0', 512, activation=tf.nn.relu).Dropout(
                              'dropout',
                              0.5).FullyConnected('fc1',
                                                  10,
                                                  activation=tf.identity)())

        tf.nn.softmax(logits, name='output')

        # a vector of length B with loss of each sample
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(
            cost, name='cross_entropy_loss')  # the average cross-entropy loss

        correct = tf.cast(tf.nn.in_top_k(logits, label, 1),
                          tf.float32,
                          name='correct')
        accuracy = tf.reduce_mean(correct, name='accuracy')

        # This will monitor training error (in a moving_average fashion):
        # 1. write the value to tensosrboard
        # 2. write the value to stat.json
        # 3. print the value after each epoch
        train_error = tf.reduce_mean(1 - correct, name='train_error')
        summary.add_moving_summary(train_error, accuracy)

        # Use a regex to find parameters to apply weight decay.
        # Here we apply a weight decay on all W (weight matrix) of all fc layers
        wd_cost = tf.multiply(1e-5,
                              regularize_cost('fc.*/W', tf.nn.l2_loss),
                              name='regularize_loss')
        self.cost = tf.add_n([wd_cost, cost], name='total_cost')
        summary.add_moving_summary(cost, wd_cost, self.cost)

        # monitor histogram of all weight (of conv and fc layers) in tensorboard
        summary.add_param_summary(('.*/W', ['histogram', 'rms']))
Esempio n. 14
0
    def _build_graph(self, inputs):
        image, label = inputs
        image = image / 256.0

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv1' in name or 'fct' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        def resblock(x, channel, stride):
            def get_stem_full(x):
                return (LinearWrap(x)
                        .Conv2D('c3x3a', channel, 3)
                        .BatchNorm('stembn')
                        .apply(activate)
                        .Conv2D('c3x3b', channel, 3)())
            channel_mismatch = channel != x.get_shape().as_list()[3]
            if stride != 1 or channel_mismatch or 'pool1' in x.name:
                # handling pool1 is to work around an architecture bug in our model
                if stride != 1 or 'pool1' in x.name:
                    x = AvgPooling('pool', x, stride, stride)
                x = BatchNorm('bn', x)
                x = activate(x)
                shortcut = Conv2D('shortcut', x, channel, 1)
                stem = get_stem_full(x)
            else:
                shortcut = x
                x = BatchNorm('bn', x)
                x = activate(x)
                stem = get_stem_full(x)
            return shortcut + stem

        def group(x, name, channel, nr_block, stride):
            with tf.variable_scope(name + 'blk1'):
                x = resblock(x, channel, stride)
            for i in range(2, nr_block + 1):
                with tf.variable_scope(name + 'blk{}'.format(i)):
                    x = resblock(x, channel, 1)
            return x

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False, nl=tf.identity):
            logits = (LinearWrap(image)
                      # use explicit padding here, because our training framework has
                      # different padding mechanisms from TensorFlow
                      .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]])
                      .Conv2D('conv1', 64, 7, stride=2, padding='VALID', use_bias=True)
                      .tf.pad([[0, 0], [1, 1], [1, 1], [0, 0]], 'SYMMETRIC')
                      .MaxPooling('pool1', 3, 2, padding='VALID')
                      .apply(group, 'conv2', 64, 2, 1)
                      .apply(group, 'conv3', 128, 2, 2)
                      .apply(group, 'conv4', 256, 2, 2)
                      .apply(group, 'conv5', 512, 2, 2)
                      .BatchNorm('lastbn')
                      .apply(nonlin)
                      .GlobalAvgPooling('gap')
                      .tf.multiply(49)  # this is due to a bug in our model design
                      .FullyConnected('fct', 1000)())
        tf.nn.softmax(logits, name='output')
        ImageNetModel.compute_loss_and_error(logits, label)
Esempio n. 15
0
    def _build_graph(self, inputs):
        image, label = inputs
        image = image / 256.0

        fw, fa = get_quantize(BITW, BITA)
        old_get_variable = tf.get_variable

        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv1' in name or 'fct' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x * 0.1))

        def resblock(x,
                     channel,
                     stride,
                     bottleneck_dividend=4,
                     stem_type='full'):
            bottleneck_channel_s = channel // bottleneck_dividend

            def get_stem_bottleneck(x):
                return (LinearWrap(x).Conv2D(
                    'c1x1shrink', bottleneck_channel_s,
                    1).BatchNorm('stembn1').apply(activate).Conv2D(
                        'c3x3', bottleneck_channel_s,
                        3).BatchNorm('stembn2').apply(activate).Conv2D(
                            'c1x1expand', channel, 1)())

            def get_stem_full(x):
                return (LinearWrap(x).Conv2D(
                    'c3x3a', channel,
                    3).BatchNorm('stembn').apply(activate).Conv2D(
                        'c3x3b', channel, 3)())

            get_stem = dict(bottleneck=get_stem_bottleneck,
                            full=get_stem_full)[stem_type]
            channel_mismatch = channel != x.get_shape().as_list()[3]
            if stride != 1 or channel_mismatch or 'pool1' in x.name:
                # handling pool1 is to work around an architecture bug in our model
                if stride != 1 or 'pool1' in x.name:
                    x = AvgPooling('pool', x, stride, stride)
                x = BatchNorm('bn', x)
                x = activate(x)
                shortcut = Conv2D('shortcut', x, channel, 1)
                stem = get_stem(x)
            else:
                shortcut = x
                x = BatchNorm('bn', x)
                x = activate(x)
                stem = get_stem(x)
            return shortcut + stem

        def group(x, name, channel, nr_block, stride):
            with tf.variable_scope(name + 'blk1'):
                x = resblock(x, channel, stride)
            for i in range(2, nr_block + 1):
                with tf.variable_scope(name + 'blk{}'.format(i)):
                    x = resblock(x, channel, 1)
            return x

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False, nl=tf.identity):
            logits = (
                LinearWrap(image)
                # use explicit padding here, because our training framework has
                # different padding mechanisms from TensorFlow
                .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]]).Conv2D(
                    'conv1', 64, 7, stride=2, padding='VALID',
                    use_bias=True).tf.pad(
                        [[0, 0], [1, 1], [1, 1], [0, 0]],
                        'SYMMETRIC').MaxPooling(
                            'pool1',
                            3, 2, padding='VALID').apply(
                                group, 'conv2', 64,
                                2, 1).apply(group, 'conv3', 128, 2, 2).apply(
                                    group, 'conv4', 256, 2,
                                    2).apply(group, 'conv5', 512, 2,
                                             2).BatchNorm('lastbn').
                apply(nonlin).GlobalAvgPooling('gap').tf.multiply(
                    49)  # this is due to a bug in our model design
                .FullyConnected('fct', 1000)())
        prob = tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
        wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))

        wd_cost = regularize_cost('fc.*/W',
                                  l2_regularizer(5e-6),
                                  name='regularize_cost')

        add_param_summary(('.*/W', ['histogram', 'rms']))
        self.cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, self.cost)
Esempio n. 16
0
    def _build_graph(self, inputs):
        image, label = inputs
        image = image / 255.0

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fct' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)    # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity):
            logits = (LinearWrap(image)
                      .Conv2D('conv0', 96, 12, stride=4, padding='VALID')
                      .apply(activate)
                      .Conv2D('conv1', 256, 5, padding='SAME', split=2)
                      .apply(fg)
                      .BatchNorm('bn1')
                      .MaxPooling('pool1', 3, 2, padding='SAME')
                      .apply(activate)

                      .Conv2D('conv2', 384, 3)
                      .apply(fg)
                      .BatchNorm('bn2')
                      .MaxPooling('pool2', 3, 2, padding='SAME')
                      .apply(activate)

                      .Conv2D('conv3', 384, 3, split=2)
                      .apply(fg)
                      .BatchNorm('bn3')
                      .apply(activate)

                      .Conv2D('conv4', 256, 3, split=2)
                      .apply(fg)
                      .BatchNorm('bn4')
                      .MaxPooling('pool4', 3, 2, padding='VALID')
                      .apply(activate)

                      .FullyConnected('fc0', 4096)
                      .apply(fg)
                      .BatchNorm('bnfc0')
                      .apply(activate)

                      .FullyConnected('fc1', 4096)
                      .apply(fg)
                      .BatchNorm('bnfc1')
                      .apply(nonlin)
                      .FullyConnected('fct', 1000, use_bias=True)())

        tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
        wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6), name='regularize_cost')

        add_param_summary(('.*/W', ['histogram', 'rms']))
        self.cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, self.cost)
Esempio n. 17
0
    def _build_graph(self, inputs):
        image, label = inputs
        image = image / 255.0

        fw, fa = get_quantize(BITW, BITA)

        old_get_variable = tf.get_variable

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fct' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)  # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x * 0.1))

        def inception_bn(name,
                         x,
                         nr_c0_conv_1x1,
                         nr_c1_conv_1x1,
                         nr_c1_conv_3x3,
                         nr_c2_conv_1x1,
                         nr_c2_conv_5x5,
                         nr_c3_conv_1x1,
                         nonlinearity=tf.nn.relu,
                         internal_nonlin=None,
                         do_proc=True):
            if internal_nonlin is None:
                internal_nonlin = nonlinearity
            outputs = []
            with tf.variable_scope(name) as scope:
                c0 = Conv2D('column_0_conv_1x1', x, nr_c0_conv_1x1, 1)
                c0 = BatchNorm('bn_0_1x1', c0)
                if do_proc:
                    c0 = activate(c0)
                outputs.append(c0)
                c1_1x1 = Conv2D('column_1_conv_1x1', x, nr_c1_conv_1x1, 1)
                c1_1x1 = BatchNorm('bn_1_1x1', c1_1x1)
                c1_1x1 = activate(c1_1x1)
                c1_3x3 = Conv2D('column_1_conv_3x3', c1_1x1, nr_c1_conv_3x3, 3)
                c1_3x3 = BatchNorm('bn_1_3x3', c1_3x3)
                if do_proc:
                    c1_3x3 = activate(c1_3x3)
                outputs.append(c1_3x3)
                c2_1x1 = Conv2D('column_2_conv_1x1', x, nr_c2_conv_1x1, 1)
                c2_1x1 = BatchNorm('bn_2_1x1', c2_1x1)
                c2_1x1 = activate(c2_1x1)
                c2_5x5 = Conv2D('column_2_conv_5x5', c2_1x1, nr_c2_conv_5x5, 5)
                c2_5x5 = BatchNorm('bn_2_5x5', c2_5x5)
                if do_proc:
                    c2_5x5 = activate(c2_5x5)
                outputs.append(c2_5x5)
                c3_maxpool = MaxPooling('column_3_maxpool',
                                        x,
                                        3,
                                        1,
                                        padding='SAME')
                c3_1x1 = Conv2D('column_3_conv_1x1', c3_maxpool,
                                nr_c3_conv_1x1, 1)
                c3_1x1 = BatchNorm('bn_3_1x1', c3_1x1)
                if do_proc:
                    c3_1x1 = activate(c3_1x1)
                outputs.append(c3_1x1)
                return tf.concat(outputs, 3, name='concat')

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity):
            nl = tf.identity
            l = (LinearWrap(image).Conv2D(
                'conv1_1', 64, 7, stride=2,
                padding='SAME').BatchNorm('bn1_1').MaxPooling(
                    'pool1', 3, 2, padding='SAME').apply(activate).Conv2D(
                        'conv2_1', 64, 1, padding='SAME').BatchNorm(
                            'bn2_1').apply(activate).Conv2D(
                                'conv2_2', 192, 3,
                                padding='SAME').BatchNorm('bn2_2').MaxPooling(
                                    'pool2', 3, 2,
                                    padding='SAME').apply(activate)())
            l = inception_bn('inception_3_1', l, 64, 96, 128, 16, 32, 32, nl)
            l = inception_bn('inception_3_2', l, 128, 128, 192, 32, 96, 64, nl)
            l = MaxPooling('pool3', l, 3, 2, padding='SAME')
            l = inception_bn('inception_4_1', l, 192, 96, 208, 16, 48, 64, nl)
            l = inception_bn('inception_4_2', l, 160, 112, 224, 24, 64, 64, nl)
            l = inception_bn('inception_4_3', l, 128, 128, 256, 24, 64, 64, nl)
            l = inception_bn('inception_4_4', l, 112, 144, 288, 32, 64, 64, nl)
            l = inception_bn('inception_4_5', l, 256, 160, 320, 32, 128, 128,
                             nl)
            l = MaxPooling('pool4', l, 3, 2, padding='SAME')
            l = inception_bn('inception_5_1', l, 256, 160, 320, 32, 128, 128,
                             nl)
            l = inception_bn('inception_5_2',
                             l,
                             384,
                             192,
                             384,
                             48,
                             128,
                             128,
                             nl,
                             do_proc=False)
            l = GlobalAvgPooling('gap', l)
            l = activate(l)
            l = FullyConnected('fct', l, 1000, use_bias=True)
            logits = l

        prob = tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
        wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W',
                                  l2_regularizer(5e-6),
                                  name='regularize_cost')

        add_param_summary(('.*/W', ['histogram', 'rms']))
        self.cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, self.cost)
Esempio n. 18
0
    def get_logits(self, image):
        if BITW == 't':
            fw, fa, fg = get_dorefa(32, 32, 32)
            fw = ternarize
        else:
            fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fct' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)    # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        with remap_variables(new_get_variable), \
                argscope([Conv2D, BatchNorm, MaxPooling], data_format='channels_first'), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False):
            logits = (LinearWrap(image)
                      .Conv2D('conv0', 96, 12, strides=4, padding='VALID', use_bias=True)
                      .apply(activate)
                      .Conv2D('conv1', 256, 5, padding='SAME', split=2)
                      .apply(fg)
                      .BatchNorm('bn1')
                      .MaxPooling('pool1', 3, 2, padding='SAME')
                      .apply(activate)

                      .Conv2D('conv2', 384, 3)
                      .apply(fg)
                      .BatchNorm('bn2')
                      .MaxPooling('pool2', 3, 2, padding='SAME')
                      .apply(activate)

                      .Conv2D('conv3', 384, 3, split=2)
                      .apply(fg)
                      .BatchNorm('bn3')
                      .apply(activate)

                      .Conv2D('conv4', 256, 3, split=2)
                      .apply(fg)
                      .BatchNorm('bn4')
                      .MaxPooling('pool4', 3, 2, padding='VALID')
                      .apply(activate)

                      .FullyConnected('fc0', 4096)
                      .apply(fg)
                      .BatchNorm('bnfc0')
                      .apply(activate)

                      .FullyConnected('fc1', 4096, use_bias=False)
                      .apply(fg)
                      .BatchNorm('bnfc1')
                      .apply(nonlin)
                      .FullyConnected('fct', 1000, use_bias=True)())
        add_param_summary(('.*/W', ['histogram', 'rms']))
        tf.nn.softmax(logits, name='output')  # for prediction
        return logits
Esempio n. 19
0
    def build_graph(self, image, label):
        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)  #获取对三个参量量化的函数变量

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):  #注意,对模型的第一层和最后一层,一般是不做任何量化的。
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fc' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):  #这里是clip_Relu
            if BITA == 32:
                return tf.nn.relu(x)
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):  #这里是对A先做clip_Relu,再做量化
            return fa(nonlin(x))

        image = image / 256.0

        with remap_variables(binarize_weight), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4,center=True, scale=True,), \
                argscope(Conv2D, use_bias=False):#这行代码是对所有的variables对过binarize_weight函数、设置BN和Conv的参数

            logits = (
                LinearWrap(image)  #LinearWrap用来搭建线性模型,其中apply的是函数句柄,可以向其中传递参数;
                .Conv2D('conv0', 48, 5, padding='VALID', use_bias=True
                        )  #conv0 input:[none,40,40,3] output:[none,36,36,48]
                .MaxPooling('pool0', 2, padding='SAME'
                            )  #pooling input[none:36,36,48] output:[18,18,48]
                .apply(activate)  #对Activation进行量化。
                # 18
                .Conv2D('conv1', 64, 3, padding='SAME'
                        )  #input[none,18,18,48] output[none,18,18,64]
                .apply(fg)  #对导数进行量化
                .BatchNorm('bn1').apply(activate).Conv2D(
                    'conv2', 64, 3, padding='SAME'
                )  #input[none 18,18,64] output[none,18,18,64]
                .apply(fg).BatchNorm('bn2').MaxPooling(
                    'pool1', 2,
                    padding='SAME')  #input[none,18,18,64] output[none,9,9,64]
                .apply(activate)
                # 9
                .Conv2D(
                    'conv3', 128, 3,
                    padding='VALID')  #input[none,9,9,64] output[none,7,7,128]
                .apply(fg).BatchNorm('bn3').apply(activate)
                # 7
                .Conv2D(
                    'conv4', 128, 3,
                    padding='SAME')  #input[none,7,7,128] output[none,7,7,128]
                .apply(fg).BatchNorm('bn4').apply(activate).Conv2D(
                    'conv5', 128, 3,
                    padding='VALID')  #input[none,7,7,128] output[none,5,5,128]
                .apply(fg).BatchNorm('bn5').apply(activate)
                # 5
                .Dropout(rate=0.5 if is_training else 0.0).Conv2D(
                    'conv6', 512, 5,
                    padding='VALID')  #input[none,5,5,128] output[none,1,1,512]
                .apply(fg).BatchNorm('bn6').apply(
                    nonlin)  #这里只做了clip_relu.并没有过量化。
                .FullyConnected('fc1', 10)())  #fc1 output[none,10]
        tf.nn.softmax(logits, name='output')

        # compute the number of failed samples
        wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)),
                        tf.float32,
                        name='wrong_tensor')
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))

        add_param_summary(('.*/W', ['histogram', 'rms']))
        total_cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, total_cost)
        return total_cost
Esempio n. 20
0
	def _build_graph(self, inputs):
		image, label, ious, valids, bndboxes = inputs
		image = tf.round(image)

		fw, fa, fg = get_dorefa(BITW, BITA, BITG)

		old_get_variable = tf.get_variable

		def monitor(x, name):
			if MONITOR == 1:
				return tf.Print(x, [x], message='\n\n' + name + ': ', summarize=1000, name=name)
			else:
				return x

		def new_get_variable(v):
			name = v.op.name
			if not name.endswith('W') or 'conv1' in name or 'conv_obj' in name or 'conv_box' in name:
				return v
			else:
				logger.info("Quantizing weight {}".format(v.op.name))
				if MONITOR == 1:
					return tf.Print(fw(v), [fw(v)], message='\n\n' + v.name + ', Quantized weights are:', summarize=100)
				else:
					return fw(v)

		def activate(x):
			if BITA == 32:
				return tf.nn.relu(x)
			else:
				return fa(tf.nn.relu(x))

		def bn_activate(name, x):
			x = BatchNorm(name, x)
			x = monitor(x, name + '_noact_out')
			return activate(x)

		def halffire(name, x, num_squeeze_filters, num_expand_3x3_filters, skip):
			out_squeeze = Conv2D('squeeze_conv_' + name, x, out_channel=num_squeeze_filters, kernel_shape=1, stride=1, padding='SAME')
			out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze)
			out_expand_3x3 = Conv2D('expand_3x3_conv_' + name, out_squeeze, out_channel=num_expand_3x3_filters, kernel_shape=3, stride=1, padding='SAME')
			out_expand_3x3 = bn_activate('bn_expand_3x3_' + name, out_expand_3x3)
			if skip == 0:
				return out_expand_3x3
			else:
				return tf.add(x, out_expand_3x3)

		def halffire_noact(name, x, num_squeeze_filters, num_expand_3x3_filters):
			out_squeeze = Conv2D('squeeze_conv_' + name, x, out_channel=num_squeeze_filters, kernel_shape=1, stride=1, padding='SAME')
			out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze)
			out_expand_3x3 = Conv2D('expand_3x3_conv_' + name, out_squeeze, out_channel=num_expand_3x3_filters, kernel_shape=3, stride=1, padding='SAME')
			return out_expand_3x3

		with 	remap_variables(new_get_variable), \
				argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity), \
				argscope(BatchNorm, decay=0.9, epsilon=1e-4):

			image = monitor(image, 'image_out')

			l = Conv2D('conv1', image, out_channel=32, kernel_shape=3, stride=2, padding='SAME')
			l = bn_activate('bn1', l)
			l = monitor(l, 'conv1_out')

			l = MaxPooling('pool1', l, shape=3, stride=2, padding='SAME')
			l = monitor(l, 'pool1_out')

			l = halffire('fire1', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0)
			l = monitor(l, 'fire1_out')

			l = MaxPooling('pool2', l, shape=3, stride=2, padding='SAME')
			l = monitor(l, 'pool2_out')

			l = halffire('fire2', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0)
			l = monitor(l, 'fire2_out')

			l = MaxPooling('pool3', l, shape=3, stride=2, padding='SAME')
			l = monitor(l, 'pool3_out')

			l = halffire('fire3', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0)
			l = monitor(l, 'fire3_out')

			l = halffire('fire4', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0)
			l = monitor(l, 'fire4_out')			

			l = halffire('fire5', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0)
			l = monitor(l, 'fire5_out')

			l = halffire('fire6', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0)
			l = monitor(l, 'fire6_out')

			l = halffire('fire7', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS, 0)
			l = monitor(l, 'fire7_out')

			# Classification
			classify = Conv2D('conv_class', l, out_channel=12, kernel_shape=1, stride=1, padding='SAME')
			classify = bn_activate('bn_class', classify)
			classify = monitor(classify, 'conv_class_out')
			logits = GlobalAvgPooling('pool_class', classify)

			class_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
			class_loss = tf.reduce_mean(class_loss, name='cross_entropy_loss')

			wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
			add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))

			# Object Detection
			l = tf.concat([l, classify], axis=3)

			objdetect = Conv2D('conv_obj', l, out_channel=1, kernel_shape=1, stride=1, padding='SAME')
			objdetect = tf.identity(objdetect, name='objdetect_out')
			objdetect_loss = tf.losses.hinge_loss(labels=ious, logits=objdetect)

			bndbox = Conv2D('conv_box', l, out_channel=4, kernel_shape=1, stride=1, padding='SAME')
			bndbox = tf.identity(bndbox, name='bndbox_out')
			bndbox = tf.multiply(bndbox, valids, name='mult0')
			bndbox_loss = tf.losses.mean_squared_error(labels=bndboxes, predictions=bndbox)

			# weight decay on all W of fc layers
			# reg_cost = regularize_cost('(fire7|conv_obj|conv_box).*/W', l2_regularizer(1e-5), name='regularize_cost')

			# cost = class_loss*objdetect_loss*bndbox_loss
			# cost = class_loss + objdetect_loss + bndbox_loss + reg_cost
			cost = class_loss + 10*objdetect_loss + bndbox_loss

			add_moving_summary(class_loss, objdetect_loss, bndbox_loss, cost)

		self.cost = cost

		tf.get_variable = old_get_variable
Esempio n. 21
0
    def build_graph(self, image, label):
        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'weak' in name or 'fc' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)
                #return ternarize(v)

        def cabs(x):
            return tf.minimum(1.0, tf.abs(x), name='cabs')

        def activate(x):
            return fa(cabs(x))
        
        def merge(x, y):
            #return x + y
            #return x - y
            return tf.concat([x,y], axis=3)

        image = image / 256.0;          k=3;      zp=0.25;      zp2=zp / 1
        #scale = tf.train.exponential_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*5, decay_rate=0.5, staircase=True, name='scale')
        #scale = tf.where(scale>0.001, scale, tf.zeros_like(scale))
        scale = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*50, alpha=0.0)
        tf.summary.scalar('scale', scale);             endconv=[];  endweak=[]
        #scale2 = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*50, alpha=0.0)
        #scale3 = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*80, alpha=0.0)
        with remap_variables(binarize_weight), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False):
            net=Conv2D('conv0', image, np.round(48*zp), 5, padding='VALID', use_bias=True)
            net=MaxPooling('pool0', net, 2, padding='SAME');            net=activate(net)

            net1=Conv2D('conv1', net, np.round(64*zp), 3, padding='SAME');      net1=BatchNorm('bn1', net1);     endconv.append(net1)
            net2=Conv2D('weak1', net, np.round(64*zp2), k, padding='SAME');      net2=BatchNorm('bn12', net2);      endweak.append(net2);  #  net2=tf.nn.relu(net2)
            net=merge(activate(net1), scale*net2)
            #net=activate(net1)
            
            net1=Conv2D('conv2', net, np.round(64*zp), 3, padding='SAME');      net1=BatchNorm('bn2', net1);     endconv.append(net1)
            net2=Conv2D('weak2', net, np.round(64*zp2), k, padding='SAME');      net2=BatchNorm('bn22', net2);      endweak.append(net2);   # net2=tf.nn.relu(net2)
            net1=MaxPooling('pool1', net1, 2, padding='SAME');   net2=MaxPooling('pool12', net2, 2, padding='SAME');
            net=merge(activate(net1), scale*net2)
            net=activate(net1)

            net1=Conv2D('conv3', net, np.round(128*zp), 3, padding='VALID');      net1=BatchNorm('bn3', net1);     endconv.append(net1)
            net2=Conv2D('weak3', net, np.round(128*zp2), k, padding='VALID');      net2=BatchNorm('bn32', net2);      endweak.append(net2);  #  net2=tf.nn.relu(net2)
            net=merge(activate(net1), scale*net2)
            #net=activate(net1)

            net1=Conv2D('conv4', net, np.round(128*zp), 3, padding='SAME');      net1=BatchNorm('bn4', net1);     endconv.append(net1)
            net2=Conv2D('weak4', net, np.round(128*zp2), k, padding='SAME');      net2=BatchNorm('bn42', net2);      endweak.append(net2);  #  net2=tf.nn.relu(net2)
            net=merge(activate(net1), scale*net2)
            # net=activate(net1)

            net1=Conv2D('conv5', net, np.round(128*zp), 3, padding='VALID');      net1=BatchNorm('bn5', net1);     endconv.append(net1)
            net2=Conv2D('weak5', net, np.round(128*zp2), k, padding='VALID');      net2=BatchNorm('bn52', net2);      endweak.append(net2);  #  net2=tf.nn.relu(net2)
            net=merge(activate(net1), scale*net2)
            #net=activate(net1)

            net=tf.nn.dropout(net, 0.5 if is_training else 1.0)
            net1=Conv2D('conv6', net, np.round(512*zp), 5, padding='VALID');       net1=BatchNorm('bn6', net1);     endconv.append(net1)
            net2=Conv2D('weak6', net, np.round(512*zp2), 5, padding='VALID');       net2=BatchNorm('bn62', net2);      endweak.append(net2);  #  net2=tf.nn.relu(net2)
            net=merge(cabs(net1), scale*net2)
            # net=cabs(net1)
            logits=FullyConnected('fc1', net, 10)
        tf.nn.softmax(logits, name='output')

        # compute the number of failed samples
        wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='wrong_tensor')
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))

        add_param_summary(('.*/W', ['histogram', 'rms']))
        total_cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, total_cost)
        for i in range(len(endweak)):
            add_moving_summary(tf.reduce_mean(tf.abs(endconv[i]), name='mean_conv_'+str(i+1) )  )
            add_moving_summary(tf.reduce_mean(tf.abs(endweak[i]), name='mean_weak_'+str(i+1) )  )

        return total_cost
Esempio n. 22
0
def ResNet18(image,
             label,
             scope,
             is_training,
             dataset='cifar',
             reuse=False,
             Distill=None,
             bit_a=32,
             bit_w=32,
             bit_g=32):
    end_points = {}

    nChannels = []
    if 'cifar' in dataset or 'svhn' in dataset:
        nChannels = [64, 64, 128, 256, 512]
    elif 'imagenet' in dataset:
        nChannels = [64, 256, 512, 1024, 2048]

    assert len(nChannels) > 0, "empty channels!!"

    stride = [1, 2, 2, 2]

    # 4 (stride) * n * 2 + 2. shortcut is not involved.
    # 32 -> 16 -> 8 -> 4
    n = 2
    if scope == 'Teacher':
        with tf.variable_scope(scope):
            std = tf.contrib.layers.conv2d(image,
                                           nChannels[0], [3, 3],
                                           1,
                                           scope='base_conv',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn0',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)
            for i in range(len(stride)):
                std = NetworkBlock(std,
                                   ResBlock,
                                   n,
                                   nChannels[i + 1],
                                   stride[i],
                                   is_training=is_training,
                                   reuse=reuse,
                                   name='Resblock%d' % i)
            fc = tf.reduce_mean(std, [1, 2])
            logits = tf.contrib.layers.fully_connected(
                fc,
                label.get_shape().as_list()[-1],
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=tf.zeros_initializer(),
                trainable=True,
                scope='full',
                reuse=reuse)
            end_points['Logits'] = logits
    elif scope == 'Student':
        fw, fa, fg = get_dorefa(bit_w, bit_a, bit_g)

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith(
                    'weights') or 'base_conv' in name or 'full' in name:
                return v
            else:
                tf.logging.info("Quantizing weight {} at bits {}".format(
                    v.op.name, bit_w))
                return fw(v)

        def nonlin(x):
            if bit_a == 32:
                return tf.nn.relu(x)  # still use relu for 32-bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            tf.logging.info("Quantizing activations {} at bits {}".format(
                x.name, bit_a))
            return fa(nonlin(x))

        with tf.variable_scope(scope), remap_variables(new_get_variable):
            std = tf.contrib.layers.conv2d(image,
                                           nChannels[0], [3, 3],
                                           1,
                                           scope='base_conv',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn0',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            for i in range(len(stride)):
                std = NetworkBlock(std,
                                   ResBlock,
                                   n,
                                   nChannels[i + 1],
                                   stride[i],
                                   activate,
                                   is_training=is_training,
                                   reuse=reuse,
                                   name='Resblock%d' % i,
                                   scope=scope)
            fc = tf.reduce_mean(std, [1, 2])
            logits = tf.contrib.layers.fully_connected(
                fc,
                label.get_shape().as_list()[-1],
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=tf.zeros_initializer(),
                trainable=True,
                scope='full',
                reuse=reuse)
            end_points['Logits'] = logits

    if Distill is not None:
        if Distill == 'DML':
            teacher_train = True
        else:
            is_training = False
            teacher_train = False
        with tf.variable_scope('Teacher'):
            with tf.contrib.framework.arg_scope(
                [tf.contrib.layers.conv2d, tf.contrib.layers.fully_connected],
                    variables_collections=[
                        tf.GraphKeys.GLOBAL_VARIABLES, 'Teacher'
                    ]):
                with tf.contrib.framework.arg_scope(
                    [tf.contrib.layers.batch_norm],
                        variables_collections=[
                            tf.GraphKeys.GLOBAL_VARIABLES, 'Teacher'
                        ]):
                    tch = tf.contrib.layers.conv2d(image,
                                                   nChannels[0], [3, 3],
                                                   1,
                                                   scope='base_conv',
                                                   trainable=teacher_train,
                                                   reuse=reuse)
                    tch = tf.contrib.layers.batch_norm(tch,
                                                       scope='bn0',
                                                       trainable=teacher_train,
                                                       is_training=is_training,
                                                       reuse=reuse)
                    tch = tf.nn.relu(tch)
                    for i in range(len(stride)):
                        tch = NetworkBlock(tch,
                                           ResBlock,
                                           n,
                                           nChannels[i + 1],
                                           stride[i],
                                           is_training=is_training,
                                           reuse=reuse,
                                           name='Resblock%d' % i)
                    fc = tf.reduce_mean(tch, [1, 2])
                    logits_tch = tf.contrib.layers.fully_connected(
                        fc,
                        label.get_shape().as_list()[-1],
                        weights_initializer=tf.contrib.layers.
                        xavier_initializer(),
                        biases_initializer=tf.zeros_initializer(),
                        trainable=teacher_train,
                        scope='full',
                        reuse=reuse)
                    end_points['Logits_tch'] = logits_tch

        with tf.variable_scope('Distillation'):
            feats = tf.get_collection('feat')
            student_feats = feats[:len(feats) // 2]
            teacher_feats = feats[len(feats) // 2:]
            feats_noact = tf.get_collection('feat_noact')
            student_feats_noact = feats[:len(feats_noact) // 2]
            teacher_feats_noact = feats[len(feats_noact) // 2:]

            if Distill == 'Soft_logits':
                tf.add_to_collection(
                    'dist', Response.Soft_logits(logits, logits_tch, 3))
            elif Distill == 'DML':
                tf.add_to_collection('dist', Response.DML(logits, logits_tch))
            elif Distill == 'FT':
                tf.add_to_collection(
                    'dist',
                    Response.Factor_Transfer(student_feats_noact[-1],
                                             teacher_feats_noact[-1]))

            elif Distill == 'FitNet':
                tf.add_to_collection(
                    'dist', Multiple.FitNet(student_feats, teacher_feats))
            elif Distill == 'AT':
                tf.add_to_collection(
                    'dist',
                    Multiple.Attention_transfer(student_feats, teacher_feats))
            elif Distill == 'AB':
                tf.add_to_collection(
                    'dist',
                    Multiple.AB_distillation(student_feats, teacher_feats, 1.,
                                             3e-3))

            elif Distill == 'FSP':
                tf.add_to_collection('dist',
                                     Shared.FSP(student_feats, teacher_feats))
            elif Distill[:3] == 'KD-':
                tf.add_to_collection(
                    'dist',
                    Shared.KD_SVD(student_feats, teacher_feats, Distill[-3:]))

            elif Distill == 'RKD':
                tf.add_to_collection(
                    'dist', Relation.RKD(logits, logits_tch, l=[5e1, 1e2]))
            elif Distill == 'MHGD':
                tf.add_to_collection(
                    'dist', Relation.MHGD(student_feats, teacher_feats))
            elif Distill == 'MHGD-RKD':
                tf.add_to_collection(
                    'dist',
                    Relation.MHGD(student_feats, teacher_feats) +
                    Relation.RKD(logits, logits_tch, l=[5e1, 1e2]))
            elif Distill == 'MHGD-RKD-SVD':
                tf.add_to_collection(
                    'dist',
                    Relation.MHGD(student_feats, teacher_feats) +
                    Relation.RKD(logits, logits_tch, l=[5e1, 1e2]) +
                    Shared.KD_SVD(student_feats, teacher_feats, "SVD"))

    return end_points
Esempio n. 23
0
    def build_graph(self, image, label):
        image = image / 256.0
        is_training = get_current_tower_context().is_training
        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv1' in name or 'fct' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        def resblock(x, channel, stride):
            def get_stem_full(x):
                return (LinearWrap(x)
                        .Conv2D('c3x3a', channel, 3)
                        .quan_all_L2norm('stembn')
                        .apply(activate)
                        .Conv2D('c3x3b', channel, 3)())
            channel_mismatch = channel != x.get_shape().as_list()[3]
            if stride != 1 or channel_mismatch or 'pool1' in x.name:
                # handling pool1 is to work around an architecture bug in our model
                if stride != 1 or 'pool1' in x.name:
                    x = AvgPooling('pool', x, stride, stride)
                x = quan_all_L2norm('bn', x)
                x = activate(x)
                shortcut = Conv2D('shortcut', x, channel, 1)
                stem = get_stem_full(x)
            else:
                shortcut = x
                x = quan_all_L2norm('bn', x)
                x = activate(x)
                stem = get_stem_full(x)
            return shortcut + stem

        def group(x, name, channel, nr_block, stride):
            with tf.variable_scope(name + 'blk1'):
                x = resblock(x, channel, stride)
            for i in range(2, nr_block + 1):
                with tf.variable_scope(name + 'blk{}'.format(i)):
                    x = resblock(x, channel, 1)
            return x

        with remap_variables(new_get_variable), \
                argscope(quan_all_L2norm, momentum=0.9, eps=1e-4,train=is_training), \
                argscope(Conv2D, use_bias=False, nl=tf.identity):
            logits = (LinearWrap(image)
                      # use explicit padding here, because our private training framework has
                      # different padding mechanisms from TensorFlow
                      .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]])
                      .Conv2D('conv1', 64, 7, stride=2, padding='VALID', use_bias=True)
                      .tf.pad([[0, 0], [1, 1], [1, 1], [0, 0]], 'SYMMETRIC')
                      .MaxPooling('pool1', 3, 2, padding='VALID')
                      .apply(group, 'conv2', 64, 2, 1)
                      .apply(group, 'conv3', 128, 2, 2)
                      .apply(group, 'conv4', 256, 2, 2)
                      .apply(group, 'conv5', 512, 2, 2)
                      .quan_all_L2norm('lastbn')
                      .apply(nonlin)
                      .GlobalAvgPooling('gap')
                      #.tf.multiply(49)  # this is due to a bug in our model design
                      .FullyConnected('fct', 10)())
        tf.nn.softmax(logits, name='output')
        # compute the number of failed samples
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='wrong_vector')
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        # weight decay on all W of fc layers
        #wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(),480000, 0.2, True)
        #wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost')
        #add_moving_summary(cost, wd_cost)

        add_param_summary(('.*/W', ['histogram']))   # monitor W
        return tf.add_n([cost], name='cost')
    def build_graph(self, image, label):
        # get quantization function
        # quantize weights
        qw = quantize_weight(int(self.quantizer_config['BITW']),
                             self.quantizer_config['name'],
                             self.quantizer_config['W_opts'],
                             self.quantizer_config)
        # quantize activation
        if self.quantizer_config['BITA'] in ['32', 32]:
            qa = tf.identity
        else:
            qa = quantize_activation(int(self.quantizer_config['BITA']),
                                     self.quantizer_config['name'],
                                     self.quantizer_config)
        # quantize gradient
        qg = quantize_gradient(int(self.quantizer_config['BITG']))

        def new_get_variable(v):
            name = v.op.name
            # don't quantize first and last layer
            if not name.endswith('/W') or 'conv1' in name or 'fct' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                return qw(v)

        def activate(x):
            return qa(self.activation(x))

        @layer_register(use_scope=True)
        def DWConv2D(inputs,
                     channel,
                     kernel_size=3,
                     stride=1,
                     padding='SAME',
                     data_format=None,
                     dilations=None):
            #output = tf.keras.layers.DepthwiseConv2D(kernel_size, strides=(stride,stride), padding='same', use_bias=False)(inputs)
            #print(output.name, ': ', inputs.shape, ' --> ', output.shape)
            #return output
            curr_channel = inputs.get_shape().as_list()[3]
            var = tf.get_variable(
                name='dwconv_kernel',
                shape=[kernel_size, kernel_size, curr_channel, 1],
                initializer=tf.glorot_uniform_initializer)

            output = tf.nn.depthwise_conv2d(inputs,
                                            var,
                                            strides=(1, stride, stride, 1),
                                            padding=padding)
            print(output.name, ': ', inputs.shape, ' --> ', output.shape)
            return output

        @layer_register(use_scope=True)
        def SE_block(input_feature, ratio=8):
            kernel_initializer = tf.contrib.layers.variance_scaling_initializer(
            )
            bias_initializer = tf.constant_initializer(value=0.0)

            channel = input_feature.get_shape()[-1]
            # Global average pooling
            squeeze = tf.reduce_mean(input_feature, axis=[1, 2], keepdims=True)
            excitation = tf.layers.dense(inputs=squeeze,
                                         units=channel // ratio,
                                         activation=tf.nn.relu,
                                         kernel_initializer=kernel_initializer,
                                         bias_initializer=bias_initializer,
                                         name='bottleneck_fc')
            excitation = tf.layers.dense(inputs=excitation,
                                         units=channel,
                                         activation=tf.nn.sigmoid,
                                         kernel_initializer=kernel_initializer,
                                         bias_initializer=bias_initializer,
                                         name='recover_fc')
            scale = input_feature * excitation
            return scale

        def SepConv(x, name, nr_block, channel, kernel_size, stride):
            with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
                x = DWConv2D('dwconv', x, channel, kernel_size, stride)
                x = Conv2D('pwconv', x, channel, 1, strides=(stride, stride))
            return x

        def block(x, channel, kernel_size, stride, extension, SE):
            channel_match = channel == x.get_shape().as_list()[3]

            shortcut = x

            if x.get_shape().as_list()[3] < 20:
                x = Conv2D('pwconv_a',
                           x,
                           channel * extension,
                           1,
                           strides=(1, 1))
            else:
                x = Conv2D('pwconv_a1', x, 20, 1, strides=(1, 1))
                x = Conv2D('pwconv_a2',
                           x,
                           channel * extension,
                           1,
                           strides=(1, 1))
            x = BatchNorm('bn_a', x)
            x = activate(x)

            x = DWConv2D('dwconv_b', x, channel * extension, kernel_size,
                         stride)
            x = BatchNorm('bn_b', x)
            x = activate(x)

            if SE:
                x = SE_block('se_block', x)

            if channel < 20:
                x = Conv2D('pwconv_c', x, channel, 1, strides=(1, 1))
            else:
                x = Conv2D('pwconv_c1', x, 20, 1, strides=(1, 1))
                x = Conv2D('pwconv_c2', x, channel, 1, strides=(1, 1))
            x = BatchNorm('bn_c', x)

            if stride == 1 and channel_match:
                x = x + shortcut
            return x

        def group(x, name, nr_block, channel, kernel_size, stride, extension,
                  SE):
            with tf.variable_scope(name + 'blk1', reuse=tf.AUTO_REUSE):
                x = block(x, channel, kernel_size, stride, extension, SE)
            for i in range(2, nr_block + 1):
                with tf.variable_scope(name + 'blk{}'.format(i),
                                       reuse=tf.AUTO_REUSE):
                    x = block(x, channel, kernel_size, 1, extension, SE)
            return x

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.99, epsilon=1e-3), \
                argscope(Conv2D, use_bias=False, nl=tf.identity,
                         kernel_initializer=tf.variance_scaling_initializer(scale=float(self.initializer_config['scale']),
                                                                            mode=self.initializer_config['mode'])):
            logits = (
                LinearWrap(image).Conv2D('conv1', 32, 3)  # size=32
                .apply(group, 'mbconv2', 1, 16, 3, 1, 6, False)  # size=32
                .apply(group, 'mbconv3', 2, 24, 3, 1, 6, False)  # size=16
                .apply(group, 'mbconv4', 3, 32, 3, 2, 6, False)  # size=8
                .apply(group, 'mbconv5', 4, 64, 3, 2, 6, False)  # size=4
                .apply(group, 'mbconv6', 3, 96, 3, 1, 6, False)  # size=4
                .apply(group, 'mbconv7', 3, 160, 3, 2, 6, False)  # size=2
                .apply(group, 'mbconv8', 1, 320, 3, 1, 6, False)  # size=2
                .Conv2D('conv9/pwconv_a1', 20, 1, strides=(1, 1)).Conv2D(
                    'conv9/pwconv_a2', 1280, 1,
                    strides=(1, 1)).BatchNorm('last_bn').apply(
                        activate).GlobalAvgPooling('gap').FullyConnected(
                            'fct', self.nb_classes)())
        prob = tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        # regularization
        if self.regularizer_config['name'] not in [None, 'None']:
            reg_func = getattr(regularizers,
                               self.regularizer_config['name'])().get_func(
                                   self.regularizer_config,
                                   self.quantizer_config)
            reg_cost = tf.multiply(float(self.regularizer_config['lmbd']),
                                   regularize_cost('.*/W', reg_func),
                                   name='reg_cost')
            #reg_cost = tf.multiply(float(self.regularizer_config['lmbd']), regularize_cost_from_collection(), name='reg_cost')
            total_cost = tf.add_n([cost, reg_cost], name='total_cost')
        else:
            total_cost = cost

        # summary
        def add_summary(logits, cost):
            err_top1 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label,
                                                             1)),
                               tf.float32,
                               name='err_top1')
            add_moving_summary(
                tf.reduce_mean(err_top1, name='train_error_top1'))
            err_top5 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label,
                                                             5)),
                               tf.float32,
                               name='err_top5')
            add_moving_summary(
                tf.reduce_mean(err_top5, name='train_error_top5'))

            add_moving_summary(cost)
            add_param_summary(('.*/W', ['histogram']))  # monitor W

        add_summary(logits, cost)

        return total_cost
Esempio n. 25
0
    def _build_graph(self, inputs):
        inp, label = inputs
        is_training = get_current_tower_context().is_training

        fw, fa = get_dorefa(self.bitw, self.bita)

        def binarize_weight(v):
            name = v.op.name
            if not (name.endswith('W') or name.endswith('b')):
                logger.info("Not quantizing {}".format(name))
                return v
            elif not self.quant_ends and 'conv0' in name:
                logger.info("Not quantizing {}".format(name))
                return v
            elif not self.quant_ends and 'last_linear' in name:
                logger.info("Not quantizing {}".format(name))
                return v
            elif not self.quant_ends and (self.net_fn == fcn1_net or self.net_fn == fcn2_net) and 'linear0' in name:
                logger.info("Not quantizing {}".format(name))
                return v
            else:
                logger.info("Quantizing weight {}".format(name))
                return fw(v)

        def nonlin(x, name="activate"):
            if self.bita == 32:
                return fa(tf.nn.relu(BNWithTrackedMults(x)))
            else:
                return fa(tf.clip_by_value(BNWithTrackedMults(x), 0.0, 1.0))

        with remap_variables(binarize_weight), \
                argscope([FullyConnectedWithTrackedMults], network_complexity=self.network_complexity), \
                argscope([Conv2DWithTrackedMults], network_complexity=self.network_complexity), \
                argscope([BNReLUWithTrackedMults], network_complexity=self.network_complexity), \
                argscope([BNWithTrackedMults], network_complexity=self.network_complexity), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4):
            l = self.net_fn(inp, nonlin, self.n_context)
            logits = FullyConnectedWithTrackedMults('last_linear', l, out_dim=self.n_spks, nl=tf.identity)

        prob = tf.nn.softmax(logits, name='output')

        # used for validation accuracy of utterance
        identity_guesses = flatten(tf.argmax(prob, axis=1))
        uniq_identities, _, count = tf.unique_with_counts(identity_guesses)
        idx_to_identity_with_most_votes = tf.argmax(count)
        chosen_identity = tf.gather(uniq_identities, idx_to_identity_with_most_votes)
        wrong = tf.expand_dims(tf.not_equal(chosen_identity, tf.cast(label[0], tf.int64)), axis=0, name='utt-wrong')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        add_moving_summary(cost)

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))

        with tf.name_scope('original-weight-summaries'):
            add_param_summary(('.*/W', ['rms', 'histogram']))
            add_param_summary(('.*/b', ['rms', 'histogram']))

        with tf.name_scope('activation-summaries'):
            def fn(name):
                return (name.endswith('output') or name.endswith('output:0')) and "Inference" not in name and 'quantized' not in name
            tensors = get_tensors_from_graph(tf.get_default_graph(), fn) 
            logger.info("Adding activation tensors to summary: {}".format(tensors))
            for tensor in tensors:
                add_tensor_summary(tensor, ['rms', 'histogram'])

        wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), 480000, 0.2, True)
        wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost')
        add_moving_summary(wd_cost)
        self.cost = tf.add_n([cost, wd_cost], name='cost')

        tf.constant([self.network_complexity['mults']], name='TotalMults')
        tf.constant([self.network_complexity['weights']], name='TotalWeights')
        logger.info("Parameter count: {}".format(self.network_complexity))
Esempio n. 26
0
    def _build_graph(self, inputs):
        image, label = inputs
        """Add a single channel here"""
        image = tf.expand_dims(image, 3)

        image = image * 256
        image = tf.round(image)

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        old_get_variable = tf.get_variable

        def monitor(x, name):
            if MONITOR == 1:
                return tf.Print(x, [x],
                                message='\n\n' + name + ': ',
                                summarize=1000,
                                name=name)
            else:
                return x

        def new_get_variable(v):
            name = v.op.name
            if not name.endswith('W') or 'conv0' in name or 'fc1' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                if MONITOR == 1:
                    return tf.Print(fw(v), [fw(v)],
                                    message='\n\n' + v.name +
                                    ', Quantized weights are:',
                                    summarize=100)
                else:
                    return fw(v)

        def activate(x):
            if BITA == 32:
                return tf.nn.relu(x)
            else:
                return fa(tf.nn.relu(x))

        with remap_variables(new_get_variable), \
             argscope(Conv2D, kernel_shape=3, use_bias=False, nl=tf.identity, out_channel=32):
            logits = (LinearWrap(image).apply(monitor, 'image_out').Conv2D(
                'conv0').apply(fg).BatchNorm('bn0').apply(activate).apply(
                    monitor, 'conv0_out').MaxPooling('pool0', 2).apply(
                        monitor, 'pool0_out').Conv2D('conv1').apply(
                            fg).BatchNorm('bn1').apply(activate).apply(
                                monitor, 'conv1_out').Conv2D('conv2').apply(
                                    fg).BatchNorm('bn2').apply(activate).apply(
                                        monitor, 'conv2_out').MaxPooling(
                                            'pool1', 2).apply(
                                                monitor,
                                                'pool1_out').Conv2D('conv3').
                      apply(fg).BatchNorm('bn3').apply(activate).apply(
                          monitor, 'conv3_out').FullyConnected(
                              'fc0',
                              use_bias=False,
                              out_dim=20,
                              nl=tf.identity).apply(activate).apply(
                                  monitor, 'fc0_out').FullyConnected(
                                      'fc1',
                                      use_bias=False,
                                      out_dim=10,
                                      nl=tf.identity).apply(
                                          monitor, 'fc1_out')())

        prob = tf.nn.softmax(logits, name='prob')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = symbf.prediction_incorrect(logits, label, name='incorrect')
        accuracy = symbf.accuracy(logits, label, name='accuracy')

        train_error = tf.reduce_mean(wrong, name='train_error')
        summary.add_moving_summary(train_error, accuracy)

        wd_cost = tf.multiply(1e-5,
                              regularize_cost('fc.*/W', tf.nn.l2_loss),
                              name='regularize_loss')
        self.cost = tf.add_n([wd_cost, cost], name='total_cost')
        summary.add_moving_summary(cost, wd_cost, self.cost)
Esempio n. 27
0
    def build_graph(self, image, label):
        image = image / 255.0

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fct' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)  # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False):
            logits = (LinearWrap(image).Conv2D(
                'conv0', 96, 12, strides=4,
                padding='VALID').apply(activate).Conv2D(
                    'conv1', 256, 5, padding='SAME',
                    split=2).apply(fg).BatchNorm('bn1').MaxPooling(
                        'pool1', 3, 2, padding='SAME').apply(activate).Conv2D(
                            'conv2', 384,
                            3).apply(fg).BatchNorm('bn2').MaxPooling(
                                'pool2', 3, 2,
                                padding='SAME').apply(activate).Conv2D(
                                    'conv3', 384, 3, split=2).apply(fg).
                      BatchNorm('bn3').apply(activate).Conv2D(
                          'conv4', 256, 3,
                          split=2).apply(fg).BatchNorm('bn4').MaxPooling(
                              'pool4', 3, 2,
                              padding='VALID').apply(activate).FullyConnected(
                                  'fc0', 4096).apply(fg).BatchNorm('bnfc0').
                      apply(activate).FullyConnected(
                          'fc1', 4096,
                          use_bias=False).apply(fg).BatchNorm('bnfc1').apply(
                              nonlin).FullyConnected('fct',
                                                     1000,
                                                     use_bias=True)())

        tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
        wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W',
                                  l2_regularizer(5e-6),
                                  name='regularize_cost')

        add_param_summary(('.*/W', ['histogram', 'rms']))
        total_cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, total_cost)
        return total_cost
Esempio n. 28
0
    def build_graph(self, image, label):
        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fc' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        image = image / 256.0

        with remap_variables(binarize_weight), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False):
            logits = (
                LinearWrap(image).Conv2D('conv0',
                                         48,
                                         5,
                                         padding='VALID',
                                         use_bias=True).MaxPooling(
                                             'pool0', 2,
                                             padding='SAME').apply(activate)
                # 18
                .Conv2D('conv1', 64, 3, padding='SAME').apply(fg).BatchNorm(
                    'bn1').apply(activate).Conv2D(
                        'conv2', 64, 3,
                        padding='SAME').apply(fg).BatchNorm('bn2').MaxPooling(
                            'pool1', 2, padding='SAME').apply(activate)
                # 9
                .Conv2D(
                    'conv3', 128, 3,
                    padding='VALID').apply(fg).BatchNorm('bn3').apply(activate)
                # 7
                .Conv2D('conv4', 128, 3, padding='SAME').apply(fg).
                BatchNorm('bn4').apply(activate).Conv2D(
                    'conv5', 128, 3,
                    padding='VALID').apply(fg).BatchNorm('bn5').apply(activate)
                # 5
                .Dropout(rate=0.5 if is_training else 0.0).Conv2D(
                    'conv6', 512, 5, padding='VALID').apply(fg).BatchNorm(
                        'bn6').apply(nonlin).FullyConnected('fc1', 10)())
        tf.nn.softmax(logits, name='output')

        # compute the number of failed samples
        wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)),
                        tf.float32,
                        name='wrong_tensor')
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))

        add_param_summary(('.*/W', ['histogram', 'rms']))
        total_cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, total_cost)
        return total_cost
Esempio n. 29
0
def AlexNetCifar(image,
                 label,
                 scope,
                 is_training,
                 dataset='cifar',
                 reuse=False,
                 Distill=None,
                 bit_a=32,
                 bit_w=32,
                 bit_g=32):
    end_points = {}

    if scope == 'Teacher':
        with tf.variable_scope(scope):
            image = tf.pad(image, [[0, 0], [5, 5], [5, 5], [0, 0]])
            std = tf.contrib.layers.conv2d(image,
                                           64, [11, 11],
                                           1,
                                           scope='conv0',
                                           padding='VALID',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn0',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)

            std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME')
            std = tf.contrib.layers.conv2d(std,
                                           192, [5, 5],
                                           padding='SAME',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn1',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)
            std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME')
            tf.add_to_collection('feat', std)

            std = tf.contrib.layers.conv2d(std,
                                           384, [3, 3],
                                           padding='SAME',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn2',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)
            tf.add_to_collection('feat', std)

            std = tf.contrib.layers.conv2d(std,
                                           256, [3, 3],
                                           padding='SAME',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn3',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)
            tf.add_to_collection('feat', std)

            std = tf.contrib.layers.conv2d(std,
                                           256, [3, 3],
                                           padding='SAME',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn4',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)
            std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME')
            tf.add_to_collection('feat', std)

            fc = tf.layers.flatten(std, name='fc_flat')

            fc1 = tf.contrib.layers.fully_connected(fc,
                                                    4096,
                                                    scope='fc0',
                                                    trainable=True,
                                                    reuse=reuse)
            fc1 = tf.contrib.layers.batch_norm(fc1,
                                               scope='bn_fc0',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            fc1 = tf.nn.relu(fc1)
            fc2 = tf.contrib.layers.fully_connected(fc1,
                                                    4096,
                                                    scope='fc1',
                                                    trainable=True,
                                                    reuse=reuse)
            fc2 = tf.contrib.layers.batch_norm(fc2,
                                               scope='bn_fc1',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            fc2 = tf.nn.relu(fc2)
            logits = tf.contrib.layers.fully_connected(
                fc2,
                label.get_shape().as_list()[-1],
                scope='fct',
                trainable=True,
                reuse=reuse)

            end_points['Logits'] = logits
    else:
        fw, fa, fg = get_dorefa(bit_w, bit_a, bit_g)

        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith(
                    'weights') or 'conv0' in name or 'fct' in name:
                return v
            else:
                tf.logging.info("Quantizing weight {} at bits {}".format(
                    v.op.name, bit_w))
                return fw(v)

        def nonlin(x):
            if bit_a == 32:
                return tf.nn.relu(x)  # still use relu for 32-bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            tf.logging.info("Quantizing activations {} at bits {}".format(
                x.name, bit_a))
            return fa(nonlin(x))

        with tf.variable_scope(scope), remap_variables(new_get_variable):
            image = tf.pad(image, [[0, 0], [5, 5], [5, 5], [0, 0]])
            std = tf.contrib.layers.conv2d(image,
                                           64, [11, 11],
                                           1,
                                           scope='conv0',
                                           padding='VALID',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn0',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)

            std = activate(std)

            std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME')
            std = tf.contrib.layers.conv2d(std,
                                           192, [5, 5],
                                           padding='SAME',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn1',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)

            std = activate(std)

            std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME')
            tf.add_to_collection('feat', std)

            std = tf.contrib.layers.conv2d(std,
                                           384, [3, 3],
                                           padding='SAME',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn2',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)

            std = activate(std)
            tf.add_to_collection('feat', std)

            std = tf.contrib.layers.conv2d(std,
                                           256, [3, 3],
                                           padding='SAME',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn3',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)

            std = activate(std)
            tf.add_to_collection('feat', std)

            std = tf.contrib.layers.conv2d(std,
                                           256, [3, 3],
                                           padding='SAME',
                                           trainable=True,
                                           reuse=reuse)
            std = tf.contrib.layers.batch_norm(std,
                                               scope='bn4',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            std = tf.nn.relu(std)

            std = activate(std)

            std = tf.layers.max_pooling2d(std, 2, strides=2, padding='SAME')
            tf.add_to_collection('feat', std)

            fc = tf.layers.flatten(std, name='fc_flat')

            fc1 = tf.contrib.layers.fully_connected(fc,
                                                    4096,
                                                    scope='fc0',
                                                    trainable=True,
                                                    reuse=reuse)
            fc1 = tf.contrib.layers.batch_norm(fc1,
                                               scope='bn_fc0',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            fc1 = tf.nn.relu(fc1)
            fc1 = activate(fc1)

            fc2 = tf.contrib.layers.fully_connected(fc1,
                                                    4096,
                                                    scope='fc1',
                                                    trainable=True,
                                                    reuse=reuse)
            fc2 = tf.contrib.layers.batch_norm(fc2,
                                               scope='bn_fc1',
                                               trainable=True,
                                               is_training=is_training,
                                               reuse=reuse)
            fc2 = tf.nn.relu(fc2)

            logits = tf.contrib.layers.fully_connected(
                fc2,
                label.get_shape().as_list()[-1],
                scope='fct',
                trainable=True,
                reuse=reuse)

            end_points['Logits'] = logits

    if Distill is not None:
        if Distill == 'DML':
            teacher_train = True
        else:
            is_training = False
            teacher_train = False
        with tf.variable_scope('Teacher'):
            with tf.contrib.framework.arg_scope(
                [tf.contrib.layers.conv2d, tf.contrib.layers.fully_connected],
                    variables_collections=[
                        tf.GraphKeys.GLOBAL_VARIABLES, 'Teacher'
                    ]):
                with tf.contrib.framework.arg_scope(
                    [tf.contrib.layers.batch_norm],
                        variables_collections=[
                            tf.GraphKeys.GLOBAL_VARIABLES, 'Teacher'
                        ]):
                    std = tf.contrib.layers.conv2d(image,
                                                   64, [11, 11],
                                                   1,
                                                   scope='conv0',
                                                   padding='VALID',
                                                   trainable=True,
                                                   reuse=reuse)
                    std = tf.contrib.layers.batch_norm(std,
                                                       scope='bn0',
                                                       trainable=True,
                                                       is_training=is_training,
                                                       reuse=reuse)
                    std = tf.nn.relu(std)

                    std = tf.layers.max_pooling2d(std,
                                                  2,
                                                  strides=2,
                                                  padding='SAME')
                    # tf.add_to_collection('feat', std)
                    std = tf.contrib.layers.conv2d(std,
                                                   192, [5, 5],
                                                   padding='SAME',
                                                   trainable=teacher_train,
                                                   reuse=reuse)
                    std = tf.contrib.layers.batch_norm(std,
                                                       scope='bn1',
                                                       trainable=teacher_train,
                                                       is_training=is_training,
                                                       reuse=reuse)
                    std = tf.nn.relu(std)

                    std = tf.layers.max_pooling2d(std,
                                                  2,
                                                  strides=2,
                                                  padding='SAME')
                    tf.add_to_collection('feat', std)
                    std = tf.contrib.layers.conv2d(std,
                                                   384, [3, 3],
                                                   padding='SAME',
                                                   trainable=teacher_train,
                                                   reuse=reuse)
                    std = tf.contrib.layers.batch_norm(std,
                                                       scope='bn2',
                                                       trainable=teacher_train,
                                                       is_training=is_training,
                                                       reuse=reuse)
                    std = tf.nn.relu(std)
                    tf.add_to_collection('feat', std)

                    std = tf.contrib.layers.conv2d(std,
                                                   256, [3, 3],
                                                   padding='SAME',
                                                   trainable=teacher_train,
                                                   reuse=reuse)
                    std = tf.contrib.layers.batch_norm(std,
                                                       scope='bn3',
                                                       trainable=teacher_train,
                                                       is_training=is_training,
                                                       reuse=reuse)
                    std = tf.nn.relu(std)
                    tf.add_to_collection('feat', std)

                    std = tf.contrib.layers.conv2d(std,
                                                   256, [3, 3],
                                                   padding='SAME',
                                                   trainable=teacher_train,
                                                   reuse=reuse)
                    std = tf.contrib.layers.batch_norm(std,
                                                       scope='bn4',
                                                       trainable=teacher_train,
                                                       is_training=is_training,
                                                       reuse=reuse)
                    std = tf.nn.relu(std)
                    std = tf.layers.max_pooling2d(std,
                                                  2,
                                                  strides=2,
                                                  padding='SAME')
                    tf.add_to_collection('feat', std)
                    fc_tch = tf.layers.flatten(std, name='fc_flat')

                    fc1 = tf.contrib.layers.fully_connected(
                        fc_tch,
                        4096,
                        scope='fc0',
                        trainable=teacher_train,
                        reuse=reuse)
                    fc1 = tf.contrib.layers.batch_norm(fc1,
                                                       scope='bn_fc0',
                                                       trainable=teacher_train,
                                                       is_training=is_training,
                                                       reuse=reuse)
                    fc1 = tf.nn.relu(fc1)
                    fc2 = tf.contrib.layers.fully_connected(
                        fc1,
                        4096,
                        scope='fc1',
                        trainable=teacher_train,
                        reuse=reuse)
                    fc2 = tf.contrib.layers.batch_norm(fc2,
                                                       scope='bn_fc1',
                                                       trainable=teacher_train,
                                                       is_training=is_training,
                                                       reuse=reuse)
                    fc2 = tf.nn.relu(fc2)
                    logits_tch = tf.contrib.layers.fully_connected(
                        fc2,
                        label.get_shape().as_list()[-1],
                        scope='fct',
                        trainable=teacher_train,
                        reuse=reuse)

                    end_points['Logits_tch'] = logits_tch

        with tf.variable_scope('Distillation'):
            feats = tf.get_collection('feat')
            student_feats = feats[:len(feats) // 2]
            teacher_feats = feats[len(feats) // 2:]
            feats_noact = tf.get_collection('feat_noact')
            student_feats_noact = feats[:len(feats_noact) // 2]
            teacher_feats_noact = feats[len(feats_noact) // 2:]

            if Distill == 'Soft_logits':
                tf.add_to_collection(
                    'dist', Response.Soft_logits(logits, logits_tch, 3))
            elif Distill == 'DML':
                tf.add_to_collection('dist', Response.DML(logits, logits_tch))
            elif Distill == 'FT':
                tf.add_to_collection(
                    'dist',
                    Response.Factor_Transfer(student_feats_noact[-1],
                                             teacher_feats_noact[-1]))

            elif Distill == 'FitNet':
                tf.add_to_collection(
                    'dist', Multiple.FitNet(student_feats, teacher_feats))
            elif Distill == 'AT':
                tf.add_to_collection(
                    'dist',
                    Multiple.Attention_transfer(student_feats, teacher_feats))
            elif Distill == 'AB':
                tf.add_to_collection(
                    'dist',
                    Multiple.AB_distillation(student_feats, teacher_feats, 1.,
                                             3e-3))

            elif Distill == 'FSP':
                tf.add_to_collection('dist',
                                     Shared.FSP(student_feats, teacher_feats))
            elif Distill[:3] == 'KD-':
                tf.add_to_collection(
                    'dist',
                    Shared.KD_SVD(student_feats, teacher_feats, Distill[-3:]))

            elif Distill == 'RKD':
                tf.add_to_collection(
                    'dist', Relation.RKD(logits, logits_tch, l=[5e1, 1e2]))
            elif Distill == 'MHGD':
                tf.add_to_collection(
                    'dist', Relation.MHGD(student_feats, teacher_feats))
            elif Distill == 'MHGD-RKD':
                tf.add_to_collection(
                    'dist',
                    Relation.MHGD(student_feats, teacher_feats) +
                    Relation.RKD(logits, logits_tch, l=[5e1, 1e2]))
            elif Distill == 'MHGD-RKD-SVD':
                tf.add_to_collection(
                    'dist',
                    Relation.MHGD(student_feats, teacher_feats) +
                    Relation.RKD(logits, logits_tch, l=[5e1, 1e2]) +
                    Shared.KD_SVD(student_feats, teacher_feats, "SVD"))

    return end_points
Esempio n. 30
0
    def build_graph(self, image, label):
        # get quantization function
        # quantize weights
        qw = quantize_weight(int(self.quantizer_config['BITW']),
                             self.quantizer_config['name'],
                             self.quantizer_config['W_opts'],
                             self.quantizer_config)
        # quantize activation
        if self.quantizer_config['BITA'] in ['32', 32]:
            qa = tf.identity
        else:
            qa = quantize_activation(int(self.quantizer_config['BITA']),
                                     self.quantizer_config['name'],
                                     self.quantizer_config)
        # quantize gradient
        qg = quantize_gradient(int(self.quantizer_config['BITG']))

        def new_get_variable(v):
            name = v.op.name
            # don't quantize first and last layer
            if not name.endswith('/W') or 'conv1' in name or 'fct' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                return qw(v)

        def activate(x):
            return qa(self.activation(x))

        def resblock(x, channel, stride):
            def get_stem_full(x):
                return (LinearWrap(x).Conv2D(
                    'stem_conv_a', channel,
                    3).BatchNorm('stem_bn').apply(activate).Conv2D(
                        'stem_conv_b', channel, 3)())

            channel_mismatch = channel != x.get_shape().as_list()[3]
            if stride != 1 or channel_mismatch:
                if stride != 1:
                    x = AvgPooling('avgpool', x, stride, stride)
                x = BatchNorm('bn', x)
                x = activate(x)
                shortcut = Conv2D('shortcut', x, channel, 1)
                stem = get_stem_full(x)
            else:
                shortcut = x
                x = BatchNorm('bn', x)
                x = activate(x)
                stem = get_stem_full(x)
            return shortcut + stem

        def group(x, name, channel, nr_block, stride):
            with tf.variable_scope(name + 'blk1', reuse=tf.AUTO_REUSE):
                x = resblock(x, channel, stride)
            for i in range(2, nr_block + 1):
                with tf.variable_scope(name + 'blk{}'.format(i),
                                       reuse=tf.AUTO_REUSE):
                    x = resblock(x, channel, 1)
            return x

        def resblock_idt(x, channel, stride, first):
            def get_r(x):
                if 'InferenceTower' in x.op.name:
                    idx = x.op.name.index('/')
                    n = x.op.name[idx + 1::]
                elif 'tower' in x.op.name:
                    idx = x.op.name.index('/')
                    n = x.op.name[idx + 1::]
                else:
                    n = x.op.name
                n0 = n.split('blk')[0]
                n1 = n0 + 'blk1/shortcut/maxW'
                n2 = n.split('/output')[0] + '/maxW'

                if int(
                        self.quantizer_config['BITW']
                ) != 32:  # and eval(self.quantizer_config['W_opts']['fix_max']):
                    n1 += '_stop_grad'
                    n2 += '_stop_grad'

                maxs = tf.get_collection('maxs')
                for tensor in maxs:
                    tn = tensor.op.name
                    if n1 == tn:
                        m1 = tensor
                    elif n2 == tn:
                        m2 = tensor

                r = m2 / m1

                temp = self.quantizer_config['mulR']

                if temp == '2R':
                    r2 = (1 / r) * (2.0**tf.floor(tf.log(r) / tf.log(2.0)))
                elif temp == 'R':
                    r2 = 1 / r

                return r2

            def get_stem_full(x):
                return (LinearWrap(x).Conv2D(
                    'stem_conv_a', channel, 1,
                    strides=(1,
                             1)).BatchNorm('stem_bn1').apply(activate).Conv2D(
                                 'stem_conv_b',
                                 channel,
                                 3,
                                 strides=(stride,
                                          stride)).BatchNorm('stem_bn2').apply(
                                              activate).Conv2D('stem_conv_c',
                                                               channel * 4,
                                                               1,
                                                               strides=(1,
                                                                        1))())

            #channel_mismatch = channel != x.get_shape().as_list()[3]
            #if stride != 1 or channel_mismatch:
            if first:
                #shortcut = tf.concat([x[::, 0::2, 0::2, ::], x[::, 1::2, 1::2, ::]], -1)
                x = BatchNorm('bn', x)
                x = activate(x)
                #if stride != 1:
                #    shortcut = Conv2D('shortcut', x, channel, 1, strides=(stride, stride))
                #else:
                #    shortcut = Conv2D('shortcut', x, channel, 1)
                shortcut = Conv2D('shortcut',
                                  x,
                                  channel * 4,
                                  1,
                                  strides=(stride, stride))
                stem = get_stem_full(x)
            else:
                shortcut = x
                x = BatchNorm('bn', x)
                x = activate(x)
                stem = get_stem_full(x)

            if self.quantizer_config['mulR'] in ['2R', 'R']:
                r = get_r(stem)
                stem = stem * r

            return shortcut + stem

        def group_v2(x, name, channel, nr_block, stride):
            with tf.variable_scope(name + 'blk1', reuse=tf.AUTO_REUSE):
                x = resblock_idt(x, channel, stride, True)
            for i in range(2, nr_block + 1):
                with tf.variable_scope(name + 'blk{}'.format(i),
                                       reuse=tf.AUTO_REUSE):
                    x = resblock_idt(x, channel, 1, False)
            return x

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False, nl=tf.identity,
                         kernel_initializer=tf.variance_scaling_initializer(scale=float(self.initializer_config['scale']),
                                                                            mode=self.initializer_config['mode'])):
            logits = (
                LinearWrap(image).Conv2D('conv1', 64, 7, strides=2)  # size=112
                .MaxPooling('pool1', pool_size=3, strides=2,
                            padding="SAME")  # size=56
                #.BatchNorm('bn1')
                #.apply(activate)
                .apply(group_v2, 'res1', 64, 3, 1)  # size=56
                .apply(group_v2, 'res2', 128, 4, 2)  # size=28
                .apply(group_v2, 'res3', 256, 6, 2)  # size=14
                .apply(group_v2, 'res4', 512, 3, 2)  # size=7
                .BatchNorm('last_bn').apply(activate).GlobalAvgPooling(
                    'gap').FullyConnected('fct', self.nb_classes)())
        prob = tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        # regularization
        if self.regularizer_config['name'] not in [None, 'None']:
            reg_func = getattr(regularizers,
                               self.regularizer_config['name'])().get_func(
                                   self.regularizer_config,
                                   self.quantizer_config)
            reg_cost = tf.multiply(float(self.regularizer_config['lmbd']),
                                   regularize_cost('.*/W', reg_func),
                                   name='reg_cost')
            total_cost = tf.add_n([cost, reg_cost], name='total_cost')
        else:
            total_cost = cost

        # summary
        def add_summary(logits, cost):
            err_top1 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label,
                                                             1)),
                               tf.float32,
                               name='err_top1')
            add_moving_summary(
                tf.reduce_mean(err_top1, name='train_error_top1'))
            err_top5 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label,
                                                             5)),
                               tf.float32,
                               name='err_top5')
            add_moving_summary(
                tf.reduce_mean(err_top5, name='train_error_top5'))

            add_moving_summary(cost)
            add_param_summary(('.*/W', ['histogram']))  # monitor W

        add_summary(logits, cost)

        return total_cost
Esempio n. 31
0
    def _build_graph(self, inputs):
        image, label = inputs

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        old_get_variable = tf.get_variable

        def monitor(x, name):
            if MONITOR == 1:
                return tf.Print(x, [x],
                                message='\n\n' + name + ': ',
                                summarize=1000,
                                name=name)
            else:
                return x

        def new_get_variable(v):
            name = v.op.name
            if not name.endswith('W') or 'conv1_1' in name or 'fc8' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                if MONITOR == 1:
                    return tf.Print(fw(v), [fw(v)],
                                    message='\n\n' + v.name +
                                    ', Quantized weights are:',
                                    summarize=100)
                else:
                    return fw(v)

        def bn_activate(name, x):
            X = BatchNorm(name, x)
            x = monitor(x, name + '_noact_out')
            return activate(x)

        def activate(x):
            if BITA == 32:
                return tf.nn.relu(x)
            else:
                return fa(tf.nn.relu(x))

        # VGG 16
        with remap_variables(new_get_variable), \
             argscope(Conv2D, kernel_shape=3, use_bias=False, nl = tf.identity):
            logits = (
                LinearWrap(image).apply(monitor, 'image_out').Conv2D(
                    'conv1_1',
                    64).apply(fg).BatchNorm('bn1_1').apply(activate).apply(
                        monitor, 'conv1_1_out').Conv2D('conv1_2', 64).apply(
                            fg).BatchNorm('bn1_2').apply(activate).apply(
                                monitor,
                                'conv1_2_out').MaxPooling('pool1', 2).apply(
                                    monitor, 'pool1_out')
                # 112
                .Conv2D(
                    'conv2_1',
                    128).apply(fg).BatchNorm('bn2_1').apply(activate).apply(
                        monitor, 'conv2_1_out').Conv2D('conv2_2', 128).apply(
                            fg).BatchNorm('bn2_2').apply(activate).apply(
                                monitor, 'conv2_2_out').MaxPooling(
                                    'pool2', 2).apply(monitor, 'pool2_out')
                # 56
                .Conv2D(
                    'conv3_1',
                    256).apply(fg).BatchNorm('bn3_1').apply(activate).apply(
                        monitor, 'conv3_1_out').Conv2D(
                            'conv3_2', 256).apply(fg).BatchNorm('bn3_2').
                apply(activate).apply(monitor, 'conv3_2_out').Conv2D(
                    'conv3_3',
                    256).apply(fg).BatchNorm('bn3_3').apply(activate).apply(
                        monitor, 'conv3_3_out').MaxPooling('pool3', 2).apply(
                            monitor, 'pool3_out')
                # 28
                .Conv2D(
                    'conv4_1',
                    512).apply(fg).BatchNorm('bn4_1').apply(activate).apply(
                        monitor, 'conv4_1_out').Conv2D(
                            'conv4_2', 512).apply(fg).BatchNorm('bn4_2').
                apply(activate).apply(monitor, 'conv4_2_out').Conv2D(
                    'conv4_3',
                    512).apply(fg).BatchNorm('bn4_3').apply(activate).apply(
                        monitor, 'conv4_3_out').MaxPooling('pool4', 2).apply(
                            monitor, 'pool4_out')
                # 14
                .Conv2D(
                    'conv5_1',
                    512).apply(fg).BatchNorm('bn5_1').apply(activate).apply(
                        monitor, 'conv5_1_out').Conv2D(
                            'conv5_2', 512).apply(fg).BatchNorm('bn5_2').
                apply(activate).apply(monitor, 'conv5_2_out').Conv2D(
                    'conv5_3',
                    512).apply(fg).BatchNorm('bn5_3').apply(activate).apply(
                        monitor, 'conv5_3_out').MaxPooling('pool5', 2).apply(
                            monitor, 'pool5_out').FullyConnected(
                                'fc6', use_bias=False,
                                out_dim=512).apply(activate).apply(
                                    monitor, 'fc6_out').FullyConnected(
                                        'fc7', use_bias=False,
                                        out_dim=512).apply(activate).apply(
                                            monitor, 'fc7_out').FullyConnected(
                                                'fc8',
                                                use_bias=False,
                                                out_dim=self.cifar_classnum,
                                                nl=tf.identity).apply(
                                                    monitor, 'fc8_out')())

        prob = tf.nn.softmax(logits, name='prob')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = symbf.prediction_incorrect(logits, label, name='incorrect')
        accuracy = symbf.accuracy(logits, label, name='accuracy')

        train_error = tf.reduce_mean(wrong, name='train_error')
        summary.add_moving_summary(train_error, accuracy)

        wd_cost = tf.multiply(1e-5,
                              regularize_cost('fc.*/W', tf.nn.l2_loss),
                              name='regularize_loss')
        self.cost = tf.add_n([wd_cost, cost], name='total_cost')
        summary.add_moving_summary(cost, wd_cost, self.cost)
Esempio n. 32
0
    def _build_graph(self, inputs):
        image, label = inputs
        image = image / 256.0

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)
        old_get_variable = tf.get_variable

        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv1' in name or 'fct' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        def resblock(x, channel, stride):
            def get_stem_full(x):
                return (LinearWrap(x)
                        .Conv2D('c3x3a', channel, 3)
                        .BatchNorm('stembn')
                        .apply(activate)
                        .Conv2D('c3x3b', channel, 3)())
            channel_mismatch = channel != x.get_shape().as_list()[3]
            if stride != 1 or channel_mismatch or 'pool1' in x.name:
                # handling pool1 is to work around an architecture bug in our model
                if stride != 1 or 'pool1' in x.name:
                    x = AvgPooling('pool', x, stride, stride)
                x = BatchNorm('bn', x)
                x = activate(x)
                shortcut = Conv2D('shortcut', x, channel, 1)
                stem = get_stem_full(x)
            else:
                shortcut = x
                x = BatchNorm('bn', x)
                x = activate(x)
                stem = get_stem_full(x)
            return shortcut + stem

        def group(x, name, channel, nr_block, stride):
            with tf.variable_scope(name + 'blk1'):
                x = resblock(x, channel, stride)
            for i in range(2, nr_block + 1):
                with tf.variable_scope(name + 'blk{}'.format(i)):
                    x = resblock(x, channel, 1)
            return x

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False, nl=tf.identity):
            logits = (LinearWrap(image)
                      # use explicit padding here, because our training framework has
                      # different padding mechanisms from TensorFlow
                      .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]])
                      .Conv2D('conv1', 64, 7, stride=2, padding='VALID', use_bias=True)
                      .tf.pad([[0, 0], [1, 1], [1, 1], [0, 0]], 'SYMMETRIC')
                      .MaxPooling('pool1', 3, 2, padding='VALID')
                      .apply(group, 'conv2', 64, 2, 1)
                      .apply(group, 'conv3', 128, 2, 2)
                      .apply(group, 'conv4', 256, 2, 2)
                      .apply(group, 'conv5', 512, 2, 2)
                      .BatchNorm('lastbn')
                      .apply(nonlin)
                      .GlobalAvgPooling('gap')
                      .tf.multiply(49)  # this is due to a bug in our model design
                      .FullyConnected('fct', 1000)())
        prob = tf.nn.softmax(logits, name='output')
        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
    def _build_graph(self, inputs):
        input, label = inputs

        fw, fa, fg = get_dorefa(FLAGS.bit_w, FLAGS.bit_a, 32)

        old_get_variable = tf.get_variable

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            logger.info("Binarizing weight {}".format(v.op.name))
            return fw(v, FLAGS.force_quantization)

        def nonlin(x):
            if FLAGS.bit_a == 32 and not FLAGS.use_clip:
                return tf.nn.relu(x)  # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        activations = []
        with remap_variables(new_get_variable), \
                argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity):
            curr_layer = LinearWrap(input)
            for i in range(FLAGS.n_layers):
                curr_layer = (curr_layer.FullyConnected(
                    'fc' + str(i),
                    FLAGS.state_size).LayerNorm('ln_fc' +
                                                str(i)).apply(activate))
                activations.append(curr_layer.tensor())
                curr_layer = (curr_layer.Dropout('dropout', FLAGS.dropout))
            logits = (curr_layer.FullyConnected(
                'fc' + str(FLAGS.n_layers),
                256).LayerNorm('lnfc' + str(FLAGS.n_layers)).apply(
                    nonlin).FullyConnected('fct', self.n_spks,
                                           use_bias=True)())

        print_all_tf_vars()

        prob = tf.nn.softmax(logits, name='output')

        # used for validation accuracy of utterance
        identity_guesses = flatten(tf.argmax(prob, axis=1))
        uniq_identities, _, count = tf.unique_with_counts(identity_guesses)
        idx_to_identity_with_most_votes = tf.argmax(count)
        chosen_identity = tf.gather(uniq_identities,
                                    idx_to_identity_with_most_votes)
        wrong = tf.expand_dims(tf.not_equal(chosen_identity,
                                            tf.cast(label[0], tf.int64)),
                               axis=0,
                               name='utt-wrong')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W',
                                  l2_regularizer(5e-6),
                                  name='regularize_cost')

        add_param_summary(('.*/W', ['histogram', 'rms']))
        self.cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, self.cost)

        for activation in activations:
            add_activation_summary(activation)
            tf.summary.histogram(activation.name, activation)
Esempio n. 34
0
    def build_graph(self, image, label):
        # get quantization function
        # quantize weights
        qw = quantize_weight(int(self.quantizer_config['BITW']), self.quantizer_config['name'], self.quantizer_config['W_opts'], self.quantizer_config)
        # quantize activation
        if self.quantizer_config['BITA'] in ['32', 32]:
            qa = tf.identity
        else:
            qa = quantize_activation(int(self.quantizer_config['BITA']))
        # quantize gradient
        qg = quantize_gradient(int(self.quantizer_config['BITG']))

        def new_get_variable(v):
            name = v.op.name
            # don't quantize first and last layer
            if not name.endswith('/W') or 'conv1' in name or 'fct' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                return qw(v)

        def activate(x):
            return qa(self.activation(x))

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False, nl=tf.identity,
                         kernel_initializer=tf.variance_scaling_initializer(scale=float(self.initializer_config['scale']),
                                                                            mode=self.initializer_config['mode'])):
            logits = (LinearWrap(image)
                      .Conv2D('conv1', 96, 3)
                      .BatchNorm('bn1')
                      .apply(activate)
                      .Conv2D('conv2', 256, 3, padding='SAME', split=2)
                      .BatchNorm('bn2')
                      .apply(activate)
                      .MaxPooling('pool2', 2, 2, padding='VALID')  # size=16

                      .Conv2D('conv3', 384, 3)
                      .BatchNorm('bn3')
                      .apply(activate)
                      .MaxPooling('pool2', 2, 2, padding='VALID')  # size=8

                      .Conv2D('conv4', 384, 3, split=2)
                      .BatchNorm('bn4')
                      .apply(activate)

                      .Conv2D('conv5', 256, 3, split=2)
                      .BatchNorm('bn5')
                      .apply(activate)
                      .MaxPooling('pool5', 2, 2, padding='VALID')  # size=4

                      .FullyConnected('fc1', 4096, use_bias=False)
                      .BatchNorm('bnfc1')
                      .apply(activate)

                      .FullyConnected('fc2', 4096, use_bias=False)
                      .BatchNorm('bnfc2')
                      .apply(activate)

                      .FullyConnected('fct', self.nb_classes, use_bias=True)())
        prob = tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        # regularization
        if self.regularizer_config['name'] not in [None, 'None']:
            reg_func = getattr(regularizers, self.regularizer_config['name'])().get_func(self.regularizer_config)
            reg_cost = tf.multiply(float(self.regularizer_config['lmbd']), regularize_cost('.*/W', reg_func), name='reg_cost')
            total_cost = tf.add_n([cost, reg_cost], name='total_cost')
        else:
            total_cost = cost

        # summary
        def add_summary(logits, cost):
            err_top1 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='err_top1')
            add_moving_summary(tf.reduce_mean(err_top1, name='train_error_top1'))
            err_top5 = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 5)), tf.float32, name='err_top5')
            add_moving_summary(tf.reduce_mean(err_top5, name='train_error_top5'))

            add_moving_summary(cost)
            add_param_summary(('.*/W', ['histogram']))  # monitor W
        add_summary(logits, cost)
            
        return total_cost