Example #1
0
    def _build_graph(self, inputs):
        input, label = inputs

        fw, fa, fg = get_dorefa(FLAGS.bit_w, FLAGS.bit_a, 32)

        old_get_variable = tf.get_variable

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'fc0' in name or 'fct' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if FLAGS.bit_a == 32 and not FLAGS.use_clip:
                return tf.nn.relu(x)  # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        activations = []
        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity):
            curr_layer = LinearWrap(input)
            for i in range(FLAGS.n_layers):
                curr_layer = (curr_layer.FullyConnected(
                    'fc' + str(i), FLAGS.state_size).apply(activate))
                activations.append(curr_layer.tensor())
                curr_layer = (curr_layer.BatchNorm('bn_fc' + str(i)).Dropout(
                    'dropout', FLAGS.dropout))
            logits = (curr_layer.FullyConnected(
                'fc' + str(FLAGS.n_layers), 256).apply(nonlin).BatchNorm(
                    'bnfc' + str(FLAGS.n_layers)).FullyConnected(
                        'fct', self.n_spks, use_bias=True)())

        print_all_tf_vars()

        prob = tf.nn.softmax(logits, name='output')

        # used for validation accuracy of utterance
        identity_guesses = flatten(tf.argmax(prob, axis=1))
        uniq_identities, _, count = tf.unique_with_counts(identity_guesses)
        idx_to_identity_with_most_votes = tf.argmax(count)
        chosen_identity = tf.gather(uniq_identities,
                                    idx_to_identity_with_most_votes)
        wrong = tf.expand_dims(tf.not_equal(chosen_identity,
                                            tf.cast(label[0], tf.int64)),
                               axis=0,
                               name='utt-wrong')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W',
                                  l2_regularizer(5e-6),
                                  name='regularize_cost')

        add_param_summary(('.*/W', ['histogram', 'rms']))
        self.cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, self.cost)

        for activation in activations:
            add_activation_summary(activation)
            tf.summary.histogram(activation.name, activation)
    def build_graph(self, image, label):
        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fc' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def cabs(x):
            return tf.minimum(1.0, tf.abs(x), name='cabs')

        def activate(x):
            return fa(cabs(x))

        image = image / 256.0

        with remap_variables(binarize_weight), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False):
            logits = (
                LinearWrap(image).Conv2D('conv0',
                                         48,
                                         5,
                                         padding='VALID',
                                         use_bias=True).MaxPooling(
                                             'pool0', 2,
                                             padding='SAME').apply(activate)
                # 18
                .Conv2D('conv1', 64, 3, padding='SAME').apply(fg).BatchNorm(
                    'bn1').apply(activate).Conv2D(
                        'conv2', 64, 3,
                        padding='SAME').apply(fg).BatchNorm('bn2').MaxPooling(
                            'pool1', 2, padding='SAME').apply(activate)
                # 9
                .Conv2D(
                    'conv3', 128, 3,
                    padding='VALID').apply(fg).BatchNorm('bn3').apply(activate)
                # 7
                .Conv2D('conv4', 128, 3, padding='SAME').apply(fg).
                BatchNorm('bn4').apply(activate).Conv2D(
                    'conv5', 128, 3,
                    padding='VALID').apply(fg).BatchNorm('bn5').apply(activate)
                # 5
                .tf.nn.dropout(0.5 if is_training else 1.0).Conv2D(
                    'conv6', 512, 5, padding='VALID').apply(fg).BatchNorm(
                        'bn6').apply(cabs).FullyConnected('fc1', 10)())
        tf.nn.softmax(logits, name='output')

        # compute the number of failed samples
        wrong = prediction_incorrect(logits, label)
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))

        add_param_summary(('.*/W', ['histogram', 'rms']))
        total_cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, total_cost)
        return total_cost
Example #3
0
    def _build_graph(self, inputs):
        image, label = inputs
        """Add a single channel here"""
        image = tf.expand_dims(image, 3)

        image = image * 256
        image = tf.round(image)

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        old_get_variable = tf.get_variable

        def monitor(x, name):
            if MONITOR == 1:
                return tf.Print(x, [x],
                                message='\n\n' + name + ': ',
                                summarize=1000,
                                name=name)
            else:
                return x

        def new_get_variable(v):
            name = v.op.name
            if not name.endswith('W') or 'conv0' in name or 'fc1' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                if MONITOR == 1:
                    return tf.Print(fw(v), [fw(v)],
                                    message='\n\n' + v.name +
                                    ', Quantized weights are:',
                                    summarize=100)
                else:
                    return fw(v)

        def activate(x):
            if BITA == 32:
                return tf.nn.relu(x)
            else:
                return fa(tf.nn.relu(x))

        with remap_variables(new_get_variable), \
             argscope(Conv2D, kernel_shape=3, use_bias=False, nl=tf.identity, out_channel=32):
            logits = (LinearWrap(image).apply(monitor, 'image_out').Conv2D(
                'conv0').apply(fg).BatchNorm('bn0').apply(activate).apply(
                    monitor, 'conv0_out').MaxPooling('pool0', 2).apply(
                        monitor, 'pool0_out').Conv2D('conv1').apply(
                            fg).BatchNorm('bn1').apply(activate).apply(
                                monitor, 'conv1_out').Conv2D('conv2').apply(
                                    fg).BatchNorm('bn2').apply(activate).apply(
                                        monitor, 'conv2_out').MaxPooling(
                                            'pool1', 2).apply(
                                                monitor,
                                                'pool1_out').Conv2D('conv3').
                      apply(fg).BatchNorm('bn3').apply(activate).apply(
                          monitor, 'conv3_out').FullyConnected(
                              'fc0',
                              use_bias=False,
                              out_dim=20,
                              nl=tf.identity).apply(activate).apply(
                                  monitor, 'fc0_out').FullyConnected(
                                      'fc1',
                                      use_bias=False,
                                      out_dim=10,
                                      nl=tf.identity).apply(
                                          monitor, 'fc1_out')())

        prob = tf.nn.softmax(logits, name='prob')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = symbf.prediction_incorrect(logits, label, name='incorrect')
        accuracy = symbf.accuracy(logits, label, name='accuracy')

        train_error = tf.reduce_mean(wrong, name='train_error')
        summary.add_moving_summary(train_error, accuracy)

        wd_cost = tf.multiply(1e-5,
                              regularize_cost('fc.*/W', tf.nn.l2_loss),
                              name='regularize_loss')
        self.cost = tf.add_n([wd_cost, cost], name='total_cost')
        summary.add_moving_summary(cost, wd_cost, self.cost)
Example #4
0
    def _build_graph(self, input_vars, is_training):
        image, label = input_vars

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)
        # monkey-patch tf.get_variable to apply fw
        old_get_variable = tf.get_variable

        def new_get_variable(name, shape=None, **kwargs):
            v = old_get_variable(name, shape, **kwargs)
            # don't binarize first and last layer
            if name != 'W' or 'conv0' in v.op.name or 'fc' in v.op.name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        tf.get_variable = new_get_variable

        def cabs(x):
            return tf.minimum(1.0, tf.abs(x), name='cabs')

        def activate(x):
            return fa(cabs(x))

        image = image / 256.0

        with argscope(BatchNorm, decay=0.9, epsilon=1e-4, use_local_stat=is_training), \
                argscope(Conv2D, use_bias=False, nl=tf.identity):
            logits = (
                LinearWrap(image).Conv2D('conv0',
                                         48,
                                         5,
                                         padding='VALID',
                                         use_bias=True).MaxPooling(
                                             'pool0', 2,
                                             padding='SAME').apply(activate)
                # 18
                .Conv2D('conv1', 64, 3, padding='SAME').apply(fg).BatchNorm(
                    'bn1').apply(activate).Conv2D(
                        'conv2', 64, 3,
                        padding='SAME').apply(fg).BatchNorm('bn2').MaxPooling(
                            'pool1', 2, padding='SAME').apply(activate)
                # 9
                .Conv2D(
                    'conv3', 128, 3,
                    padding='VALID').apply(fg).BatchNorm('bn3').apply(activate)
                # 7
                .Conv2D('conv4', 128, 3, padding='SAME').apply(fg).
                BatchNorm('bn4').apply(activate).Conv2D(
                    'conv5', 128, 3,
                    padding='VALID').apply(fg).BatchNorm('bn5').apply(activate)
                # 5
                .tf.nn.dropout(0.5 if is_training else 1.0).Conv2D(
                    'conv6', 512, 5, padding='VALID').apply(fg).BatchNorm(
                        'bn6').apply(cabs).FullyConnected('fc1',
                                                          10,
                                                          nl=tf.identity)())
        tf.get_variable = old_get_variable
        prob = tf.nn.softmax(logits, name='output')

        # compute the number of failed samples
        wrong = prediction_incorrect(logits, label)
        nr_wrong = tf.reduce_sum(wrong, name='wrong')
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))
        add_moving_summary(cost, wd_cost)

        add_param_summary([('.*/W', ['histogram', 'rms'])])
        self.cost = tf.add_n([cost, wd_cost], name='cost')
    def _build_graph(self, input_vars, is_training):
        image, label = input_vars
        image = image / 255.0

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)
        # monkey-patch tf.get_variable to apply fw
        old_get_variable = tf.get_variable
        def new_get_variable(name, shape=None, **kwargs):
            v = old_get_variable(name, shape, **kwargs)
            # don't binarize first and last layer
            if name != 'W' or 'conv0' in v.op.name or 'fct' in v.op.name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)
        tf.get_variable = new_get_variable

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)    # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        with argscope(BatchNorm, decay=0.9, epsilon=1e-4, use_local_stat=is_training), \
                argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity):
            logits = (LinearWrap(image)
                .Conv2D('conv0', 96, 12, stride=4, padding='VALID')
                .apply(activate)

                .Conv2D('conv1', 256, 5, padding='SAME', split=2)
                .apply(fg)
                .BatchNorm('bn1')
                .MaxPooling('pool1', 3, 2, padding='SAME')
                .apply(activate)

                .Conv2D('conv2', 384, 3)
                .apply(fg)
                .BatchNorm('bn2')
                .MaxPooling('pool2', 3, 2, padding='SAME')
                .apply(activate)

                .Conv2D('conv3', 384, 3, split=2)
                .apply(fg)
                .BatchNorm('bn3')
                .apply(activate)

                .Conv2D('conv4', 256, 3, split=2)
                .apply(fg)
                .BatchNorm('bn4')
                .MaxPooling('pool4', 3, 2, padding='VALID')
                .apply(activate)

                .FullyConnected('fc0', 4096)
                .apply(fg)
                .BatchNorm('bnfc0')
                .apply(activate)

                .FullyConnected('fc1', 4096)
                .apply(fg)
                .BatchNorm('bnfc1')
                .apply(nonlin)
                .FullyConnected('fct', 1000, use_bias=True)())
        tf.get_variable = old_get_variable

        prob = tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label, 1)
        nr_wrong = tf.reduce_sum(wrong, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train_error_top1'))
        wrong = prediction_incorrect(logits, label, 5)
        nr_wrong = tf.reduce_sum(wrong, name='wrong-top5')
        add_moving_summary(tf.reduce_mean(wrong, name='train_error_top5'))

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6))
        add_moving_summary(cost, wd_cost)

        add_param_summary([('.*/W', ['histogram', 'rms'])])
        self.cost = tf.add_n([cost, wd_cost], name='cost')
Example #6
0
    def build_graph(self, image, label):
        """This function should build the model which takes the input variables (defined above)
        and return cost at the end."""

        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'fc0' in name or 'fc_out' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)
            #FIXMEreturn tf.clip_by_value(x, 0.0, 1.0)
            return tf.clip_by_value(x, -1.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        # The context manager `argscope` sets the default option for all the layers under
        # this context. Here we use 32 channel convolution with shape 3x3
        # See tutorial at https://tensorpack.readthedocs.io/tutorial/symbolic.html
        with remap_variables(binarize_weight), \
                argscope(FullyConnected, use_bias=False), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4):
            # LinearWrap is just a syntax sugar.
            # See tutorial at https://tensorpack.readthedocs.io/tutorial/symbolic.html
            logits = (
                LinearWrap(image).Dropout('dropout_in',
                                          rate=0.2 if is_training else 0.0)
                # hidden 0
                .FullyConnected(
                    'fc0', n_units).BatchNorm('bn0').apply(activate).Dropout(
                        'dropout_hidden0', rate=0.5 if is_training else 0.0)
                # hidden 1
                .FullyConnected(
                    'fc1', n_units).BatchNorm('bn1').apply(activate).Dropout(
                        'dropout_hidden1', rate=0.5 if is_training else 0.0)
                # hidden 2
                .FullyConnected(
                    'fc2', n_units).BatchNorm('bn2').apply(activate).Dropout(
                        'dropout_hidden2', rate=0.5 if is_training else 0.0)
                # output layer
                .FullyConnected('fc_out', 10, activation=tf.identity)())

        # a vector of length B with loss of each sample
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(
            cost, name='cross_entropy_loss')  # the average cross-entropy loss

        correct = tf.cast(tf.nn.in_top_k(predictions=logits,
                                         targets=label,
                                         k=1),
                          tf.float32,
                          name='correct')
        accuracy = tf.reduce_mean(correct, name='accuracy')

        # This will monitor training error & accuracy (in a moving average fashion). The value will be automatically
        # 1. written to tensosrboard
        # 2. written to stat.json
        # 3. printed after each epoch
        # You can also just call `tf.summary.scalar`. But moving summary has some other benefits.
        # See tutorial at https://tensorpack.readthedocs.io/tutorial/summary.html
        train_error = tf.reduce_mean(1 - correct, name='train_error')
        summary.add_moving_summary(train_error, accuracy)

        # Use a regex to find parameters to apply weight decay.
        # Here we apply a weight decay on all W (weight matrix) of all fc layers
        # If you don't like regex, you can certainly define the cost in any other methods.
        wd_cost = tf.multiply(1e-5,
                              regularize_cost('fc.*/W', tf.nn.l2_loss),
                              name='regularize_loss')
        total_cost = tf.add_n([wd_cost, cost], name='total_cost')
        summary.add_moving_summary(cost, wd_cost, total_cost)

        # monitor histogram of all weight (of conv and fc layers) in tensorboard
        summary.add_param_summary(('.*/W', ['histogram', 'rms']))
        # the function should return the total cost to be optimized
        return total_cost
Example #7
0
    def _conv_bn_layer(self,
                       layer_name,
                       inputs,
                       filters,
                       size,
                       stride,
                       padding='SAME',
                       use_bias=False,
                       freeze=False,
                       xavier=False,
                       relu=True,
                       activation_fn=tf.nn.relu,
                       stddev=0.001,
                       kernel_name='kernels',
                       bias_name='biases'):
        """Convolutional layer operation constructor.

    Args:
      layer_name: layer name.
      inputs: input tensor
      filters: number of output filters.
      size: kernel size.
      stride: stride
      padding: 'SAME' or 'VALID'. See tensorflow doc for detailed description.
      freeze: if true, then do not train the parameters in this layer.
      xavier: whether to use xavier weight initializer or not.
      relu: whether to use relu or not.
      stddev: standard deviation used for random weight initializer.
    Returns:
      A convolutional layer operation.
    """

        mc = self.mc
        use_pretrained_param = False
        if mc.LOAD_PRETRAINED_MODEL:
            cw = self.caffemodel_weight
            if layer_name in cw:
                kernel_val = np.transpose(cw[layer_name][0], [2, 3, 1, 0])
                bias_val = cw[layer_name][1]
                # check the shape
                if (kernel_val.shape ==
                      (size, size, inputs.get_shape().as_list()[-1], filters)) \
                   and (bias_val.shape == (filters, )):
                    use_pretrained_param = True
                else:
                    print(
                        'Shape of the pretrained parameter of {} does not match, '
                        'use randomly initialized parameter'.format(
                            layer_name))
            else:
                print(
                    'Cannot find {} in the pretrained model. Use randomly initialized '
                    'parameters'.format(layer_name))

        if mc.DEBUG_MODE:
            print('Input tensor shape to {}: {}'.format(
                layer_name, inputs.get_shape()))

        with tf.variable_scope(layer_name) as scope:
            channels = inputs.get_shape()[3]

            # re-order the caffe kernel with shape [out, in, h, w] -> tf kernel with
            # shape [h, w, in, out]
            if use_pretrained_param:
                if mc.DEBUG_MODE:
                    print('Using pretrained model for {}'.format(layer_name))
                kernel_init = tf.constant(kernel_val, dtype=tf.float32)
                bias_init = tf.constant(bias_val, dtype=tf.float32)
            elif xavier:
                kernel_init = tf.contrib.layers.xavier_initializer_conv2d()
                bias_init = tf.constant_initializer(0.0)
            else:
                kernel_init = tf.truncated_normal_initializer(stddev=stddev,
                                                              dtype=tf.float32)
                bias_init = tf.constant_initializer(0.0)

            kernel = _variable_with_weight_decay(
                kernel_name,
                shape=[size, size, int(channels), filters],
                wd=mc.WEIGHT_DECAY,
                initializer=kernel_init,
                trainable=(not freeze))

            #kernel_binary = binarize(kernel)
            if use_bias == True:
                biases = _variable_on_device(bias_name, [filters],
                                             bias_init,
                                             trainable=(not freeze))
                self.model_params += [kernel, biases]

            if mc.bDoreFa == True:
                fw, fa, fg = get_dorefa(mc.BITW, mc.BITA, mc.BITG)
                kernel = fw(kernel)
                if mc.BITA != 32:
                    #inputs = tf.clip_by_value(inputs, 0.0, 1.0)
                    inputs = inputs / tf.reduce_max(inputs)
                    inputs = fa(inputs)

            if mc.bQuant == True:
                if mc.bQuantWeights == True:
                    kernel = self._quant_kernel_v1(mc, kernel)

            if mc.bQuant == True:
                if mc.bQuantActivations == True:
                    inputs = self._quant_activations(mc, inputs)

            conv = tf.nn.conv2d(inputs,
                                kernel, [1, stride, stride, 1],
                                padding=padding,
                                name='convolution')

            if use_bias == True:
                out0 = tf.nn.bias_add(conv, biases, name='bias_add')
            else:
                out0 = conv

            out0 = slim.batch_norm(out0, scope='BatchNorm')
            if relu == True:
                out = activation_fn(out0, 'relu')
            else:
                out = out0

            self.model_size_counter.append(
                (layer_name, (1 + size * size * int(channels)) * filters))
            out_shape = out.get_shape().as_list()
            num_flops = \
              (1+2*int(channels)*size*size)*filters*out_shape[1]*out_shape[2]
            if relu:
                num_flops += 2 * filters * out_shape[1] * out_shape[2]
            self.flop_counter.append((layer_name, num_flops))

            self.activation_counter.append(
                (layer_name, out_shape[1] * out_shape[2] * out_shape[3]))

            return out
Example #8
0
    def build_graph(self, image, label):
        image = image / 256.0

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv1' in name or 'fct' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        def resblock(x, channel, stride):
            def get_stem_full(x):
                return (LinearWrap(x).Conv2D(
                    'c3x3a', channel,
                    3).BatchNorm('stembn').apply(activate).Conv2D(
                        'c3x3b', channel, 3)())

            channel_mismatch = channel != x.get_shape().as_list()[3]
            if stride != 1 or channel_mismatch or 'pool1' in x.name:
                # handling pool1 is to work around an architecture bug in our model
                if stride != 1 or 'pool1' in x.name:
                    x = AvgPooling('pool', x, stride, stride)
                x = BatchNorm('bn', x)
                x = activate(x)
                shortcut = Conv2D('shortcut', x, channel, 1)
                stem = get_stem_full(x)
            else:
                shortcut = x
                x = BatchNorm('bn', x)
                x = activate(x)
                stem = get_stem_full(x)
            return shortcut + stem

        def group(x, name, channel, nr_block, stride):
            with tf.variable_scope(name + 'blk1'):
                x = resblock(x, channel, stride)
            for i in range(2, nr_block + 1):
                with tf.variable_scope(name + 'blk{}'.format(i)):
                    x = resblock(x, channel, 1)
            return x

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False, nl=tf.identity):
            logits = (
                LinearWrap(image)
                # use explicit padding here, because our private training framework has
                # different padding mechanisms from TensorFlow
                .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]]).Conv2D(
                    'conv1', 64, 7, stride=2, padding='VALID',
                    use_bias=True).tf.pad(
                        [[0, 0], [1, 1], [1, 1], [0, 0]],
                        'SYMMETRIC').MaxPooling(
                            'pool1',
                            3, 2, padding='VALID').apply(
                                group, 'conv2', 64,
                                2, 1).apply(group, 'conv3', 128, 2, 2).apply(
                                    group, 'conv4', 256, 2,
                                    2).apply(group, 'conv5', 512, 2,
                                             2).BatchNorm('lastbn').
                apply(nonlin).GlobalAvgPooling('gap').tf.multiply(
                    49)  # this is due to a bug in our model design
                .FullyConnected('fct', 1000)())
        tf.nn.softmax(logits, name='output')
        ImageNetModel.compute_loss_and_error(logits, label)
Example #9
0
    def _build_graph(self, inputs):
        image, label = inputs
        image = tf.expand_dims(image, 3)
        image = image * 2 - 1  # center the pixels values at zero

        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fc' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def cabs(x):
            return tf.minimum(1.0, tf.abs(x), name='cabs')

        def activate(x):
            return fa(cabs(x))


        with remap_variables(binarize_weight), \
                argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32):
            logits = (LinearWrap(image).Conv2D('conv0').MaxPooling(
                'pool0', 2).apply(activate).Conv2D('conv1').apply(fg).Conv2D(
                    'conv2').apply(fg).MaxPooling('pool1', 2).apply(activate).
                      Conv2D('conv3').apply(fg).apply(cabs).FullyConnected(
                          'fc0', 512, activation=tf.nn.relu).Dropout(
                              'dropout',
                              0.5).FullyConnected('fc1',
                                                  10,
                                                  activation=tf.identity)())

        tf.nn.softmax(logits, name='output')

        # a vector of length B with loss of each sample
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(
            cost, name='cross_entropy_loss')  # the average cross-entropy loss

        correct = tf.cast(tf.nn.in_top_k(logits, label, 1),
                          tf.float32,
                          name='correct')
        accuracy = tf.reduce_mean(correct, name='accuracy')

        # This will monitor training error (in a moving_average fashion):
        # 1. write the value to tensosrboard
        # 2. write the value to stat.json
        # 3. print the value after each epoch
        train_error = tf.reduce_mean(1 - correct, name='train_error')
        summary.add_moving_summary(train_error, accuracy)

        # Use a regex to find parameters to apply weight decay.
        # Here we apply a weight decay on all W (weight matrix) of all fc layers
        wd_cost = tf.multiply(1e-5,
                              regularize_cost('fc.*/W', tf.nn.l2_loss),
                              name='regularize_loss')
        self.cost = tf.add_n([wd_cost, cost], name='total_cost')
        summary.add_moving_summary(cost, wd_cost, self.cost)

        # monitor histogram of all weight (of conv and fc layers) in tensorboard
        summary.add_param_summary(('.*/W', ['histogram', 'rms']))
Example #10
0
import tensorflow as tf
import math
from tensorflow.python.training import moving_averages
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.framework import ops
from dorefa import get_dorefa

BITW = 1
BITA = 3
BITG = 32
fw, fa, fg = get_dorefa(BITW, BITA, BITG)


def cabs(x):
    return tf.minimum(1.0, tf.abs(x), name='cabs')


def DoReFa_Convolution_w(nOutputPlane,
                         kW,
                         kH,
                         dW=1,
                         dH=1,
                         padding='VALID',
                         bias=True,
                         reuse=None,
                         name='DoReFa_Convolution_w'):
    def b_conv2d(x, is_training=True):
        nInputPlane = x.get_shape().as_list()[3]
        with tf.variable_op_scope([x], None, name, reuse=reuse):
            w = tf.get_variable(
                'weight', [kH, kW, nInputPlane, nOutputPlane],
Example #11
0
    def _build_graph(self, inputs):
        inp, label = inputs
        is_training = get_current_tower_context().is_training

        fw, fa = get_dorefa(BITW, BITA)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            if not (name.endswith('W') or name.endswith('b')
                    ) or 'linear0' in name or 'last_linear' in name:
                print("Not quantizing", name)
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x, name="activate"):
            return fa(tf.clip_by_value(BNWithTrackedMults(x), 0.0, 1.0))

        with remap_variables(binarize_weight), \
                argscope([FullyConnectedWithTrackedMults], network_complexity=self.network_complexity), \
                argscope([BNReLUWithTrackedMults], network_complexity=self.network_complexity), \
                argscope([BNWithTrackedMults], network_complexity=self.network_complexity), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4):
            l = self.net_fn(inp, nonlin, self.n_context)
            logits = FullyConnectedWithTrackedMults('last_linear',
                                                    l,
                                                    out_dim=self.n_spks,
                                                    nl=tf.identity)

        prob = tf.nn.softmax(logits, name='output')

        # used for validation accuracy of utterance
        identity_guesses = flatten(tf.argmax(prob, axis=1))
        uniq_identities, _, count = tf.unique_with_counts(identity_guesses)
        idx_to_identity_with_most_votes = tf.argmax(count)
        chosen_identity = tf.gather(uniq_identities,
                                    idx_to_identity_with_most_votes)
        wrong = tf.expand_dims(tf.not_equal(chosen_identity,
                                            tf.cast(label[0], tf.int64)),
                               axis=0,
                               name='utt-wrong')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        add_moving_summary(cost)

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))

        with tf.name_scope('original-weight-summaries'):
            add_param_summary(('.*/W', ['rms', 'histogram']))
            add_param_summary(('.*/b', ['rms', 'histogram']))

        with tf.name_scope('activation-summaries'):

            def fn(name):
                return (
                    name.endswith('output') or name.endswith('output:0')
                ) and "Inference" not in name and 'quantized' not in name

            tensors = get_tensors_from_graph(tf.get_default_graph(), fn)
            print("Adding activation tensors to summary:", tensors)
            for tensor in tensors:
                add_tensor_summary(tensor, ['rms', 'histogram'])

        if self.regularize:
            # decreasing regularization on all W of fc layers
            wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(),
                                              480000, 0.2, True)
            wd_cost = tf.multiply(wd_w,
                                  regularize_cost('.*/W', tf.nn.l2_loss),
                                  name='wd_cost')
            add_moving_summary(wd_cost)
            self.cost = tf.add_n([cost, wd_cost], name='cost')
        else:
            self.cost = tf.identity(cost, name='cost')

        tf.constant([self.network_complexity['mults']], name='TotalMults')
        tf.constant([self.network_complexity['weights']], name='TotalWeights')
        logger.info("Parameter count: {}".format(self.network_complexity))
Example #12
0
    def _build_graph(self, input_vars, is_training):
        is_training = bool(is_training)
        keep_prob = tf.constant(0.5 if is_training else 1.0)

        image, label = input_vars
        image = tf.expand_dims(image, 3)  # add a single channel

        fw, fa, fg = get_dorefa(1, 2, 7)
        # monkey-patch tf.get_variable to apply fw
        old_get_variable = tf.get_variable  #  weightの更新

        nl = PReLU.f
        image = image * 2 - 1

        def new_get_variable(name, shape=None, **kwargs):
            v = old_get_variable(name, shape, **kwargs)
            # don't binarize first and last layer
            if name != 'W' or 'conv0' in v.op.name or 'fct' in v.op.name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        tf.get_variable = new_get_variable

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)  # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def cabs(x):
            return tf.minimum(1.0, tf.abs(x), name='cabs')

        def activate(x):
            return fa(cabs(x))  # 活性化関数の出力のクリップ


        with argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
            argscope(Conv2D, kernel_shape=3, nl=nl, out_channel=32):
            logits = (
                LinearWrap(
                    image)  # the starting brace is only for line-breaking
                .Conv2D(
                    'conv0', padding='VALID'
                )  #.apply(fg).BatchNorm('bn1',use_local_stat=is_training)
                .MaxPooling('pool0', 2).apply(activate).Conv2D('conv1',
                                                               padding='SAME').
                apply(fg).BatchNorm('bn2').apply(activate).Conv2D(
                    'conv2',
                    padding='VALID').apply(fg).BatchNorm('bn3').MaxPooling(
                        'pool1', 2).apply(activate).Conv2D('conv3',
                                                           padding='VALID').
                apply(fg).BatchNorm('bn4').apply(activate).FullyConnected(
                    'fc0',
                    512).apply(fg).BatchNorm('bn5')  #.tf.nn.dropout(keep_prob)
                .apply(cabs).FullyConnected('fc1', out_dim=10,
                                            nl=tf.identity)())

        prob = tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        # compute the number of failed samples, for ClassificationError to use at test time
        wrong = symbolic_functions.prediction_incorrect(logits, label)
        nr_wrong = tf.reduce_sum(wrong, name='wrong')
        # monitor training error
        summary.add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        # weight decay on all W of fc layers
        wd_cost = tf.mul(1e-5,
                         regularize_cost('fc.*/W', tf.nn.l2_loss),
                         name='regularize_loss')
        summary.add_moving_summary(cost, wd_cost)

        summary.add_param_summary([('.*/W', ['histogram'])
                                   ])  # monitor histogram of all W
        self.cost = tf.add_n([wd_cost, cost], name='cost')
Example #13
0
    def get_logits(self, image):
        if BITW == 't':
            fw, fa, fg = get_dorefa(32, 32, 32)
            fw = ternarize
        else:
            fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fct' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)    # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        with remap_variables(new_get_variable), \
                argscope([Conv2D, BatchNorm, MaxPooling], data_format='channels_first'), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False):
            logits = (LinearWrap(image)
                      .Conv2D('conv0', 96, 12, strides=4, padding='VALID', use_bias=True)
                      .apply(activate)
                      .Conv2D('conv1', 256, 5, padding='SAME', split=2)
                      .apply(fg)
                      .BatchNorm('bn1')
                      .MaxPooling('pool1', 3, 2, padding='SAME')
                      .apply(activate)

                      .Conv2D('conv2', 384, 3)
                      .apply(fg)
                      .BatchNorm('bn2')
                      .MaxPooling('pool2', 3, 2, padding='SAME')
                      .apply(activate)

                      .Conv2D('conv3', 384, 3, split=2)
                      .apply(fg)
                      .BatchNorm('bn3')
                      .apply(activate)

                      .Conv2D('conv4', 256, 3, split=2)
                      .apply(fg)
                      .BatchNorm('bn4')
                      .MaxPooling('pool4', 3, 2, padding='VALID')
                      .apply(activate)

                      .FullyConnected('fc0', 4096)
                      .apply(fg)
                      .BatchNorm('bnfc0')
                      .apply(activate)

                      .FullyConnected('fc1', 4096, use_bias=False)
                      .apply(fg)
                      .BatchNorm('bnfc1')
                      .apply(nonlin)
                      .FullyConnected('fct', 1000, use_bias=True)())
        add_param_summary(('.*/W', ['histogram', 'rms']))
        tf.nn.softmax(logits, name='output')  # for prediction
        return logits
Example #14
0
    def _build_graph(self, input_vars):
        image, label = input_vars
        image = image / 255.0

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)
        # monkey-patch tf.get_variable to apply fw
        old_get_variable = tf.get_variable

        def new_get_variable(name, shape=None, **kwargs):
            v = old_get_variable(name, shape, **kwargs)
            # don't binarize first and last layer
            if name != 'W' or 'conv0' in v.op.name or 'fct' in v.op.name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        tf.get_variable = new_get_variable

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)  # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        with argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity):
            logits = (LinearWrap(image).Conv2D(
                'conv0', 96, 12, stride=4,
                padding='VALID').apply(activate).Conv2D(
                    'conv1', 256, 5, padding='SAME',
                    split=2).apply(fg).BatchNorm('bn1').MaxPooling(
                        'pool1', 3, 2, padding='SAME').apply(activate).Conv2D(
                            'conv2', 384,
                            3).apply(fg).BatchNorm('bn2').MaxPooling(
                                'pool2', 3, 2,
                                padding='SAME').apply(activate).Conv2D(
                                    'conv3', 384, 3, split=2).apply(fg).
                      BatchNorm('bn3').apply(activate).Conv2D(
                          'conv4', 256, 3,
                          split=2).apply(fg).BatchNorm('bn4').MaxPooling(
                              'pool4', 3, 2,
                              padding='VALID').apply(activate).FullyConnected(
                                  'fc0', 4096).apply(fg).
                      BatchNorm('bnfc0').apply(activate).FullyConnected(
                          'fc1', 4096).apply(fg).BatchNorm('bnfc1').apply(
                              nonlin).FullyConnected('fct',
                                                     1000,
                                                     use_bias=True)())
        tf.get_variable = old_get_variable

        prob = tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
        wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6))
        add_moving_summary(cost, wd_cost)

        add_param_summary([('.*/W', ['histogram', 'rms'])])
        self.cost = tf.add_n([cost, wd_cost], name='cost')
Example #15
0
    def build_graph(self, image, label):
        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'weak' in name or 'fc' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)
                #return ternarize(v)

        def cabs(x):
            return tf.minimum(1.0, tf.abs(x), name='cabs')

        def activate(x):
            return fa(cabs(x))
        
        def merge(x, y):
            #return x + y
            #return x - y
            return tf.concat([x,y], axis=3)

        image = image / 256.0;          k=3;      zp=0.25;      zp2=zp / 1
        #scale = tf.train.exponential_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*5, decay_rate=0.5, staircase=True, name='scale')
        #scale = tf.where(scale>0.001, scale, tf.zeros_like(scale))
        scale = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*50, alpha=0.0)
        tf.summary.scalar('scale', scale);             endconv=[];  endweak=[]
        #scale2 = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*50, alpha=0.0)
        #scale3 = tf.train.cosine_decay(learning_rate=1.0, global_step=get_global_step_var(), decay_steps=4721*80, alpha=0.0)
        with remap_variables(binarize_weight), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False):
            net=Conv2D('conv0', image, np.round(48*zp), 5, padding='VALID', use_bias=True)
            net=MaxPooling('pool0', net, 2, padding='SAME');            net=activate(net)

            net1=Conv2D('conv1', net, np.round(64*zp), 3, padding='SAME');      net1=BatchNorm('bn1', net1);     endconv.append(net1)
            net2=Conv2D('weak1', net, np.round(64*zp2), k, padding='SAME');      net2=BatchNorm('bn12', net2);      endweak.append(net2);  #  net2=tf.nn.relu(net2)
            net=merge(activate(net1), scale*net2)
            #net=activate(net1)
            
            net1=Conv2D('conv2', net, np.round(64*zp), 3, padding='SAME');      net1=BatchNorm('bn2', net1);     endconv.append(net1)
            net2=Conv2D('weak2', net, np.round(64*zp2), k, padding='SAME');      net2=BatchNorm('bn22', net2);      endweak.append(net2);   # net2=tf.nn.relu(net2)
            net1=MaxPooling('pool1', net1, 2, padding='SAME');   net2=MaxPooling('pool12', net2, 2, padding='SAME');
            net=merge(activate(net1), scale*net2)
            net=activate(net1)

            net1=Conv2D('conv3', net, np.round(128*zp), 3, padding='VALID');      net1=BatchNorm('bn3', net1);     endconv.append(net1)
            net2=Conv2D('weak3', net, np.round(128*zp2), k, padding='VALID');      net2=BatchNorm('bn32', net2);      endweak.append(net2);  #  net2=tf.nn.relu(net2)
            net=merge(activate(net1), scale*net2)
            #net=activate(net1)

            net1=Conv2D('conv4', net, np.round(128*zp), 3, padding='SAME');      net1=BatchNorm('bn4', net1);     endconv.append(net1)
            net2=Conv2D('weak4', net, np.round(128*zp2), k, padding='SAME');      net2=BatchNorm('bn42', net2);      endweak.append(net2);  #  net2=tf.nn.relu(net2)
            net=merge(activate(net1), scale*net2)
            # net=activate(net1)

            net1=Conv2D('conv5', net, np.round(128*zp), 3, padding='VALID');      net1=BatchNorm('bn5', net1);     endconv.append(net1)
            net2=Conv2D('weak5', net, np.round(128*zp2), k, padding='VALID');      net2=BatchNorm('bn52', net2);      endweak.append(net2);  #  net2=tf.nn.relu(net2)
            net=merge(activate(net1), scale*net2)
            #net=activate(net1)

            net=tf.nn.dropout(net, 0.5 if is_training else 1.0)
            net1=Conv2D('conv6', net, np.round(512*zp), 5, padding='VALID');       net1=BatchNorm('bn6', net1);     endconv.append(net1)
            net2=Conv2D('weak6', net, np.round(512*zp2), 5, padding='VALID');       net2=BatchNorm('bn62', net2);      endweak.append(net2);  #  net2=tf.nn.relu(net2)
            net=merge(cabs(net1), scale*net2)
            # net=cabs(net1)
            logits=FullyConnected('fc1', net, 10)
        tf.nn.softmax(logits, name='output')

        # compute the number of failed samples
        wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='wrong_tensor')
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))

        add_param_summary(('.*/W', ['histogram', 'rms']))
        total_cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, total_cost)
        for i in range(len(endweak)):
            add_moving_summary(tf.reduce_mean(tf.abs(endconv[i]), name='mean_conv_'+str(i+1) )  )
            add_moving_summary(tf.reduce_mean(tf.abs(endweak[i]), name='mean_weak_'+str(i+1) )  )

        return total_cost
Example #16
0
    def _build_graph(self, inputs):
        image, label, ious, valids, bndboxes = inputs
        image = tf.round(image)

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        old_get_variable = tf.get_variable

        def monitor(x, name):
            if MONITOR == 1:
                return tf.Print(x, [x],
                                message='\n\n' + name + ': ',
                                summarize=1000,
                                name=name)
            else:
                return x

        def new_get_variable(v):
            name = v.op.name
            if not name.endswith(
                    'W'
            ) or 'conv1' in name or 'conv_obj' in name or 'conv_box' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                if MONITOR == 1:
                    return tf.Print(fw(v), [fw(v)],
                                    message='\n\n' + v.name +
                                    ', Quantized weights are:',
                                    summarize=100)
                else:
                    return fw(v)

        def activate(x):
            if BITA == 32:
                return tf.nn.relu(x)
            else:
                return fa(tf.nn.relu(x))

        def bn_activate(name, x):
            x = BatchNorm(name, x)
            x = monitor(x, name + '_noact_out')
            return activate(x)

        def halffire(name, x, num_squeeze_filters, num_expand_3x3_filters,
                     skip):
            out_squeeze = Conv2D('squeeze_conv_' + name,
                                 x,
                                 out_channel=num_squeeze_filters,
                                 kernel_shape=1,
                                 stride=1,
                                 padding='SAME')
            out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze)
            out_expand_3x3 = Conv2D('expand_3x3_conv_' + name,
                                    out_squeeze,
                                    out_channel=num_expand_3x3_filters,
                                    kernel_shape=3,
                                    stride=1,
                                    padding='SAME')
            out_expand_3x3 = bn_activate('bn_expand_3x3_' + name,
                                         out_expand_3x3)
            if skip == 0:
                return out_expand_3x3
            else:
                return tf.add(x, out_expand_3x3)

        def halffire_noact(name, x, num_squeeze_filters,
                           num_expand_3x3_filters):
            out_squeeze = Conv2D('squeeze_conv_' + name,
                                 x,
                                 out_channel=num_squeeze_filters,
                                 kernel_shape=1,
                                 stride=1,
                                 padding='SAME')
            out_squeeze = bn_activate('bn_squeeze_' + name, out_squeeze)
            out_expand_3x3 = Conv2D('expand_3x3_conv_' + name,
                                    out_squeeze,
                                    out_channel=num_expand_3x3_filters,
                                    kernel_shape=3,
                                    stride=1,
                                    padding='SAME')
            return out_expand_3x3

        with  remap_variables(new_get_variable), \
          argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity), \
          argscope(BatchNorm, decay=0.9, epsilon=1e-4):

            image = monitor(image, 'image_out')

            l = Conv2D('conv1',
                       image,
                       out_channel=32,
                       kernel_shape=3,
                       stride=2,
                       padding='SAME')
            l = bn_activate('bn1', l)
            l = monitor(l, 'conv1_out')

            l = MaxPooling('pool1', l, shape=3, stride=2, padding='SAME')
            l = monitor(l, 'pool1_out')

            l = halffire('fire1', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS,
                         0)
            l = monitor(l, 'fire1_out')

            l = MaxPooling('pool2', l, shape=3, stride=2, padding='SAME')
            l = monitor(l, 'pool2_out')

            l = halffire('fire2', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS,
                         0)
            l = monitor(l, 'fire2_out')

            l = MaxPooling('pool3', l, shape=3, stride=2, padding='SAME')
            l = monitor(l, 'pool3_out')

            l = halffire('fire3', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS,
                         0)
            l = monitor(l, 'fire3_out')

            l = halffire('fire4', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS,
                         0)
            l = monitor(l, 'fire4_out')

            l = halffire('fire5', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS,
                         0)
            l = monitor(l, 'fire5_out')

            l = halffire('fire6', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS,
                         0)
            l = monitor(l, 'fire6_out')

            l = halffire('fire7', l, NUM_SQUEEZE_FILTERS, NUM_EXPAND_FILTERS,
                         0)
            l = monitor(l, 'fire7_out')

            # Classification
            classify = Conv2D('conv_class',
                              l,
                              out_channel=12,
                              kernel_shape=1,
                              stride=1,
                              padding='SAME')
            classify = bn_activate('bn_class', classify)
            classify = monitor(classify, 'conv_class_out')
            logits = GlobalAvgPooling('pool_class', classify)

            class_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=label)
            class_loss = tf.reduce_mean(class_loss, name='cross_entropy_loss')

            wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
            add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))

            # Object Detection
            l = tf.concat([l, classify], axis=3)

            objdetect = Conv2D('conv_obj',
                               l,
                               out_channel=1,
                               kernel_shape=1,
                               stride=1,
                               padding='SAME')
            objdetect = tf.identity(objdetect, name='objdetect_out')
            objdetect_loss = tf.losses.hinge_loss(labels=ious,
                                                  logits=objdetect)

            bndbox = Conv2D('conv_box',
                            l,
                            out_channel=4,
                            kernel_shape=1,
                            stride=1,
                            padding='SAME')
            bndbox = tf.identity(bndbox, name='bndbox_out')
            bndbox = tf.multiply(bndbox, valids, name='mult0')
            bndbox_loss = tf.losses.mean_squared_error(labels=bndboxes,
                                                       predictions=bndbox)

            # weight decay on all W of fc layers
            # reg_cost = regularize_cost('(fire7|conv_obj|conv_box).*/W', l2_regularizer(1e-5), name='regularize_cost')

            # cost = class_loss*objdetect_loss*bndbox_loss
            # cost = class_loss + objdetect_loss + bndbox_loss + reg_cost
            cost = class_loss + 10 * objdetect_loss + bndbox_loss

            add_moving_summary(class_loss, objdetect_loss, bndbox_loss, cost)

        self.cost = cost

        tf.get_variable = old_get_variable
Example #17
0
    def _build_graph(self, inputs):
        input, label = inputs

        fw, fa, fg = get_dorefa(FLAGS.bit_w, FLAGS.bit_a, 32)
        logger.info("Using {}-bit activations and {}-bit weights".format(FLAGS.bit_a, FLAGS.bit_w))
        logger.info("Using trn_cache: {}".format(FLAGS.trn_cache_dir))
        logger.info("Using host: {}".format(socket.gethostname()))

        old_get_variable = tf.get_variable

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            logger.info("Binarizing weight {}".format(v.op.name))
            return fw(v, FLAGS.force_quantization)

        def nonlin(x):
            if FLAGS.bit_a == 32 and not FLAGS.use_clip:
                return tf.nn.relu(x)    # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        activations = []
        with remap_variables(new_get_variable), \
                argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity):
            curr_layer = LinearWrap(input)
            if model_type == 'fc':
                for i in range(FLAGS.n_layers):
                    curr_layer = (curr_layer
                                .FullyConnected('fc' + str(i), FLAGS.state_size)
                                .LayerNorm('ln_fc' + str(i))
                                .apply(activate))
                    activations.append(curr_layer.tensor())
                    curr_layer = (curr_layer
                                .Dropout('dropout', FLAGS.dropout))
            elif model_type == 'cnn':
                
            logits = curr_layer.FullyConnected('fct', self.n_spks, use_bias=True)()

        print_all_tf_vars()

        prob = tf.nn.softmax(logits, name='output')

        # used for validation accuracy of utterance
        identity_guesses = flatten(tf.argmax(prob, axis=1))
        uniq_identities, _, count = tf.unique_with_counts(identity_guesses)
        idx_to_identity_with_most_votes = tf.argmax(count)
        chosen_identity = tf.gather(uniq_identities, idx_to_identity_with_most_votes)
        wrong = tf.expand_dims(tf.not_equal(chosen_identity, tf.cast(label[0], tf.int64)), axis=0, name='utt-wrong')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6), name='regularize_cost')

        add_param_summary(('.*/W', ['histogram', 'rms']))
        self.cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, self.cost)

        for activation in activations:
            add_activation_summary(activation)
            tf.summary.histogram(activation.name, activation)

    def _get_optimizer(self):
        lr = get_scalar_var('learning_rate', FLAGS.learning_rate, summary=True)
        return tf.train.AdamOptimizer(lr, epsilon=1e-5)
Example #18
0
    def _build_graph(self, inputs):
        image, label = inputs
        image = image / 256.0

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv1' in name or 'fct' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        def resblock(x, channel, stride):
            def get_stem_full(x):
                return (LinearWrap(x)
                        .Conv2D('c3x3a', channel, 3)
                        .BatchNorm('stembn')
                        .apply(activate)
                        .Conv2D('c3x3b', channel, 3)())
            channel_mismatch = channel != x.get_shape().as_list()[3]
            if stride != 1 or channel_mismatch or 'pool1' in x.name:
                # handling pool1 is to work around an architecture bug in our model
                if stride != 1 or 'pool1' in x.name:
                    x = AvgPooling('pool', x, stride, stride)
                x = BatchNorm('bn', x)
                x = activate(x)
                shortcut = Conv2D('shortcut', x, channel, 1)
                stem = get_stem_full(x)
            else:
                shortcut = x
                x = BatchNorm('bn', x)
                x = activate(x)
                stem = get_stem_full(x)
            return shortcut + stem

        def group(x, name, channel, nr_block, stride):
            with tf.variable_scope(name + 'blk1'):
                x = resblock(x, channel, stride)
            for i in range(2, nr_block + 1):
                with tf.variable_scope(name + 'blk{}'.format(i)):
                    x = resblock(x, channel, 1)
            return x

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False, nl=tf.identity):
            logits = (LinearWrap(image)
                      # use explicit padding here, because our training framework has
                      # different padding mechanisms from TensorFlow
                      .tf.pad([[0, 0], [3, 2], [3, 2], [0, 0]])
                      .Conv2D('conv1', 64, 7, stride=2, padding='VALID', use_bias=True)
                      .tf.pad([[0, 0], [1, 1], [1, 1], [0, 0]], 'SYMMETRIC')
                      .MaxPooling('pool1', 3, 2, padding='VALID')
                      .apply(group, 'conv2', 64, 2, 1)
                      .apply(group, 'conv3', 128, 2, 2)
                      .apply(group, 'conv4', 256, 2, 2)
                      .apply(group, 'conv5', 512, 2, 2)
                      .BatchNorm('lastbn')
                      .apply(nonlin)
                      .GlobalAvgPooling('gap')
                      .tf.multiply(49)  # this is due to a bug in our model design
                      .FullyConnected('fct', 1000)())
        tf.nn.softmax(logits, name='output')
        ImageNetModel.compute_loss_and_error(logits, label)
Example #19
0
    def build_graph(self, image, label):
        image = image / 255.0

        if BITW == 't':
            fw, fa, fg = get_dorefa(32, 32, 32)
            fw = ternarize
        else:
            fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def new_get_variable(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fct' in name:
                return v
            else:
                logger.info("Quantizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)  # still use relu for 32bit cases
            return tf.clip_by_value(x, 0.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        with remap_variables(new_get_variable), \
                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False):
            logits = (LinearWrap(image).Conv2D(
                'conv0', 96, 12, strides=4,
                padding='VALID').apply(activate).Conv2D(
                    'conv1', 256, 5, padding='SAME',
                    split=2).apply(fg).BatchNorm('bn1').MaxPooling(
                        'pool1', 3, 2, padding='SAME').apply(activate).Conv2D(
                            'conv2', 384,
                            3).apply(fg).BatchNorm('bn2').MaxPooling(
                                'pool2', 3, 2,
                                padding='SAME').apply(activate).Conv2D(
                                    'conv3', 384, 3, split=2).apply(fg).
                      BatchNorm('bn3').apply(activate).Conv2D(
                          'conv4', 256, 3,
                          split=2).apply(fg).BatchNorm('bn4').MaxPooling(
                              'pool4', 3, 2,
                              padding='VALID').apply(activate).FullyConnected(
                                  'fc0', 4096).apply(fg).BatchNorm('bnfc0').
                      apply(activate).FullyConnected(
                          'fc1', 4096,
                          use_bias=False).apply(fg).BatchNorm('bnfc1').apply(
                              nonlin).FullyConnected('fct',
                                                     1000,
                                                     use_bias=True)())

        tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
        wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W',
                                  l2_regularizer(5e-6),
                                  name='regularize_cost')

        add_param_summary(('.*/W', ['histogram', 'rms']))
        total_cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, total_cost)
        return total_cost
    def _build_graph(self, inputs):
        image, label = inputs
        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'conv0' in name or 'fc' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def cabs(x):
            return tf.minimum(1.0, tf.abs(x), name='cabs')

        def activate(x):
            return fa(cabs(x))

        image = image / 256.0

        with remap_variables(binarize_weight), \
                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
                argscope(Conv2D, use_bias=False, nl=tf.identity):
            logits = (LinearWrap(image)
                      .Conv2D('conv0', 48, 5, padding='VALID', use_bias=True)
                      .MaxPooling('pool0', 2, padding='SAME')
                      .apply(activate)
                      # 18
                      .Conv2D('conv1', 64, 3, padding='SAME')
                      .apply(fg)
                      .BatchNorm('bn1').apply(activate)

                      .Conv2D('conv2', 64, 3, padding='SAME')
                      .apply(fg)
                      .BatchNorm('bn2')
                      .MaxPooling('pool1', 2, padding='SAME')
                      .apply(activate)
                      # 9
                      .Conv2D('conv3', 128, 3, padding='VALID')
                      .apply(fg)
                      .BatchNorm('bn3').apply(activate)
                      # 7

                      .Conv2D('conv4', 128, 3, padding='SAME')
                      .apply(fg)
                      .BatchNorm('bn4').apply(activate)

                      .Conv2D('conv5', 128, 3, padding='VALID')
                      .apply(fg)
                      .BatchNorm('bn5').apply(activate)
                      # 5
                      .tf.nn.dropout(0.5 if is_training else 1.0)
                      .Conv2D('conv6', 512, 5, padding='VALID')
                      .apply(fg).BatchNorm('bn6')
                      .apply(cabs)
                      .FullyConnected('fc1', 10, nl=tf.identity)())
        tf.nn.softmax(logits, name='output')

        # compute the number of failed samples
        wrong = prediction_incorrect(logits, label)
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))

        add_param_summary(('.*/W', ['histogram', 'rms']))
        self.cost = tf.add_n([cost, wd_cost], name='cost')
        add_moving_summary(cost, wd_cost, self.cost)
Example #21
0
    def build_graph(self, image, label):
        """This function should build the model which takes the input variables (defined above)
        and return cost at the end."""

        is_training = get_current_tower_context().is_training

        fw, fa, fg = get_dorefa(BITW, BITA, BITG)

        # monkey-patch tf.get_variable to apply fw
        def binarize_weight(v):
            name = v.op.name
            # don't binarize first and last layer
            if not name.endswith('W') or 'fc0' in name or 'fc_out' in name:
                return v
            else:
                logger.info("Binarizing weight {}".format(v.op.name))
                return fw(v)

        def nonlin(x):
            if BITA == 32:
                return tf.nn.relu(x)
            #FIXMEreturn tf.clip_by_value(x, 0.0, 1.0)
            return tf.clip_by_value(x, -1.0, 1.0)

        def activate(x):
            return fa(nonlin(x))

        # The context manager `argscope` sets the default option for all the layers under
        # this context. Here we use 32 channel convolution with shape 3x3
        # See tutorial at https://tensorpack.readthedocs.io/tutorial/symbolic.html
        with remap_variables(binarize_weight), \
                argscope(FullyConnected, use_bias=False), \
                argscope(BatchNorm, momentum=0.1, epsilon=1e-4):
            # LinearWrap is just a syntax sugar.
            # See tutorial at https://tensorpack.readthedocs.io/tutorial/symbolic.html
            logits = (
                LinearWrap(image).Dropout('dropout_in',
                                          rate=0.2 if is_training else 0.0)
                # hidden 0
                .FullyConnected(
                    'fc0', n_units).BatchNorm('bn0').apply(activate).Dropout(
                        'dropout_hidden0', rate=0.5 if is_training else 0.0)
                # hidden 1
                .FullyConnected(
                    'fc1', n_units).BatchNorm('bn1').apply(activate).Dropout(
                        'dropout_hidden1', rate=0.5 if is_training else 0.0)
                # hidden 2
                .FullyConnected(
                    'fc2', n_units).BatchNorm('bn2').apply(activate).Dropout(
                        'dropout_hidden2', rate=0.5 if is_training else 0.0)
                # output layer
                .FullyConnected('fc_out', 10, activation=tf.identity)())

        # a vector of length B with loss of each sample
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                              labels=label)
        cost = tf.reduce_mean(
            cost, name='cross_entropy_loss')  # the average cross-entropy loss

        correct = tf.cast(tf.nn.in_top_k(predictions=logits,
                                         targets=label,
                                         k=1),
                          tf.float32,
                          name='correct')
        accuracy = tf.reduce_mean(correct, name='accuracy')