Example #1
0
def crop_pyr(im,
             rect,
             im_size,
             scales,
             pad_value=0,
             feather=False,
             feather_margin=0.05,
             name='crop_pyr'):
    '''
    Args:
        im: [b, h, w, 3]
        rect: [b, 4]
        im_size: (height, width)
        scales: [s]
        pad_value: Either scalar constant or
            tf.Tensor that is broadcast-compatible with image.

    Returns:
        [b, s, h, w, 3]
    '''
    with tf.name_scope(name) as scope:
        if tf.contrib.framework.is_tensor(pad_value):
            # TODO: This operation seems slow!
            im -= pad_value
            crop_ims, rects = crop_pyr(im,
                                       rect,
                                       im_size,
                                       scales,
                                       pad_value=0,
                                       feather=feather,
                                       feather_margin=feather_margin,
                                       name=name)
            crop_ims += tf.expand_dims(pad_value, 1)
            return crop_ims, rects

        if feather:
            im = feather_image(im,
                               margin=feather_margin,
                               background_value=pad_value)
        # [b, s, 4]
        rects = geom.grow_rect(tf.expand_dims(scales, -1),
                               tf.expand_dims(rect, -2))
        # Extract multiple rectangles from each image.
        batch_len = tf.shape(im)[0]
        num_scales, = tf.unstack(tf.shape(scales))
        box_ind = tf.tile(tf.expand_dims(tf.range(batch_len), 1),
                          [1, num_scales])
        # [b, s, ...] -> [b*s, ...]
        rects, restore = merge_dims(rects, 0, 2)
        box_ind, _ = merge_dims(box_ind, 0, 2)
        crop_ims = tf.image.crop_and_resize(im,
                                            geom.rect_to_tf_box(rects),
                                            box_ind=box_ind,
                                            crop_size=n_positive_integers(
                                                2, im_size),
                                            extrapolation_value=pad_value)
        # [b*s, ...] -> [b, s, ...]
        crop_ims = restore(crop_ims, 0)
        return crop_ims, rects
Example #2
0
def _motion_net(x, output_shapes, is_training, weight_decay=0):
    '''
    Args:
        x: [b, t, h, w, c]
        output_shapes: Dict that maps string to iterable of ints.
            e.g. {'response': [5, 17, 17, 1]} for a score-map
            e.g. {'translation': [2]} for translation regression

    Returns:
        Dictionary of outputs with shape [b] + output_shape.
    '''
    assert len(x.shape) == 5
    with slim.arg_scope([slim.conv2d, slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        normalizer_fn=slim.batch_norm,
                        weights_regularizer=slim.l2_regularizer(weight_decay)):
        with slim.arg_scope([slim.batch_norm], is_training=is_training):
            # https://github.com/tensorflow/models/blob/master/research/slim/nets/alexnet.py
            x, unmerge = helpers.merge_dims(x, 0, 2)  # Merge time into batch.
            # 103 = 11 + (47 - 1) * 2 or
            # 195 = 11 + (47 - 1) * 4
            x = slim.conv2d(x, 64, [11, 11], 4, padding='VALID', scope='conv1')
            # 47 = 3 + (23 - 1) * 2
            x = slim.max_pool2d(x, [3, 3], 2, scope='pool1')
            x = unmerge(x, axis=0)  # Un-merge time from batch.
            # Concatenate the images over time.
            x = tf.concat(tf.unstack(x, axis=1), axis=-1)
            x = slim.conv2d(x, 192, [5, 5], scope='conv2')
            # 23 = 3 + (11 - 1) * 2
            x = slim.max_pool2d(x, [3, 3], 2, scope='pool2')
            x = slim.conv2d(x, 384, [3, 3], scope='conv3')
            x = slim.conv2d(x, 384, [3, 3], scope='conv4')
            x = slim.conv2d(x, 256, [3, 3], scope='conv5')
            # 11 = 3 + (5 - 1) * 2
            x = slim.max_pool2d(x, [3, 3], 2, scope='pool5')
            # 5

            # Add fully-connected layer.
            x = slim.conv2d(x, 4096, [5, 5], padding='VALID', scope='fc6')
            x = tf.squeeze(x, axis=(-3, -2))
            x = slim.fully_connected(x, 4096, scope='fc7')
            # Regress to score map.
            y = {}
            for k in output_shapes.keys():
                with tf.variable_scope('head_{}'.format(k)):
                    y[k] = x
                    output_dim = np.asscalar(np.prod(output_shapes[k]))
                    y[k] = slim.fully_connected(y[k],
                                                output_dim,
                                                scope='fc8',
                                                activation_fn=None,
                                                normalizer_fn=None)
                    if len(output_shapes[k]) > 1:
                        y[k] = helpers.split_dims(y[k],
                                                  axis=-1,
                                                  shape=output_shapes[k])

            return y
Example #3
0
def merge_batch_dims(x):
    '''Merges all dimensions except the last three.

    Returns:
        (merged, restore_fn)
    '''
    x = as_tensor(x)
    ndim = len(x.value.shape)
    assert ndim >= 4
    # Merge all dimensions except last three.
    value, restore_fn = helpers.merge_dims(x.value, None, -3)
    y = Tensor(value, x.fields)
    return y, partial_pixelwise(restore_fn, axis=0)
Example #4
0
def _embed_net(
        x,
        is_training,
        trainable,
        variables_collections,
        weight_decay=0,
        name='embed',
        # Additional arguments:
        arch='alexnet',
        arch_params=None,
        extra_conv_enable=False,
        extra_conv_params=None):
    '''
    Args:
        x: Image of which to compute features. Shape [..., h, w, c]

    Returns:
        Output of network, intermediate layers, variable scope of feature net.
        The variables in the feature scope can be loaded from a pre-trained model.
    '''
    with tf.name_scope(name) as scope:
        arch_params = arch_params or {}
        extra_conv_params = extra_conv_params or {}
        weight_decay = float(weight_decay)

        try:
            func = feature_nets.BY_NAME[arch]
        except KeyError:
            raise ValueError('unknown architecture: {}'.format(arch))

        x = cnn.as_tensor(x)
        num_dims = len(x.value.shape)
        if num_dims > 4:
            merged, unmerge = helpers.merge_dims(x.value, 0, num_dims - 3)
            x = cnn.Tensor(merged, x.fields)

        with tf.variable_scope('feature') as feature_vs:
            x, end_points = func(x,
                                 is_training,
                                 trainable,
                                 variables_collections,
                                 weight_decay=weight_decay,
                                 **arch_params)

        if extra_conv_enable:
            with tf.variable_scope('extra'):
                x = _extra_conv(x, is_training, trainable,
                                variables_collections, **extra_conv_params)
        if num_dims > 4:
            x = cnn.Tensor(unmerge(x.value, 0), x.fields)
        return x, end_points, feature_vs
Example #5
0
def _draw_rectangles(im, gt, gt_is_valid=None, pred=None, name='draw_rectangles'):
    '''
    Args:
        im: Tensor [..., h, w, c]
        gt: Tensor [..., 4]
        gt_is_valid: Tensor [...] or None
        pred: Tensor [..., 4] or None
    '''
    with tf.name_scope(name) as scope:
        im = tf.convert_to_tensor(im)
        num_batch_dims = len(im.shape) - 3
        assert len(gt.shape) == num_batch_dims + 1
        if gt_is_valid is not None:
            assert len(gt_is_valid.shape) == num_batch_dims
        if pred is not None:
            assert len(pred.shape) == num_batch_dims + 1

        if im.dtype != tf.float32:
            im = tf.image.convert_image_dtype(im, tf.float32)
        if gt_is_valid is not None:
            gt = tf.where(tf.broadcast_to(tf.expand_dims(gt_is_valid, -1), gt.shape),
                          gt,
                          tf.broadcast_to(geom.unit_rect(), gt.shape))
        rects = [gt]
        if pred is not None:
            rects.append(pred)
        rects = tf.stack(rects, axis=-2)  # [..., num_rects, 4]

        # Flatten batch to draw boxes.
        if num_batch_dims > 1:
            im, restore_im = helpers.merge_dims(im, 0, num_batch_dims)
            rects, _ = helpers.merge_dims(rects, 0, num_batch_dims)
        im = tf.image.draw_bounding_boxes(im, geom.rect_to_tf_box(rects))
        if num_batch_dims > 1:
            im = restore_im(im, axis=0)

        im = tf.image.convert_image_dtype(im, tf.uint8, saturate=True)
        return im
Example #6
0
def load_and_resize_images(image_files, resize=False, size=None, method='bilinear',
                           name='load_and_resize_images'):
    '''
    Args:
        image_files: Tensor of type string with shape `[k[0], ..., k[n-1]]`.
        size: Tuple (h, w).

    If `resize` is true, images will be resized to this `size`.
    If `resize` is false and `size` is specified, the shape will be set with `set_shape`.
    Otherwise the shape of the image tensor will be left unspecified.

    Returns:
        Tensor of type uint8 with shape `[k[0], ..., k[n-1], h, w, 3]`.

    If `resize` is false, then the image files must be the same size.
    '''
    with tf.name_scope(name) as scope:
        image_files, restore_fn = helpers.merge_dims(image_files, None, None)
        images = tf.map_fn(
            functools.partial(_load_and_resize_image, resize=resize, size=size, method=method),
            image_files,
            dtype=tf.float32)
        images = restore_fn(images, 0)
        return images
Example #7
0
def _output_net(r,
                v,
                output_shapes,
                is_training,
                weight_decay=0,
                use_response=True,
                use_images=True):
    '''
    Args:
        r: Response. [b, s, hr, wr, c]
        v: Motion. [b, t, hv, wv, c]
        output_shapes: Dict that maps string to iterable of ints.
            e.g. {'response': [5, 17, 17, 1]} for a score-map
            e.g. {'translation': [2], 'scale': [1]} for translation regression

    Returns:
        Dictionary of outputs with shape [b] + output_shape.
    '''
    assert len(r.shape) == 5
    # with slim.arg_scope([slim.conv2d, slim.fully_connected, _res_conv2d, _res_fc],
    with slim.arg_scope([slim.conv2d, slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        normalizer_fn=slim.batch_norm,
                        normalizer_params=dict(is_training=is_training),
                        weights_regularizer=slim.l2_regularizer(weight_decay)):
        x = []

        if use_response:
            with tf.variable_scope('preproc_response'):
                # Perform same operation on each scale.
                r, unmerge = helpers.merge_dims(r, 0,
                                                2)  # Merge scale into batch.
                # Spatial dim 17
                r = slim.conv2d(r, 32, 3, padding='SAME', scope='conv1')
                r = slim.max_pool2d(r,
                                    3,
                                    stride=2,
                                    padding='SAME',
                                    scope='pool1')
                # Spatial dim 9
                r = slim.conv2d(r, 64, 3, padding='SAME', scope='conv2')
                r = slim.max_pool2d(r,
                                    3,
                                    stride=2,
                                    padding='SAME',
                                    scope='pool2')
                # Spatial dim 5
                r = unmerge(r, axis=0)  # Unmerge scale from batch.
                # Concatenate channels of all scales.
                r = tf.concat(tf.unstack(r, axis=1), axis=-1)
                x.append(r)

        if use_images:
            with tf.variable_scope('preproc_motion'):
                # https://github.com/tensorflow/models/blob/master/research/slim/nets/alexnet.py
                v, unmerge = helpers.merge_dims(v, 0,
                                                2)  # Merge time into batch.
                # To determine size, work backwards from output.
                # Input size must be
                # 103 = 11 + (47 - 1) * 2 (for stride 2) or
                # 195 = 11 + (47 - 1) * 4 (for stride 4)
                v = slim.conv2d(v,
                                32, [11, 11],
                                4,
                                padding='VALID',
                                scope='conv1')  # was 64
                # Must be 47 = 3 + (23 - 1) * 2
                v = slim.max_pool2d(v, [3, 3],
                                    2,
                                    padding='VALID',
                                    scope='pool1')
                v = unmerge(v, axis=0)  # Un-merge time from batch.
                # Concatenate the images over time.
                v = tf.concat(tf.unstack(v, axis=1), axis=-1)
                v = slim.conv2d(v, 48, [5, 5], padding='SAME',
                                scope='conv2')  # was 192
                # Must be 23 = 3 + (11 - 1) * 2
                v = slim.max_pool2d(v, [3, 3],
                                    2,
                                    padding='VALID',
                                    scope='pool2')
                # TODO: Use residual connections here? Be careful with relu and bnorm.
                v = slim.conv2d(v, 64, [3, 3], padding='SAME',
                                scope='conv3')  # was 384
                # v = _res_conv2d(v, [3, 3], scope='conv4')  # was 384
                # v = _res_conv2d(v, [3, 3], scope='conv5')  # was 256
                v = slim.conv2d(v, 64, [3, 3], scope='conv4')  # was 384
                v = slim.conv2d(v, 64, [3, 3], scope='conv5')  # was 256
                # Must be 11 = 3 + (5 - 1) * 2
                v = slim.max_pool2d(v, [3, 3],
                                    2,
                                    padding='VALID',
                                    scope='pool5')
                # Spatial dim 5
                x.append(v)

        # Concatenate appearance response and/or motion description.
        x = tf.concat(x, axis=-1)

        with tf.variable_scope('output'):
            x = slim.conv2d(x, 512, [5, 5], padding='VALID', scope='fc1')
            # Spatial dim 1
            x = tf.squeeze(x, axis=(-2, -3))
            # x = _res_fc(x, scope='fc2')
            # x = _res_fc(x, scope='fc3')
            x = slim.fully_connected(x, 512, scope='fc2')

        # Regress to each output.
        y = {}
        for k in output_shapes.keys():
            with tf.variable_scope('head_{}'.format(k)):
                y[k] = x
                output_dim = np.asscalar(np.prod(output_shapes[k]))
                y[k] = slim.fully_connected(y[k],
                                            output_dim,
                                            scope='fc1',
                                            activation_fn=None,
                                            normalizer_fn=None)
                if len(output_shapes[k]) > 1:
                    y[k] = helpers.split_dims(y[k],
                                              axis=-1,
                                              shape=output_shapes[k])
        return y
Example #8
0
def all_pixel_pairs(template, search, is_training,
                    trainable=True,
                    operation='mul',
                    reduce_channels=True,
                    use_mean=True,
                    use_batch_norm=False,
                    learn_gain=False,
                    gain_init=1,
                    scope='all_pixel_pairs'):
    '''
    Args:
        template: cnn.Tensor with shape [n, h_t, w_t, c]
        search: cnn.Tensor with shape [n, s, h_s, w_s, c]

    Returns:
        cnn.Tensor with shape [n, h_s, w_s, h_t * w_t]
    '''
    with tf.variable_scope(scope, 'all_pixel_pairs'):
        template = cnn.as_tensor(template)
        search = cnn.as_tensor(search)
        template_size = template.value.shape[-3:-1].as_list()
        num_channels = template.value.shape[-1].value

        # Break template into 1x1 patches.
        # Then "convolve" (multiply) each with the search image.
        t = template.value
        s = search.value
        # template becomes: [n, 1, ...,   1,   1, h_t, w_t, c]
        # search becomes:   [n, s, ..., h_s, w_s,   1,   1, c]
        t = tf.expand_dims(t, 1)
        t = helpers.expand_dims_n(t, -4, 2)
        s = helpers.expand_dims_n(s, -2, 2)
        if operation == 'mul':
            p = t * s
        elif operation == 'abs_diff':
            p = tf.abs(t - s)
        else:
            raise ValueError('unknown operation: "{}"'.format(operation))

        # if reduce_channels:
        #     if use_mean:
        #         p = tf.reduce_mean(p, axis=-1, keepdims=True)
        #     else:
        #         p = tf.reduce_sum(p, axis=-1, keepdims=True)
        # Merge the spatial dimensions of the template into features.
        # response becomes: [n, ..., h_s, w_s, h_t * w_t * c]
        p, _ = helpers.merge_dims(p, -3, None)
        pairs = cnn.Tensor(p, search.fields)

        # TODO: This initialization could be too small?
        normalizer = 1 / (np.prod(template_size) ** 2 * num_channels) if use_mean else 1
        weights_shape = template_size + [np.prod(template_size) * num_channels, 1]
        weights = tf.get_variable('weights', weights_shape, tf.float32,
                                  initializer=tf.constant_initializer(normalizer),
                                  trainable=trainable)
        # TODO: Support depthwise_conv2d (keep channels).
        pairs, restore = cnn.merge_batch_dims(pairs)
        response = cnn.nn_conv2d(pairs, weights, strides=[1, 1, 1, 1], padding='VALID')
        response = restore(response)

        return _calibrate(response, is_training, use_batch_norm, learn_gain, gain_init,
                          trainable=trainable)