def crop_pyr(im, rect, im_size, scales, pad_value=0, feather=False, feather_margin=0.05, name='crop_pyr'): ''' Args: im: [b, h, w, 3] rect: [b, 4] im_size: (height, width) scales: [s] pad_value: Either scalar constant or tf.Tensor that is broadcast-compatible with image. Returns: [b, s, h, w, 3] ''' with tf.name_scope(name) as scope: if tf.contrib.framework.is_tensor(pad_value): # TODO: This operation seems slow! im -= pad_value crop_ims, rects = crop_pyr(im, rect, im_size, scales, pad_value=0, feather=feather, feather_margin=feather_margin, name=name) crop_ims += tf.expand_dims(pad_value, 1) return crop_ims, rects if feather: im = feather_image(im, margin=feather_margin, background_value=pad_value) # [b, s, 4] rects = geom.grow_rect(tf.expand_dims(scales, -1), tf.expand_dims(rect, -2)) # Extract multiple rectangles from each image. batch_len = tf.shape(im)[0] num_scales, = tf.unstack(tf.shape(scales)) box_ind = tf.tile(tf.expand_dims(tf.range(batch_len), 1), [1, num_scales]) # [b, s, ...] -> [b*s, ...] rects, restore = merge_dims(rects, 0, 2) box_ind, _ = merge_dims(box_ind, 0, 2) crop_ims = tf.image.crop_and_resize(im, geom.rect_to_tf_box(rects), box_ind=box_ind, crop_size=n_positive_integers( 2, im_size), extrapolation_value=pad_value) # [b*s, ...] -> [b, s, ...] crop_ims = restore(crop_ims, 0) return crop_ims, rects
def _motion_net(x, output_shapes, is_training, weight_decay=0): ''' Args: x: [b, t, h, w, c] output_shapes: Dict that maps string to iterable of ints. e.g. {'response': [5, 17, 17, 1]} for a score-map e.g. {'translation': [2]} for translation regression Returns: Dictionary of outputs with shape [b] + output_shape. ''' assert len(x.shape) == 5 with slim.arg_scope([slim.conv2d, slim.fully_connected], activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, weights_regularizer=slim.l2_regularizer(weight_decay)): with slim.arg_scope([slim.batch_norm], is_training=is_training): # https://github.com/tensorflow/models/blob/master/research/slim/nets/alexnet.py x, unmerge = helpers.merge_dims(x, 0, 2) # Merge time into batch. # 103 = 11 + (47 - 1) * 2 or # 195 = 11 + (47 - 1) * 4 x = slim.conv2d(x, 64, [11, 11], 4, padding='VALID', scope='conv1') # 47 = 3 + (23 - 1) * 2 x = slim.max_pool2d(x, [3, 3], 2, scope='pool1') x = unmerge(x, axis=0) # Un-merge time from batch. # Concatenate the images over time. x = tf.concat(tf.unstack(x, axis=1), axis=-1) x = slim.conv2d(x, 192, [5, 5], scope='conv2') # 23 = 3 + (11 - 1) * 2 x = slim.max_pool2d(x, [3, 3], 2, scope='pool2') x = slim.conv2d(x, 384, [3, 3], scope='conv3') x = slim.conv2d(x, 384, [3, 3], scope='conv4') x = slim.conv2d(x, 256, [3, 3], scope='conv5') # 11 = 3 + (5 - 1) * 2 x = slim.max_pool2d(x, [3, 3], 2, scope='pool5') # 5 # Add fully-connected layer. x = slim.conv2d(x, 4096, [5, 5], padding='VALID', scope='fc6') x = tf.squeeze(x, axis=(-3, -2)) x = slim.fully_connected(x, 4096, scope='fc7') # Regress to score map. y = {} for k in output_shapes.keys(): with tf.variable_scope('head_{}'.format(k)): y[k] = x output_dim = np.asscalar(np.prod(output_shapes[k])) y[k] = slim.fully_connected(y[k], output_dim, scope='fc8', activation_fn=None, normalizer_fn=None) if len(output_shapes[k]) > 1: y[k] = helpers.split_dims(y[k], axis=-1, shape=output_shapes[k]) return y
def merge_batch_dims(x): '''Merges all dimensions except the last three. Returns: (merged, restore_fn) ''' x = as_tensor(x) ndim = len(x.value.shape) assert ndim >= 4 # Merge all dimensions except last three. value, restore_fn = helpers.merge_dims(x.value, None, -3) y = Tensor(value, x.fields) return y, partial_pixelwise(restore_fn, axis=0)
def _embed_net( x, is_training, trainable, variables_collections, weight_decay=0, name='embed', # Additional arguments: arch='alexnet', arch_params=None, extra_conv_enable=False, extra_conv_params=None): ''' Args: x: Image of which to compute features. Shape [..., h, w, c] Returns: Output of network, intermediate layers, variable scope of feature net. The variables in the feature scope can be loaded from a pre-trained model. ''' with tf.name_scope(name) as scope: arch_params = arch_params or {} extra_conv_params = extra_conv_params or {} weight_decay = float(weight_decay) try: func = feature_nets.BY_NAME[arch] except KeyError: raise ValueError('unknown architecture: {}'.format(arch)) x = cnn.as_tensor(x) num_dims = len(x.value.shape) if num_dims > 4: merged, unmerge = helpers.merge_dims(x.value, 0, num_dims - 3) x = cnn.Tensor(merged, x.fields) with tf.variable_scope('feature') as feature_vs: x, end_points = func(x, is_training, trainable, variables_collections, weight_decay=weight_decay, **arch_params) if extra_conv_enable: with tf.variable_scope('extra'): x = _extra_conv(x, is_training, trainable, variables_collections, **extra_conv_params) if num_dims > 4: x = cnn.Tensor(unmerge(x.value, 0), x.fields) return x, end_points, feature_vs
def _draw_rectangles(im, gt, gt_is_valid=None, pred=None, name='draw_rectangles'): ''' Args: im: Tensor [..., h, w, c] gt: Tensor [..., 4] gt_is_valid: Tensor [...] or None pred: Tensor [..., 4] or None ''' with tf.name_scope(name) as scope: im = tf.convert_to_tensor(im) num_batch_dims = len(im.shape) - 3 assert len(gt.shape) == num_batch_dims + 1 if gt_is_valid is not None: assert len(gt_is_valid.shape) == num_batch_dims if pred is not None: assert len(pred.shape) == num_batch_dims + 1 if im.dtype != tf.float32: im = tf.image.convert_image_dtype(im, tf.float32) if gt_is_valid is not None: gt = tf.where(tf.broadcast_to(tf.expand_dims(gt_is_valid, -1), gt.shape), gt, tf.broadcast_to(geom.unit_rect(), gt.shape)) rects = [gt] if pred is not None: rects.append(pred) rects = tf.stack(rects, axis=-2) # [..., num_rects, 4] # Flatten batch to draw boxes. if num_batch_dims > 1: im, restore_im = helpers.merge_dims(im, 0, num_batch_dims) rects, _ = helpers.merge_dims(rects, 0, num_batch_dims) im = tf.image.draw_bounding_boxes(im, geom.rect_to_tf_box(rects)) if num_batch_dims > 1: im = restore_im(im, axis=0) im = tf.image.convert_image_dtype(im, tf.uint8, saturate=True) return im
def load_and_resize_images(image_files, resize=False, size=None, method='bilinear', name='load_and_resize_images'): ''' Args: image_files: Tensor of type string with shape `[k[0], ..., k[n-1]]`. size: Tuple (h, w). If `resize` is true, images will be resized to this `size`. If `resize` is false and `size` is specified, the shape will be set with `set_shape`. Otherwise the shape of the image tensor will be left unspecified. Returns: Tensor of type uint8 with shape `[k[0], ..., k[n-1], h, w, 3]`. If `resize` is false, then the image files must be the same size. ''' with tf.name_scope(name) as scope: image_files, restore_fn = helpers.merge_dims(image_files, None, None) images = tf.map_fn( functools.partial(_load_and_resize_image, resize=resize, size=size, method=method), image_files, dtype=tf.float32) images = restore_fn(images, 0) return images
def _output_net(r, v, output_shapes, is_training, weight_decay=0, use_response=True, use_images=True): ''' Args: r: Response. [b, s, hr, wr, c] v: Motion. [b, t, hv, wv, c] output_shapes: Dict that maps string to iterable of ints. e.g. {'response': [5, 17, 17, 1]} for a score-map e.g. {'translation': [2], 'scale': [1]} for translation regression Returns: Dictionary of outputs with shape [b] + output_shape. ''' assert len(r.shape) == 5 # with slim.arg_scope([slim.conv2d, slim.fully_connected, _res_conv2d, _res_fc], with slim.arg_scope([slim.conv2d, slim.fully_connected], activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=dict(is_training=is_training), weights_regularizer=slim.l2_regularizer(weight_decay)): x = [] if use_response: with tf.variable_scope('preproc_response'): # Perform same operation on each scale. r, unmerge = helpers.merge_dims(r, 0, 2) # Merge scale into batch. # Spatial dim 17 r = slim.conv2d(r, 32, 3, padding='SAME', scope='conv1') r = slim.max_pool2d(r, 3, stride=2, padding='SAME', scope='pool1') # Spatial dim 9 r = slim.conv2d(r, 64, 3, padding='SAME', scope='conv2') r = slim.max_pool2d(r, 3, stride=2, padding='SAME', scope='pool2') # Spatial dim 5 r = unmerge(r, axis=0) # Unmerge scale from batch. # Concatenate channels of all scales. r = tf.concat(tf.unstack(r, axis=1), axis=-1) x.append(r) if use_images: with tf.variable_scope('preproc_motion'): # https://github.com/tensorflow/models/blob/master/research/slim/nets/alexnet.py v, unmerge = helpers.merge_dims(v, 0, 2) # Merge time into batch. # To determine size, work backwards from output. # Input size must be # 103 = 11 + (47 - 1) * 2 (for stride 2) or # 195 = 11 + (47 - 1) * 4 (for stride 4) v = slim.conv2d(v, 32, [11, 11], 4, padding='VALID', scope='conv1') # was 64 # Must be 47 = 3 + (23 - 1) * 2 v = slim.max_pool2d(v, [3, 3], 2, padding='VALID', scope='pool1') v = unmerge(v, axis=0) # Un-merge time from batch. # Concatenate the images over time. v = tf.concat(tf.unstack(v, axis=1), axis=-1) v = slim.conv2d(v, 48, [5, 5], padding='SAME', scope='conv2') # was 192 # Must be 23 = 3 + (11 - 1) * 2 v = slim.max_pool2d(v, [3, 3], 2, padding='VALID', scope='pool2') # TODO: Use residual connections here? Be careful with relu and bnorm. v = slim.conv2d(v, 64, [3, 3], padding='SAME', scope='conv3') # was 384 # v = _res_conv2d(v, [3, 3], scope='conv4') # was 384 # v = _res_conv2d(v, [3, 3], scope='conv5') # was 256 v = slim.conv2d(v, 64, [3, 3], scope='conv4') # was 384 v = slim.conv2d(v, 64, [3, 3], scope='conv5') # was 256 # Must be 11 = 3 + (5 - 1) * 2 v = slim.max_pool2d(v, [3, 3], 2, padding='VALID', scope='pool5') # Spatial dim 5 x.append(v) # Concatenate appearance response and/or motion description. x = tf.concat(x, axis=-1) with tf.variable_scope('output'): x = slim.conv2d(x, 512, [5, 5], padding='VALID', scope='fc1') # Spatial dim 1 x = tf.squeeze(x, axis=(-2, -3)) # x = _res_fc(x, scope='fc2') # x = _res_fc(x, scope='fc3') x = slim.fully_connected(x, 512, scope='fc2') # Regress to each output. y = {} for k in output_shapes.keys(): with tf.variable_scope('head_{}'.format(k)): y[k] = x output_dim = np.asscalar(np.prod(output_shapes[k])) y[k] = slim.fully_connected(y[k], output_dim, scope='fc1', activation_fn=None, normalizer_fn=None) if len(output_shapes[k]) > 1: y[k] = helpers.split_dims(y[k], axis=-1, shape=output_shapes[k]) return y
def all_pixel_pairs(template, search, is_training, trainable=True, operation='mul', reduce_channels=True, use_mean=True, use_batch_norm=False, learn_gain=False, gain_init=1, scope='all_pixel_pairs'): ''' Args: template: cnn.Tensor with shape [n, h_t, w_t, c] search: cnn.Tensor with shape [n, s, h_s, w_s, c] Returns: cnn.Tensor with shape [n, h_s, w_s, h_t * w_t] ''' with tf.variable_scope(scope, 'all_pixel_pairs'): template = cnn.as_tensor(template) search = cnn.as_tensor(search) template_size = template.value.shape[-3:-1].as_list() num_channels = template.value.shape[-1].value # Break template into 1x1 patches. # Then "convolve" (multiply) each with the search image. t = template.value s = search.value # template becomes: [n, 1, ..., 1, 1, h_t, w_t, c] # search becomes: [n, s, ..., h_s, w_s, 1, 1, c] t = tf.expand_dims(t, 1) t = helpers.expand_dims_n(t, -4, 2) s = helpers.expand_dims_n(s, -2, 2) if operation == 'mul': p = t * s elif operation == 'abs_diff': p = tf.abs(t - s) else: raise ValueError('unknown operation: "{}"'.format(operation)) # if reduce_channels: # if use_mean: # p = tf.reduce_mean(p, axis=-1, keepdims=True) # else: # p = tf.reduce_sum(p, axis=-1, keepdims=True) # Merge the spatial dimensions of the template into features. # response becomes: [n, ..., h_s, w_s, h_t * w_t * c] p, _ = helpers.merge_dims(p, -3, None) pairs = cnn.Tensor(p, search.fields) # TODO: This initialization could be too small? normalizer = 1 / (np.prod(template_size) ** 2 * num_channels) if use_mean else 1 weights_shape = template_size + [np.prod(template_size) * num_channels, 1] weights = tf.get_variable('weights', weights_shape, tf.float32, initializer=tf.constant_initializer(normalizer), trainable=trainable) # TODO: Support depthwise_conv2d (keep channels). pairs, restore = cnn.merge_batch_dims(pairs) response = cnn.nn_conv2d(pairs, weights, strides=[1, 1, 1, 1], padding='VALID') response = restore(response) return _calibrate(response, is_training, use_batch_norm, learn_gain, gain_init, trainable=trainable)