def _get_pyramid_pooling_arguments(self, crop_size, output_stride, image_grid, image_pooling_crop_size=None): """Gets arguments for pyramid pooling. Args: crop_size: A list of two integers, [crop_height, crop_width] specifying whole patch crop size. output_stride: Integer, output stride value for extracted features. image_grid: A list of two integers, [image_grid_height, image_grid_width], specifying the grid size of how the pyramid pooling will be performed. image_pooling_crop_size: A list of two integers, [crop_height, crop_width] specifying the crop size for image pooling operations. Note that we decouple whole patch crop_size and image_pooling_crop_size as one could perform the image_pooling with different crop sizes. Returns: A list of (resize_value, pooled_kernel) """ resize_height = utils.scale_dimension(crop_size[0], 1. / output_stride) resize_width = utils.scale_dimension(crop_size[1], 1. / output_stride) # If image_pooling_crop_size is not specified, use crop_size. if image_pooling_crop_size is None: image_pooling_crop_size = crop_size pooled_height = utils.scale_dimension( image_pooling_crop_size[0], 1. / (output_stride * image_grid[0])) pooled_width = utils.scale_dimension( image_pooling_crop_size[1], 1. / (output_stride * image_grid[1])) return ([resize_height, resize_width], [pooled_height, pooled_width])
def _apply_conv_operation(self, net, operation, stride, is_from_original_input): """Applies the predicted conv operation to net.""" if stride > 1 and not is_from_original_input: stride = 1 input_filters = net.shape[3] filter_size = self._filter_size if 'separable' in operation: num_layers = int(operation.split('_')[-1]) kernel_size = int(operation.split('x')[0][-1]) for layer_num in range(num_layers): net = tf.nn.relu(net) net = slim.separable_conv2d( net, filter_size, kernel_size, depth_multiplier=1, scope='separable_{0}x{0}_{1}'.format(kernel_size, layer_num + 1), stride=stride) net = slim.batch_norm( net, scope='bn_sep_{0}x{0}_{1}'.format(kernel_size, layer_num + 1)) stride = 1 elif 'atrous' in operation: kernel_size = int(operation.split('x')[0][-1]) net = tf.nn.relu(net) if stride == 2: scaled_height = scale_dimension(tf.shape(net)[1], 0.5) scaled_width = scale_dimension(tf.shape(net)[2], 0.5) net = resize_bilinear(net, [scaled_height, scaled_width], net.dtype) net = slim.conv2d(net, filter_size, kernel_size, rate=1, scope='atrous_{0}x{0}'.format(kernel_size)) else: net = slim.conv2d(net, filter_size, kernel_size, rate=2, scope='atrous_{0}x{0}'.format(kernel_size)) net = slim.batch_norm(net, scope='bn_atr_{0}x{0}'.format(kernel_size)) elif operation in ['none']: if stride > 1 or (input_filters != filter_size): net = tf.nn.relu(net) net = slim.conv2d(net, filter_size, 1, stride=stride, scope='1x1') net = slim.batch_norm(net, scope='bn_1') elif 'pool' in operation: pooling_type = operation.split('_')[0] pooling_shape = int(operation.split('_')[-1].split('x')[0]) if pooling_type == 'avg': net = slim.avg_pool2d(net, pooling_shape, stride=stride, padding='SAME') elif pooling_type == 'max': net = slim.max_pool2d(net, pooling_shape, stride=stride, padding='SAME') else: raise ValueError('Unimplemented pooling type: ', pooling_type) if input_filters != filter_size: net = slim.conv2d(net, filter_size, 1, stride=1, scope='1x1') net = slim.batch_norm(net, scope='bn_1') else: raise ValueError('Unimplemented operation', operation) if operation != 'none': net = self._apply_drop_path(net) return net
def _build_nas_base(images, cell, backbone, num_classes, hparams, global_pool=False, reuse=None, scope=None, final_endpoint=None): """Constructs a NAS model. Args: images: A tensor of size [batch, height, width, channels]. cell: Cell structure used in the network. backbone: Backbone structure used in the network. A list of integers in which value 0 means "output_stride=4", value 1 means "output_stride=8", value 2 means "output_stride=16", and value 3 means "output_stride=32". num_classes: Number of classes to predict. hparams: Hyperparameters needed to construct the network. global_pool: If True, we perform global average pooling before computing the logits. Set to True for image classification, False for dense prediction. reuse: Whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. final_endpoint: The endpoint to construct the network up to. Returns: net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. end_points: A dictionary from components of the network to the corresponding activation. """ with tf.variable_scope(scope, 'nas', [images], reuse=reuse): end_points = {} def add_and_check_endpoint(endpoint_name, net): end_points[endpoint_name] = net return final_endpoint and (endpoint_name == final_endpoint) net, cell_outputs = _nas_stem(images) if add_and_check_endpoint('Stem', net): return net, end_points # Run the cells filter_scaling = 1.0 for cell_num in range(len(backbone)): stride = 1 if cell_num == 0: if backbone[0] == 1: stride = 2 filter_scaling *= hparams.filter_scaling_rate else: if backbone[cell_num] == backbone[cell_num - 1] + 1: stride = 2 filter_scaling *= hparams.filter_scaling_rate elif backbone[cell_num] == backbone[cell_num - 1] - 1: scaled_height = scale_dimension(net.shape[1].value, 2) scaled_width = scale_dimension(net.shape[2].value, 2) net = resize_bilinear(net, [scaled_height, scaled_width], net.dtype) filter_scaling /= hparams.filter_scaling_rate net = cell(net, scope='cell_{}'.format(cell_num), filter_scaling=filter_scaling, stride=stride, prev_layer=cell_outputs[-2], cell_num=cell_num) if add_and_check_endpoint('Cell_{}'.format(cell_num), net): return net, end_points cell_outputs.append(net) net = tf.nn.relu(net) if global_pool: # Global average pooling. net = tf.reduce_mean(net, [1, 2], name='global_pool', keepdims=True) if num_classes is not None: net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') end_points['predictions'] = slim.softmax(net, scope='predictions') return net, end_points