def resize_shortest_edge(image, image_shape, size): shape = tf.cast(image_shape, tf.float32) w_greater = tf.greater(image_shape[0], image_shape[1]) shape = tf.cond(w_greater, lambda: tf.cast([shape[0] / shape[1] * size, size], tf.int32), lambda: tf.cast([size, shape[1] / shape[0] * size], tf.int32)) return uint8_resize_bicubic(image, shape)
def build_graph(self, image, label): xys = np.array([(y, x, 1) for y in range(WARP_TARGET_SIZE) for x in range(WARP_TARGET_SIZE)], dtype='float32') xys = tf.constant(xys, dtype=tf.float32, name='xys') # p x 3 image = image / 255.0 - 0.5 # bhw2 def get_stn(image): stn = (LinearWrap(image) .AvgPooling('downsample', 2) .Conv2D('conv0', 20, 5, padding='VALID') .MaxPooling('pool0', 2) .Conv2D('conv1', 20, 5, padding='VALID') .FullyConnected('fc1', 32) .FullyConnected('fct', 6, activation=tf.identity, kernel_initializer=tf.constant_initializer(), bias_initializer=tf.constant_initializer([1, 0, HALF_DIFF, 0, 1, HALF_DIFF]))()) # output 6 parameters for affine transformation stn = tf.reshape(stn, [-1, 2, 3], name='affine') # bx2x3 stn = tf.reshape(tf.transpose(stn, [2, 0, 1]), [3, -1]) # 3 x (bx2) coor = tf.reshape(tf.matmul(xys, stn), [WARP_TARGET_SIZE, WARP_TARGET_SIZE, -1, 2]) coor = tf.transpose(coor, [2, 0, 1, 3], 'sampled_coords') # b h w 2 sampled = GridSample('warp', [image, coor], borderMode='constant') return sampled with argscope([Conv2D, FullyConnected], activation=tf.nn.relu): with tf.variable_scope('STN1'): sampled1 = get_stn(image) with tf.variable_scope('STN2'): sampled2 = get_stn(image) # For visualization in tensorboard with tf.name_scope('visualization'): padded1 = tf.pad(sampled1, [[0, 0], [HALF_DIFF, HALF_DIFF], [HALF_DIFF, HALF_DIFF], [0, 0]]) padded2 = tf.pad(sampled2, [[0, 0], [HALF_DIFF, HALF_DIFF], [HALF_DIFF, HALF_DIFF], [0, 0]]) img_orig = tf.concat([image[:, :, :, 0], image[:, :, :, 1]], 1) # b x 2h x w transform1 = tf.concat([padded1[:, :, :, 0], padded1[:, :, :, 1]], 1) transform2 = tf.concat([padded2[:, :, :, 0], padded2[:, :, :, 1]], 1) stacked = tf.concat([img_orig, transform1, transform2], 2, 'viz') tf.summary.image('visualize', tf.expand_dims(stacked, -1), max_outputs=30) sampled = tf.concat([sampled1, sampled2], 3, 'sampled_concat') logits = (LinearWrap(sampled) .FullyConnected('fc1', 256, activation=tf.nn.relu) .FullyConnected('fc2', 128, activation=tf.nn.relu) .FullyConnected('fct', 19, activation=tf.identity)()) tf.nn.softmax(logits, name='prob') cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='incorrect_vector') summary.add_moving_summary(tf.reduce_mean(wrong, name='train_error')) wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') summary.add_moving_summary(cost, wd_cost) return tf.add_n([wd_cost, cost], name='cost')
def GridSample(inputs, borderMode='repeat'): """ Sample the images using the given coordinates, by bilinear interpolation. This was described in the paper: `Spatial Transformer Networks <http://arxiv.org/abs/1506.02025>`_. This is equivalent to `torch.nn.functional.grid_sample`, up to some non-trivial coordinate transformation. This implementation returns pixel value at pixel (1, 1) for a floating point coordinate (1.0, 1.0). Note that this may not be what you need. Args: inputs (list): [images, coords]. images has shape NHWC. coords has shape (N, H', W', 2), where each pair of the last dimension is a (y, x) real-value coordinate. borderMode: either "repeat" or "constant" (zero-filled) Returns: tf.Tensor: a tensor named ``output`` of shape (N, H', W', C). """ image, mapping = inputs assert image.get_shape().ndims == 4 and mapping.get_shape().ndims == 4 input_shape = image.get_shape().as_list()[1:] assert None not in input_shape, \ "Images in GridSample layer must have fully-defined shape" assert borderMode in ['repeat', 'constant'] orig_mapping = mapping mapping = tf.maximum(mapping, 0.0) lcoor = tf.floor(mapping) ucoor = lcoor + 1 diff = mapping - lcoor neg_diff = 1.0 - diff # bxh2xw2x2 lcoory, lcoorx = tf.split(lcoor, 2, 3) ucoory, ucoorx = tf.split(ucoor, 2, 3) lyux = tf.concat([lcoory, ucoorx], 3) uylx = tf.concat([ucoory, lcoorx], 3) diffy, diffx = tf.split(diff, 2, 3) neg_diffy, neg_diffx = tf.split(neg_diff, 2, 3) ret = tf.add_n([sample(image, lcoor) * neg_diffx * neg_diffy, sample(image, ucoor) * diffx * diffy, sample(image, lyux) * neg_diffy * diffx, sample(image, uylx) * diffy * neg_diffx], name='sampled') if borderMode == 'constant': max_coor = tf.constant([input_shape[0] - 1, input_shape[1] - 1], dtype=tf.float32) mask = tf.greater_equal(orig_mapping, 0.0) mask2 = tf.less_equal(orig_mapping, max_coor) mask = tf.logical_and(mask, mask2) # bxh2xw2x2 mask = tf.reduce_all(mask, [3]) # bxh2xw2 boolean mask = tf.expand_dims(mask, 3) ret = ret * tf.cast(mask, tf.float32) return tf.identity(ret, name='output')
def build_graph(self, image, label): """This function should build the model which takes the input variables (defined above) and return cost at the end.""" # In tensorflow, inputs to convolution function are assumed to be # NHWC. Add a single channel here. image = tf.expand_dims(image, 3) image = image * 2 - 1 # center the pixels values at zero # The context manager `argscope` sets the default option for all the layers under # this context. Here we use 32 channel convolution with shape 3x3 # See tutorial at https://tensorpack.readthedocs.io/tutorial/symbolic.html with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu, filters=32): # LinearWrap is just a syntax sugar. # See tutorial at https://tensorpack.readthedocs.io/tutorial/symbolic.html logits = (LinearWrap(image) .Conv2D('conv0') .MaxPooling('pool0', 2) .Conv2D('conv1') .Conv2D('conv2') .MaxPooling('pool1', 2) .Conv2D('conv3') .FullyConnected('fc0', 512, activation=tf.nn.relu) .Dropout('dropout', rate=0.5) .FullyConnected('fc1', 10, activation=tf.identity)()) # a vector of length B with loss of each sample cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # the average cross-entropy loss correct = tf.cast(tf.nn.in_top_k(predictions=logits, targets=label, k=1), tf.float32, name='correct') accuracy = tf.reduce_mean(correct, name='accuracy') # This will monitor training error & accuracy (in a moving average fashion). The value will be automatically # 1. written to tensosrboard # 2. written to stat.json # 3. printed after each epoch # You can also just call `tf.summary.scalar`. But moving summary has some other benefits. # See tutorial at https://tensorpack.readthedocs.io/tutorial/summary.html train_error = tf.reduce_mean(1 - correct, name='train_error') summary.add_moving_summary(train_error, accuracy) # Use a regex to find parameters to apply weight decay. # Here we apply a weight decay on all W (weight matrix) of all fc layers # If you don't like regex, you can certainly define the cost in any other methods. wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') total_cost = tf.add_n([wd_cost, cost], name='total_cost') summary.add_moving_summary(cost, wd_cost, total_cost) # monitor histogram of all weight (of conv and fc layers) in tensorboard summary.add_param_summary(('.*/W', ['histogram', 'rms'])) # the function should return the total cost to be optimized return total_cost
def build_graph(self, input_img, target_img): target_img = tf.cast(target_img, tf.float32) target_img = tf.image.rgb_to_grayscale(target_img) self.prediction_img = tf.identity(self.make_prediction(input_img), name='prediction_img') cost = tf.losses.mean_squared_error(target_img, self.prediction_img, reduction=tf.losses.Reduction.MEAN) return tf.identity(cost, name='total_costs')
def image_preprocess(self, image): with tf.name_scope('image_preprocess'): if image.dtype.base_dtype != tf.float32: image = tf.cast(image, tf.float32) mean = [0.485, 0.456, 0.406] # rgb std = [0.229, 0.224, 0.225] if self.image_bgr: mean = mean[::-1] std = std[::-1] image_mean = tf.constant(mean, dtype=tf.float32) * 255. image_std = tf.constant(std, dtype=tf.float32) * 255. image = (image - image_mean) / image_std return image
def build_graph(self, image, label): """This function should build the model which takes the input variables and return cost at the end""" # In tensorflow, inputs to convolution function are assumed to be # NHWC. Add a single channel here. image = tf.expand_dims(image, 3) image = image * 2 - 1 # center the pixels values at zero # The context manager `argscope` sets the default option for all the layers under # this context. Here we use 32 channel convolution with shape 3x3 with argscope([tf.layers.conv2d], padding='same', activation=tf.nn.relu): l = tf.layers.conv2d(image, 32, 3, name='conv0') l = tf.layers.max_pooling2d(l, 2, 2, padding='valid') l = tf.layers.conv2d(l, 32, 3, name='conv1') l = tf.layers.conv2d(l, 32, 3, name='conv2') l = tf.layers.max_pooling2d(l, 2, 2, padding='valid') l = tf.layers.conv2d(l, 32, 3, name='conv3') l = tf.layers.flatten(l) l = tf.layers.dense(l, 512, activation=tf.nn.relu, name='fc0') l = tf.layers.dropout(l, rate=0.5, training=self.training) logits = tf.layers.dense(l, 10, activation=tf.identity, name='fc1') # a vector of length B with loss of each sample cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') # the average cross-entropy loss correct = tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32, name='correct') accuracy = tf.reduce_mean(correct, name='accuracy') # This will monitor training error & accuracy (in a moving average fashion). The value will be automatically # 1. written to tensosrboard # 2. written to stat.json # 3. printed after each epoch train_error = tf.reduce_mean(1 - correct, name='train_error') summary.add_moving_summary(train_error, accuracy) # Use a regex to find parameters to apply weight decay. # Here we apply a weight decay on all W (weight matrix) of all fc layers # If you don't like regex, you can certainly define the cost in any other methods. wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/kernel', tf.nn.l2_loss), name='regularize_loss') total_cost = tf.add_n([wd_cost, cost], name='total_cost') summary.add_moving_summary(cost, wd_cost, total_cost) # monitor histogram of all weight (of conv and fc layers) in tensorboard summary.add_param_summary(('.*/kernel', ['histogram', 'rms'])) # the function should return the total cost to be optimized return total_cost
def make_prediction(self, img): img = tf.cast(img, tf.float32) img = tf.image.rgb_to_grayscale(img) k = tf.get_variable('filter', dtype=tf.float32, initializer=[[[[0.]], [[1.]], [[0.]]], [[[1.]], [[-4.]], [[1.]]], [[[0.]], [[1.]], [[0.]]]]) prediction_img = tf.nn.conv2d(img, k, strides=[1, 1, 1, 1], padding='SAME') return prediction_img
def build_graph(self, image, label): drop_rate = tf.constant(0.5 if self.training else 0.0) if self.training: tf.summary.image("train_image", image, 10) if tf.test.is_gpu_available(): image = tf.transpose(image, [0, 3, 1, 2]) data_format = 'channels_first' else: data_format = 'channels_last' image = image / 4.0 # just to make range smaller with argscope(Conv2D, activation=BNReLU, use_bias=False, kernel_size=3), \ argscope([Conv2D, MaxPooling, BatchNorm], data_format=data_format): logits = LinearWrap(image) \ .Conv2D('conv1.1', filters=64) \ .Conv2D('conv1.2', filters=64) \ .MaxPooling('pool1', 3, stride=2, padding='SAME') \ .Conv2D('conv2.1', filters=128) \ .Conv2D('conv2.2', filters=128) \ .MaxPooling('pool2', 3, stride=2, padding='SAME') \ .Conv2D('conv3.1', filters=128, padding='VALID') \ .Conv2D('conv3.2', filters=128, padding='VALID') \ .FullyConnected('fc0', 1024 + 512, activation=tf.nn.relu) \ .Dropout(rate=drop_rate) \ .FullyConnected('fc1', 512, activation=tf.nn.relu) \ .FullyConnected('linear', out_dim=self.cifar_classnum)() cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') correct = tf.cast(tf.nn.in_top_k(predictions=logits, targets=label, k=1), tf.float32, name='correct') # monitor training error add_moving_summary(tf.reduce_mean(correct, name='accuracy')) # weight decay on all W of fc layers wd_cost = regularize_cost('fc.*/W', l2_regularizer(4e-4), name='regularize_loss') add_moving_summary(cost, wd_cost) add_param_summary(('.*/W', ['histogram'])) # monitor W return tf.add_n([cost, wd_cost], name='cost')
def build_graph(self, *inputs): inputs = dict(zip(self.input_names, inputs)) if "gt_masks_packed" in inputs: gt_masks = tf.cast(unpackbits_masks(inputs.pop("gt_masks_packed")), tf.uint8, name="gt_masks") inputs["gt_masks"] = gt_masks image = self.preprocess(inputs['image']) # 1CHW features = self.backbone(image) anchor_inputs = { k: v for k, v in inputs.items() if k.startswith('anchor_') } proposals, rpn_losses = self.rpn(image, features, anchor_inputs) # inputs? targets = [ inputs[k] for k in ['gt_boxes', 'gt_labels', 'gt_masks'] if k in inputs ] gt_boxes_area = tf.reduce_mean(tf_area(inputs["gt_boxes"]), name='mean_gt_box_area') add_moving_summary(gt_boxes_area) head_losses = self.roi_heads(image, features, proposals, targets) if self.training: wd_cost = regularize_cost('.*/W', l2_regularizer(cfg.TRAIN.WEIGHT_DECAY), name='wd_cost') total_cost = tf.add_n(rpn_losses + head_losses + [wd_cost], 'total_cost') add_moving_summary(total_cost, wd_cost) return total_cost else: # Check that the model defines the tensors it declares for inference # For existing models, they are defined in "fastrcnn_predictions(name_scope='output')" G = tf.get_default_graph() ns = G.get_name_scope() for name in self.get_inference_tensor_names()[1]: try: name = '/'.join([ns, name]) if ns else name G.get_tensor_by_name(name + ':0') except KeyError: raise KeyError( "Your model does not define the tensor '{}' in inference context." .format(name))
def build_graph(self, input_img_bytes): # prepare input (png encoded strings to images) input_img = tf.map_fn(lambda x: tf.image.decode_png(x, channels=3), input_img_bytes, dtype=tf.uint8) # just copy the relevant parts to this graph. prediction_img = self.make_prediction(input_img) # outputs should be png encoded strings agains prediction_img = tf.clip_by_value(prediction_img, 0, 255) prediction_img = tf.cast(prediction_img, tf.uint8) prediction_img_bytes = tf.map_fn(tf.image.encode_png, prediction_img, dtype=tf.string) tf.identity(prediction_img_bytes, name='prediction_img_bytes')
def training_mapper(byte): jpeg_shape = tf.image.extract_jpeg_shape(byte) # hwc bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box( jpeg_shape, bounding_boxes=tf.zeros(shape=[0, 0, 4]), min_object_covered=0, aspect_ratio_range=[0.75, 1.33], area_range=[0.08, 1.0], max_attempts=10, use_image_if_no_bounding_boxes=True) is_bad = tf.reduce_sum(tf.cast(tf.equal(bbox_size, jpeg_shape), tf.int32)) >= 2 def good(): offset_y, offset_x, _ = tf.unstack(bbox_begin) target_height, target_width, _ = tf.unstack(bbox_size) crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) image = tf.image.decode_and_crop_jpeg( byte, crop_window, channels=3, **JPEG_OPT) image = uint8_resize_bicubic(image, [224, 224]) return image def bad(): image = tf.image.decode_jpeg( tf.reshape(byte, shape=[]), 3, **JPEG_OPT) image = resize_shortest_edge(image, jpeg_shape, 224) image = center_crop(image, 224) return image image = tf.cond(is_bad, bad, good) # TODO other imgproc image = lighting(image, 0.1, eigval=np.array([0.2175, 0.0188, 0.0045], dtype='float32') * 255.0, eigvec=np.array([[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]], dtype='float32')) image = tf.image.random_flip_left_right(image) image = tf.reverse(image, axis=[2]) # to BGR return image
def build_graph(self, image, label): image = tf.expand_dims(image * 2 - 1, 3) with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32): c0 = Conv2D('conv0', image) p0 = MaxPooling('pool0', c0, 2) c1 = Conv2D('conv1', p0) c2 = Conv2D('conv2', c1) p1 = MaxPooling('pool1', c2, 2) c3 = Conv2D('conv3', p1) fc1 = FullyConnected('fc0', c3, 512, nl=tf.nn.relu) fc1 = Dropout('dropout', fc1, 0.5) logits = FullyConnected('fc1', fc1, out_dim=10, nl=tf.identity) with tf.name_scope('visualizations'): visualize_conv_weights(c0.variables.W, 'conv0') visualize_conv_activations(c0, 'conv0') visualize_conv_weights(c1.variables.W, 'conv1') visualize_conv_activations(c1, 'conv1') visualize_conv_weights(c2.variables.W, 'conv2') visualize_conv_activations(c2, 'conv2') visualize_conv_weights(c3.variables.W, 'conv3') visualize_conv_activations(c3, 'conv3') tf.summary.image('input', (image + 1.0) * 128., 3) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') tf.reduce_mean(tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32), name='accuracy') wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), name='regularize_loss') return tf.add_n([wd_cost, cost], name='total_cost')
def build_graph(self, image, label): image = image / 128.0 - 1 with argscope(Conv2D, activation=BNReLU, use_bias=False): logits = (LinearWrap(image).Conv2D( 'conv1', 24, 5, padding='VALID').MaxPooling('pool1', 2, padding='SAME').Conv2D( 'conv2', 32, 3, padding='VALID').Conv2D('conv3', 32, 3, padding='VALID').MaxPooling( 'pool2', 2, padding='SAME'). Conv2D('conv4', 64, 3, padding='VALID').Dropout( 'drop', rate=0.5).FullyConnected( 'fc0', 512, bias_initializer=tf.constant_initializer(0.1), activation=tf.nn.relu).FullyConnected( 'linear', units=10)()) tf.nn.softmax(logits, name='output') accuracy = tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32) add_moving_summary(tf.reduce_mean(accuracy, name='accuracy')) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') wd_cost = regularize_cost('fc.*/W', l2_regularizer(0.00001)) add_moving_summary(cost, wd_cost) add_param_summary(('.*/W', ['histogram', 'rms'])) # monitor W return tf.add_n([cost, wd_cost], name='cost')
def sample(img, coords): """ Args: img: bxhxwxc coords: bxh2xw2x2. each coordinate is (y, x) integer. Out of boundary coordinates will be clipped. Return: bxh2xw2xc image """ shape = img.get_shape().as_list()[1:] # h, w, c batch = tf.shape(img)[0] shape2 = coords.get_shape().as_list()[1:3] # h2, w2 assert None not in shape2, coords.get_shape() max_coor = tf.constant([shape[0] - 1, shape[1] - 1], dtype=tf.float32) coords = tf.clip_by_value(coords, 0., max_coor) # borderMode==repeat coords = tf.cast(coords, tf.int32) batch_index = tf.range(batch, dtype=tf.int32) batch_index = tf.reshape(batch_index, [-1, 1, 1, 1]) batch_index = tf.tile(batch_index, [1, shape2[0], shape2[1], 1]) # bxh2xw2x1 indices = tf.concat([batch_index, coords], axis=3) # bxh2xw2x3 sampled = tf.gather_nd(img, indices) return sampled
def uint8_resize_bicubic(image, shape): ret = tf.image.resize_bicubic([image], shape) return tf.cast(tf.clip_by_value(ret, 0, 255), tf.uint8)[0]
def prediction_incorrect(logits, label, topk, name): return tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, topk)), tf.float32, name=name)
def roi_heads(self, image, features, proposals, targets): image_shape2d = tf.shape(image)[2:] # h,w featuremap = features[0] gt_boxes, gt_labels, *_ = targets if self.training: # sample proposal boxes in training proposals = sample_fast_rcnn_targets(proposals.boxes, gt_boxes, gt_labels) # The boxes to be used to crop RoIs. # Use all proposal boxes in inference boxes_on_featuremap = proposals.boxes * (1.0 / cfg.RPN.ANCHOR_STRIDE) roi_resized = roi_align(featuremap, boxes_on_featuremap, 14) feature_fastrcnn = resnet_conv5( roi_resized, cfg.BACKBONE.RESNET_NUM_BLOCKS[-1]) # nxcx7x7 # Keep C5 feature to be shared with mask branch feature_gap = GlobalAvgPooling('gap', feature_fastrcnn, data_format='channels_first') fastrcnn_label_logits, fastrcnn_box_logits = fastrcnn_outputs( 'fastrcnn', feature_gap, cfg.DATA.NUM_CATEGORY) fastrcnn_head = FastRCNNHead( proposals, fastrcnn_box_logits, fastrcnn_label_logits, gt_boxes, tf.constant(cfg.FRCNN.BBOX_REG_WEIGHTS, dtype=tf.float32)) if self.training: all_losses = fastrcnn_head.losses() if cfg.MODE_MASK: gt_masks = targets[2] # maskrcnn loss # In training, mask branch shares the same C5 feature. fg_feature = tf.gather(feature_fastrcnn, proposals.fg_inds()) mask_logits = maskrcnn_upXconv_head( 'maskrcnn', fg_feature, cfg.DATA.NUM_CATEGORY, num_convs=0) # #fg x #cat x 14x14 target_masks_for_fg = crop_and_resize( tf.expand_dims(gt_masks, 1), proposals.fg_boxes(), proposals.fg_inds_wrt_gt, 14, pad_border=False) # nfg x 1x14x14 target_masks_for_fg = tf.squeeze(target_masks_for_fg, 1, 'sampled_fg_mask_targets') all_losses.append( maskrcnn_loss(mask_logits, proposals.fg_labels(), target_masks_for_fg)) return all_losses else: decoded_boxes = fastrcnn_head.decoded_output_boxes() decoded_boxes = clip_boxes(decoded_boxes, image_shape2d, name='fastrcnn_all_boxes') label_scores = fastrcnn_head.output_scores( name='fastrcnn_all_scores') final_boxes, final_scores, final_labels = fastrcnn_predictions( decoded_boxes, label_scores, name_scope='output') if cfg.MODE_MASK: roi_resized = roi_align( featuremap, final_boxes * (1.0 / cfg.RPN.ANCHOR_STRIDE), 14) feature_maskrcnn = resnet_conv5( roi_resized, cfg.BACKBONE.RESNET_NUM_BLOCKS[-1]) mask_logits = maskrcnn_upXconv_head( 'maskrcnn', feature_maskrcnn, cfg.DATA.NUM_CATEGORY, 0) # #result x #cat x 14x14 indices = tf.stack([ tf.range(tf.size(final_labels)), tf.cast(final_labels, tf.int32) - 1 ], axis=1) final_mask_logits = tf.gather_nd(mask_logits, indices) # #resultx14x14 tf.sigmoid(final_mask_logits, name='output/masks') return []
def prediction_incorrect(logits, label, topk=1, name='incorrect_vector'): with tf.name_scope('prediction_incorrect'): x = tf.logical_not(tf.nn.in_top_k(logits, label, topk)) return tf.cast(x, tf.float32, name=name)
def roi_heads(self, image, features, proposals, targets): image_shape2d = tf.shape(image)[2:] # h,w assert len(features) == 5, "Features have to be P23456!" gt_boxes, gt_labels, *_ = targets if self.training: proposals = sample_fast_rcnn_targets(proposals.boxes, gt_boxes, gt_labels) fastrcnn_head_func = getattr(model_frcnn, cfg.FPN.FRCNN_HEAD_FUNC) if not cfg.FPN.CASCADE: roi_feature_fastrcnn = multilevel_roi_align( features[:4], proposals.boxes, 7) head_feature = fastrcnn_head_func('fastrcnn', roi_feature_fastrcnn) fastrcnn_label_logits, fastrcnn_box_logits = fastrcnn_outputs( 'fastrcnn/outputs', head_feature, cfg.DATA.NUM_CATEGORY) fastrcnn_head = FastRCNNHead( proposals, fastrcnn_box_logits, fastrcnn_label_logits, gt_boxes, tf.constant(cfg.FRCNN.BBOX_REG_WEIGHTS, dtype=tf.float32)) else: def roi_func(boxes): return multilevel_roi_align(features[:4], boxes, 7) fastrcnn_head = CascadeRCNNHead(proposals, roi_func, fastrcnn_head_func, (gt_boxes, gt_labels), image_shape2d, cfg.DATA.NUM_CATEGORY) if self.training: all_losses = fastrcnn_head.losses() if cfg.MODE_MASK: gt_masks = targets[2] # maskrcnn loss roi_feature_maskrcnn = multilevel_roi_align( features[:4], proposals.fg_boxes(), 14, name_scope='multilevel_roi_align_mask') maskrcnn_head_func = getattr(model_mrcnn, cfg.FPN.MRCNN_HEAD_FUNC) mask_logits = maskrcnn_head_func( 'maskrcnn', roi_feature_maskrcnn, cfg.DATA.NUM_CATEGORY) # #fg x #cat x 28 x 28 target_masks_for_fg = crop_and_resize( tf.expand_dims(gt_masks, 1), proposals.fg_boxes(), proposals.fg_inds_wrt_gt, 28, pad_border=False) # fg x 1x28x28 target_masks_for_fg = tf.squeeze(target_masks_for_fg, 1, 'sampled_fg_mask_targets') all_losses.append( maskrcnn_loss(mask_logits, proposals.fg_labels(), target_masks_for_fg)) return all_losses else: decoded_boxes = fastrcnn_head.decoded_output_boxes() decoded_boxes = clip_boxes(decoded_boxes, image_shape2d, name='fastrcnn_all_boxes') label_scores = fastrcnn_head.output_scores( name='fastrcnn_all_scores') final_boxes, final_scores, final_labels = fastrcnn_predictions( decoded_boxes, label_scores, name_scope='output') if cfg.MODE_MASK: # Cascade inference needs roi transform with refined boxes. roi_feature_maskrcnn = multilevel_roi_align( features[:4], final_boxes, 14) maskrcnn_head_func = getattr(model_mrcnn, cfg.FPN.MRCNN_HEAD_FUNC) mask_logits = maskrcnn_head_func( 'maskrcnn', roi_feature_maskrcnn, cfg.DATA.NUM_CATEGORY) # #fg x #cat x 28 x 28 indices = tf.stack([ tf.range(tf.size(final_labels)), tf.cast(final_labels, tf.int32) - 1 ], axis=1) final_mask_logits = tf.gather_nd(mask_logits, indices) # #resultx28x28 tf.sigmoid(final_mask_logits, name='output/masks') return []
def lighting(image, std, eigval, eigvec): v = tf.random_normal(shape=[3], stddev=std) * eigval inc = tf.matmul(eigvec, tf.reshape(v, [3, 1])) image = tf.cast(tf.cast(image, tf.float32) + tf.reshape(inc, [3]), image.dtype) return image