def predict(self, preprocessed_inputs, true_image_shapes): """Predicts unpostprocessed tensors from input tensor. This function takes an input batch of images and runs it through the forward pass of the network to yield unpostprocessesed predictions. A side effect of calling the predict method is that self._anchors is populated with a box_list.BoxList of anchors. These anchors must be constructed before the postprocess or loss functions can be called. Args: preprocessed_inputs: a [batch, height, width, channels] image tensor. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) preprocessed_inputs: the [batch, height, width, channels] image tensor. 2) box_encodings: 4-D float tensor of shape [batch_size, num_anchors, box_code_dimension] containing predicted boxes. 3) class_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, num_classes+1] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions (at class index 0). 4) feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i]. 5) anchors: 2-D float tensor of shape [num_anchors, 4] containing the generated anchors in normalized coordinates. """ with tf.variable_scope(None, self._extract_features_scope, [preprocessed_inputs]): feature_maps = self._feature_extractor.extract_features( preprocessed_inputs) feature_map_spatial_dims = self._get_feature_map_spatial_dims( feature_maps) image_shape = shape_utils.combined_static_and_dynamic_shape( preprocessed_inputs) self._anchors = box_list_ops.concatenate( self._anchor_generator.generate(feature_map_spatial_dims, im_height=image_shape[1], im_width=image_shape[2])) prediction_dict = self._box_predictor.predict( feature_maps, self._anchor_generator.num_anchors_per_location()) box_encodings = tf.squeeze(tf.concat(prediction_dict['box_encodings'], axis=1), axis=2) class_predictions_with_background = tf.concat( prediction_dict['class_predictions_with_background'], axis=1) predictions_dict = { 'preprocessed_inputs': preprocessed_inputs, 'box_encodings': box_encodings, 'class_predictions_with_background': class_predictions_with_background, 'feature_maps': feature_maps, 'anchors': self._anchors.get() } return predictions_dict
def _batch_decode_oriented(self, oriented_box_encodings): """Decodes a batch of oriented box encodings with respect to the anchors. Args: oriented_box_encodings: A float32 tensor of shape [batch_size, num_anchors, box_code_size] containing box encodings. Returns: decoded_boxes: A float32 tensor of shape [batch_size, num_anchors, 4, 2] containing the decoded boxes. """ combined_shape = shape_utils.combined_static_and_dynamic_shape( oriented_box_encodings) batch_size = combined_shape[0] tiled_anchor_boxes = tf.tile(tf.expand_dims(self.anchors.get(), 0), [batch_size, 1, 1]) tiled_anchors_boxlist = box_list.BoxList( tf.reshape(tiled_anchor_boxes, [-1, self._box_coder.code_size])) decoded_oriented_boxes = self._box_coder.decode_oriented( tf.reshape(oriented_box_encodings, [-1, self._box_coder.code_size_oriented]), tiled_anchors_boxlist) return tf.reshape( decoded_oriented_boxes.get_oriented(), tf.stack([combined_shape[0], combined_shape[1], 4, 2]))
def calibration_fn(class_predictions_with_background): """Calibrate predictions via 1-d linear interpolation. Predictions scores are linearly interpolated based on class-agnostic function approximations. Note that the 0-indexed background class may also transformed. Args: class_predictions_with_background: tf.float32 tensor of shape [batch_size, num_anchors, num_classes + 1] containing scores on the interval [0,1]. This is usually produced by a sigmoid or softmax layer and the result of calling the `predict` method of a detection model. Returns: tf.float32 tensor of shape [batch_size, num_anchors, num_classes] if background class is not present (else shape is [batch_size, num_anchors, num_classes + 1]) on the interval [0, 1]. """ # Flattening Tensors and then reshaping at the end. flat_class_predictions_with_background = tf.reshape( class_predictions_with_background, shape=[-1]) fn_x, fn_y = _function_approximation_proto_to_tf_tensors( calibration_config.function_approximation.x_y_pairs) updated_scores = _tf_linear_interp1d( flat_class_predictions_with_background, fn_x, fn_y) # Un-flatten the scores original_detections_shape = shape_utils.combined_static_and_dynamic_shape( class_predictions_with_background) calibrated_class_predictions_with_background = tf.reshape( updated_scores, shape=original_detections_shape, name='calibrate_scores') return calibrated_class_predictions_with_background
def select_random_box(boxlist, default_box=None, seed=None, scope=None): """Selects a random bounding box from a `BoxList`. Args: boxlist: A BoxList. default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`, this default box will be returned. If None, will use a default box of [[-1., -1., -1., -1.]]. seed: Random seed. scope: Name scope. Returns: bbox: A [1, 4] tensor with a random bounding box. valid: A bool tensor indicating whether a valid bounding box is returned (True) or whether the default box is returned (False). """ with tf.name_scope(scope, 'SelectRandomBox'): bboxes = boxlist.get() combined_shape = shape_utils.combined_static_and_dynamic_shape(bboxes) number_of_boxes = combined_shape[0] default_box = default_box or tf.constant([[-1., -1., -1., -1.]]) def select_box(): random_index = tf.random_uniform([], maxval=number_of_boxes, dtype=tf.int32, seed=seed) return tf.expand_dims(bboxes[random_index], axis=0), tf.constant(True) return tf.cond(tf.greater_equal(number_of_boxes, 1), true_fn=select_box, false_fn=lambda: (default_box, tf.constant(False)))
def nearest_neighbor_upsampling(input_tensor, scale): """Nearest neighbor upsampling implementation. Nearest neighbor upsampling function that maps input tensor with shape [batch_size, height, width, channels] to [batch_size, height * scale , width * scale, channels]. This implementation only uses reshape and broadcasting to make it TPU compatible. Args: input_tensor: A float32 tensor of size [batch, height_in, width_in, channels]. scale: An integer multiple to scale resolution of input data. Returns: data_up: A float32 tensor of size [batch, height_in*scale, width_in*scale, channels]. """ with tf.name_scope('nearest_neighbor_upsampling'): (batch_size, height, width, channels ) = shape_utils.combined_static_and_dynamic_shape(input_tensor) output_tensor = tf.reshape(input_tensor, [ batch_size, height, 1, width, 1, channels ]) * tf.ones([1, 1, scale, 1, scale, 1], dtype=input_tensor.dtype) return tf.reshape( output_tensor, [batch_size, height * scale, width * scale, channels])
def _compute_loss(logits, labels, num_classes): # shape_labels = combined_static_and_dynamic_shape(labels) # cls_logits = tf.image.resize_bilinear(cls_logits, shape_labels[1:3], align_corners=True) shape_logits = combined_static_and_dynamic_shape(logits) labels = tf.image.resize_nearest_neighbor(labels, shape_logits[1:3], align_corners=True) logits = tf.reshape(logits, [-1, shape_logits[-1]]) labels = tf.reshape(labels, [-1]) idx = tf.logical_and(tf.greater_equal(labels, 0), tf.less(labels, num_classes)) idx = tf.where(idx)[:, 0] valid_logits = tf.gather(logits, idx) valid_labels = tf.gather(labels, idx) # cls_loss = focal_loss(labels=valid_labels, logits=valid_logits) cls_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=valid_labels, logits=valid_logits) correct = tf.equal( valid_labels, tf.argmax(valid_logits, axis=-1, output_type=valid_labels.dtype)) acc = tf.cast(correct, tf.float32) return tf.reduce_mean(cls_loss), tf.reduce_mean(acc)
def call(self, input): cnn_fmaps_lastscale = self.cnn_fmap[-1] batch_size, batch_len, depth = shape_utils.combined_static_and_dynamic_shape( input) _, feature_h, feature_w, feature_c = cnn_fmaps_lastscale.get_shape( ).as_list() ld_output = tf.reshape(input, (batch_size * batch_len, depth)) ld_output_reshape = tf.reshape(ld_output, [batch_size * batch_len, 1, 1, depth]) ld_output_conv = conv2d(ld_output_reshape, self.embedding_dim, 1, activation_fn=None, normalizer_fn=None, scope='compute2d_attention_layer/hs_conv', reuse=tf.AUTO_REUSE) ld_output_conv_tile = tf.tile(ld_output_conv, [1, feature_h, feature_w, 1]) cnn_fmap_conv = conv2d(cnn_fmaps_lastscale, self.embedding_dim, 3, activation_fn=None, normalizer_fn=None, scope='compute2d_attention_layer/fmap_conv', reuse=tf.AUTO_REUSE) cnn_fmap_tile = tf.expand_dims(cnn_fmap_conv, 1) cnn_fmap_tile = tf.tile(cnn_fmap_tile, [1, batch_len, 1, 1, 1]) cnn_fmap_tile = tf.reshape( cnn_fmap_tile, [batch_size * batch_len, feature_h, feature_w, feature_c]) g = tf.nn.tanh(tf.add(cnn_fmap_tile, ld_output_conv_tile)) g_conv = conv2d(g, 1, 1, scope='compute2d_attention_layer/g_conv', activation_fn=None, normalizer_fn=None, reuse=tf.AUTO_REUSE) g_conv_reshape = tf.reshape( g_conv, [batch_size * batch_len, feature_w * feature_h]) g_conv_reshape_softmax = tf.nn.softmax(g_conv_reshape) mask = tf.reshape(g_conv_reshape_softmax, [batch_size * batch_len, feature_h, feature_w, 1]) g_tmp = tf.tile( tf.reshape(g_conv_reshape_softmax, [batch_size * batch_len, feature_h, feature_w, 1]), [1, 1, 1, feature_c]) glimpse = tf.reduce_sum(tf.multiply(cnn_fmap_tile, g_tmp), [1, 2]) glimpse = tf.reshape(glimpse, [batch_size, batch_len, depth]) c_h_concat = tf.concat( [glimpse, tf.reshape(ld_output, [batch_size, batch_len, depth])], axis=-1) rnn_output = tf.layers.dense(c_h_concat, self.output_dim, name='compute2d_attention_layer/output_w', reuse=tf.AUTO_REUSE) output = tf.reshape(rnn_output, [1, -1, self.output_dim]) return output
def GenerationLoss(self, predictions_dict, scope=None): assert 'logits' or 'glyphs' in predictions_dict with tf.variable_scope(scope, 'Loss', list(predictions_dict.values())): glyphs = predictions_dict['glyphs'] ref_glyphs = tf.constant(np.load('data/glyphs-325-fonts.npy'), dtype=tf.float32) # 96 , 325, 32*32 #ref_glyphs_reshape = tf.reshape(ref_glyphs, [96*325, 32*32]) labels = self._groundtruth_dict['decoder_targets'] lengths = self._groundtruth_dict['decoder_lengths'] batch_size, batch_len = shape_utils.combined_static_and_dynamic_shape( labels) labels_indexes = tf.reshape( labels, [batch_size * batch_len ]) + 96 * predictions_dict['embedding_ids'] targets = tf.gather( ref_glyphs, labels_indexes) # batch_size * batch_len, 8, 32*32 targets_for_visual = tf.reshape( (targets + 1.0) * 127.5, [batch_size * batch_len, 32, 32, 1]) tf.summary.image('target_glyph1', (targets_for_visual[:20]), max_outputs=20) targets = tf.reshape(targets, [batch_size, batch_len, 32 * 32]) with tf.name_scope(scope, 'WeightedL1Loss'): raw_losses = tf.reduce_mean(tf.abs(glyphs - targets), axis=[2]) batch_size, max_time = shape_utils.combined_static_and_dynamic_shape( labels) mask = tf.less(tf.tile([tf.range(max_time)], [batch_size, 1]), tf.expand_dims(lengths, 1), name='mask') masked_losses = tf.multiply(raw_losses, tf.cast(mask, tf.float32), name='masked_losses') row_losses = tf.reduce_sum(masked_losses, 1, name='row_losses') losses_tmp = tf.truediv(row_losses, tf.cast(lengths, tf.float32)) loss_for_compare = tf.reduce_mean(losses_tmp) tf.summary.scalar('averged_L1_loss', loss_for_compare) loss = tf.reduce_sum(row_losses) loss = tf.truediv( loss, tf.cast(tf.maximum(batch_size, 1), tf.float32)) l1_loss_tensor = loss * 0.5 return l1_loss_tensor
def aspp_inference(logits, image): ''' logits: (N, H', W', nc), classification logits image: (N, H, W, _), image or label, just for shape inference) ''' shape_image = combined_static_and_dynamic_shape(image) logits = tf.image.resize_bilinear(logits, shape_image[1:3], align_corners=True) output = tf.argmax(logits, axis=-1, name='segmentation_output', output_type=tf.int32) return output
def _match_when_rows_are_empty(): """Performs matching when the rows of similarity matrix are empty. When the rows are empty, all detections are false positives. So we return a tensor of -1's to indicate that the columns do not match to any rows. Returns: matches: int32 tensor indicating the row each column matches to. """ similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( similarity_matrix) return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32)
def _predict(self, image_features, num_predictions_per_location): combined_feature_shape = shape_utils.combined_static_and_dynamic_shape(image_features) batch_size = combined_feature_shape[0] num_anchors = (combined_feature_shape[1] * combined_feature_shape[2]) code_size = 5 zero = tf.reduce_sum(0 * image_features) box_encodings = zero + tf.zeros((batch_size, num_anchors, 1, code_size), dtype=tf.float32) class_predictions_with_background = zero + tf.zeros((batch_size, num_anchors, self.num_classes + 1), dtype=tf.float32) return {box_predictor.BOX_ENCODINGS: box_encodings, box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def aspp_features(hlist, num_classes=19, alpha=1.0): ''' Args: hlist: list of three features, [1/8, 1/16, 1/32]. ''' h0, h1, h2 = hlist shape_h0 = combined_static_and_dynamic_shape(h0) shape_h1 = combined_static_and_dynamic_shape(h1) with ssdnet_argscope(): # merge h1 and h2, create 1/16 feature h2 = tf.depth_to_space(h2, 2) h12 = tf.concat([h1, h2], axis=-1) # 128 h12 = Conv2D('h12', h12, 256, 1, activation=BNReLU) with tf.variable_scope('top'): feat = Conv2D('conv1', h12, 256, 1, activation=BNReLU) with tf.variable_scope('se'): s = AvgPooling('avgpool', h12, 49, strides=(16, 20), padding='same') s = Conv2D('conv1', s, 256, 1, activation=None, use_bias=True) s = tf.sigmoid(s, name='sigmoid') s = tf.image.resize_bilinear(s, shape_h1[1:3], align_corners=True) feat = tf.multiply(feat, s) feat = tf.image.resize_bilinear(feat, shape_h0[1:3], align_corners=True) feat = DWConv('convd', feat, 5) feat_l = Conv2D('conv_h0', h0, 128, 1, activation=BNReLU) with argscope([Conv2D], use_bias=True): feat = Conv2D('logit_up', feat, num_classes, 1) feat_l = Conv2D('logit_h0', feat_l, num_classes, 1) out = tf.add(feat, alpha * feat_l, name='cls_logit') return out
def _match_when_rows_are_non_empty(): """Performs matching when the rows of similarity matrix are non empty. Returns: matches: int32 tensor indicating the row each column matches to. """ # Matches for each column matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32) # Deal with matched and unmatched threshold if self._matched_threshold is not None: # Get logical indices of ignored and unmatched columns as tf.int64 matched_vals = tf.reduce_max(similarity_matrix, 0) below_unmatched_threshold = tf.greater(self._unmatched_threshold, matched_vals) between_thresholds = tf.logical_and( tf.greater_equal(matched_vals, self._unmatched_threshold), tf.greater(self._matched_threshold, matched_vals)) if self._negatives_lower_than_unmatched: matches = self._set_values_using_indicator(matches, below_unmatched_threshold, -1) matches = self._set_values_using_indicator(matches, between_thresholds, -2) else: matches = self._set_values_using_indicator(matches, below_unmatched_threshold, -2) matches = self._set_values_using_indicator(matches, between_thresholds, -1) if self._force_match_for_each_row: similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( similarity_matrix) force_match_column_ids = tf.argmax(similarity_matrix, 1, output_type=tf.int32) force_match_column_indicators = ( tf.one_hot( force_match_column_ids, depth=similarity_matrix_shape[1]) * tf.cast(tf.expand_dims(valid_rows, axis=-1), dtype=tf.float32)) force_match_row_ids = tf.argmax(force_match_column_indicators, 0, output_type=tf.int32) force_match_column_mask = tf.cast( tf.reduce_max(force_match_column_indicators, 0), tf.bool) final_matches = tf.where(force_match_column_mask, force_match_row_ids, matches) return final_matches else: return matches
def predict(self, cnn_fmaps_mulscale, lstm_holistic_features, scope=None): with tf.variable_scope(scope, 'Predict'): predict = [] ### a two layer LSTM cell0 = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=True) if self._is_training: cell0 = tf.nn.rnn_cell.DropoutWrapper(cell=cell0, output_keep_prob=0.5) cell1 = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=True) if self._is_training: cell1 = tf.nn.rnn_cell.DropoutWrapper(cell=cell1, output_keep_prob=0.5) lstm_cell = tf.nn.rnn_cell.MultiRNNCell([cell0, cell1], state_is_tuple=True) char_embedding_array = tf.constant( np.identity(self.num_classes, dtype=np.float32)) with tf.variable_scope('decoder') as scope: ld_output, ld_output_states = tf.nn.dynamic_rnn( cell=lstm_cell, inputs=tf.nn.embedding_lookup( char_embedding_array, self._groundtruth_dict['decoder_inputs']), #sequence_length=tf.fill([batch_size], feature_w), initial_state=lstm_holistic_features, dtype=tf.float32, time_major=False, scope=scope) batch_size, batch_len, depth = shape_utils.combined_static_and_dynamic_shape( ld_output) rnn_output, glyphs, embeddings_ids = self.compute_att_2d( ld_output, cnn_fmaps_mulscale, 512) sample_id = tf.argmax(rnn_output, 2) if self._is_training: #assert isinstance(outputs, seq2seq.BasicDecoderOutput) outputs_dict = { 'labels': sample_id, 'logits': rnn_output, 'glyphs': glyphs, 'embedding_ids': embeddings_ids } else: outputs_dict = { 'labels': sample_id, 'scores': res_score, #'lengths': prediction_lengths } return outputs_dict
def tile_context_tensors(tensor_dict): """Tiles context fields to have num_frames along 0-th dimension.""" num_frames = tf.shape(tensor_dict[fields.InputDataFields.image])[0] for key in tensor_dict: if key not in fields.SEQUENCE_FIELDS: original_tensor = tensor_dict[key] tensor_shape = shape_utils.combined_static_and_dynamic_shape( original_tensor) tensor_dict[key] = tf.tile( tf.expand_dims(original_tensor, 0), tf.stack([num_frames] + [1] * len(tensor_shape), axis=0)) return tensor_dict
def _get_feature_map_spatial_dims(self, feature_maps): """Return list of spatial dimensions for each feature map in a list. Args: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i]. Returns: a list of pairs (height, width) for each feature map in feature_maps """ feature_map_shapes = [ shape_utils.combined_static_and_dynamic_shape(feature_map) for feature_map in feature_maps ] return [(shape[1], shape[2]) for shape in feature_map_shapes]
def matmul_gather_on_zeroth_axis(params, indices, scope=None): """Matrix multiplication based implementation of tf.gather on zeroth axis. TODO(rathodv, jonathanhuang): enable sparse matmul option. Args: params: A float32 Tensor. The tensor from which to gather values. Must be at least rank 1. indices: A Tensor. Must be one of the following types: int32, int64. Must be in range [0, params.shape[0]) scope: A name for the operation (optional). Returns: A Tensor. Has the same type as params. Values from params gathered from indices given by indices, with shape indices.shape + params.shape[1:]. """ with tf.name_scope(scope, 'MatMulGather'): params_shape = shape_utils.combined_static_and_dynamic_shape(params) indices_shape = shape_utils.combined_static_and_dynamic_shape(indices) params2d = tf.reshape(params, [params_shape[0], -1]) indicator_matrix = tf.one_hot(indices, params_shape[0]) gathered_result_flattened = tf.matmul(indicator_matrix, params2d) return tf.reshape(gathered_result_flattened, tf.stack(indices_shape + params_shape[1:]))
def _aggregate_recognition_results(self, text_list, scores_list, scope=None): """Aggregate recognition results by picking up ones with highest scores. Args text_list: a list of tensors with shape [batch_size] scores_list: a list of tensors with shape [batch_size] """ with tf.variable_scope(scope, 'AggregateRecognitionResults', (text_list + scores_list)): stacked_text = tf.stack(text_list, axis=1) stacked_scores = tf.stack(scores_list, axis=1) argmax_scores = tf.argmax(stacked_scores, axis=1) batch_size = shape_utils.combined_static_and_dynamic_shape(stacked_text)[0] indices = tf.stack([tf.range(batch_size, dtype=tf.int64), argmax_scores], axis=1) aggregated_text = tf.gather_nd(stacked_text, indices) aggregated_scores = tf.gather_nd(stacked_scores, indices) recognition_dict = {'text': aggregated_text, 'scores': aggregated_scores} return recognition_dict
def predict(self, cnn_fmap, lstm_holistic_features, scope=None): ''' if not isinstance(feature_maps, (list, tuple)): raise ValueError('`feature_maps` must be list of tuple') ''' with tf.variable_scope(scope, 'Predict'): cnn_fmaps_lastscale = cnn_fmap[-1] batch_size = shape_utils.combined_static_and_dynamic_shape( cnn_fmaps_lastscale)[0] decoder_cell = self._build_decoder_cell() decoder = self._build_decoder(decoder_cell, batch_size, lstm_holistic_features, cnn_fmap) outputs, _, output_lengths = seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=False, maximum_iterations=self._max_num_steps) # apply regularizer filter_weights = lambda vars: [ x for x in vars if x.op.name.endswith('kernel') ] tf.contrib.layers.apply_regularization( self._rnn_regularizer, filter_weights(decoder_cell.trainable_weights)) outputs_dict = None if self._is_training: assert isinstance(outputs, seq2seq.BasicDecoderOutput) outputs_dict = { 'labels': outputs.sample_id, 'logits': outputs.rnn_output, } else: assert isinstance(outputs, seq2seq.FinalBeamSearchDecoderOutput) prediction_labels = outputs.predicted_ids[:, :, 0] prediction_lengths = output_lengths[:, 0] prediction_scores = tf.gather_nd( outputs.beam_search_decoder_output.scores[:, :, 0], tf.stack([tf.range(batch_size), prediction_lengths - 1], axis=1)) outputs_dict = { 'labels': prediction_labels, 'scores': prediction_scores, 'lengths': prediction_lengths } return outputs_dict
def resize_image(image, masks=None, new_height=600, new_width=1024, method=tf.image.ResizeMethod.BILINEAR, align_corners=False): with tf.name_scope( 'ResizeImage', values=[image, new_height, new_width, method, align_corners]): new_image = tf.image.resize_images(image, tf.stack([new_height, new_width]), method=method, align_corners=align_corners) image_shape = shape_utils.combined_static_and_dynamic_shape(image) result = [new_image] result.append(tf.stack([new_height, new_width, image_shape[2]])) return result
def __call__(self, logits, labels, lengths, scope=None): """ Args: logits: float32 tensor with shape [batch_size, max_time, num_classes] labels: int32 tensor with shape [batch_size, max_time] lengths: int32 tensor with shape [batch_size] """ #print('raw_losses') #print(logits) with tf.name_scope(scope, 'SequenceCrossEntropyLoss', [logits, labels, lengths]): raw_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits ) #print(raw_losses) #input() batch_size, max_time = shape_utils.combined_static_and_dynamic_shape(labels) mask = tf.less( tf.tile([tf.range(max_time)], [batch_size, 1]), tf.expand_dims(lengths, 1), name='mask' ) masked_losses = tf.multiply( raw_losses, tf.cast(mask, tf.float32), name='masked_losses' ) # => [batch_size, max_time] row_losses = tf.reduce_sum(masked_losses, 1, name='row_losses') if self._sequence_normalize: loss = tf.truediv( row_losses, tf.cast(tf.maximum(lengths, 1), tf.float32), name='seq_normed_losses') loss = tf.reduce_sum(row_losses) if self._sample_normalize: loss = tf.truediv( loss, tf.cast(tf.maximum(batch_size, 1), tf.float32)) if self._weight: loss = loss * self._weight return loss
def nearest_neighbor_upsampling(input_tensor, scale=None, height_scale=None, width_scale=None): """Nearest neighbor upsampling implementation. Nearest neighbor upsampling function that maps input tensor with shape [batch_size, height, width, channels] to [batch_size, height * scale , width * scale, channels]. This implementation only uses reshape and broadcasting to make it TPU compatible. Args: input_tensor: A float32 tensor of size [batch, height_in, width_in, channels]. scale: An integer multiple to scale resolution of input data in both height and width dimensions. height_scale: An integer multiple to scale the height of input image. This option when provided overrides `scale` option. width_scale: An integer multiple to scale the width of input image. This option when provided overrides `scale` option. Returns: data_up: A float32 tensor of size [batch, height_in*scale, width_in*scale, channels]. Raises: ValueError: If both scale and height_scale or if both scale and width_scale are None. """ if not scale and (height_scale is None or width_scale is None): raise ValueError('Provide either `scale` or `height_scale` and' ' `width_scale`.') with tf.name_scope('nearest_neighbor_upsampling'): h_scale = scale if height_scale is None else height_scale w_scale = scale if width_scale is None else width_scale (batch_size, height, width, channels ) = shape_utils.combined_static_and_dynamic_shape(input_tensor) output_tensor = tf.reshape(input_tensor, [ batch_size, height, 1, width, 1, channels ]) * tf.ones([1, 1, h_scale, 1, w_scale, 1], dtype=input_tensor.dtype) return tf.reshape( output_tensor, [batch_size, height * h_scale, width * w_scale, channels])
def _predict(self, image_features, **kwargs): image_feature = image_features[0] combined_feature_shape = shape_utils.combined_static_and_dynamic_shape( image_feature) batch_size = combined_feature_shape[0] num_anchors = (combined_feature_shape[1] * combined_feature_shape[2]) code_size = 4 zero = tf.reduce_sum(0 * image_feature) num_class_slots = self.num_classes if self._add_background_class: num_class_slots = num_class_slots + 1 box_encodings = zero + tf.zeros( (batch_size, num_anchors, 1, code_size), dtype=tf.float32) class_predictions_with_background = zero + tf.zeros( (batch_size, num_anchors, num_class_slots), dtype=tf.float32) predictions_dict = { box_predictor.BOX_ENCODINGS: box_encodings, box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background } return predictions_dict
def tile_activation_maps_rows_cols(maps, num_rows, num_cols): """ Args: maps: [batch_size, map_height, map_width, map_depth] Return: tiled_map: [batch_size, tiled_height, tiled_width] """ batch_size, map_height, map_width, map_depth = \ shape_utils.combined_static_and_dynamic_shape(maps) # padding num_maps = num_rows * num_cols padded_map = tf.cond(tf.greater(num_maps, map_depth), true_fn=lambda: tf.pad( maps, [[0, 0], [0, 0], [0, 0], [0, tf.maximum(num_maps - map_depth, 0)]]), false_fn=lambda: maps[:, :, :, :num_maps]) # reshape to [batch_size, map_height, map_width, num_rows, num_cols] reshaped_map = tf.reshape( padded_map, [batch_size, map_height, map_width, num_rows, num_cols]) # unstack and concat along widths width_concated_maps = tf.concat( tf.unstack( reshaped_map, axis=4 ), # => list of [batch_size, map_height, map_width, num_rows] axis=2) # => [batch_size, map_height, map_width * num_cols, num_rows] tiled_map = tf.concat( tf.unstack( width_concated_maps, axis=3 ), # => list of [batch_size, map_height, map_width * num_cols] axis=1) # => [batch_size, map_height * num_rows, map_width * num_cols] tiled_map = tf.expand_dims(tiled_map, axis=3) return tiled_map
def nearest_neighbor_upsampling(input_tensor, scale): """Nearest neighbor upsampling implementation. Nearest neighbor upsampling function that maps input tensor with shape [batch_size, height, width, channels] to [batch_size, height * scale , width * scale, channels]. This implementation only uses reshape and tile to make it compatible with certain hardware. Args: input_tensor: A float32 tensor of size [batch, height_in, width_in, channels]. scale: An integer multiple to scale resolution of input data. Returns: data_up: A float32 tensor of size [batch, height_in*scale, width_in*scale, channels]. """ shape = shape_utils.combined_static_and_dynamic_shape(input_tensor) shape_before_tile = [shape[0], shape[1], 1, shape[2], 1, shape[3]] shape_after_tile = [shape[0], shape[1] * scale, shape[2] * scale, shape[3]] data_reshaped = tf.reshape(input_tensor, shape_before_tile) resized_tensor = tf.tile(data_reshaped, [1, 1, scale, 1, scale, 1]) resized_tensor = tf.reshape(resized_tensor, shape_after_tile) return resized_tensor
def _create_regression_targets(self, anchors, groundtruth_boxes, match): """Returns a regression target for each anchor. Args: anchors: a BoxList representing N anchors groundtruth_boxes: a BoxList representing M groundtruth_boxes match: a matcher.Match object Returns: reg_targets: a float32 tensor with shape [N, box_code_dimension] """ matched_gt_boxes = match.gather_based_on_match( groundtruth_boxes.get(), unmatched_value=tf.zeros(4), ignored_value=tf.zeros(4)) matched_gt_boxlist = box_list.BoxList(matched_gt_boxes) if groundtruth_boxes.has_field(fields.BoxListFields.keypoints): groundtruth_keypoints = groundtruth_boxes.get_field( fields.BoxListFields.keypoints) matched_keypoints = match.gather_based_on_match( groundtruth_keypoints, unmatched_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]), ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:])) matched_gt_boxlist.add_field(fields.BoxListFields.keypoints, matched_keypoints) matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors) match_results_shape = shape_utils.combined_static_and_dynamic_shape( match.match_results) # Zero out the unmatched and ignored regression targets. unmatched_ignored_reg_targets = tf.tile( self._default_regression_target(), [match_results_shape[0], 1]) matched_anchors_mask = match.matched_column_indicator() reg_targets = tf.where(matched_anchors_mask, matched_reg_targets, unmatched_ignored_reg_targets) return reg_targets
def _compute_clip_window(self, preprocessed_images, true_image_shapes): """Computes clip window to use during post_processing. Computes a new clip window to use during post-processing based on `resized_image_shapes` and `true_image_shapes` only if `preprocess` method has been called. Otherwise returns a default clip window of [0, 0, 1, 1]. Args: preprocessed_images: the [batch, height, width, channels] image tensor. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Or None if the clip window should cover the full image. Returns: a 2-D float32 tensor of the form [batch_size, 4] containing the clip window for each image in the batch in normalized coordinates (relative to the resized dimensions) where each clip window is of the form [ymin, xmin, ymax, xmax] or a default clip window of [0, 0, 1, 1]. """ if true_image_shapes is None: return tf.constant([0, 0, 1, 1], dtype=tf.float32) resized_inputs_shape = shape_utils.combined_static_and_dynamic_shape( preprocessed_images) true_heights, true_widths, _ = tf.unstack( tf.to_float(true_image_shapes), axis=1) padded_height = tf.to_float(resized_inputs_shape[1]) padded_width = tf.to_float(resized_inputs_shape[2]) return tf.stack([ tf.zeros_like(true_heights), tf.zeros_like(true_widths), true_heights / padded_height, true_widths / padded_width ], axis=1)
def _batch_decode(self, box_encodings): """Decodes a batch of box encodings with respect to the anchors. Args: box_encodings: A float32 tensor of shape [batch_size, num_anchors, box_code_size] containing box encodings. Returns: decoded_boxes: A float32 tensor of shape [batch_size, num_anchors, 4] containing the decoded boxes. decoded_keypoints: A float32 tensor of shape [batch_size, num_anchors, num_keypoints, 2] containing the decoded keypoints if present in the input `box_encodings`, None otherwise. """ combined_shape = shape_utils.combined_static_and_dynamic_shape( box_encodings) batch_size = combined_shape[0] tiled_anchor_boxes = tf.tile(tf.expand_dims(self.anchors.get(), 0), [batch_size, 1, 1]) tiled_anchors_boxlist = box_list.BoxList( tf.reshape(tiled_anchor_boxes, [-1, 4])) decoded_boxes = self._box_coder.decode( tf.reshape(box_encodings, [-1, self._box_coder.code_size]), tiled_anchors_boxlist) decoded_keypoints = None if decoded_boxes.has_field(fields.BoxListFields.keypoints): decoded_keypoints = decoded_boxes.get_field( fields.BoxListFields.keypoints) num_keypoints = decoded_keypoints.get_shape()[1] decoded_keypoints = tf.reshape( decoded_keypoints, tf.stack( [combined_shape[0], combined_shape[1], num_keypoints, 2])) decoded_boxes = tf.reshape( decoded_boxes.get(), tf.stack([combined_shape[0], combined_shape[1], 4])) return decoded_boxes, decoded_keypoints
def aspp_losses(cls_logits, labels, num_classes): ''' Args: labels: (H, W) label image cls_logits: (H', W', nc) logits For now, H' and W' are H/8 and W/8, respectively. ''' # shape_labels = combined_static_and_dynamic_shape(labels) # cls_logits = tf.image.resize_bilinear(cls_logits, shape_labels[1:3], align_corners=True) shape_logits = combined_static_and_dynamic_shape(cls_logits) labels = tf.image.resize_nearest_neighbor(labels, shape_logits[1:3], align_corners=True) logits = tf.reshape(cls_logits, [-1, shape_logits[-1]]) labels = tf.reshape(labels, [-1]) idx = tf.logical_and(tf.greater_equal(labels, 0), tf.less(labels, num_classes)) idx = tf.where(idx)[:, 0] valid_logits = tf.gather(logits, idx) valid_labels = tf.gather(labels, idx) # cls_loss = focal_loss(labels=valid_labels, logits=valid_logits) cls_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=valid_labels, logits=valid_logits) cls_loss = tf.reduce_mean(cls_loss, name='cls_loss') correct = tf.equal( valid_labels, tf.argmax(valid_logits, axis=-1, output_type=valid_labels.dtype)) acc = tf.reduce_mean(tf.cast(correct, tf.float32), name='accuracy') add_moving_summary(cls_loss, acc) # return the loss return cls_loss
def compute_att_2d(self, ld_output, cnn_fmaps_mulscale, d): with tf.variable_scope('decoder/compute2d_attention_layer'): cnn_fmaps_s1 = cnn_fmaps_mulscale[0] # 24 * 80 cnn_fmaps_s2 = cnn_fmaps_mulscale[1] # 12 * 40 cnn_fmaps_s3 = cnn_fmaps_mulscale[2] # 6 * 40 cnn_fmaps_lastscale = cnn_fmaps_mulscale[-1] # 6 * 40 batch_size, batch_len, depth = shape_utils.combined_static_and_dynamic_shape( ld_output) _, feature_h, feature_w, feature_c = cnn_fmaps_lastscale.get_shape( ).as_list() ## for lstm outputs ld_output = tf.reshape(ld_output, (batch_size * batch_len, depth)) ld_output_reshape = tf.reshape( ld_output, [batch_size * batch_len, 1, 1, depth]) ld_output_conv = conv2d(ld_output_reshape, d, 1, activation_fn=None, normalizer_fn=None, scope='hs_conv') ld_output_conv_tile = tf.tile(ld_output_conv, [1, feature_h, feature_w, 1]) cnn_fmap_conv = conv2d(cnn_fmaps_lastscale, d, 3, activation_fn=None, normalizer_fn=None, scope='fmap_conv') cnn_fmap_tile = tf.expand_dims(cnn_fmap_conv, 1) cnn_fmap_tile = tf.tile(cnn_fmap_tile, [1, batch_len, 1, 1, 1]) cnn_fmap_tile = tf.reshape( cnn_fmap_tile, [batch_size * batch_len, feature_h, feature_w, feature_c]) g = tf.nn.tanh(tf.add(cnn_fmap_tile, ld_output_conv_tile)) g = tf.nn.dropout(g, 0.5) g_conv = conv2d(g, 1, 1, scope='g_conv', activation_fn=None, normalizer_fn=None) g_conv_reshape = tf.reshape( g_conv, [batch_size * batch_len, feature_w * feature_h]) g_conv_reshape_softmax = tf.nn.softmax(g_conv_reshape) mask = tf.reshape( g_conv_reshape_softmax, [batch_size * batch_len, feature_h, feature_w, 1]) tf.summary.image('Mask1', (mask[:20]), max_outputs=20) g_tmp = tf.tile( tf.reshape(g_conv_reshape_softmax, [batch_size * batch_len, feature_h, feature_w, 1]), [1, 1, 1, feature_c]) glimpse = tf.reduce_sum(tf.multiply(cnn_fmap_tile, g_tmp), [1, 2]) _, cnn_fmap_s1_h, cnn_fmap_s1_w, cnn_fmap_s1_c = cnn_fmaps_s1.get_shape( ).as_list() _, cnn_fmap_s2_h, cnn_fmap_s2_w, cnn_fmap_s2_c = cnn_fmaps_s2.get_shape( ).as_list() _, cnn_fmap_s3_h, cnn_fmap_s3_w, cnn_fmap_s3_c = cnn_fmaps_s3.get_shape( ).as_list() mask_s3 = tf.tile(mask, [1, 1, 1, cnn_fmap_s3_c]) #bs_bl, 6 ,40 ,1 mask_s2 = tf.tile( tf.image.resize_bilinear(mask, [cnn_fmap_s2_h, cnn_fmap_s2_w]), [1, 1, 1, cnn_fmap_s2_c]) mask_s1 = tf.tile( tf.image.resize_bilinear(mask, [cnn_fmap_s1_h, cnn_fmap_s1_w]), [1, 1, 1, cnn_fmap_s1_c]) # cnn_fmaps_s1 ( bs * 24 * 80 * c ) cnn_fmap_s1_tile = tf.expand_dims(cnn_fmaps_s1, 1) cnn_fmap_s1_tile = tf.tile(cnn_fmap_s1_tile, [1, batch_len, 1, 1, 1]) cnn_fmap_s1_tile = tf.reshape(cnn_fmap_s1_tile, [ batch_size * batch_len, cnn_fmap_s1_h, cnn_fmap_s1_w, cnn_fmap_s1_c ]) glimpse_s1 = tf.multiply(cnn_fmap_s1_tile, mask_s1) cnn_fmap_s2_tile = tf.expand_dims(cnn_fmaps_s2, 1) cnn_fmap_s2_tile = tf.tile(cnn_fmap_s2_tile, [1, batch_len, 1, 1, 1]) cnn_fmap_s2_tile = tf.reshape(cnn_fmap_s2_tile, [ batch_size * batch_len, cnn_fmap_s2_h, cnn_fmap_s2_w, cnn_fmap_s2_c ]) glimpse_s2 = tf.multiply(cnn_fmap_s2_tile, mask_s2) cnn_fmap_s3_tile = tf.expand_dims(cnn_fmaps_s3, 1) cnn_fmap_s3_tile = tf.tile(cnn_fmap_s3_tile, [1, batch_len, 1, 1, 1]) cnn_fmap_s3_tile = tf.reshape(cnn_fmap_s3_tile, [ batch_size * batch_len, cnn_fmap_s3_h, cnn_fmap_s3_w, cnn_fmap_s3_c ]) glimpse_s3 = tf.multiply(cnn_fmap_s3_tile, mask_s3) glimpse_s1_reshape = tf.reshape(glimpse_s1, [ batch_size * batch_len, cnn_fmap_s1_h * cnn_fmap_s1_w, cnn_fmap_s1_c ]) glimpse_s1_reshape = tf.reshape(glimpse_s1_reshape, [ batch_size * batch_len, cnn_fmap_s1_c, cnn_fmap_s1_h * cnn_fmap_s1_w ]) glimpse_s2_reshape = tf.reshape(glimpse_s2, [ batch_size * batch_len, cnn_fmap_s2_h * cnn_fmap_s2_w, cnn_fmap_s2_c ]) glimpse_s2_reshape = tf.reshape(glimpse_s2_reshape, [ batch_size * batch_len, cnn_fmap_s2_c, cnn_fmap_s2_h * cnn_fmap_s2_w ]) glimpse_s3_reshape = tf.reshape(glimpse_s3, [ batch_size * batch_len, cnn_fmap_s3_h * cnn_fmap_s3_w, cnn_fmap_s3_c ]) glimpse_s3_reshape = tf.reshape(glimpse_s3_reshape, [ batch_size * batch_len, cnn_fmap_s3_c, cnn_fmap_s3_h * cnn_fmap_s3_w ]) glimpse_s1_resize_ = fully_connected(glimpse_s1_reshape, 16 * 16) glimpse_s2_resize_ = fully_connected(glimpse_s2_reshape, 8 * 8) glimpse_s3_resize_ = fully_connected(glimpse_s3_reshape, 4 * 4) glimpse_s1_resize = tf.reshape( glimpse_s1_resize_, [batch_size * batch_len, 16 * 16, cnn_fmap_s1_c]) glimpse_s1_resize = tf.reshape( glimpse_s1_resize, [batch_size * batch_len, 16, 16, cnn_fmap_s1_c]) glimpse_s2_resize = tf.reshape( glimpse_s2_resize_, [batch_size * batch_len, 8 * 8, cnn_fmap_s2_c]) glimpse_s2_resize = tf.reshape( glimpse_s2_resize, [batch_size * batch_len, 8, 8, cnn_fmap_s2_c]) glimpse_s3_resize = tf.reshape( glimpse_s3_resize_, [batch_size * batch_len, 4 * 4, cnn_fmap_s3_c]) glimpse_s3_resize = tf.reshape( glimpse_s3_resize, [batch_size * batch_len, 4, 4, cnn_fmap_s3_c]) embeddings_ids = tf.random_uniform([batch_size * batch_len], minval=0, maxval=325, dtype=tf.int64) embeddings_fordeconv = tf.gather(self._embeddings, embeddings_ids) glimpse_fordeconv = tf.reshape( glimpse, [batch_size * batch_len, 1, 1, depth]) concat_feat = tf.concat([glimpse_fordeconv, embeddings_fordeconv], axis=-1) d1 = conv2d_transpose(concat_feat, 128, [2, 2], [2, 2], normalizer_fn=batch_norm, scope='gly_deconv_1') d2 = conv2d_transpose(d1, 64, [3, 3], [2, 2], normalizer_fn=batch_norm, scope='gly_deconv_2') d3 = conv2d_transpose(tf.concat([d2, glimpse_s3_resize], axis=-1), 32, [3, 3], [2, 2], normalizer_fn=batch_norm, scope='gly_deconv_3') d4 = conv2d_transpose(tf.concat([d3, glimpse_s2_resize], axis=-1), 16, [3, 3], [2, 2], normalizer_fn=batch_norm, scope='gly_deconv_4') d5 = conv2d_transpose(tf.concat([d4, glimpse_s1_resize], axis=-1), 1, [3, 3], [2, 2], activation_fn=tf.nn.tanh, scope='gly_deconv_5') glyph = d5 # batch_size * batchlen , 32, 32, 1 glyph = tf.reshape(glyph, [batch_size * batch_len, 32 * 32 ]) # batch_size * batchlen , 32 * 32 glyph_for_visual = tf.reshape((glyph + 1.0) * 127.5, [batch_size * batch_len, 32, 32, 1]) tf.summary.image('glyph1', (glyph_for_visual[:20]), max_outputs=20) glyph_output = tf.reshape(glyph, [batch_size, batch_len, 32 * 32 ]) # batch_size , batchlen , 32 * 32 glimpse = tf.reshape(glimpse, [batch_size, batch_len, depth]) c_h_concat = tf.concat([ glimpse, tf.reshape(ld_output, [batch_size, batch_len, depth]) ], axis=-1) rnn_output = tf.layers.dense(c_h_concat, self.num_classes, name='output_w') return rnn_output, glyph_output, embeddings_ids