def _cumsum_gradient(score_map, box): b, n, m, c = utils.get_tensor_shape(score_map) _, p, _ = utils.get_tensor_shape(box) expanded_box = _get_expanded_box(box, img_h=n, img_w=m, border_ratio=border_ratio) (box_h, box_w) = _get_box_shape(box) (expanded_box_h, expanded_box_w) = _get_box_shape(expanded_box) cumsum = imgproc.calc_cumsum_2d( score_map, tf.concat([box, expanded_box], axis=1)) area = tf.expand_dims(tf.cast(box_h * box_w, tf.float32), axis=-1) area_border = tf.expand_dims(tf.cast( expanded_box_h * expanded_box_w - box_h * box_w, tf.float32), axis=-1) avg_val = tf.div(cumsum[:, :p, :], tf.maximum(_SMALL_NUMBER, area)) avg_val_in_border = tf.div(cumsum[:, p:, :] - cumsum[:, :p, :], tf.maximum(_SMALL_NUMBER, area_border)) return avg_val - avg_val_in_border
def calc_cumsum_2d(image, box): """Computes the cumulative sum give pre-defiend boxes. i_a (ymin, xmin), ..., i_b (ymin, xmax) i_c (ymax, xmin), ..., i_d (ymax, xmax) Args: image: 4-D float `Tensor` of size [b, n, m, c], representing `b` images with height `n`, width `m`, and channels `c`. box: 3-D int64 `Tensor` of size [b, p, 4], representing `b` examples each with `p` proposals in the format of [ymin, xmin, ymax, xmax]. Returns: cumsum: 3-D float `Tensor` of size [b, p, c], channel-wise cumulative sum. """ b, n, m, c = utils.get_tensor_shape(image) _, p, _ = utils.get_tensor_shape(box) cumsum = calc_integral_image(image) ymin, xmin, ymax, xmax = tf.unstack(box, axis=-1) i = tf.range(tf.cast(b, tf.int64), dtype=tf.int64) i = tf.tile(tf.expand_dims(i, axis=-1), [1, p]) i_a = tf.gather_nd(cumsum, tf.stack([i, ymin, xmin], axis=-1)) i_b = tf.gather_nd(cumsum, tf.stack([i, ymin, xmax], axis=-1)) i_c = tf.gather_nd(cumsum, tf.stack([i, ymax, xmin], axis=-1)) i_d = tf.gather_nd(cumsum, tf.stack([i, ymax, xmax], axis=-1)) return i_d + i_a - i_b - i_c
def extract_labels(self, examples): """Extracts the pseudo labels. Args: examples: A dictionary involving image-level annotations. Returns: labels: A [batch, num_classes] tensor denoting the presence of classes. """ with tf.name_scope('extend_match_extractor'): items = self._name2id.items() keys = [k for k, v in items] values = [v for k, v in items] table = tf.contrib.lookup.HashTable( initializer=tf.contrib.lookup.KeyValueTensorInitializer(keys, values), default_value=self. num_classes) # Class ID for Out-of-Vocabulary words. ids = table.lookup(examples[InputDataFields.concat_caption_string]) labels = tf.one_hot( indices=ids, depth=1 + self.num_classes, dtype=tf.float32) batch, num_tokens = utils.get_tensor_shape( examples[InputDataFields.concat_caption_string]) labels = tf.cond( num_tokens > 0, true_fn=lambda: tf.reduce_max(labels, axis=1)[:, :-1], false_fn=lambda: tf.zeros(shape=[batch, self.num_classes])) return labels
def _match_labels(class_texts, vocabulary_list): """Matches labels from texts. Args: class_texts: A [batch, num_tokens] string tensor. Returns: A [batch, num_classes] float tensor. """ keys = [class_name for class_id, class_name in enumerate(vocabulary_list)] values = [class_id for class_id, class_name in enumerate(vocabulary_list)] table = tf.contrib.lookup.HashTable( initializer=tf.contrib.lookup.KeyValueTensorInitializer(keys, values), default_value=len( vocabulary_list)) # Class ID for Out-of-Vocabulary words. ids = table.lookup(class_texts) labels = tf.one_hot( indices=ids, depth=1 + len(vocabulary_list), dtype=tf.float32) batch, num_tokens = utils.get_tensor_shape(class_texts) labels = tf.cond( num_tokens > 0, true_fn=lambda: tf.reduce_max(labels, axis=1)[:, :-1], false_fn=lambda: tf.zeros(shape=[batch, len(vocabulary_list)])) return labels
def resize_image_to_size(image, new_height=600, new_width=1024, method=tf.image.ResizeMethod.BILINEAR, align_corners=False): """Resizes images to the given height and width. Args: image: A 3D tensor of shape [height, width, channels] new_height: (optional) (scalar) desired height of the image. new_width: (optional) (scalar) desired width of the image. method: (optional) interpolation method used in resizing. Defaults to BILINEAR. align_corners: bool. If true, exactly align all 4 corners of the input and output. Defaults to False. Returns: resized_image: A tensor of size [new_height, new_width, channels]. resized_image_shape: A 1D tensor of shape [3] containing the shape of the resized image. """ with tf.name_scope("resize_image_to_size"): new_image = tf.image.resize_images(image, tf.stack([new_height, new_width]), method=method, align_corners=align_corners) image_shape = utils.get_tensor_shape(image) return new_image, tf.stack([new_height, new_width, image_shape[2]])
def _extract_class_label(self, class_texts, vocabulary_list): """Extracts class labels. Args: class_texts: a [batch, max_num_objects] string tensor. vocabulary_list: a list of words of length `num_classes`. Returns: labels: a [batch, num_classes] float tensor. """ with tf.name_scope('extract_class_label'): batch, _ = utils.get_tensor_shape(class_texts) categorical_col = tf.feature_column.categorical_column_with_vocabulary_list( key='name_to_id', vocabulary_list=vocabulary_list, num_oov_buckets=1) indicator_col = tf.feature_column.indicator_column(categorical_col) indicator = tf.feature_column.input_layer( {'name_to_id': class_texts}, feature_columns=[indicator_col]) labels = tf.cast(indicator[:, :-1] > 0, tf.float32) # if isinstance(batch, int): # labels.set_shape([batch, len(vocabulary_list)]) # else: # labels.set_shape([None, len(vocabulary_list)]) labels.set_shape([batch, len(vocabulary_list)]) return labels
def _batch_scale_box_fn(examples): (image, image_shape, object_boxes, proposal_boxes) = (examples[InputDataFields.image], examples[InputDataFields.image_shape], examples[InputDataFields.object_boxes], examples[InputDataFields.proposals]) _, pad_h, pad_w, _ = utils.get_tensor_shape(image) img_h, img_w, _ = tf.unstack(image_shape, axis=-1) def _scale_box(box): ymin, xmin, ymax, xmax = tf.unstack(box, axis=-1) ymin = ymin * tf.to_float(tf.expand_dims(img_h, axis=-1)) / tf.to_float(pad_h) xmin = xmin * tf.to_float(tf.expand_dims(img_w, axis=-1)) / tf.to_float(pad_w) ymax = ymax * tf.to_float(tf.expand_dims(img_h, axis=-1)) / tf.to_float(pad_h) xmax = xmax * tf.to_float(tf.expand_dims(img_w, axis=-1)) / tf.to_float(pad_w) return tf.stack([ymin, xmin, ymax, xmax], axis=-1) examples[InputDataFields.object_boxes] = _scale_box(object_boxes) examples[InputDataFields.proposals] = _scale_box(proposal_boxes) return examples
def _extract_class_label(self, num_captions, caption_strings, caption_lengths, vocabulary_list): """Encodes labels. Args: num_captions: a [batch] int tensor, should always be ONE. caption_strings: a [batch, num_captions, max_caption_len] string tensor. caption_lengths: a [batch, num_captions] int tensor. vocabulary_list: a list of words of length `num_classes`. Returns: class_label: a [batch, num_classes] float tensor. """ with tf.name_scope('extract_class_label'): batch, num_captions, max_caption_len = utils.get_tensor_shape( caption_strings) caption_string = caption_strings[:, 0, :] caption_length = caption_lengths[:, 0] categorical_col = tf.feature_column.categorical_column_with_vocabulary_list( key='name_to_class_id', vocabulary_list=vocabulary_list, num_oov_buckets=1) indicator_col = tf.feature_column.indicator_column(categorical_col) indicator = tf.feature_column.input_layer( {'name_to_class_id': caption_strings}, feature_columns=[indicator_col]) class_label = tf.cast(indicator[:, :-1] > 0, tf.float32) class_label.set_shape([batch, len(vocabulary_list)]) return class_label
def _calc_anchor_scores(self, class_activation_map, anchors, resize_height=224, resize_width=224, num_boxes_per_class=100): """Calculates class activation box based on the class activation map. Args: class_act_map: A [batch, height, width, num_classes] float tensor. anchor_boxes: A [batch, number_of_anchors, 4] float tensor. Returns: anchor_scores: A [batch, number_of_anchors, num_classes] tensor. """ with tf.name_scope('calc_anchor_scores'): class_activation_map = tf.image.resize_images( class_activation_map, [resize_height, resize_width]) batch, height, width, num_classes = utils.get_tensor_shape( class_activation_map) ymin, xmin, ymax, xmax = tf.unstack(anchors, axis=-1) anchors_absolute = tf.stack([ tf.to_int64(tf.round(ymin * tf.to_float(height))), tf.to_int64(tf.round(xmin * tf.to_float(width))), tf.to_int64(tf.round(ymax * tf.to_float(height))), tf.to_int64(tf.round(xmax * tf.to_float(width))) ], axis=-1) fn = model_utils.build_proposal_saliency_fn(func_name='wei', border_ratio=0.2, purity_weight=1.0) anchor_scores = fn(class_activation_map, anchors_absolute) return anchor_scores
def _encode_captions(self, caption_strings, vocabulary_list, common_dimensions=300, scope="coco_word_embedding", is_training=False): """Builds caption model. Args: caption_strings: captions in the batch, a [num_captions_in_batch, max_caption_length] string tensor. vocabulary_list: words in the vocabulary, a list of python strings. common_dimensions: dimensions of the word embedding. is_training: if True, training graph is built. Returns: text_feature: embedding of each word, a [num_captions_in_batch, max_caption_length, common_dimensions] tensor. """ (num_captions_in_batch, max_caption_length) = utils.get_tensor_shape(caption_strings) caption_strings_flattened = tf.reshape(caption_strings, [-1]) text_feature_flattened = self._encode_words(caption_strings_flattened, common_dimensions, vocabulary_list) text_feature = tf.reshape( text_feature_flattened, [num_captions_in_batch, max_caption_length, common_dimensions]) return text_feature
def build_loss(self, predictions, examples, **kwargs): """Build tf graph to compute loss. Args: predictions: dict of prediction results keyed by name. examples: dict of inputs keyed by name. Returns: loss_dict: dict of loss tensors keyed by name. """ loss_dict = {} with tf.name_scope('losses'): # Extract image-level labels. labels = self._extract_class_label( class_texts=examples[InputDataFields.caption_strings], vocabulary_list=self._vocabulary_list) # Loss of the multi-instance detection network. midn_logits = predictions[OICRPredictions.midn_logits] losses = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=midn_logits) loss_dict['midn_cross_entropy_loss'] = tf.reduce_mean(losses) # Losses of the online instance classifier refinement network. options = self._model_proto (num_proposals, proposals, proposal_scores_0) = ( predictions[DetectionResultFields.num_proposals], predictions[DetectionResultFields.proposal_boxes], predictions[OICRPredictions.midn_proba_r_given_c]) batch, max_num_proposals, _ = utils.get_tensor_shape( proposal_scores_0) proposal_scores_0 = tf.concat([ tf.fill([batch, max_num_proposals, 1], 0.0), proposal_scores_0 ], axis=-1) for i in range(options.oicr_iterations): proposal_scores_1 = predictions[ OICRPredictions.oicr_proposal_scores + '_at_{}'.format(i + 1)] loss_dict['oicr_cross_entropy_loss_at_{}'.format( i + 1)] = self._calc_oicr_loss( labels, num_proposals, proposals, proposal_scores_0, proposal_scores_1, scope='oicr_{}'.format(i + 1), iou_threshold=options.oicr_iou_threshold) proposal_scores_0 = proposal_scores_1 return loss_dict
def gaussian_filter(inputs, ksize=3): """Applies Gaussian filter to the inputs. Args: inputs: input images, a [batch, height, width, channels] float tensor. ksize: aperture size of the Gaussian kernel. Returns: outputs: output images, a [batch, height, width, channels] float tensor. """ batch, height, width, channels = utils.get_tensor_shape(inputs) kernel = gaussian_kernel(ksize) kernel = tf.reshape(tf.constant(kernel), [ksize, ksize, 1, 1]) outputs = [] channel_images = tf.split(inputs, num_or_size_splits=channels, axis=-1) for channel_image in channel_images: outputs.append( tf.nn.conv2d( channel_image, kernel, [1, 1, 1, 1], padding='SAME', data_format="NHWC", name="gaussian_filter")) return tf.concat(outputs, axis=-1)
def _extract_text_feature(self, text_strings, text_lengths, vocabulary_list, initial_embedding=None, embedding_dims=50, trainable=True, max_norm=None): """Extracts text feature. Args: text_strings: A [batch, max_text_length] string tensor. text_lengths: A [batch] int tensor. vocabulary_list: A list of words. Returns: text_features: a [batch, max_text_length, feature_dims] float tensor. """ batch, max_text_length = utils.get_tensor_shape(text_strings) text_strings_flattented = tf.reshape(text_strings, [-1]) token_ids_flatterned, text_features_flattened = self._encode_tokens( text_strings_flattented, embedding_dims, vocabulary_list, initial_embedding, trainable) token_ids = tf.reshape(token_ids_flatterned, [batch, max_text_length]) text_features = tf.reshape(text_features_flattened, [batch, max_text_length, embedding_dims]) return token_ids, text_features
def _batch_resize_image_fn(examples): # Resize image, height and width denote the padding size. image = examples[InputDataFields.image] _, height, width, channels = utils.get_tensor_shape(image) index = tf.random_uniform([], minval=0, maxval=len(options.batch_resize_scale_value), dtype=tf.int32) scale_h = scale_w = tf.gather([x for x in options.batch_resize_scale_value], index) new_height = tf.to_int32(tf.round(scale_h * tf.to_float(height))) new_width = tf.to_int32(tf.round(scale_w * tf.to_float(width))) new_image = tf.image.resize_images(image, tf.stack([new_height, new_width])) examples[InputDataFields.image] = new_image # Modify the image_shape, height and width denote the image size. image_shape = examples[InputDataFields.image_shape] height, width, channels = tf.unstack(image_shape, axis=-1) new_height = tf.to_int32(tf.round(scale_h * tf.to_float(height))) new_width = tf.to_int32(tf.round(scale_w * tf.to_float(width))) new_image_shape = tf.stack([new_height, new_width, channels], axis=-1) examples[InputDataFields.image_shape] = new_image_shape return examples
def sample_negatives_randomly(num_captions, caption_strings, caption_lengths): """Samples negative examples randomly. Args: num_pos_captions: number of captions of each example, a [batch] int tensor. caption_strings: caption data, a [batch, max_num_captions, max_caption_length] string tensor. caption_lengths: length of each caption, a [batch, max_num_captions] int tensor. Returns: num_neg_captions: number of captions of each example, a [batch] int tensor. neg_caption_strings: caption data, a [batch, max_num_captions, max_caption_length] string tensor. neg_caption_lengths: length of each caption, a [batch, max_num_captions] int tensor. """ batch = utils.get_tensor_shape(num_captions)[0] offset = tf.random_uniform([batch], maxval=batch - 1, dtype=tf.int32) index = tf.range(batch, dtype=tf.int32) sampled_index = tf.mod(index + offset, batch) return (tf.gather(num_captions, sampled_index), tf.gather(caption_strings, sampled_index), tf.gather(caption_lengths, sampled_index))
def _predict_image_score_map(self, examples): """Builds tf graph for . Args: examples: dict of input tensors keyed by name. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training image = examples[InputDataFields.image] # Keep image size for resizing saliency and activation map later. batch, height, width, channels = utils.get_tensor_shape(image) class_act_map_predictions = self._calc_class_act_map(examples) class_act_map = class_act_map_predictions[VOCPredictions.class_act_map] def _resize_fn(image, ksize=32): resized_image = tf.image.resize_images(image, [height, width]) if ksize: smoothed_image = imgproc.gaussian_filter(resized_image, ksize=ksize) else: smoothed_image = resized_image return smoothed_image predictions = { VOCPredictionTasks.image_saliency: tf.zeros([batch, height, width, 1]), VOCPredictionTasks.image_score_map: _resize_fn(class_act_map), } return predictions
def _build_midn_network(self, num_proposals, proposal_features, num_classes=20): """Builds the Multiple Instance Detection Network. MIDN: An attention network. Args: num_proposals: A [batch] int tensor. proposal_features: A [batch, max_num_proposals, features_dims] float tensor. num_classes: Number of classes. Returns: logits: A [batch, num_classes] float tensor. proba_r_given_c: A [batch, max_num_proposals, num_classes] float tensor. """ with tf.name_scope('multi_instance_detection'): batch, max_num_proposals, _ = utils.get_tensor_shape(proposal_features) mask = tf.sequence_mask( num_proposals, maxlen=max_num_proposals, dtype=tf.float32) mask = tf.expand_dims(mask, axis=-1) # Calculates the attention score: proposal `r` given class `c`. # proba_r_given_c shape = [batch, max_num_proposals, num_classes]. logits_r_given_c = slim.fully_connected( proposal_features, num_outputs=num_classes, activation_fn=None, scope='midn/proba_r_given_c') logits_r_given_c = tf.multiply(mask, logits_r_given_c) proba_r_given_c = utils.masked_softmax( data=logits_r_given_c, mask=mask, dim=1) proba_r_given_c = tf.multiply(mask, proba_r_given_c) tf.summary.histogram('midn/logits_r_given_c', logits_r_given_c) # Calculates the weighted logits: # logits_c_given_r shape = [batch, max_num_proposals, num_classes]. # logits shape = [batch, num_classes]. logits_c_given_r = slim.fully_connected( proposal_features, num_outputs=num_classes, activation_fn=None, scope='midn/proba_c_given_r') proba_c_given_r = tf.nn.softmax(logits_c_given_r) proba_c_given_r = tf.multiply(mask, proba_c_given_r) tf.summary.histogram('midn/logits_c_given_r', logits_c_given_r) # Aggregates the logits. logits = tf.multiply(logits_c_given_r, proba_r_given_c) logits = tf.reduce_sum(logits, axis=1) tf.summary.histogram('midn/logits', logits) return logits, proba_r_given_c
def _cumsum_gradient(score_map, box): b, n, m, c = utils.get_tensor_shape(score_map) _, p, _ = utils.get_tensor_shape(box) # Leave a border for the image border. ymin, xmin, ymax, xmax = tf.unstack(box, axis=-1) ymin, xmin = tf.maximum(ymin, 2), tf.maximum(xmin, 2) ymax, xmax = tf.minimum(ymax, tf.to_int64(n - 2)), tf.minimum( xmax, tf.to_int64(m - 2)) box = tf.stack([ymin, xmin, ymax, xmax], axis=-1) box_exp = _get_expanded_box(box, img_h=n, img_w=m, border_ratio=border_ratio) box_list = [box, box_exp] area_list = [ tf.cast(_get_box_area(b), tf.float32) for b in box_list ] cumsum = imgproc.calc_cumsum_2d(score_map, tf.concat(box_list, axis=1)) cumsum_list = [ cumsum[:, i * p:(i + 1) * p, :] for i in range(len(box_list)) ] # The main box has to be valid, including the four shrinked boxes. assert_op = tf.Assert( tf.reduce_all(tf.greater(area_list[0], 0)), ["Check area of the main box failed:", area_list[0]]) with tf.control_dependencies([assert_op]): border_area = area_list[1] - area_list[0] border_cumsum = cumsum_list[1] - cumsum_list[0] border_avg = tf.div( border_cumsum, tf.maximum(_SMALL_NUMBER, tf.expand_dims(border_area, axis=-1))) box_avg = tf.div( cumsum_list[0], tf.maximum(_SMALL_NUMBER, tf.expand_dims(area_list[0], axis=-1))) return purity_weight * box_avg - border_avg
def _predict_image_saliency(self, examples): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training if not options.use_saliency_score: raise ValueError("The flag of `use_saliency_score` should be set.") image = examples[InputDataFields.image] # Extract image feature, shape = # [batch, feature_height * feature_width, common_dimensions]. image_feature = self._encode_images( image, cnn_name=options.cnn_name, cnn_trainable=options.cnn_trainable, cnn_weight_decay=options.cnn_weight_decay, cnn_feature_map=options.cnn_feature_map, cnn_dropout_keep_prob=options.cnn_dropout_keep_prob, cnn_checkpoint=options.cnn_checkpoint, cnn_scope=GAPVariableScopes.cnn, is_training=is_training) image_feature = self._project_images( image_feature, common_dimensions=options.common_dimensions, scope=GAPVariableScopes.image_proj, hyperparams=options.image_proj_hyperparams, is_training=is_training) (batch, feature_height, feature_width, common_dimensions) = utils.get_tensor_shape(image_feature) image_feature = tf.reshape(image_feature, [batch, -1, common_dimensions]) # Predict saliency score. # image_saliency shape = [batch, num_regions]. # caption_saliency shape = [num_captions_in_batch, max_caption_length]. image_saliency = self._calc_saliency_score( image_feature, scope=GAPVariableScopes.image_saliency, hyperparams=options.image_saliency_hyperparams, is_training=is_training) return { GAPPredictions.image_saliency: tf.reshape(image_saliency, [-1, feature_height, feature_width]), }
def encode(self, feature, length, scope=None): """Encodes sequence features into representation. Args: feature: A [batch, max_sequence_len, dims] float tensor. length: A [batch] int tensor. Returns: A [batch, dims] float tensor. """ options = self._model_proto is_training = self._is_training def lstm_cell(): cell = tf.nn.rnn_cell.BasicLSTMCell( num_units=options.hidden_units, forget_bias=1.0) if is_training: cell = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=options.input_keep_prob, output_keep_prob=options.output_keep_prob, state_keep_prob=options.state_keep_prob) return cell rnn_cell = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(options.number_of_layers)]) with tf.variable_scope(scope): outputs, state = tf.nn.bidirectional_dynamic_rnn( cell_fw=rnn_cell, cell_bw=rnn_cell, inputs=feature, sequence_length=length, parallel_iterations=options.parallel_iterations, dtype=tf.float32) mask = tf.sequence_mask( length, maxlen=utils.get_tensor_shape(feature)[1], dtype=tf.float32) # outputs = tf.multiply(0.5, outputs[0] + outputs[1]) # feature = utils.masked_avg_nd(data=outputs, mask=mask, dim=1) # return tf.squeeze(feature, axis=1) state_list = [] for state_per_direction in state: for state_per_layer in state_per_direction: state_list.extend([state_per_layer.c, state_per_layer.h]) state_final = tf.contrib.layers.fully_connected( inputs=tf.concat(state_list, axis=-1), num_outputs=options.output_units, activation_fn=None, scope='bilstm_output') return state_final
def _calc_spp_feature(self, inputs, spp_bins=[1, 2, 3, 6], max_pool=True): """Apply SPP layer to get the multi-resolutional feature. LIMITATION: the inputs has to have static shape. Args: inputs: A [batch, feature_height, feature_width, feature_dims] float tensor. spp_bins: A python list representing the number of bins at each SPP level. Returns: spp_pool: A [batch, spp_feature_dims] fixed-length feature tensor. Raises: ValueError: If any of the parameters are invalid. """ batch, height, width, _ = utils.get_tensor_shape(inputs) if not type(height) == type(width) == int: raise ValueError('The inputs should have static shape.') pool_fn = tf.nn.avg_pool if max_pool: pool_fn = tf.nn.max_pool with tf.name_scope('calc_spp_feature'): pool_outputs = [] for bins in spp_bins: if height % bins or width % bins: raise ValueError('Reminder should be ZERO.') pool_h, pool_w = height // bins, width // bins stride_h, stride_w = height // bins, width // bins pool = pool_fn(inputs, ksize=[1, pool_h, pool_w, 1], strides=[1, stride_h, stride_w, 1], padding='SAME') tf.summary.histogram('oicr/spp_bins_{}'.format(bins), pool) tf.summary.scalar('oicr/spp_bins_min_{}'.format(bins), tf.reduce_min(pool)) tf.summary.scalar('oicr/spp_bins_max_{}'.format(bins), tf.reduce_max(pool)) tf.summary.scalar('oicr/spp_bins_avg_{}'.format(bins), tf.reduce_mean(pool)) pool_outputs.append(tf.reshape(pool, [batch, -1])) tf.logging.info( 'SPP bins=%i, bin_size=(%i,%i), strides=(%i, %i), output=%s', bins, pool_h, pool_w, stride_h, stride_w, pool.get_shape().as_list()) spp_pool = tf.concat(pool_outputs, axis=-1) tf.logging.info('Final SPP shape=%s', spp_pool.get_shape().as_list()) return spp_pool
def gather_in_batch_captions(image_id, num_captions, caption_strings, caption_lengths): """Gathers all of the in-batch captions into a caption batch. Args: image_id: image_id, a [batch] int64 tensor. num_captions: number of captions of each example, a [batch] int tensor. caption_strings: caption data, a [batch, max_num_captions, max_caption_length] string tensor. caption_lengths: length of each caption, a [batch, max_num_captions] int tensor. Returns: image_ids_gathered: associated image_id of each caption in the new batch, a [num_captions_in_batch] string tensor. caption_strings_gathered: caption data, a [num_captions_in_batch, max_caption_length] string tensor. caption_lengths_gathered: length of each caption, a [num_captions_in_batch] int tensor. """ if not image_id.dtype in [tf.int32, tf.int64]: raise ValueError('The image_id has to be int32 or int64') (batch, max_num_captions, max_caption_length) = utils.get_tensor_shape(caption_strings) # caption_mask denotes the validity of each caption in the flattened batch. # caption_mask shape = [batch * max_num_captions], caption_mask = tf.sequence_mask(num_captions, maxlen=max_num_captions, dtype=tf.bool) caption_mask = tf.reshape(caption_mask, [-1]) # image_id shape = [batch, max_num_captions]. image_id = tf.tile(tf.expand_dims(image_id, axis=1), [1, max_num_captions]) # Reshape the tensors to make their first dimensions to be [batch * max_num_captions]. image_id_reshaped = tf.reshape(image_id, [-1]) caption_strings_reshaped = tf.reshape(caption_strings, [-1, max_caption_length]) caption_lengths_reshaped = tf.reshape(caption_lengths, [-1]) # Apply the caption_mask. image_ids_gathered = tf.boolean_mask(image_id_reshaped, caption_mask) caption_strings_gathered = tf.boolean_mask(caption_strings_reshaped, caption_mask) caption_lengths_gathered = tf.boolean_mask(caption_lengths_reshaped, caption_mask) return image_ids_gathered, caption_strings_gathered, caption_lengths_gathered
def draw_rectangles(image, boxes, scores=None, labels=None, color=GREEN, thickness=1, fontscale=1.0): """Draws rectangle to the image. Args: image: a [batch, height, width, 3] uint8 tensor. boxes: a [batch, num_boxes, 4] float tensor representing normalized boxes, i.e.: [ymin, xmin, ymax, xmax], values are ranging from 0.0 to 1.0. scores: a [batch, num_boxes] float tensor representing the scores to be drawn on the image. labels: a [batch, num_boxes] string or float tensor representing the labels to be drawn on the image. color: color to be used. thickness: the line thickness. fontscale: size of the font. Returns: canvas: a [batch, height, width, 3] uint8 tensor with information drawn. """ def _draw_fn(inputs): """Draws the box on the image. Args: image: a [height, width, 3] float tensor. box: a [num_boxes, 4] float tensor representing [ymin, xmin, ymax, xmax]. score: a [num_boxes] float tensor representing box scores. label: a [num_boxes] string tensor denoting the text to be drawn. Returns: canvas: a [height, width, 3] float tensor with box drawn. """ image, boxes, scores, labels = inputs canvas = tf.py_func( func=lambda x, y, z, w: _py_draw_rectangles(x, y, z, w, color=color, thickness=thickness, fontscale=fontscale), inp=[image, boxes, scores, labels], Tout=tf.uint8) canvas.set_shape(tf.TensorShape([None, None, 3])) return canvas batch, num_boxes, _ = utils.get_tensor_shape(boxes) if scores is None: scores = tf.constant(-9999.0, shape=[batch, num_boxes], dtype=tf.float32) if labels is None: labels = tf.constant("", shape=[batch, num_boxes], dtype=tf.string) return tf.map_fn( _draw_fn, elems=[image, boxes, scores, labels], dtype=tf.uint8)
def encode(self, feature, length, scope=None): """Encodes sequence features into representation. Args: feature: A [batch, max_sequence_len, dims] float tensor. length: A [batch] int tensor. Returns: A [batch, dims] float tensor. """ with tf.name_scope('avg_pooling_encoder'): mask = tf.sequence_mask( length, maxlen=utils.get_tensor_shape(feature)[-2], dtype=tf.float32) feature = utils.masked_avg_nd(data=feature, mask=mask, dim=1) return tf.squeeze(feature, axis=1)
def _calc_vgg_proposal_feature(self, image_feature_cropped): """Calculates proposal feature using vgg fc layers. Args: image_feature_cropped: A [batch, crop_size, crop_size, feature_dims] float tensor. Returns: proposal_feature: A [batch, proposal_feature_dims] float tensor. """ options = self._model_proto is_training = self._is_training # SPP. bins = 7 batch, height, width, _ = utils.get_tensor_shape(image_feature_cropped) if height % bins or width % bins: raise ValueError('Reminder should be ZERO.') pool_h, pool_w = height // bins, width // bins stride_h, stride_w = height // bins, width // bins net = tf.nn.max_pool(image_feature_cropped, ksize=[1, pool_h, pool_w, 1], strides=[1, stride_h, stride_w, 1], padding='SAME') with tf.variable_scope(options.cnn.scope, reuse=True): with tf.variable_scope(options.cnn.name, reuse=True): net = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') net = slim.dropout(net, options.cnn.dropout_keep_prob, is_training=is_training and options.cnn.trainable, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') net = slim.dropout(net, options.hidden_dropout_keep_prob, is_training=is_training, scope='dropout7') net = tf.squeeze(net, [1, 2], name='fc8/squeezed') return net
def _encode_labels(self, num_captions, caption_strings, caption_lengths, vocabulary_list, is_training=False): """Encodes labels. Args: num_captions: a [batch] int tensor. caption_strings: a [batch, num_captions, max_caption_len] string tensor. caption_lengths: a [batch, num_captions] int tensor. vocabulary_list: a list of words of length ``num_classes''''. is_training: if True, training graph is built. Returns: classes: a [batch, num_classes] int tensor. """ with tf.name_scope('encode_labels'): batch, num_captions, max_caption_len = utils.get_tensor_shape( caption_strings) caption_string = caption_strings[:, 0, :] caption_length = caption_lengths[:, 0] categorical_col = tf.feature_column.categorical_column_with_vocabulary_list( key='name_to_class_id', vocabulary_list=vocabulary_list, num_oov_buckets=1) indicator_col = tf.feature_column.indicator_column(categorical_col) indicator = tf.feature_column.input_layer( { 'name_to_class_id': caption_strings }, feature_columns=[indicator_col]) classes = tf.cast(indicator[:, :-1] > 0, tf.int64) tf.summary.histogram('num_gt_boxes_per_image', caption_length) tf.summary.histogram('num_gt_labels_per_image', tf.reduce_sum(classes, axis=-1)) classes.set_shape([batch, len(vocabulary_list)]) return classes
def _average_encoding(sequence_feature, sequence_length): """Encodes sequence using Average pooling. Args: sequence_feature: a [batch_sequence, max_sequence_length, feature_dimensions]. float tensor. sequence_length: a [batch_sequence] int tensor. Returns: sequence_emb: A [batch_sequence, common_dimensions] float tensor, representing the embedding vectors. """ (_, max_sequence_length, _) = utils.get_tensor_shape(sequence_feature) mask = tf.sequence_mask( sequence_length, maxlen=max_sequence_length, dtype=tf.float32) sequence_emb = utils.masked_avg_nd(sequence_feature, mask, dim=1) sequence_emb = tf.squeeze(sequence_emb, axis=1) return sequence_emb
def _midn_loss_mine_hardest_negative(self, labels, losses): """Hardest negative mining of the MIDN loss. Args: labels: A [batch, num_classes] float tensor, where `1` denotes the presence of a class. losses: A [batch, num_classes] float tensor, the losses predicted by the model. Returns: mask: A [batch, num_classes] float tensor where `1` denotes the selected entry. """ batch, num_classes = utils.get_tensor_shape(labels) indices_0 = tf.range(batch, dtype=tf.int64) indices_1 = utils.masked_argmax(data=losses, mask=1.0 - labels, dim=1) indices = tf.stack([indices_0, indices_1], axis=-1) negative_masks = tf.sparse_to_dense(indices, [batch, num_classes], sparse_values=1.0) return tf.add(labels, negative_masks)
def _calc_graph_node_scores(node, hidden_layers=None, hidden_units=None, dropout_keep_prob=1.0, is_training=False, scope='calc_graph_node_scores'): """Calculates the node scores [from node to node]. Args: node: A [batch, max_num_node, dims] float tensor. hidden_layers: An integer denoting number of MLP layers. hidden_units: An integer denoting MLP hidden units. dropout_keep_prob: Keep probability of the dropout layers. is_training: If true, build the training graph. scope: Variable scope name. Returns: A [batch, max_num_node] float tensor. """ with tf.variable_scope(scope): batch = utils.get_tensor_shape(node)[0] # Concatenate the node features, inputs = [node]. hiddens = node for layer_i in range(hidden_layers): hiddens = tf.contrib.layers.fully_connected( inputs=hiddens, num_outputs=hidden_units, activation_fn=tf.nn.relu, scope='hidden_{}'.format(layer_i)) hiddens = slim.dropout(hiddens, dropout_keep_prob, is_training=is_training) outputs = tf.contrib.layers.fully_connected(inputs=hiddens, num_outputs=1, activation_fn=None, scope='output') outputs = tf.squeeze(outputs, axis=-1) return outputs
def visualize(self, image, saliency, interpolation=tf.image.ResizeMethod.NEAREST_NEIGHBOR): """Visualizes images to tensorboard. Args: image: a [batch, height, width, channels] float tensor, in [0, 255]. saliency: a [batch, feature_height, feature_width] float tensor. """ (batch, height, width, channels) = utils.get_tensor_shape(image) image = image / 255.0 heatmap = plotlib.convert_to_heatmap(saliency, normalize=True) heatmap = tf.image.resize_images(heatmap, [height, width], interpolation) heatmap = plotlib.gaussian_filter(heatmap, ksize=32) image = tf.maximum(0.0, tf.concat([image, heatmap], axis=2)) tf.summary.image("images", image, max_outputs=10)