def process_boundary(boundaries, input_length, t1_id, t2_id, all_dialogue): """process the boundaries of the dialogue.""" points = tf.string_split([boundaries]).values points_val = tf.string_to_number(points, out_type=tf.int32) siz = tf.size(points_val) // 2 start_points, end_points = points_val[0:siz], points_val[siz:] return do_process_boundary(start_points, end_points, input_length, t1_id, t2_id, all_dialogue)
def process_source_id(source_id): """Processes source_id to the right format.""" if source_id.dtype == tf.string: source_id = tf.cast(tf.string_to_number(source_id), tf.int64) with tf.control_dependencies([source_id]): source_id = tf.cond(tf.equal(tf.size(source_id), 0), lambda: tf.cast(tf.constant(-1), tf.int64), lambda: tf.identity(source_id)) return source_id
def label_string_to_tensor(x, batch_size, num_outputs=None): sparse = tf.string_split(x, delimiter=' ') values = tf.string_to_number(sparse.values) if num_outputs is None: dense = tf.reshape(values, [batch_size, -1]) else: dense = tf.reshape(values, (batch_size, num_outputs)) return dense
def process_entry_self_play(intent, action, truth_action, kb, utterance, boundary, reward_diag, reward_action, vocab_table): """Pro-proess procedure for the self-play iterator.""" t1_id = tf.cast(vocab_table.lookup(tf.constant("<t1>")), tf.int32) t2_id = tf.cast(vocab_table.lookup(tf.constant("<t2>")), tf.int32) res = process_entry_common(intent, action, utterance, boundary, kb, vocab_table, t1_id, t2_id) tensor_intent, size_intent, source_diag, target_diag, size_dialogue, tensor_action, size_action, tensor_kb, has_reservation, mask1, mask2, turn_point = res truth_action, _ = process_data(truth_action, vocab_table) splitted_reward_d = tf.string_split([reward_diag]).values splitted_reward_a = tf.string_split([reward_action]).values tensor_reward_diag = tf.string_to_number( splitted_reward_d, out_type=tf.float32, name=None)[:-1] # remove the last dialogue ??? tensor_reward_action = tf.string_to_number(splitted_reward_a, out_type=tf.float32, name=None) return tensor_intent, size_intent, source_diag, target_diag, size_dialogue, tensor_action, size_action, truth_action, tensor_reward_diag, tensor_reward_action, tensor_kb, has_reservation, mask1, mask2, turn_point
def _dedup_tensor(sp_tensor: tf.SparseTensor) -> tf.SparseTensor: """Dedup values of a SparseTensor along each row. Args: sp_tensor: A 2D SparseTensor to be deduped. Returns: A deduped SparseTensor of shape [batch_size, max_len], where max_len is the maximum number of unique values for a row in the Tensor. """ string_batch_index = tf.as_string(sp_tensor.indices[:, 0]) # tf.unique only works on 1D tensors. To avoid deduping across examples, # prepend each feature value with the example index. This requires casting # to and from strings for non-string features. string_values = sp_tensor.values original_dtype = sp_tensor.values.dtype if original_dtype != tf.string: string_values = tf.as_string(sp_tensor.values) index_and_value = tf.strings.join([string_batch_index, string_values], separator='|') unique_index_and_value, _ = tf.unique(index_and_value) # split is a shape [tf.size(values), 2] tensor. The first column contains # indices and the second column contains the feature value (we assume no # feature contains | so we get exactly 2 values from the string split). split = tf.string_split(unique_index_and_value, delimiter='|') split = tf.reshape(split.values, [-1, 2]) string_indices = split[:, 0] values = split[:, 1] indices = tf.reshape( tf.string_to_number(string_indices, out_type=tf.int32), [-1]) if original_dtype != tf.string: values = tf.string_to_number(values, out_type=original_dtype) values = tf.reshape(values, [-1]) # Convert example indices into SparseTensor indices, e.g. # [0, 0, 0, 1, 3, 3] -> [[0,0], [0,1], [0,2], [1,0], [3,0], [3,1]] batch_size = tf.to_int32(sp_tensor.dense_shape[0]) new_indices, max_len = _example_index_to_sparse_index(indices, batch_size) return tf.SparseTensor( indices=tf.to_int64(new_indices), values=values, dense_shape=[tf.to_int64(batch_size), max_len])
def _parse_single_example(self, example): """Parses a single serialized tf.Example proto. Args: example: a serialized tf.Example proto string. Returns: A dictionary of groundtruth with the following fields: source_id: a scalar tensor of int64 representing the image source_id. height: a scalar tensor of int64 representing the image height. width: a scalar tensor of int64 representing the image width. boxes: a float tensor of shape [K, 4], representing the groundtruth boxes in absolute coordinates with respect to the original image size. classes: a int64 tensor of shape [K], representing the class labels of each instances. is_crowds: a bool tensor of shape [K], indicating whether the instance is crowd. areas: a float tensor of shape [K], indicating the area of each instance. masks: a string tensor of shape [K], containing the bytes of the png mask of each instance. """ decoder = tf_example_decoder.TfExampleDecoder( include_mask=self._include_mask) decoded_tensors = decoder.decode(example) image = decoded_tensors['image'] image_size = tf.shape(image)[0:2] boxes = box_utils.denormalize_boxes( decoded_tensors['groundtruth_boxes'], image_size) groundtruths = { 'source_id': tf.string_to_number(decoded_tensors['source_id'], out_type=tf.int64), 'height': decoded_tensors['height'], 'width': decoded_tensors['width'], 'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0], 'boxes': boxes, 'classes': decoded_tensors['groundtruth_classes'], 'is_crowds': decoded_tensors['groundtruth_is_crowd'], 'areas': decoded_tensors['groundtruth_area'], } if self._include_mask: groundtruths.update({ 'masks': decoded_tensors['groundtruth_instance_masks_png'], }) return groundtruths
def parse_single_tfexample(_, serialized_example): """Parsing serialized pb2 example.""" # read data from serialized examples features = tf.parse_single_example( serialized_example, features={ 'x': tf.FixedLenFeature([], tf.string), 'y': tf.FixedLenFeature([], tf.int64), # z is for sequence origins, # i.e. which genome and which position the seq is from # 'z': tf.VarLenFeature(tf.string) }) seq_str = features['x'] x_str = tf.string_split([seq_str], delimiter=' ').values features['x'] = tf.string_to_number(x_str, out_type=tf.int32) features['y'] = tf.cast(features['y'], dtype=tf.int32) return features
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preprocessed to have normalized value and fixed dimension [image_height, image_width, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the processed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed dimension [self._max_instances_per_image, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tensor is padded with 0 to the fixed dimension [self._max_instances_per_image]. areas: Groundtruth areas annotations. The tensor is padded with -1 to the fixed dimension [self._max_instances_per_image]. classes: Groundtruth classes annotations. The tensor is padded with -1 to the fixed dimension [self._max_instances_per_image]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if params['skip_crowd_during_training'] and self._is_training: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # NOTE: The autoaugment method works best when used alongside the # standard horizontal flipping of images along with size jittering # and normalization. if params.get('autoaugment_policy', None) and self._is_training: from aug import autoaugment # pylint: disable=g-import-not-at-top image, boxes = autoaugment.distort_image_with_autoaugment( image, boxes, params['autoaugment_policy'], params['use_augmix'], *params['augmix_params']) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max'], params.get('target_size', None)) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_instances_per_image, 4]) is_crowds = pad_to_fixed_size( is_crowds, 0, [self._max_instances_per_image, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_instances_per_image, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_instances_per_image, 1]) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, boxes, is_crowds, areas, classes)
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. image_info: image information that includes the original height and width, the scale of the proccessed image to the original image, and the scaled height and width. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) data['groundtruth_is_crowd'] = tf.cond( tf.greater(tf.size(data['groundtruth_is_crowd']), 0), lambda: data['groundtruth_is_crowd'], lambda: tf.zeros_like(data['groundtruth_classes'], dtype=tf.bool)) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) input_height = tf.shape(image)[0] input_width = tf.shape(image)[1] if params['skip_crowd_during_training'] and self._is_training: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original scaled_height = tf.to_float( input_height) * input_processor.image_scale scaled_width = tf.to_float( input_width) * input_processor.image_scale image_info = tf.stack([ tf.cast(scaled_height, dtype=tf.float32), tf.cast(scaled_width, dtype=tf.float32), image_scale, tf.cast(input_height, dtype=tf.float32), tf.cast(input_width, dtype=tf.float32), ]) boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, image_info, boxes, is_crowds, areas, classes)
def convert_string_neighbors(string_neighbors): split = tf.string_split(string_neighbors, "") string_dense = tf.sparse_tensor_to_dense(split, default_value="0") num = tf.string_to_number(string_dense, out_type=tf.int32) bool_neigh = tf.cast(num, tf.bool) return bool_neigh
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: a dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: image information that includes the original height and width, the scale of the proccessed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: a dictionary that contains auxiliary information plus (optional) labels. The following describes {key: value} pairs in the dictionary. `labels` is only for training. score_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) data['groundtruth_is_crowd'] = tf.cond( tf.greater(tf.size(data['groundtruth_is_crowd']), 0), lambda: data['groundtruth_is_crowd'], lambda: tf.zeros_like(data['groundtruth_classes'], dtype=tf.bool)) image = data['image'] image = tf.image.convert_image_dtype(image, dtype=tf.float32) orig_image = image source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if (self._mode == tf.estimator.ModeKeys.PREDICT or self._mode == tf.estimator.ModeKeys.EVAL): image = preprocess_ops.normalize_image(image) if params['resize_method'] == 'retinanet': image, image_info, _, _, _ = preprocess_ops.resize_crop_pad( image, params['image_size'], 2**params['max_level']) else: image, image_info, _, _, _ = preprocess_ops.resize_crop_pad_v2( image, params['short_side'], params['long_side'], 2**params['max_level']) if params['precision'] == 'bfloat16': image = tf.cast(image, dtype=tf.bfloat16) features = { 'images': image, 'image_info': image_info, 'source_ids': source_id, } if params['visualize_images_summary']: resized_image = tf.image.resize_images( orig_image, params['image_size']) features['orig_images'] = resized_image if (params['include_groundtruth_in_features'] or self._mode == tf.estimator.ModeKeys.EVAL): labels = _prepare_labels_for_eval( data, target_num_instances=self._max_num_instances, target_polygon_list_len=self. _max_num_polygon_list_len, use_instance_mask=params['include_mask']) return {'features': features, 'labels': labels} else: return {'features': features} elif self._mode == tf.estimator.ModeKeys.TRAIN: instance_masks = None if self._use_instance_mask: instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) if self._use_instance_mask: instance_masks = tf.gather_nd( instance_masks, indices) image = preprocess_ops.normalize_image(image) if params['input_rand_hflip']: flipped_results = ( preprocess_ops.random_horizontal_flip( image, boxes=boxes, masks=instance_masks)) if self._use_instance_mask: image, boxes, instance_masks = flipped_results else: image, boxes = flipped_results # Scaling, jittering and padding. if params['resize_method'] == 'retinanet': image, image_info, boxes, classes, cropped_gt_masks = ( preprocess_ops.resize_crop_pad( image, params['image_size'], 2**params['max_level'], aug_scale_min=params['aug_scale_min'], aug_scale_max=params['aug_scale_max'], boxes=boxes, classes=classes, masks=instance_masks, crop_mask_size=params['gt_mask_size'])) else: image, image_info, boxes, classes, cropped_gt_masks = ( preprocess_ops.resize_crop_pad_v2( image, params['short_side'], params['long_side'], 2**params['max_level'], aug_scale_min=params['aug_scale_min'], aug_scale_max=params['aug_scale_max'], boxes=boxes, classes=classes, masks=instance_masks, crop_mask_size=params['gt_mask_size'])) if cropped_gt_masks is not None: cropped_gt_masks = tf.pad(cropped_gt_masks, paddings=tf.constant([[ 0, 0, ], [ 2, 2, ], [2, 2]]), mode='CONSTANT', constant_values=0.) padded_height, padded_width, _ = image.get_shape().as_list( ) padded_image_size = (padded_height, padded_width) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], padded_image_size) anchor_labeler = anchors.AnchorLabeler( input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) # Assign anchors. score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) # Pad groundtruth data. boxes = preprocess_ops.pad_to_fixed_size( boxes, -1, [self._max_num_instances, 4]) classes = preprocess_ops.pad_to_fixed_size( classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. if self._use_instance_mask: cropped_gt_masks = tf.reshape( cropped_gt_masks, tf.stack([tf.shape(cropped_gt_masks)[0], -1])) cropped_gt_masks = preprocess_ops.pad_to_fixed_size( cropped_gt_masks, -1, [ self._max_num_instances, (params['gt_mask_size'] + 4)**2 ]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['precision'] == 'bfloat16': image = tf.cast(image, dtype=tf.bfloat16) features = { 'images': image, 'image_info': image_info, 'source_ids': source_id, } labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes if self._use_instance_mask: labels['cropped_gt_masks'] = cropped_gt_masks return features, labels
def decode_func(value): return [tf.string_to_number(value, out_type=tf.int32)]
def _parse_example(self, data): """Example parser.""" with tf.name_scope('augmentation'): source_id = data['source_id'] image = data['image'] # dtype uint8 raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # ssd_crop resizes and returns image of dtype float32 and does not # change its range (i.e., value in between 0--255). Divide by 255. # converts it to [0, 1] range. Not doing this before cropping to # avoid dtype cast (which incurs additional memory copy). image /= 255.0 # random_horizontal_flip() is hard coded to flip with 50% chance. image, boxes = preprocessor.random_horizontal_flip(image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter(image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) if self._params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes, self._params['use_spatial_partitioning']) labels = { ssd_constants.NUM_MATCHED_BOXES: tf.reshape(num_matched_boxes, [1, -1, 1, 1]), ssd_constants.BOXES: encoded_boxes, ssd_constants.CLASSES: encoded_classes, } return image, labels else: image = tf.image.resize_images(image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE)) # resize_image returns image of dtype float32 and does not change its # range. Divide by 255 to convert image to [0, 1] range. image /= 255. if self._params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:ssd_constants.MAX_NUM_EVAL_BOXES] num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape( inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) labels = { ssd_constants.BOXES: boxes, ssd_constants.CLASSES: classes, ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), ssd_constants.RAW_SHAPE: raw_shape, } if not self._is_training and self._count > self._params[ 'eval_samples']: labels[ssd_constants.IS_PADDED] = data[ ssd_constants.IS_PADDED] return image, labels
def label_string_to_tensor(x, batch_size, num_outputs=-1): sparse = tf.string_split(x, sep=' ') values = tf.string_to_number(sparse.values) dense = tf.reshape(values, [batch_size, num_outputs]) return dense
def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = data['image'] # dtype uint8 raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # ssd_crop resizes and returns image of dtype float32 and does not # change its range (i.e., value in between 0--255). Divide by 255. # converts it to [0, 1] range. Not doing this before cropping to # avoid dtype cast (which incurs additional memory copy). image /= 255.0 # random_horizontal_flip() is hard coded to flip with 50% chance. image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter( image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) if params['dtype'] == 'bf16': image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # We transpose in dataloader instead of in the topology to save time encoded_classes, encoded_boxes = transpose_labels(encoded_classes, encoded_boxes) encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { constants.NUM_MATCHED_BOXES: num_matched_boxes, constants.BOXES: encoded_boxes, constants.CLASSES: tf.squeeze(encoded_classes, axis=1), } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor(DefaultBoxes()('ltrb'))) ).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: image = tf.image.resize_images( image, size=(constants.IMAGE_SIZE, constants.IMAGE_SIZE)) # resize_image returns image of dtype float32 and does not change its # range. Divide by 255 to convert image to [0, 1] range. image /= 255. if params['dtype'] == 'bf16': image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:constants.MAX_NUM_EVAL_BOXES] num_pad = constants.MAX_NUM_EVAL_BOXES - tf.shape(inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) sample = { constants.IMAGE: image, constants.BOXES: boxes, constants.CLASSES: classes, constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), constants.RAW_SHAPE: raw_shape, } if not self._is_training and self._count > params['eval_samples']: sample[constants.IS_PADDED] = data[constants.IS_PADDED] return sample