def focal_loss(logits, targets, alpha, gamma, normalizer): """Compute the focal loss between `logits` and the golden `target` values. Focal loss = -(1-alpha)^gamma * log(pt) where pt is the probability of being classified to the true class. Args: logits: A float32 tensor of size [batch, height_in, width_in, num_predictions]. targets: A float32 tensor of size [batch, height_in, width_in, num_predictions]. alpha: A float32 scalar multiplying alpha to the loss from positive examples and (1-alpha) to the loss from negative examples. gamma: A float32 scalar modulating loss from hard and easy examples. normalizer: A float32 scalar normalizes the total loss from all examples. Returns: loss: A float32 scalar representing normalized total loss. """ with tf.name_scope('focal_loss'): positive_label_mask = tf.equal(targets, 1.0) cross_entropy = ( tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits)) probs = tf.sigmoid(logits) probs_gt = tf.where(positive_label_mask, probs, 1.0 - probs) # With small gamma, the implementation could produce NaN during back prop. modulator = tf.pow(1.0 - probs_gt, gamma) loss = modulator * cross_entropy weighted_loss = tf.where(positive_label_mask, alpha * loss, (1.0 - alpha) * loss) total_loss = tf.reduce_sum(weighted_loss) total_loss /= normalizer return total_loss
def clip_boxes(self, boxes): """Clip boxes to fit in an image.""" boxes = tf.where(tf.less(boxes, 0), tf.zeros_like(boxes), boxes) is_height_short_side = tf.less(self._scaled_height, self._scaled_width) bound = tf.where( is_height_short_side, tf.convert_to_tensor( [self._output_size[0] - 1, self._output_size[1] - 1] * 2, dtype=tf.float32), tf.convert_to_tensor( [self._output_size[1] - 1, self._output_size[0] - 1] * 2, dtype=tf.float32)) boxes = tf.where(tf.greater(boxes, bound), bound * tf.ones_like(boxes), boxes) return boxes
def _learning_rate_schedule(base_learning_rate, lr_warmup_step, lr_drop_step, global_step): """Handles linear scaling rule, gradual warmup, and LR decay.""" linear_warmup = [(1.0/3.0 + float(step)/lr_warmup_step * 2.0/3.0, step) for step in range(lr_warmup_step)] lr_schedule = linear_warmup + [[1.0, lr_warmup_step], [0.1, lr_drop_step]] learning_rate = base_learning_rate for mult, start_global_step in lr_schedule: learning_rate = tf.where(global_step < start_global_step, learning_rate, base_learning_rate * mult) return learning_rate
def resize_and_crop_boxes(self): """Resize boxes and crop it to the self._output dimension.""" boxlist = preprocessor.box_list.BoxList(self._boxes) boxes = preprocessor.box_list_scale(boxlist, self._scaled_height, self._scaled_width).get() # Clip the boxes. boxes = self.clip_boxes(boxes) # Filter out ground truth boxes that are all zeros and corresponding classes # and masks. indices = tf.where(tf.not_equal(tf.reduce_sum(boxes, axis=1), 0)) boxes = tf.gather_nd(boxes, indices) classes = tf.gather_nd(self._classes, indices) self._masks = tf.gather_nd(self._masks, indices) return boxes, classes
def resize_and_crop_boxes(self): """Resize boxes and crop it to the self._output dimension.""" boxlist = preprocessor.box_list.BoxList(self._boxes) boxes = preprocessor.box_list_scale( boxlist, self._scaled_height, self._scaled_width).get() # Adjust box coordinates based on the offset. box_offset = tf.stack([self._crop_offset_y, self._crop_offset_x, self._crop_offset_y, self._crop_offset_x,]) boxes -= tf.to_float(tf.reshape(box_offset, [1, 4])) # Clip the boxes. boxes = self.clip_boxes(boxes) # Filter out ground truth boxes that are all zeros. indices = tf.where(tf.not_equal(tf.reduce_sum(boxes, axis=1), 0)) boxes = tf.gather_nd(boxes, indices) classes = tf.gather_nd(self._classes, indices) return boxes, classes
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: A dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: An image tensor that is preprocessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: Image information that includes the original height and width, the scale of the processed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: (only for training) A dictionary that contains groundtruth labels. The following describes {key: value} pairs in the dictionary. score_targets_dict: An ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: An ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: Groundtruth masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) image = data['image'] source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if self._mode == tf.estimator.ModeKeys.PREDICT: input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size']) input_processor.normalize_image() input_processor.set_scale_factors_to_mlperf_reference_size( ) image = input_processor.resize_and_crop_image() if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) image_info = input_processor.get_image_info() return { 'images': image, 'image_info': image_info, 'source_ids': source_id } # The following part is for training. instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) instance_masks = tf.gather_nd(instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size'], boxes, classes, instance_masks) input_processor.normalize_image() if params['input_rand_hflip']: input_processor.random_horizontal_flip() input_processor.set_scale_factors_to_mlperf_reference_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() cropped_gt_masks = input_processor.crop_gt_masks( params['gt_mask_size']) image_info = input_processor.get_image_info() # Assign anchors. is_height_short_side = tf.less(image_info[3], image_info[4]) score_targets, box_targets = tf.cond( is_height_short_side, lambda: anchor_labeler.label_anchors(boxes, classes), lambda: height_long_side_anchor_labeler.label_anchors(boxes, classes)) # pylint: disable=line-too-long # Pad groundtruth data. boxes *= image_info[2] boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. cropped_gt_masks = tf.reshape( cropped_gt_masks, [-1, (params['gt_mask_size'] + 4)**2]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [self._max_num_instances, (params['gt_mask_size'] + 4)**2]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) features = {} features['images'] = image features['image_info'] = image_info features['source_ids'] = source_id labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes labels['cropped_gt_masks'] = cropped_gt_masks return features, labels
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if params['skip_crowd_during_training'] and self._is_training: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # NOTE: The autoaugment method works best when used alongside the # standard horizontal flipping of images along with size jittering # and normalization. if params.get('autoaugment_policy', None) and self._is_training: image, boxes = autoaugment.distort_image_with_autoaugment( image, boxes, params['autoaugment_policy']) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, boxes, is_crowds, areas, classes)
def clip_boxes(self, boxes): """Clip boxes to fit in an image.""" boxes = tf.where(tf.less(boxes, 0), tf.zeros_like(boxes), boxes) boxes = tf.where(tf.greater(boxes, self._output_size - 1), (self._output_size - 1) * tf.ones_like(boxes), boxes) return boxes
def _local_perm(inputs, targets, is_masked, perm_size, seq_len): """ Sample a permutation of the factorization order, and create an attention mask accordingly. Args: inputs: int64 Tensor in shape [seq_len], input ids. targets: int64 Tensor in shape [seq_len], target ids. is_masked: bool Tensor in shape [seq_len]. True means being selected for partial prediction. perm_size: the length of longest permutation. Could be set to be reuse_len. Should not be larger than reuse_len or there will be data leaks. seq_len: int, sequence length. """ # Generate permutation indices index = tf.range(seq_len, dtype=tf.int64) index = tf.transpose(tf.reshape(index, [-1, perm_size])) index = tf.random_shuffle(index) index = tf.reshape(tf.transpose(index), [-1]) # `perm_mask` and `target_mask` # non-functional tokens non_func_tokens = tf.logical_not(tf.logical_or( tf.equal(inputs, SEP_ID), tf.equal(inputs, CLS_ID))) non_mask_tokens = tf.logical_and(tf.logical_not(is_masked), non_func_tokens) masked_or_func_tokens = tf.logical_not(non_mask_tokens) # Set the permutation indices of non-masked (& non-funcional) tokens to the # smallest index (-1): # (1) they can be seen by all other positions # (2) they cannot see masked positions, so there won"t be information leak smallest_index = -tf.ones([seq_len], dtype=tf.int64) rev_index = tf.where(non_mask_tokens, smallest_index, index) # Create `target_mask`: non-funcional and maksed tokens # 1: use mask as input and have loss # 0: use token (or [SEP], [CLS]) as input and do not have loss target_tokens = tf.logical_and(masked_or_func_tokens, non_func_tokens) target_mask = tf.cast(target_tokens, tf.float32) # Create `perm_mask` # `target_tokens` cannot see themselves self_rev_index = tf.where(target_tokens, rev_index, rev_index + 1) # 1: cannot attend if i <= j and j is not non-masked (masked_or_func_tokens) # 0: can attend if i > j or j is non-masked perm_mask = tf.logical_and( self_rev_index[:, None] <= rev_index[None, :], masked_or_func_tokens) perm_mask = tf.cast(perm_mask, tf.float32) # new target: [next token] for LM and [curr token] (self) for PLM new_targets = tf.concat([inputs[0: 1], targets[: -1]], axis=0) # construct inputs_k inputs_k = inputs # construct inputs_q inputs_q = target_mask return perm_mask, new_targets, target_mask, inputs_k, inputs_q