def det_post_process(params: Dict[Any, Any], cls_outputs: Dict[int, tf.Tensor], box_outputs: Dict[int, tf.Tensor], scales: List[float], min_score_thresh, max_boxes_to_draw): """Post preprocessing the box/class predictions. Args: params: a parameter dictionary that includes `min_level`, `max_level`, `batch_size`, and `num_classes`. cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. scales: a list of float values indicating image scale. min_score_thresh: A float representing the threshold for deciding when to remove boxes based on score. max_boxes_to_draw: Max number of boxes to draw. Returns: detections_batch: a batch of detection results. Each detection is a tensor with each row representing [image_id, x, y, width, height, score, class]. """ # TODO(tanmingxing): refactor the code to make it more explicity. outputs = { 'cls_outputs_all': [None], 'box_outputs_all': [None], 'indices_all': [None], 'classes_all': [None] } det_model_fn.add_metric_fn_inputs( params, cls_outputs, box_outputs, outputs, -1) # Create anchor_label for picking top-k predictions. eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) # Add all detections for each input image. detections_batch = [] for index in range(params['batch_size']): cls_outputs_per_sample = outputs['cls_outputs_all'][index] box_outputs_per_sample = outputs['box_outputs_all'][index] indices_per_sample = outputs['indices_all'][index] classes_per_sample = outputs['classes_all'][index] detections = anchor_labeler.generate_detections( cls_outputs_per_sample, box_outputs_per_sample, indices_per_sample, classes_per_sample, image_id=[index], image_scale=[scales[index]], min_score_thresh=min_score_thresh, max_boxes_to_draw=max_boxes_to_draw, disable_pyfun=params.get('disable_pyfun')) detections_batch.append(detections) return tf.stack(detections_batch, name='detections')
def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) coco_metrics = coco_metric_fn( batch_size, anchor_labeler, params['val_json_file'], testdev_dir=params['testdev_dir'], disable_pyfun=params.get('disable_pyfun', None), **kwargs) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def pre_nms(params, cls_outputs, box_outputs) -> Tuple[T, T, T]: """Detection post processing before nms. It takes the multi-level class and box predictions from network, merge them into unified tensors, and compute boxes, scores, and classes. Args: params: a dict of parameters. cls_outputs: a list of tensors for classes, each tensor denotes a level of logits with shape [N, H, W, num_class * num_anchors]. box_outputs: a list of tensors for boxes, each tensor ddenotes a level of boxes with shape [N, H, W, 4 * num_anchors]. Returns: A tuple of (boxes, scores, classes). """ cls_outputs, box_outputs = merge_class_box_level_outputs( params, cls_outputs, box_outputs) cls_outputs, box_outputs, classes, indices = topk_class_boxes( params, cls_outputs, box_outputs) # get boxes by apply bounding box regression to anchors. eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_boxes = tf.gather(eval_anchors.boxes, indices) boxes = anchors.decode_box_outputs_tf(box_outputs, anchor_boxes) # convert logits to scores. scores = tf.math.sigmoid(cls_outputs) return boxes, scores, classes
def metric_fn(**kwargs): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) # add metrics to output cls_outputs = {} box_outputs = {} for level in range(params['min_level'], params['max_level'] + 1): cls_outputs[level] = kwargs['cls_outputs_%d' % level] box_outputs[level] = kwargs['box_outputs_%d' % level] detections = anchor_labeler.generate_detections( cls_outputs, box_outputs, kwargs['source_ids']) eval_metric = coco_metric.EvaluationMetric(params['val_json_file']) coco_metrics = eval_metric.estimator_metric_fn(detections, kwargs['image_scales']) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def _predict_postprocess(cls_outputs, box_outputs, labels, params): """Post processes prediction outputs.""" predict_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) cls_outputs, box_outputs, anchor_boxes = postprocess.reshape_outputs( cls_outputs, box_outputs, predict_anchors.boxes, params['min_level'], params['max_level'], params['num_classes']) boxes, scores, classes, num_detections = postprocess.generate_detections( cls_outputs, box_outputs, anchor_boxes) predictions = { 'detection_boxes': boxes, 'detection_classes': classes, 'detection_scores': scores, 'num_detections': num_detections, } if labels is not None: predictions.update({ 'image_info': labels['image_info'], 'source_id': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], }) return predictions
def det_post_process_combined(params, cls_outputs, box_outputs, scales, min_score_thresh, max_boxes_to_draw): """A combined version of det_post_process with dynamic batch size support.""" batch_size = tf.shape(list(cls_outputs.values())[0])[0] cls_outputs_all = [] box_outputs_all = [] # Concatenates class and box of all levels into one tensor. for level in range(params['min_level'], params['max_level'] + 1): if params['data_format'] == 'channels_first': cls_outputs[level] = tf.transpose(cls_outputs[level], [0, 2, 3, 1]) box_outputs[level] = tf.transpose(box_outputs[level], [0, 2, 3, 1]) cls_outputs_all.append( tf.reshape(cls_outputs[level], [batch_size, -1, params['num_classes']])) box_outputs_all.append( tf.reshape(box_outputs[level], [batch_size, -1, 4])) cls_outputs_all = tf.concat(cls_outputs_all, 1) box_outputs_all = tf.concat(box_outputs_all, 1) # Create anchor_label for picking top-k predictions. eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_boxes = eval_anchors.boxes scores = tf.math.sigmoid(cls_outputs_all) # apply bounding box regression to anchors boxes = anchors.decode_box_outputs_tf(box_outputs_all, anchor_boxes) boxes = tf.expand_dims(boxes, axis=2) scales = tf.expand_dims(scales, axis=-1) nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( tf.image.combined_non_max_suppression(boxes, scores, max_boxes_to_draw, max_boxes_to_draw, score_threshold=min_score_thresh, clip_boxes=False)) del valid_detections # to be used in futue. image_ids = tf.cast(tf.tile(tf.expand_dims(tf.range(batch_size), axis=1), [1, max_boxes_to_draw]), dtype=tf.float32) y = nmsed_boxes[..., 0] * scales x = nmsed_boxes[..., 1] * scales height = nmsed_boxes[..., 2] * scales - y width = nmsed_boxes[..., 3] * scales - x detection_list = [ # Format: (image_ids, y, x, height, width, score, class) image_ids, y, x, height, width, nmsed_scores, tf.cast(nmsed_classes + 1, tf.float32) ] detections = tf.stack(detection_list, axis=2, name='detections') return detections
def __init__(self, iou_loss_type, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size, **kwargs): super().__init__(**kwargs) self.iou_loss_type = iou_loss_type self.input_anchors = anchors.Anchors(min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size) self.box_coder = FasterRcnnBoxCoder()
def load_anchors(self,fn): """ load anchors from file """ F = self.genoreader.get_nrows() T = self.phenoreader.get_nrows() self.anchors = anchors.Anchors(F,T) self.anchors.load(fn)
def get_pred_results(cls_outputs_dict,box_outputs_dict, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], (params['image_size'] - 5)) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) return tf.map_fn(anchor_labeler.generate_detections,(cls_outputs_dict,box_outputs_dict),dtype=tf.float32)
def __init__(self, params): self._max_num_instances = MAX_NUM_INSTANCES self._image_size = params["image_size"] self._num_classes = params["num_classes"] input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], (params['image_size'] - 5)) self.anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'])
def det_post_process(params, class_outputs, box_outputs, scales): from object_detection.core.post_processing import \ batch_multiclass_non_max_suppression cls_outputs_all, box_outputs_all = [], [] for level in range(params['min_level'], params['max_level'] + 1): cls_outputs_all.append( tf.reshape(class_outputs[level], [params['batch_size'], -1, params['num_classes']])) box_outputs_all.append( tf.reshape(box_outputs[level], [params['batch_size'], -1, 4])) cls_outputs_all = tf.concat(cls_outputs_all, 1) box_outputs_all = tf.concat(box_outputs_all, 1) probs = tf.math.sigmoid(cls_outputs_all) # Generate location of anchors. eval_anchors = tf.transpose( anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']).boxes) ycenter_a = (eval_anchors[0] + eval_anchors[2]) / 2 xcenter_a = (eval_anchors[1] + eval_anchors[3]) / 2 ha = eval_anchors[2] - eval_anchors[0] wa = eval_anchors[3] - eval_anchors[1] # Generate absolute bboxes in the units of pixels of the image. box_outputs_per_sample = tf.transpose(box_outputs_all[0]) ty, tx, th, tw = (box_outputs_per_sample[0], box_outputs_per_sample[1], box_outputs_per_sample[2], box_outputs_per_sample[3]) w, h = tf.math.exp(tw) * wa, tf.math.exp(th) * ha ycenter, xcenter = ty * ha + ycenter_a, tx * wa + xcenter_a ymin, ymax = ycenter - h / 2.0, ycenter + h / 2.0 xmin, xmax = xcenter - w / 2.0, xcenter + w / 2.0 boxes = tf.transpose(tf.stack([ymin, xmin, ymax, xmax])) # Generate the outputs boxes_all = tf.reshape(boxes, [params['batch_size'], -1, 1, 4]) probs_all = tf.reshape( probs, [params['batch_size'], -1, params['num_classes']]) (boxes_tf, scores_tf, classes_tf, _, _, num_detections_tf) = \ batch_multiclass_non_max_suppression( boxes=boxes_all, scores=probs_all, score_thresh=0.5, iou_thresh=0.5, max_size_per_class=anchors.MAX_DETECTIONS_PER_IMAGE, max_total_size=anchors.MAX_DETECTIONS_PER_IMAGE, use_combined_nms=False, use_class_agnostic_nms=True) boxes_tf *= scales return [boxes_tf, scores_tf, classes_tf, num_detections_tf]
def metric_fn(**kwargs): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) # add metrics to output cls_outputs = {} box_outputs = {} detections_bs = [] for index in range(batch_size): for level in range(params['min_level'], params['max_level'] + 1): _, w, h, c = kwargs['cls_outputs_%d' % level].get_shape().as_list() cls_outputs[level] = tf.slice( kwargs['cls_outputs_%d' % level], [index, 0, 0, 0], [1, w, h, c]) _, w, h, c = kwargs['box_outputs_%d' % level].get_shape().as_list() box_outputs[level] = tf.slice( kwargs['box_outputs_%d' % level], [index, 0, 0, 0], [1, w, h, c]) detections = anchor_labeler.generate_detections( cls_outputs, box_outputs, tf.slice(kwargs['source_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1])) detections_bs.append(detections) eval_metric = coco_metric.EvaluationMetric(params['val_json_file']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data']) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def _predict_postprocess(cls_outputs, box_outputs, params): """Post processes prediction outputs.""" predict_anchors = anchors.Anchors( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) cls_outputs, box_outputs, anchor_boxes = postprocess.reshape_outputs( cls_outputs, box_outputs, predict_anchors.boxes, params['min_level'], params['max_level'], params['num_classes']) boxes, scores, classes, num_detections = postprocess.generate_detections( cls_outputs, box_outputs, anchor_boxes) predictions = { 'detection_boxes': boxes, 'detection_classes': classes, 'detection_scores': scores, 'num_detections': num_detections, } return predictions
def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] eval_anchors = anchors.Anchors( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def gene_has_anchor(self, thresh, cis=True): """ computes if a gene has a cis anchor input: cis_thresh : threshold for cis-association cis_window : max distance between snp and gene """ F = self.genoreader.get_nrows() T = self.phenoreader.get_nrows() snp_ids = self.genoreader.getSnpIds() gene_ids = self.phenoreader.getGeneIds() RV = {'pv':[], 'snp_ids':[], 'gene_ids':[], 'isnp':[], 'igene':[]} for f, pv0_f in self.assoc0_reader.getRowIterator(): pv_min = np.min(pv0_f) if pv_min > thresh: continue idx_anchor = pv0_f==pv_min if not(idx_anchor.any()): continue if cis: idx_anchor[idx_anchor] = self.find_cis_genes(f, idx_anchor) if idx_anchor.any(): igenes = np.nonzero(idx_anchor)[0] for t in igenes: RV['pv'].append(pv_min) RV['snp_ids'].append(snp_ids[f]) RV['gene_ids'].append(gene_ids[t]) RV['isnp'].append(f) RV['igene'].append(t) for key in RV.keys(): RV[key] = np.array(RV[key]) self.anchors = anchors.Anchors(F,T,pv=RV['pv'],snp_ids=RV['snp_ids'],gene_ids=RV['gene_ids'], igene=RV['igene'],isnp=RV['isnp'])
def det_post_process(params: Dict[Any, Any], cls_outputs: Dict[int, tf.Tensor], box_outputs: Dict[int, tf.Tensor], scales: List[float]): outputs = { 'cls_outputs_all': [None], 'box_outputs_all': [None], 'indices_all': [None], 'classes_all': [None] } add_metric_fn_inputs(params, cls_outputs, box_outputs, outputs) #Create anchor_label for picking top-k predictions. eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) #Add all detections for each input image. detections_batch = [] for index in range(params['batch_size']): #shape is [MAX_DETECTION_POINTS,]---->score cls_outputs_per_sample = outputs['cls_outputs_all'][index] #shape is [MAX_DETECTION_POINTS,4]---->box ---ty,tx,th,tw box_outputs_per_sample = outputs['box_outputs_all'][index] # shape is [MAX_DETECTION_POINTS,] indices_per_sample = outputs['indices_all'][index] # shape is [MAX_DETECTION_POINTS,] classes_per_sample = outputs['classes_all'][index] detections = anchor_labeler.generate_detections( cls_outputs_per_sample, box_outputs_per_sample, indices_per_sample, classes_per_sample, image_id=[index], image_scale=[scales[index]], disable_pyfun=False) detections_batch.append(detections) #shape is batch =[batch,M,7]---[image_id, x, y, width, height, score, class] return tf.stack(detections_batch, name='detections')
def __init__(self, num_classes, block, layers): super(RetinaNet, self).__init__() self.inplanes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) if block == BasicBlock: fpn_sizes = [128, 256, 512] elif block == Bottleneck: fpn_sizes = [512, 1024, 2048] self.fpn = PFN(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2]) self.regression = BoxDetect(256) self.classification = Classification(256, num_classes=num_classes) self.anchors = anchors.Anchors() self.boxs_regression = BBoxTransform() self.clipBoxes = ClipBoxes() self.prior = 0.01 self.classification.out.weight.data.fill_(0) self.classification.out.bias.data.fill_(-math.log((1.0 - self.prior) / self.prior)) self.regression.out.weight.data.fill_(0) self.regression.out.bias.data.fill_(0) self.freeze_bn
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def get_dataset_for_mode(data_dir, is_training): """Return the location of input samples for a given mode.""" if is_training: return '%s/coco_train2017_nocrowd-*' % data_dir return '%s/coco_val2017-*' % data_dir def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip( image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float( image_original_shape[0]) / tf.to_float(tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.string_to_number(source_id, out_type=tf.float32) row = (image, cls_targets, box_targets, num_positives, source_id, image_scale) return row batch_size = params['batch_size'] data_file_pattern = get_dataset_for_mode(self._data_dir, self._is_training) dataset = tf.data.Dataset.list_files(data_file_pattern) dataset = dataset.shuffle(buffer_size=1024) if self._is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(prefetch_dataset, cycle_length=32, sloppy=True)) dataset = dataset.shuffle(20) dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) dataset = dataset.prefetch(1) (images, cls_targets, box_targets, num_positives, source_ids, image_scales) = dataset.make_one_shot_iterator().get_next() labels = {} # count num_positives in a batch num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['source_ids'] = source_ids labels['image_scales'] = image_scales return images, labels
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # Handle crowd annotations. As crowd annotations are not large # instances, the model ignores them in training. if params['skip_crowd']: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip( image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float( image_original_shape[0]) / tf.to_float(tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, cls_weights, box_targets, box_weights, num_positives, num_negatives, num_ignored) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.string_to_number(source_id, out_type=tf.float32) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) row = (image, cls_targets, cls_weights, box_targets, box_weights, num_positives, num_negatives, num_ignored, source_id, image_scale) return row # batch_size = params['batch_size'] batch_size = self._batch_size dataset = tf.data.Dataset.list_files(self._file_pattern) dataset = dataset.shuffle(buffer_size=1024) if self._is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename, buffer_size=8 * 1000 * 1000) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(prefetch_dataset, cycle_length=1, sloppy=True)) dataset = dataset.shuffle(buffer_size=3072) dataset = dataset.map(_dataset_parser, num_parallel_calls=12) dataset = dataset.prefetch(32) dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) dataset = dataset.prefetch(2) (images, cls_targets, cls_weights, box_targets, box_weights, num_positives, num_negatives, num_ignored, source_ids, image_scales) = dataset.make_one_shot_iterator().get_next() labels = {} # count num_positives in a batch num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) num_negatives_batch = tf.reduce_mean(num_negatives) labels['mean_num_negatives'] = tf.reshape( tf.tile(tf.expand_dims(num_negatives_batch, 0), [ batch_size, ]), [batch_size, 1]) num_ignored_batch = tf.reduce_mean(num_ignored) labels['mean_num_ignored'] = tf.reshape( tf.tile(tf.expand_dims(num_ignored_batch, 0), [batch_size]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['cls_weights_%d' % level] = cls_weights[level] labels['box_targets_%d' % level] = box_targets[level] labels['box_weights_%d' % level] = box_weights[level] labels['source_ids'] = source_ids labels['image_scales'] = image_scales return images, labels
def build_model_graph(features, labels, is_training, params): """Builds the forward model graph.""" model_outputs = {} if params['transpose_input'] and is_training: features['images'] = tf.transpose(features['images'], [3, 0, 1, 2]) batch_size, image_height, image_width, _ = ( features['images'].get_shape().as_list()) if 'source_ids' not in features: features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32) all_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], (image_height, image_width)) with tf.variable_scope('resnet%s' % params['resnet_depth']): resnet_fn = resnet.resnet_v1( params['resnet_depth'], num_batch_norm_group=params['num_batch_norm_group']) backbone_feats = resnet_fn(features['images'], (params['is_training_bn'] and is_training)) fpn_feats = fpn.fpn(backbone_feats, params['min_level'], params['max_level']) rpn_score_outputs, rpn_box_outputs = heads.rpn_head( fpn_feats, params['min_level'], params['max_level'], len(params['aspect_ratios'] * params['num_scales'])) if is_training: rpn_pre_nms_topn = params['rpn_pre_nms_topn'] rpn_post_nms_topn = params['rpn_post_nms_topn'] else: rpn_pre_nms_topn = params['test_rpn_pre_nms_topn'] rpn_post_nms_topn = params['test_rpn_post_nms_topn'] rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois( rpn_score_outputs, rpn_box_outputs, all_anchors, features['image_info'], rpn_pre_nms_topn, rpn_post_nms_topn, params['rpn_nms_threshold'], params['rpn_min_size'], bbox_reg_weights=None, use_tpu=params['use_tpu']) rpn_box_rois = tf.to_float(rpn_box_rois) if is_training: rpn_box_rois = tf.stop_gradient(rpn_box_rois) rpn_box_scores = tf.stop_gradient(rpn_box_scores) if is_training: # Sampling box_targets, class_targets, rpn_box_rois, proposal_to_label_map = ( training_ops.proposal_label_op( rpn_box_rois, labels['gt_boxes'], labels['gt_classes'], features['image_info'], batch_size_per_im=params['batch_size_per_im'], fg_fraction=params['fg_fraction'], fg_thresh=params['fg_thresh'], bg_thresh_hi=params['bg_thresh_hi'], bg_thresh_lo=params['bg_thresh_lo'])) # Performs multi-level RoIAlign. box_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, rpn_box_rois, output_size=7) class_outputs, box_outputs, _ = heads.box_head( box_roi_features, num_classes=params['num_classes'], mlp_head_dim=params['fast_rcnn_mlp_head_dim']) if not is_training: if params['use_tpu']: detections = postprocess_ops.generate_detections_tpu( class_outputs, box_outputs, rpn_box_rois, features['source_ids'], features['image_info'], params['test_rpn_post_nms_topn'], params['test_detections_per_image'], params['test_nms'], params['bbox_reg_weights']) else: detections = postprocess_ops.generate_detections_gpu( class_outputs, box_outputs, rpn_box_rois, features['source_ids'], features['image_info'], params['test_rpn_post_nms_topn'], params['test_detections_per_image'], params['test_nms'], params['bbox_reg_weights']) model_outputs.update({ 'detections': tf.identity(detections, 'Detections'), }) if params['output_box_features']: final_box_rois = detections[:, :, 1:5] final_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, final_box_rois, output_size=7) _, _, final_box_features = heads.box_head( final_roi_features, num_classes=params['num_classes'], mlp_head_dim=params['fast_rcnn_mlp_head_dim']) model_outputs.update({ 'box_features': tf.identity(final_box_features, 'BoxFeatures'), }) else: encoded_box_targets = training_ops.encode_box_targets( rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights']) model_outputs.update({ 'rpn_score_outputs': rpn_score_outputs, 'rpn_box_outputs': rpn_box_outputs, 'class_outputs': class_outputs, 'box_outputs': box_outputs, 'class_targets': class_targets, 'box_targets': encoded_box_targets, 'box_rois': rpn_box_rois, }) # Faster-RCNN mode. if not params['include_mask']: return model_outputs # Mask sampling if not is_training: selected_box_rois = detections[:, :, 1:5] class_indices = tf.to_int32(detections[:, :, 6]) else: (selected_class_targets, selected_box_targets, selected_box_rois, proposal_to_label_map) = (training_ops.select_fg_for_masks( class_targets, box_targets, rpn_box_rois, proposal_to_label_map, max_num_fg=int(params['batch_size_per_im'] * params['fg_fraction']))) class_indices = tf.to_int32(selected_class_targets) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, selected_box_rois, output_size=14) mask_outputs = heads.mask_head(mask_roi_features, class_indices, num_classes=params['num_classes'], mrcnn_resolution=params['mrcnn_resolution']) model_outputs.update({ 'mask_outputs': mask_outputs, }) if is_training: mask_targets = training_ops.get_mask_targets( selected_box_rois, proposal_to_label_map, selected_box_targets, labels['cropped_gt_masks'], params['mrcnn_resolution']) model_outputs.update({ 'mask_targets': mask_targets, 'selected_class_targets': selected_class_targets, }) else: model_outputs['mask_outputs'] = tf.identity( tf.nn.sigmoid(model_outputs['mask_outputs']), 'Masks') return model_outputs
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: a dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: image information that includes the original height and width, the scale of the proccessed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: a dictionary that contains auxiliary information plus (optional) labels. The following describes {key: value} pairs in the dictionary. `labels` is only for training. score_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) data['groundtruth_is_crowd'] = tf.cond( tf.greater(tf.size(data['groundtruth_is_crowd']), 0), lambda: data['groundtruth_is_crowd'], lambda: tf.zeros_like(data['groundtruth_classes'], dtype=tf.bool)) image = data['image'] image = tf.image.convert_image_dtype(image, dtype=tf.float32) orig_image = image source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if self._mode == tf.estimator.ModeKeys.PREDICT: image = preprocess_ops.normalize_image(image) image, image_info, _, _, _ = preprocess_ops.resize_crop_pad( image, params['image_size'], 2**params['max_level']) if params['precision'] == 'bfloat16': image = tf.cast(image, dtype=tf.bfloat16) features = { 'images': image, 'image_info': image_info, 'source_ids': source_id, } if params['visualize_images_summary']: resized_image = tf.image.resize_images( orig_image, params['image_size']) features['orig_images'] = resized_image if params['include_groundtruth_in_features']: labels = _prepare_labels_for_eval( data, target_num_instances=self._max_num_instances, target_polygon_list_len=self. _max_num_polygon_list_len, use_instance_mask=params['include_mask']) return {'features': features, 'labels': labels} else: return {'features': features} elif (self._mode == tf.estimator.ModeKeys.TRAIN or self._mode == tf.estimator.ModeKeys.EVAL): instance_masks = None if self._use_instance_mask: instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) if self._use_instance_mask: instance_masks = tf.gather_nd( instance_masks, indices) image = preprocess_ops.normalize_image(image) # Random flipping for training only. if (self._mode == tf.estimator.ModeKeys.TRAIN and params['input_rand_hflip']): flipped_results = ( preprocess_ops.random_horizontal_flip( image, boxes=boxes, masks=instance_masks)) if self._use_instance_mask: image, boxes, instance_masks = flipped_results else: image, boxes = flipped_results # Scaling, jittering and padding. image, image_info, boxes, classes, cropped_gt_masks = ( preprocess_ops.resize_crop_pad( image, params['image_size'], 2**params['max_level'], aug_scale_min=params['aug_scale_min'], aug_scale_max=params['aug_scale_max'], boxes=boxes, classes=classes, masks=instance_masks, crop_mask_size=params['gt_mask_size'])) if cropped_gt_masks is not None: cropped_gt_masks = tf.pad(cropped_gt_masks, paddings=tf.constant([[ 0, 0, ], [ 2, 2, ], [2, 2]]), mode='CONSTANT', constant_values=0.) padded_height, padded_width, _ = image.get_shape().as_list( ) padded_image_size = (padded_height, padded_width) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], padded_image_size) anchor_labeler = anchors.AnchorLabeler( input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) # Assign anchors. score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) # Pad groundtruth data. boxes = preprocess_ops.pad_to_fixed_size( boxes, -1, [self._max_num_instances, 4]) classes = preprocess_ops.pad_to_fixed_size( classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. if self._use_instance_mask: cropped_gt_masks = tf.reshape( cropped_gt_masks, tf.stack([tf.shape(cropped_gt_masks)[0], -1])) cropped_gt_masks = preprocess_ops.pad_to_fixed_size( cropped_gt_masks, -1, [ self._max_num_instances, (params['gt_mask_size'] + 4)**2 ]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['precision'] == 'bfloat16': image = tf.cast(image, dtype=tf.bfloat16) features = { 'images': image, 'image_info': image_info, 'source_ids': source_id, } labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes if self._use_instance_mask: labels['cropped_gt_masks'] = cropped_gt_masks return features, labels
def __call__(self, params): image_size = (params['image_size'], params['image_size']) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=self._use_instance_mask) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: a dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: image information that includes the original height and width, the scale of the proccessed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: a dictionary that contains auxiliary information plus (optional) labels. The following describes {key: value} pairs in the dictionary. `labels` is only for training. score_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) image = data['image'] source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if self._mode == tf.estimator.ModeKeys.PREDICT: input_processor = InstanceSegmentationInputProcessor( image, image_size) input_processor.normalize_image() input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) image_info = input_processor.get_image_info() return { 'images': image, 'image_info': image_info, 'source_ids': source_id } elif self._mode == tf.estimator.ModeKeys.TRAIN: instance_masks = None if self._use_instance_mask: instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) if self._use_instance_mask: instance_masks = tf.gather_nd( instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, boxes, classes, instance_masks) input_processor.normalize_image() if params['input_rand_hflip']: input_processor.random_horizontal_flip() input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() if self._use_instance_mask: instance_masks = input_processor.resize_and_crop_masks( ) cropped_gt_masks = input_processor.crop_gt_masks( instance_masks, boxes, params['gt_mask_size'], image_size) # Assign anchors. score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) # Pad groundtruth data. image_info = input_processor.get_image_info() boxes *= image_info[2] boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. if self._use_instance_mask: cropped_gt_masks = tf.reshape( cropped_gt_masks, [self._max_num_instances, -1]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [ self._max_num_instances, (params['gt_mask_size'] + 4)**2 ]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) features = {} features['images'] = image features['image_info'] = image_info features['source_ids'] = source_id labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes if self._use_instance_mask: labels['cropped_gt_masks'] = cropped_gt_masks return (features, labels) batch_size = params['batch_size'] if 'batch_size' in params else 1 dataset = tf.data.Dataset.list_files( self._file_pattern, shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN)) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( _prefetch_dataset, cycle_length=32, sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN))) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.apply( tf.contrib.data.map_and_batch(_dataset_parser, batch_size=batch_size, num_parallel_batches=64, drop_remainder=True)) # Transposes images for TPU performance. # Given the batch size, the batch dimesion (N) goes to either the minor # ((H, W, C, N) when N > C) or the second-minor ((H, W, N, C) when N < C) # dimension. Here, we assume N is 4 or 8 and C is 3, so we use # (H, W, C, N). if (params['transpose_input'] and self._mode == tf.estimator.ModeKeys.TRAIN): def _transpose_images(features, labels): features['images'] = tf.transpose(features['images'], [1, 2, 3, 0]) return features, labels dataset = dataset.map(_transpose_images, num_parallel_calls=64) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) if self._num_examples > 0: dataset = dataset.take(self._num_examples) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params=None): if params is None: params = self._params input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preprocessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the processed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tensor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tensor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tensor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if params['skip_crowd_during_training'] and self._is_training: indices = tf.where(tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # NOTE: The autoaugment method works best when used alongside the # standard horizontal flipping of images along with size jittering # and normalization. if params.get('autoaugment_policy', None) and self._is_training: from aug import autoaugment # pylint: disable=g-import-not-at-top image, boxes = autoaugment.distort_image_with_autoaugment( image, boxes, params['autoaugment_policy']) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, boxes, is_crowds, areas, classes) dataset = tf.data.Dataset.list_files( self._file_pattern, shuffle=self._is_training) if horovod_enabled() and self._is_training: #multi card eval is not supported yet # 根据 GPU 数量做 shard 均分 dataset = dataset.shard(hvd.size(), hvd.rank()) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset cycle_length = 1 if self._is_deterministic else 32 dataset = dataset.apply( tf.data.experimental.parallel_interleave( _prefetch_dataset, cycle_length=cycle_length, sloppy=self._is_training)) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. num_parallel_calls = 1 if self._is_deterministic else 64 dataset = dataset.map(_dataset_parser, num_parallel_calls=num_parallel_calls) batch_size = params['batch_size'] dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, cls_targets, box_targets, num_positives, source_ids, image_scales, boxes, is_crowds, areas, classes): """Processes one batch of data.""" labels = {} # Count num_positives in a batch. num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model defination for the RetinaNet model based on ResNet. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the RetinaNet model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. """ def _model_outputs(): return model( features, min_level=params['min_level'], max_level=params['max_level'], num_classes=params['num_classes'], num_anchors=len(params['aspect_ratios'] * params['num_scales']), resnet_depth=params['resnet_depth'], is_training_bn=params['is_training_bn']) if params['use_bfloat16']: with bfloat16.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: # print("entering PREDICT mode") predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) detections = anchor_labeler.generate_detections( cls_outputs, box_outputs,image_id=100) print("detection for image is", detections) predictions['detections'] = detections return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Load pretrained model from checkpoint. if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.train.init_from_checkpoint(params['resnet_checkpoint'], { '/': 'resnet%s/' % params['resnet_depth'], }) return tf.train.Scaffold() else: scaffold_fn = None # Set up training loss and learning rate. global_step = tf.train.get_global_step() learning_rate = _learning_rate_schedule( params['learning_rate'], params['lr_warmup_init'], params['lr_warmup_step'], params['lr_drop_step'], global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. total_loss, cls_loss, box_loss = _detection_loss(cls_outputs, box_outputs, labels, params) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=params['momentum']) if params['use_tpu']: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = variable_filter_fn( tf.trainable_variables(), params['resnet_depth']) if variable_filter_fn else None with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) else: train_op = None # Evaluation only works on GPU/CPU host and batch_size=1 eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) # add metrics to output cls_outputs = {} box_outputs = {} for level in range(params['min_level'], params['max_level'] + 1): cls_outputs[level] = kwargs['cls_outputs_%d' % level] box_outputs[level] = kwargs['box_outputs_%d' % level] detections = anchor_labeler.generate_detections( cls_outputs, box_outputs, kwargs['source_ids']) eval_metric = coco_metric.EvaluationMetric(params['val_json_file']) coco_metrics = eval_metric.estimator_metric_fn(detections, kwargs['image_scales']) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics batch_size = params['batch_size'] cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ batch_size, ]), [batch_size, 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ batch_size, ]), [batch_size, 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'image_scales': labels['image_scales'], } for level in range(params['min_level'], params['max_level'] + 1): metric_fn_inputs['cls_outputs_%d' % level] = cls_outputs[level] metric_fn_inputs['box_outputs_%d' % level] = box_outputs[level] eval_metrics = (metric_fn, metric_fn_inputs) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def main(argv): # pbModel_path = './models/pb/blazeFace_model_test.pb' pbModel_path = r'C:\Users\17ZY-HPYKFD2\Downloads\dFServer\blazeFace_model_test.pb' WIDTH_DES = 256 HEIGHT_DES = 256 USE_NORM = True UPSCALE = False anchorsC = anchors.Anchors() boxes_vec = anchorsC.get_anchors(fmSizes=[(16, 16), (8, 8)], fmBased=True) # Setup tensorflow and model os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # Force on CPU os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Force on CPU config = tf.ConfigProto() tf.reset_default_graph() with tf.Session(config=config) as sess: ret = True # Loop through video data while ret == True: # ret, frame = vid_in.read() frame = cv2.imread('./img_381.jpg') frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if UPSCALE: r = WIDTH_DES * 2 / frame.shape[1] dim_des = (int(WIDTH_DES * 2), int(frame.shape[0] * r)) frame = cv2.resize(frame, dim_des, interpolation=cv2.INTER_LANCZOS4) c_shp = frame.shape frame = frame[int(c_shp[0] / 4):-int(c_shp[0] / 4), int((c_shp[1] - WIDTH_DES) / 2):-int((c_shp[1] - WIDTH_DES) / 2)] else: r = WIDTH_DES / max(frame.shape[1], frame.shape[0]) dim_des = (int(WIDTH_DES), int(frame.shape[1] * r)) # frame = cv2.resize(frame, (WIDTH_DES, HEIGHT_DES)) frame = cv2.resize(frame, (0, 0), fx=r, fy=r) # (WIDTH_DES, HEIGHT_DES)) frame = np.pad(frame, ((0, HEIGHT_DES - frame.shape[0]), (0, WIDTH_DES - frame.shape[1]), (0, 0)), mode='constant') # frame_padded = lighting_balance(frame) # frame_padded = cv2.copyMakeBorder(frame, 0, max(0, HEIGHT_DES - frame.shape[0]), 0, 0, cv2.BORDER_CONSTANT, value=(0,0,0)) # pred_confs, pred_locs = model.test_iter(np.expand_dims(frame, axis = 0)) tmp_frame = frame / 255. pred_locs, pred_confs = freeze_graph_test(pbModel_path, np.expand_dims(tmp_frame, axis=0)) # f = open('paramR.txt', 'w', encoding='utf-8') # confT = pred_confs[0][:, 1] # for conf in confT: # print(str(conf), file=f) # f.close() # exit(1) f = open('paramR.txt', 'w', encoding='utf-8') for i in range(boxes_vec.shape[0]): l = pred_locs[0][i][0] t = pred_locs[0][i][1] r = pred_locs[0][i][2] b = pred_locs[0][i][3] p = pred_confs[0][i][1] print('index:', i, ', L:', l, ', T:', t, ', R:', r, ', B:', b, ', P:', p, file=f) f.close() pred_boxes = decode_batch(boxes_vec, pred_locs, pred_confs, min_conf=0.3)[0] pred_boxes[pred_boxes < 0] = 0 pred_boxes[:, [0, 2]][pred_boxes[:, [0, 2]] > WIDTH_DES] = WIDTH_DES pred_boxes[:, [1, 3]][pred_boxes[:, [1, 3]] > HEIGHT_DES] = HEIGHT_DES h, w = HEIGHT_DES, WIDTH_DES for box in pred_boxes.tolist(): if USE_NORM: print(int(box[0] * w), int(box[1] * h), int(box[2] * w), int(box[3] * h)) cv2.rectangle(frame, (int(box[0] * w), int(box[1] * h)), (int(box[2] * w), int(box[3] * h)), (0, 255, 0), 3) # cv2.rectangle(frame, (480, 72), (654, 294), (0, 255, 0), 3) else: cv2.rectangle(frame, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (255, 0, 0), 3) cv2.imshow('Webcam', frame) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) cv2.imwrite('./tmp.jpg', frame) cv2.waitKey(1) ret = False # vid_in.release() cv2.destroyAllWindows()
def _model_fn(features, labels, mode, params, variable_filter_fn=None): """Model defination for the Mask-RCNN model based on ResNet. Args: features: the input image tensor and auxiliary information, such as `image_info` and `source_ids`. The image tensor has a shape of [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include score targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. """ if params['transpose_input'] and mode == tf.estimator.ModeKeys.TRAIN: features['images'] = tf.transpose(features['images'], [3, 0, 1, 2]) image_size = (params['image_size'], params['image_size']) all_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) def _model_outputs(): """Generates outputs from the model.""" model_outputs = {} with tf.variable_scope('resnet%s' % params['resnet_depth']): resnet_fn = resnet.resnet_v1( params['resnet_depth'], num_batch_norm_group=params['num_batch_norm_group']) backbone_feats = resnet_fn(features['images'], params['is_training_bn']) fpn_feats = fpn.fpn(backbone_feats, params['min_level'], params['max_level']) rpn_score_outputs, rpn_box_outputs = heads.rpn_head( fpn_feats, params['min_level'], params['max_level'], len(params['aspect_ratios'] * params['num_scales'])) if mode == tf.estimator.ModeKeys.TRAIN: rpn_pre_nms_topn = params['rpn_pre_nms_topn'] rpn_post_nms_topn = params['rpn_post_nms_topn'] else: rpn_pre_nms_topn = params['test_rpn_pre_nms_topn'] rpn_post_nms_topn = params['test_rpn_post_nms_topn'] _, rpn_box_rois = mask_rcnn_architecture.proposal_op( rpn_score_outputs, rpn_box_outputs, all_anchors, features['image_info'], rpn_pre_nms_topn, rpn_post_nms_topn, params['rpn_nms_threshold'], params['rpn_min_size']) rpn_box_rois = tf.to_float(rpn_box_rois) if mode == tf.estimator.ModeKeys.TRAIN: # Sampling box_targets, class_targets, rpn_box_rois, proposal_to_label_map = ( mask_rcnn_architecture.proposal_label_op( rpn_box_rois, labels['gt_boxes'], labels['gt_classes'], features['image_info'], batch_size_per_im=params['batch_size_per_im'], fg_fraction=params['fg_fraction'], fg_thresh=params['fg_thresh'], bg_thresh_hi=params['bg_thresh_hi'], bg_thresh_lo=params['bg_thresh_lo'])) # Performs multi-level RoIAlign. box_roi_features = ops.multilevel_crop_and_resize(fpn_feats, rpn_box_rois, output_size=7) class_outputs, box_outputs = heads.box_head( box_roi_features, num_classes=params['num_classes'], mlp_head_dim=params['fast_rcnn_mlp_head_dim']) if mode != tf.estimator.ModeKeys.TRAIN: batch_size, _, _ = class_outputs.get_shape().as_list() detections = [] softmax_class_outputs = tf.nn.softmax(class_outputs) for i in range(batch_size): detections.append( anchors.generate_detections_per_image_op( softmax_class_outputs[i], box_outputs[i], rpn_box_rois[i], features['source_ids'][i], features['image_info'][i], params['test_detections_per_image'], params['test_rpn_post_nms_topn'], params['test_nms'], params['bbox_reg_weights'])) detections = tf.stack(detections, axis=0) model_outputs.update({ 'detections': detections, }) else: encoded_box_targets = mask_rcnn_architecture.encode_box_targets( rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights']) model_outputs.update({ 'rpn_score_outputs': rpn_score_outputs, 'rpn_box_outputs': rpn_box_outputs, 'class_outputs': class_outputs, 'box_outputs': box_outputs, 'class_targets': class_targets, 'box_targets': encoded_box_targets, 'box_rois': rpn_box_rois, }) # Faster-RCNN mode. if not params['include_mask']: return model_outputs # Mask sampling if mode != tf.estimator.ModeKeys.TRAIN: selected_box_rois = detections[:, :, 1:5] class_indices = tf.to_int32(detections[:, :, 6]) else: (selected_class_targets, selected_box_targets, selected_box_rois, proposal_to_label_map) = ( mask_rcnn_architecture.select_fg_for_masks( class_targets, box_targets, rpn_box_rois, proposal_to_label_map, max_num_fg=int(params['batch_size_per_im'] * params['fg_fraction']))) class_indices = tf.to_int32(selected_class_targets) mask_roi_features = ops.multilevel_crop_and_resize(fpn_feats, selected_box_rois, output_size=14) mask_outputs = heads.mask_head( mask_roi_features, class_indices, num_classes=params['num_classes'], mrcnn_resolution=params['mrcnn_resolution']) model_outputs.update({ 'mask_outputs': mask_outputs, }) if mode == tf.estimator.ModeKeys.TRAIN: mask_targets = mask_rcnn_architecture.get_mask_targets( selected_box_rois, proposal_to_label_map, selected_box_targets, labels['cropped_gt_masks'], params['mrcnn_resolution']) model_outputs.update({ 'mask_targets': mask_targets, 'selected_class_targets': selected_class_targets, }) return model_outputs if params['use_bfloat16']: with tf.contrib.tpu.bfloat16_scope(): model_outputs = _model_outputs() def cast_outputs_to_float(d): for k, v in sorted(six.iteritems(d)): if isinstance(v, dict): cast_outputs_to_float(v) else: d[k] = tf.cast(v, tf.float32) cast_outputs_to_float(model_outputs) else: model_outputs = _model_outputs() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = {} predictions['detections'] = model_outputs['detections'] predictions['image_info'] = features['image_info'] if params['include_mask']: predictions['mask_outputs'] = tf.nn.sigmoid( model_outputs['mask_outputs']) if params['use_tpu']: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. global_step = tf.train.get_or_create_global_step() learning_rate = learning_rates.step_learning_rate_with_linear_warmup( global_step, params['init_learning_rate'], params['warmup_learning_rate'], params['warmup_steps'], params['learning_rate_levels'], params['learning_rate_steps']) # score_loss and box_loss are for logging. only total_loss is optimized. total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss( model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'], labels, params) (total_fast_rcnn_loss, fast_rcnn_class_loss, fast_rcnn_box_loss) = losses.fast_rcnn_loss( model_outputs['class_outputs'], model_outputs['box_outputs'], model_outputs['class_targets'], model_outputs['box_targets'], params) # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py # pylint: disable=line-too-long if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']: mask_loss = losses.mask_rcnn_loss( model_outputs['mask_outputs'], model_outputs['mask_targets'], model_outputs['selected_class_targets'], params) else: mask_loss = 0. if variable_filter_fn: var_list = variable_filter_fn(tf.trainable_variables(), params['resnet_depth']) else: var_list = None l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in var_list if 'batch_normalization' not in v.name and 'bias' not in v.name ]) total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss + l2_regularization_loss) host_call = None if mode == tf.estimator.ModeKeys.TRAIN: optimizer = create_optimizer(learning_rate, params) optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) if not params['resnet_checkpoint']: scaffold_fn = None else: def scaffold_fn(): """Loads pretrained model through scaffold function.""" # Exclude all variable of optimizer. optimizer_vars = set( [var.name for var in optimizer.variables()]) prefix = 'resnet%s/' % params['resnet_depth'] resnet_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, prefix) vars_to_load = {} for var in resnet_vars: if var.name not in optimizer_vars: var_name = var.name # Trim the index of the variable. if ':' in var_name: var_name = var_name[:var_name.rindex(':')] if params['skip_checkpoint_variables'] and re.match( params['skip_checkpoint_variables'], var_name[len(prefix):]): continue vars_to_load[var_name[len(prefix):]] = var_name for var in optimizer_vars: tf.logging.info('Optimizer vars: %s.' % var) var_names = sorted(vars_to_load.keys()) for k in var_names: tf.logging.info('Will train: "%s": "%s",' % (k, vars_to_load[k])) tf.train.init_from_checkpoint(params['resnet_checkpoint'], vars_to_load) if not vars_to_load: raise ValueError('Variables to load is empty.') return tf.train.Scaffold() # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) if params['global_gradient_clip_ratio'] > 0: # Clips the gradients for training stability. # Refer: https://arxiv.org/abs/1211.5063 with tf.name_scope('clipping'): old_grads, variables = zip(*grads_and_vars) num_weights = sum(g.shape.num_elements() for g in old_grads if g is not None) clip_norm = params['global_gradient_clip_ratio'] * math.sqrt( num_weights) tf.logging.info( 'Global clip norm set to %g for %d variables with %d elements.' % (clip_norm, sum( 1 for g in old_grads if g is not None), num_weights)) gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm) else: gradients, variables = zip(*grads_and_vars) grads_and_vars = [] # Special treatment for biases (beta is named as bias in reference model) # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113 # pylint: disable=line-too-long for grad, var in zip(gradients, variables): if 'beta' in var.name or 'bias' in var.name: grad = 2.0 * grad grads_and_vars.append((grad, var)) minimize_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) with tf.control_dependencies(update_ops): train_op = minimize_op if params['use_host_call']: def host_call_fn(global_step, total_loss, total_rpn_loss, rpn_score_loss, rpn_box_loss, total_fast_rcnn_loss, fast_rcnn_class_loss, fast_rcnn_box_loss, mask_loss, learning_rate): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: global_step: `Tensor with shape `[batch, ]` for the global_step. total_loss: `Tensor` with shape `[batch, ]` for the training loss. total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN loss. rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN score loss. rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN box loss. total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN loss. fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN class loss. fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN box loss. mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN mask loss. learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate. Returns: List of summary ops to run on the CPU host. """ # Outfeed supports int32 but global_step is expected to be int64. global_step = tf.reduce_mean(global_step) # Host call fns are executed FLAGS.iterations_per_loop times after one # TPU loop is finished, setting max_queue value to the same as number of # iterations will make the summary writer only flush the data to storage # once per loop. with (tf.contrib.summary.create_file_writer( params['model_dir'], max_queue=params['iterations_per_loop']).as_default()): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('total_loss', tf.reduce_mean(total_loss), step=global_step) tf.contrib.summary.scalar( 'total_rpn_loss', tf.reduce_mean(total_rpn_loss), step=global_step) tf.contrib.summary.scalar( 'rpn_score_loss', tf.reduce_mean(rpn_score_loss), step=global_step) tf.contrib.summary.scalar('rpn_box_loss', tf.reduce_mean(rpn_box_loss), step=global_step) tf.contrib.summary.scalar( 'total_fast_rcnn_loss', tf.reduce_mean(total_fast_rcnn_loss), step=global_step) tf.contrib.summary.scalar( 'fast_rcnn_class_loss', tf.reduce_mean(fast_rcnn_class_loss), step=global_step) tf.contrib.summary.scalar( 'fast_rcnn_box_loss', tf.reduce_mean(fast_rcnn_box_loss), step=global_step) if params['include_mask']: tf.contrib.summary.scalar( 'mask_loss', tf.reduce_mean(mask_loss), step=global_step) tf.contrib.summary.scalar( 'learning_rate', tf.reduce_mean(learning_rate), step=global_step) return tf.contrib.summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. global_step_t = tf.reshape(global_step, [1]) total_loss_t = tf.reshape(total_loss, [1]) total_rpn_loss_t = tf.reshape(total_rpn_loss, [1]) rpn_score_loss_t = tf.reshape(rpn_score_loss, [1]) rpn_box_loss_t = tf.reshape(rpn_box_loss, [1]) total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1]) fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1]) fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1]) mask_loss_t = tf.reshape(mask_loss, [1]) learning_rate_t = tf.reshape(learning_rate, [1]) host_call = (host_call_fn, [ global_step_t, total_loss_t, total_rpn_loss_t, rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t, fast_rcnn_class_loss_t, fast_rcnn_box_loss_t, mask_loss_t, learning_rate_t ]) else: train_op = None scaffold_fn = None return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn)
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if params['skip_crowd_during_training'] and self._is_training: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, boxes, is_crowds, areas, classes) batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training, seed=tf.random.set_random_seed( int(time.time() * 1e9))) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, cls_targets, box_targets, num_positives, source_ids, image_scales, boxes, is_crowds, areas, classes): """Processes one batch of data.""" labels = {} # Count num_positives in a batch. num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) return dataset
def main(dataPath=None): pbModel_path = './models/pb/blazeFace_model_test.pb' # pbModel_path = r'C:\Users\17ZY-HPYKFD2\Downloads\dFServer\blazeFace_model_test.pb' if dataPath is not None: data_test_dir = dataPath else: data_test_dir = '/data1/image_data/data/faces/zhengmian_0815' # data_test_dir = '/data1/image_data/data/online_pushed_data/parse_result/illegalPicCls/NCNN/ncnn/WIDER_val' # lablePath = '/data1/image_data/data/online_pushed_data/parse_result/illegalPicCls/NCNN/ncnn/FDDB/FDDB_xmlanno' # lablePath = '/data1/image_data/data/online_pushed_data/parse_result/illegalPicCls/NCNN/ncnn/WIDER_val/xml' if not os.path.exists(data_test_dir): print('not found dataDir:', data_test_dir) exit(-1) # if 'FDDB' in data_test_dir: # tail = 'FDDB' # else: tail = 'Self' storePath = './tmpDetImgs_self' if not os.path.exists(storePath): os.makedirs(storePath) else: os.system('rm -rf ' + storePath) os.makedirs(storePath) WIDTH_DES = 256 HEIGHT_DES = 256 anchorsC = anchors.Anchors() boxes_vec = anchorsC.get_anchors(fmSizes=[(16, 16), (8, 8)], fmBased=True) # Setup tensorflow and model os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # Force on CPU os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # gpu编号 with tf.Graph().as_default(): output_graph_def = tf.GraphDef() with open(pbModel_path, "rb") as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name="") with tf.Session() as sess: # 定义输入的张量名称,对应网络结构的输入张量 # input:0作为输入图像,keep_prob:0作为dropout的参数,测试时值为1,is_training:0训练参数 input_image_tensor = sess.graph.get_tensor_by_name("input:0") # 定义输出的张量名称 output_tensor_probs = sess.graph.get_tensor_by_name( "BlazeNet/probs:0") output_tensor_locs = sess.graph.get_tensor_by_name( "BlazeNet/reg:0") f = open('result_mobileNetSelf_' + data_test_dir.split('/') [-1 if data_test_dir[-1] != '/' else -2] + '.txt', 'w', encoding='utf-8') for line in os.listdir(data_test_dir): if line.endswith('.jpg'): print('process line:', line) xmlPath = os.path.join(data_test_dir, line.split('.')[0] + '.json') filePath = os.path.join(data_test_dir, line) frame = cv2.imread(filePath) OSize = frame.shape frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) r = WIDTH_DES / max(frame.shape[1], frame.shape[0]) # dim_des = (int(WIDTH_DES), int(frame.shape[1] * r)) # frame = cv2.resize(frame, (WIDTH_DES, HEIGHT_DES)) bt = time.time() frame = cv2.resize(frame, (0, 0), fx=r, fy=r) # (WIDTH_DES, HEIGHT_DES)) frame = np.pad(frame, ((0, HEIGHT_DES - frame.shape[0]), (0, WIDTH_DES - frame.shape[1]), (0, 0)), mode='constant') tmp_frame = frame / 255. pred_locs, pred_confs = sess.run( [output_tensor_locs, output_tensor_probs], feed_dict={ input_image_tensor: np.expand_dims(tmp_frame, axis=0) }) pred_boxes = decode_batch(boxes_vec, pred_locs, pred_confs, min_conf=0.5)[0] pred_boxes[pred_boxes < 0] = 0 totalT = time.time() - bt # pred_boxes[:, [0, 2]][pred_boxes[:, [0, 2]] > WIDTH_DES] = WIDTH_DES # pred_boxes[:, [1, 3]][pred_boxes[:, [1, 3]] > HEIGHT_DES] = HEIGHT_DES h, w = HEIGHT_DES, WIDTH_DES tmpS = line + '\t' + str(totalT) + '\t' if drawOriBox: GT_box = getGTBoxes(xmlPath) for i in range(len(GT_box)): GBox = GT_box[i] if dstSize: r = dstSize / max(OSize[0], OSize[1]) GBox = (np.array(GBox) * r).astype(np.int32) cv2.rectangle(frame, (GBox[0], GBox[1]), (GBox[2], GBox[3]), (0, 0, 0), 3) for box in pred_boxes.tolist(): tmpS += str(int(box[0] * w)) + ',' + str( int(box[1] * h)) + ',' + str(int( box[2] * w)) + ',' + str(int( box[3] * h)) + '\t' cv2.rectangle(frame, (int(box[0] * w), int(box[1] * h)), (int(box[2] * w), int(box[3] * h)), (0, 255, 0), 2) tmpS = tmpS[:-1] + '\n' f.write(tmpS) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) line = line.replace('/', '_') cv2.imwrite(os.path.join(storePath, line), frame) f.close() os.system('zip -r tmpDetImgs.zip ' + storePath)
def build_model_graph(features, labels, is_training, params): """Builds the forward model graph.""" use_batched_nms = (not params['use_tpu'] and params['use_batched_nms']) is_gpu_inference = (not is_training and use_batched_nms) model_outputs = {} if is_training: if params['transpose_input']: features['images'] = tf.transpose(features['images'], [2, 0, 1, 3]) batch_size, image_height, image_width, _ = ( features['images'].get_shape().as_list()) # Handles space-to-depth transform. conv0_space_to_depth_block_size = 0 if is_training: conv0_space_to_depth_block_size = params[ 'conv0_space_to_depth_block_size'] image_height *= conv0_space_to_depth_block_size image_width *= conv0_space_to_depth_block_size if 'source_ids' not in features: features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32) all_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], (image_height, image_width)) if 'resnet' in params['backbone']: with tf.variable_scope(params['backbone']): resnet_fn = resnet.resnet_v1( params['backbone'], conv0_kernel_size=params['conv0_kernel_size'], conv0_space_to_depth_block_size=conv0_space_to_depth_block_size, num_batch_norm_group=params['num_batch_norm_group']) backbone_feats = resnet_fn( features['images'], (params['is_training_bn'] and is_training)) elif 'mnasnet' in params['backbone']: with tf.variable_scope(params['backbone']): _, endpoints = mnasnet_models.build_mnasnet_base( features['images'], params['backbone'], training=(params['is_training_bn'] and is_training), override_params={'use_keras': False}) backbone_feats = { 2: endpoints['reduction_2'], 3: endpoints['reduction_3'], 4: endpoints['reduction_4'], 5: endpoints['reduction_5'], } else: raise ValueError('Not a valid backbone option: %s' % params['backbone']) fpn_feats = fpn.fpn(backbone_feats, params['min_level'], params['max_level']) model_outputs.update({ 'fpn_features': fpn_feats, }) rpn_score_outputs, rpn_box_outputs = heads.rpn_head( fpn_feats, params['min_level'], params['max_level'], len(params['aspect_ratios'] * params['num_scales'])) if is_training: rpn_pre_nms_topn = params['rpn_pre_nms_topn'] rpn_post_nms_topn = params['rpn_post_nms_topn'] else: rpn_pre_nms_topn = params['test_rpn_pre_nms_topn'] rpn_post_nms_topn = params['test_rpn_post_nms_topn'] rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois( rpn_score_outputs, rpn_box_outputs, all_anchors, features['image_info'], rpn_pre_nms_topn, rpn_post_nms_topn, params['rpn_nms_threshold'], params['rpn_min_size'], bbox_reg_weights=None, use_batched_nms=use_batched_nms) rpn_box_rois = tf.to_float(rpn_box_rois) if is_training: rpn_box_rois = tf.stop_gradient(rpn_box_rois) rpn_box_scores = tf.stop_gradient(rpn_box_scores) if is_training: # Sampling box_targets, class_targets, rpn_box_rois, proposal_to_label_map = ( training_ops.proposal_label_op( rpn_box_rois, labels['gt_boxes'], labels['gt_classes'], features['image_info'], batch_size_per_im=params['batch_size_per_im'], fg_fraction=params['fg_fraction'], fg_thresh=params['fg_thresh'], bg_thresh_hi=params['bg_thresh_hi'], bg_thresh_lo=params['bg_thresh_lo'])) # Performs multi-level RoIAlign. box_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference) class_outputs, box_outputs, _ = heads.box_head( box_roi_features, num_classes=params['num_classes'], mlp_head_dim=params['fast_rcnn_mlp_head_dim']) if not is_training: if is_gpu_inference: generate_detections_fn = postprocess_ops.generate_detections_gpu else: generate_detections_fn = postprocess_ops.generate_detections_tpu detections = generate_detections_fn( class_outputs, box_outputs, rpn_box_rois, features['image_info'], params['test_rpn_post_nms_topn'], params['test_detections_per_image'], params['test_nms'], params['bbox_reg_weights']) model_outputs.update({ 'num_detections': detections[0], 'detection_boxes': detections[1], 'detection_classes': detections[2], 'detection_scores': detections[3], }) else: encoded_box_targets = training_ops.encode_box_targets( rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights']) model_outputs.update({ 'rpn_score_outputs': rpn_score_outputs, 'rpn_box_outputs': rpn_box_outputs, 'class_outputs': class_outputs, 'box_outputs': box_outputs, 'class_targets': class_targets, 'box_targets': encoded_box_targets, 'box_rois': rpn_box_rois, }) # Faster-RCNN mode. if not params['include_mask']: return model_outputs # Mask sampling if not is_training: selected_box_rois = model_outputs['detection_boxes'] class_indices = model_outputs['detection_classes'] # If using GPU for inference, delay the cast until when Gather ops show up # since GPU inference supports float point better. # TODO(laigd): revisit this when newer versions of GPU libraries is # released. if not is_gpu_inference: class_indices = tf.to_int32(class_indices) else: (selected_class_targets, selected_box_targets, selected_box_rois, proposal_to_label_map) = (training_ops.select_fg_for_masks( class_targets, box_targets, rpn_box_rois, proposal_to_label_map, max_num_fg=int(params['batch_size_per_im'] * params['fg_fraction']))) class_indices = tf.to_int32(selected_class_targets) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, selected_box_rois, output_size=14, is_gpu_inference=is_gpu_inference) mask_outputs = heads.mask_head(mask_roi_features, class_indices, num_classes=params['num_classes'], mrcnn_resolution=params['mrcnn_resolution'], is_gpu_inference=is_gpu_inference) if is_training: mask_targets = training_ops.get_mask_targets( selected_box_rois, proposal_to_label_map, selected_box_targets, labels['cropped_gt_masks'], params['mrcnn_resolution']) model_outputs.update({ 'mask_outputs': mask_outputs, 'mask_targets': mask_targets, 'selected_class_targets': selected_class_targets, }) else: model_outputs.update({ 'detection_masks': tf.nn.sigmoid(mask_outputs), }) return model_outputs
def __call__(self, params): image_size = params['dynamic_image_size'] if params[ 'dynamic_input_shapes'] else (params['image_size'], params['image_size']) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) if params['dynamic_input_shapes']: height_long_side_image_size = image_size[::-1] height_long_side_input_anchors = anchors.Anchors( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], height_long_side_image_size) height_long_side_anchor_labeler = anchors.AnchorLabeler( height_long_side_input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=True) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) instance_masks = tf.gather_nd(instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size'], boxes, classes, instance_masks) input_processor.normalize_image() if (self._mode == tf.estimator.ModeKeys.TRAIN and params['input_rand_hflip']): input_processor.random_horizontal_flip() if self._mode == tf.estimator.ModeKeys.TRAIN: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_mlperf_reference_size( ) image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() instance_masks = input_processor.resize_and_crop_masks() cropped_gt_masks = input_processor.crop_gt_masks( instance_masks, boxes, params['gt_mask_size'], image_size) # Assign anchors. if params['dynamic_input_shapes']: is_height_short_side = tf.less( input_processor._scaled_height, # pylint: disable=protected-access input_processor._scaled_width) # pylint: disable=protected-access score_targets, box_targets = tf.cond( is_height_short_side, lambda: anchor_labeler.label_anchors(boxes, classes), lambda: height_long_side_anchor_labeler.label_anchors(boxes, classes)) # pylint: disable=line-too-long else: score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) image_scale = input_processor.image_scale_to_original scaled_height = input_processor.get_height_length() scaled_width = input_processor.get_width_length() image_info = tf.stack([ tf.to_float(scaled_height), tf.to_float(scaled_width), image_scale, tf.to_float(input_processor.get_original_height), tf.to_float(input_processor.get_original_width), ]) # Pad groundtruth data for evaluation. boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. cropped_gt_masks = tf.reshape(cropped_gt_masks, [self._max_num_instances, -1]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [self._max_num_instances, (params['gt_mask_size'] + 4)**2]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, score_targets, box_targets, source_id, image_info, boxes, is_crowds, areas, classes, cropped_gt_masks) # batch_size = params['batch_size'] batch_size = params['batch_size'] if 'batch_size' in params else 1 dataset = tf.data.Dataset.list_files( self._file_pattern, shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN)) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( _prefetch_dataset, cycle_length=32, sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN))) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) if params['dynamic_input_shapes']: def key_func(image, *args): del args return tf.cast(tf.shape(image)[0], dtype=tf.int64) def reduce_func(unused_key, dataset): return dataset.batch(batch_size, drop_remainder=True) dataset = dataset.apply( tf.contrib.data.group_by_window( key_func=key_func, reduce_func=reduce_func, window_size=params['global_batch_size'])) else: dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, score_targets, box_targets, source_ids, image_info, boxes, is_crowds, areas, classes, cropped_gt_masks): """Processes one batch of data.""" # Transposes images from (N, H, W, C)->(H, W, N, C). As batch size is # less than 8, the batch goes to the second minor dimension. if (params['transpose_input'] and self._mode == tf.estimator.ModeKeys.TRAIN): images = tf.transpose(images, [1, 2, 0, 3]) labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_info'] = image_info labels['cropped_gt_masks'] = cropped_gt_masks if self._mode == tf.estimator.ModeKeys.PREDICT: features = dict(images=images, image_info=image_info, groundtruth_data=groundtruth_data, source_ids=source_ids) return features elif params['dynamic_input_shapes']: # For dynamic input shapes, we have 2 TPU programs. A tf.cond op is run # on the host side to decide which TPU program to launch. As we have # data prefetch in device side, the data for evaluating the shape needs # to sent back from device to host. Thus we retun `images` shape here # explictly to avoid copy the entire `images` back. return tf.shape(images), images, labels else: return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) return dataset