def train(self, train_input_fn, run_eval_after_train=False, eval_input_fn=None): """Run distributed training on Mask RCNN model.""" self._save_config() train_run_config = self.build_strategy_configuration('train') train_params = self.build_model_parameters('train') train_estimator = self.build_mask_rcnn_estimator( train_params, train_run_config, 'train') with dump_callback(): train_estimator.train( input_fn=train_input_fn, max_steps=self._runtime_config.total_steps, hooks=get_training_hooks( mode="train", model_dir=self._runtime_config.model_dir, checkpoint_path=self._runtime_config.checkpoint, skip_checkpoint_variables=self._runtime_config. skip_checkpoint_variables, batch_size=train_params['batch_size'], save_summary_steps=self._runtime_config.save_summary_steps, )) if not run_eval_after_train: return None if eval_input_fn is None: raise ValueError( 'Eval input_fn must be passed to conduct evaluation after training.' ) eval_run_config = self.build_strategy_configuration('eval') eval_params = self.build_model_parameters('eval') eval_estimator = self.build_mask_rcnn_estimator( eval_params, eval_run_config, 'eval') last_ckpt = self.get_last_checkpoint_path() logging.info("Restoring parameters from %s\n" % last_ckpt) eval_results, predictions = evaluation.evaluate( eval_estimator, eval_input_fn, self._runtime_config.eval_samples, self._runtime_config.eval_batch_size, self._runtime_config.include_mask, self._runtime_config.val_json_file, report_frequency=self._runtime_config.report_frequency, checkpoint_path=last_ckpt) output_dir = os.path.join(self._runtime_config.model_dir, 'eval') tf.io.gfile.makedirs(output_dir) # Summary writer writes out eval metrics. self._write_summary(output_dir, eval_results, predictions, self._runtime_config.total_steps) return eval_results
def _save(self, session, step): """Saves the latest checkpoint, returns should_stop.""" logging.info("Saving checkpoints for %d into %s.", step, self._save_path) self._saver.save(session, self._save_path, global_step=step) self._summary_writer.add_session_log( tf.compat.v1.SessionLog(status=tf.compat.v1.SessionLog.CHECKPOINT, checkpoint_path=self._save_path), step)
def log_parameters(self): """Log the hyperparameters value along with the source of those values. """ params_log = '' for k in self._params: params_log += k + ': \t' + str(self._params[k]) params_log += ' \t[' + self._params_source[k] + ']\n' logging.info('\nModel hyperparameters [source]:\n%s', params_log)
def loadRes(self, detection_results, include_mask, is_image_mask=False): """Load result file and return a result api object. Args: detection_results: a dictionary containing predictions results. include_mask: a boolean, whether to include mask in detection results. is_image_mask: a boolean, where the predict mask is a whole image mask. Returns: res: result MaskCOCO api object """ res = MaskCOCO() res.dataset['images'] = [img for img in self.dataset['images']] logging.info('Loading and preparing results...') predictions = self.load_predictions(detection_results, include_mask=include_mask, is_image_mask=is_image_mask) assert isinstance(predictions, list), 'results in not an array of objects' if predictions: image_ids = [pred['image_id'] for pred in predictions] assert set(image_ids) == (set(image_ids) & set(self.getImgIds())), \ 'Results do not correspond to current coco set' if (predictions and 'bbox' in predictions[0] and predictions[0]['bbox']): res.dataset['categories'] = copy.deepcopy( self.dataset['categories']) for idx, pred in enumerate(predictions): bb = pred['bbox'] x1, x2, y1, y2 = [ bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3] ] if 'segmentation' not in pred: pred['segmentation'] = [[ x1, y1, x1, y2, x2, y2, x2, y1 ]] pred['area'] = bb[2] * bb[3] pred['id'] = idx + 1 pred['iscrowd'] = 0 elif 'segmentation' in predictions[0]: res.dataset['categories'] = copy.deepcopy( self.dataset['categories']) for idx, pred in enumerate(predictions): # now only support compressed RLE format as segmentation results pred['area'] = maskUtils.area(pred['segmentation']) if 'bbox' not in pred: pred['bbox'] = maskUtils.toBbox(pred['segmentation']) pred['id'] = idx + 1 pred['iscrowd'] = 0 res.dataset['annotations'] = predictions res.createIndex() return res
def compute_model_statistics(batch_size, is_training=True): """Compute number of parameters and FLOPS.""" options = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation() options['output'] = 'none' from tensorflow.python.keras.backend import get_graph flops = tf.compat.v1.profiler.profile(get_graph(), options=options).total_float_ops flops_per_image = flops / batch_size logging.info( '[%s Compute Statistics] %.1f GFLOPS/image' % ("Training" if is_training else "Inference", flops_per_image / 1e9))
def __init__(self, runtime_config, model_fn): super(EstimatorExecuter, self).__init__(runtime_config, model_fn) if MPI_is_distributed(): os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['HOROVOD_NUM_NCCL_STREAMS'] = '1' # os.environ['HOROVOD_AUTOTUNE'] = '2' logging.info("SageMaker Distributed Data Parallel successfully initialized ...") os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = '1' if not MPI_is_distributed() else str(hvd.size()) os.environ['TF_SYNC_ON_FINISH'] = '0'
def _save_config(self): """Save parameters to config files if model_dir is defined.""" model_dir = self._runtime_config.model_dir if model_dir is not None: if not tf.io.gfile.exists(model_dir): tf.io.gfile.makedirs(model_dir) params_io.save_hparams_to_yaml(self._runtime_config, model_dir + '/params.yaml') try: from TensorFlow.common.tb_utils import write_hparams_v1 write_hparams_v1(model_dir, self._runtime_config.values()) # Prevent performance degradation by creating empty Session. with tf.compat.v1.Session(): pass except: logging.info('Could not save hparams to tfevent file')
def __init__(self, checkpoint_dir, checkpoint_basename="model.ckpt"): """Initializes a `CheckpointSaverHook`. Args: checkpoint_dir: `str`, base directory for the checkpoint files. checkpoint_basename: `str`, base name for the checkpoint files. Raises: ValueError: One of `save_steps` or `save_secs` should be set. ValueError: At most one of `saver` or `scaffold` should be set. """ logging.info("Create CheckpointSaverHook.") self._saver = None self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._steps_per_run = 1 self._is_initialized = False self._global_step_tensor = None self._summary_writer = None
def __init__(self, runtime_config, model_fn): super(EstimatorExecuter, self).__init__(runtime_config, model_fn) # Handle recipe cache. Skip if externally set or empty. recipe_cache = runtime_config.recipe_cache if 'TF_RECIPE_CACHE_PATH' not in os.environ.keys() and recipe_cache: os.environ['TF_RECIPE_CACHE_PATH'] = recipe_cache # Clear previous recipe cache. if not MPI_is_distributed() or MPI_rank() == 0: if os.path.exists(recipe_cache) and os.path.isdir( recipe_cache): shutil.rmtree(recipe_cache) if MPI_is_distributed(): os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['HOROVOD_NUM_NCCL_STREAMS'] = '1' # os.environ['HOROVOD_AUTOTUNE'] = '2' if runtime_config.device == "HPU": from TensorFlow.common.horovod_helpers import hvd_init, Framework hvd = hvd_init(framework=Framework.TENSORFLOW) else: hvd.init() # Other ranks should wait for recipe cache to be removed. # This operation can't be done before hvd_init. from mpi4py import MPI MPI.COMM_WORLD.Barrier() logging.info("Horovod successfully initialized ...") os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = '1' if not MPI_is_distributed( ) else str(hvd.size()) os.environ['TF_SYNC_ON_FINISH'] = '0'
def eval(self, eval_input_fn): """Run distributed eval on Mask RCNN model.""" output_dir = os.path.join(self._runtime_config.model_dir, 'eval') tf.io.gfile.makedirs(output_dir) # Summary writer writes out eval metrics. run_config = self.build_strategy_configuration('eval') eval_params = self.build_model_parameters('eval') eval_estimator = self.build_mask_rcnn_estimator( eval_params, run_config, 'eval') logging.info('Starting to evaluate.') last_ckpt = self.get_last_checkpoint_path() if last_ckpt is not None: logging.info("Restoring parameters from %s\n" % last_ckpt) current_step = int(os.path.basename(last_ckpt).split('-')[1]) else: logging.warning( "Could not find trained model in model_dir: `%s`, running initialization to predict\n" % self._runtime_config.model_dir) current_step = 0 eval_results, predictions = evaluation.evaluate( eval_estimator, eval_input_fn, self._runtime_config.eval_samples, self._runtime_config.eval_batch_size, self._runtime_config.include_mask, self._runtime_config.val_json_file, checkpoint_path=last_ckpt) self._write_summary(output_dir, eval_results, predictions, current_step) if current_step >= self._runtime_config.total_steps: logging.info('Evaluation finished after training step %d' % current_step) return eval_results
FLAGS.data_dir, "train*.tfrecord"), mode=tf.estimator.ModeKeys.TRAIN, use_fake_data=FLAGS.use_synthetic_data, use_instance_mask=True, seed=FLAGS.seed) else: input_dataset = InputReader(file_pattern=os.path.join( FLAGS.data_dir, "val*.tfrecord"), mode=tf.estimator.ModeKeys.PREDICT, num_examples=5000, use_fake_data=FLAGS.use_synthetic_data, use_instance_mask=True, seed=FLAGS.seed) logging.info("[*] Executing Benchmark in %s mode" % ("training" if FLAGS.training else "inference")) logging.info("[*] Benchmark using %s data" % ("synthetic" if FLAGS.use_synthetic_data else "real")) time.sleep(1) # Build the data input dataset = input_dataset( params={ "anchor_scale": 8.0, "aspect_ratios": [[1.0, 1.0], [1.4, 0.7], [0.7, 1.4]], "batch_size": FLAGS.batch_size, "gt_mask_size": 112, "image_size": [1024, 1024], "include_groundtruth_in_features": False, "augment_input_data": True,
def train_and_eval(self, train_input_fn, eval_input_fn): """Run distributed train and eval on Mask RCNN model.""" self._save_config() output_dir = os.path.join(self._runtime_config.model_dir, 'eval') tf.io.gfile.makedirs(output_dir) train_run_config = self.build_strategy_configuration('train') train_params = self.build_model_parameters('train') train_estimator = self.build_mask_rcnn_estimator(train_params, train_run_config, 'train') eval_estimator = None eval_results = None num_cycles = math.ceil(self._runtime_config.total_steps / self._runtime_config.num_steps_per_eval) training_hooks = get_training_hooks( mode="train", model_dir=self._runtime_config.model_dir, checkpoint_path=self._runtime_config.checkpoint, skip_checkpoint_variables=self._runtime_config.skip_checkpoint_variables ) for cycle in range(1, num_cycles + 1): if not MPI_is_distributed() or MPI_rank() == 0: print() # Visual Spacing logging.info("=================================") logging.info(' Start training cycle %02d' % cycle) logging.info("=================================\n") max_cycle_step = min(int(cycle * self._runtime_config.num_steps_per_eval), self._runtime_config.total_steps) PROFILER_ENABLED = False if (not MPI_is_distributed() or MPI_rank() == 0) and PROFILER_ENABLED: profiler_context_manager = tf.contrib.tfprof.ProfileContext else: from contextlib import suppress profiler_context_manager = lambda *args, **kwargs: suppress() # No-Op context manager with profiler_context_manager( '/workspace/profiling/', trace_steps=range(100, 200, 3), dump_steps=[200] ) as pctx: if (not MPI_is_distributed() or MPI_rank() == 0) and PROFILER_ENABLED: opts = tf.compat.v1.profiler.ProfileOptionBuilder.time_and_memory() pctx.add_auto_profiling('op', opts, [150, 200]) train_estimator.train( input_fn=train_input_fn, max_steps=max_cycle_step, hooks=training_hooks, ) if not MPI_is_distributed() or MPI_rank() == 0: print() # Visual Spacing logging.info("=================================") logging.info(' Start evaluation cycle %02d' % cycle) logging.info("=================================\n") if eval_estimator is None: eval_run_config = self.build_strategy_configuration('eval') eval_params = self.build_model_parameters('eval') eval_estimator = self.build_mask_rcnn_estimator(eval_params, eval_run_config, 'eval') last_ckpt = tf.train.latest_checkpoint(self._runtime_config.model_dir, latest_filename=None) logging.info("Restoring parameters from %s\n" % last_ckpt) eval_results, predictions = evaluation.evaluate( eval_estimator, eval_input_fn, self._runtime_config.eval_samples, self._runtime_config.eval_batch_size, self._runtime_config.include_mask, self._runtime_config.val_json_file, report_frequency=self._runtime_config.report_frequency ) self._write_summary(output_dir, eval_results, predictions, max_cycle_step) if MPI_is_distributed(): from mpi4py import MPI comm = hvd.get_worker_comm() comm.Barrier() # Waiting for all MPI processes to sync return eval_results
def _get_session_config(mode, use_xla, use_amp, use_tf_distributed=False, allow_xla_at_inference=False): assert mode in ('train', 'eval') rewrite_options = rewriter_config_pb2.RewriterConfig( # arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF, # arithmetic_optimization=rewriter_config_pb2.RewriterConfig.ON, # constant_folding=rewriter_config_pb2.RewriterConfig.OFF, # constant_folding=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # debug_stripper=rewriter_config_pb2.RewriterConfig.OFF, # debug_stripper=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF, # dependency_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # disable_model_pruning=False, # INCOMPATIBLE with AMP # function_optimization=True, # implementation_selector=True, # loop_optimization=rewriter_config_pb2.RewriterConfig.OFF, # loop_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # The default setting (SCHEDULING and SWAPPING HEURISTICS only) # memory_optimization=rewriter_config_pb2.RewriterConfig.DEFAULT_MEM_OPT, # Disabled in the meta-optimizer. # memory_optimization=rewriter_config_pb2.RewriterConfig.NO_MEM_OPT, # Driven by manual op-level annotations. # memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL, # Swapping heuristic will move a tensor from the GPU to the CPU and move it # back when needed to reduce peak memory usage.. # memory_optimization=rewriter_config_pb2.RewriterConfig.SWAPPING_HEURISTICS, # Recomputation heuristics will recompute ops (such as Relu activation) # during backprop instead of storing them, reducing peak memory usage. # memory_optimization=rewriter_config_pb2.RewriterConfig.RECOMPUTATION_HEURISTICS, # Scheduling will split big ops such as AddN and try to enforce a schedule of # the new computations that decreases peak memory usage. # memory_optimization=rewriter_config_pb2.RewriterConfig.SCHEDULING_HEURISTICS, # Use any combination of swapping and recomputation heuristics. # memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS, meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.TWO, # meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE, # meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.DEFAULT_NUM_ITERS, # pin_to_host_optimization=rewriter_config_pb2.RewriterConfig.OFF, # pin_to_host_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # # remapping=rewriter_config_pb2.RewriterConfig.OFF, # remapping=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # scoped_allocator_optimization=rewriter_config_pb2.RewriterConfig.OFF, # scoped_allocator_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # shape_optimization=rewriter_config_pb2.RewriterConfig.OFF, # shape_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST ) if use_amp: logging.info("[%s] AMP is activated - Experiment Feature" % mode) rewrite_options.auto_mixed_precision = True config = tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, graph_options=tf.compat.v1.GraphOptions( rewrite_options=rewrite_options, # infer_shapes=True # Heavily drops throughput by 30% ) ) if use_tf_distributed: config.gpu_options.force_gpu_compatible = False else: config.gpu_options.force_gpu_compatible = True # Force pinned memory if MPI_is_distributed(): config.gpu_options.visible_device_list = str(MPI_local_rank()) if use_xla and (mode == "train" or allow_xla_at_inference): logging.info("[%s] XLA is activated - Experiment Feature" % mode) config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_2 if mode == 'train': config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads if MPI_is_distributed(): config.inter_op_parallelism_threads = max(2, multiprocessing.cpu_count() // hvd.local_size()) elif not use_tf_distributed: config.inter_op_parallelism_threads = 4 return config
def after_create_session(self, session, coord=None): if not self._is_initialized: session.run(self._init_op, feed_dict=self._init_feed_dict) logging.info("Pretrained weights loaded with success...\n") self._is_initialized = True
def compute_coco_eval_metric(predictor, num_batches=-1, include_mask=True, annotation_json_file="", eval_batch_size=-1, report_frequency=None): """Compute COCO eval metric given a prediction generator. Args: predictor: a generator that iteratively pops a dictionary of predictions with the format compatible with COCO eval tool. num_batches: the number of batches to be aggregated in eval. This is how many times that the predictor gets pulled. include_mask: a boolean that indicates whether we include the mask eval. annotation_json_file: the annotation json file of the eval dataset. Returns: eval_results: the aggregated COCO metric eval results. """ if annotation_json_file == "": annotation_json_file = None use_groundtruth_from_json = (annotation_json_file is not None) predictions = dict() batch_idx = 0 if use_groundtruth_from_json: eval_metric = coco_metric.EvaluationMetric(annotation_json_file, include_mask=include_mask) else: eval_metric = coco_metric.EvaluationMetric(filename=None, include_mask=include_mask) def evaluation_preds(preds): # Essential to avoid modifying the source dict _preds = copy.deepcopy(preds) for k, v in six.iteritems(_preds): _preds[k] = np.concatenate(_preds[k], axis=0) if 'orig_images' in _preds and _preds['orig_images'].shape[0] > 10: # Only samples a few images for visualization. _preds['orig_images'] = _preds['orig_images'][:10] if use_groundtruth_from_json: eval_results = eval_metric.predict_metric_fn(_preds) else: images, annotations = coco_utils.extract_coco_groundtruth( _preds, include_mask) coco_dataset = coco_utils.create_coco_format_dataset( images, annotations) eval_results = eval_metric.predict_metric_fn( _preds, groundtruth_data=coco_dataset) return eval_results # Take into account cuDNN & Tensorflow warmup # Drop N first steps for avg throughput calculation BURNIN_STEPS = 100 model_throughput_list = list() inference_time_list = list() if MPI_is_distributed(): eval_batch_size *= MPI_size() num_batches /= MPI_size() while num_batches < 0 or batch_idx < num_batches: try: step_t0 = time.time() step_predictions = six.next(predictor) if MPI_is_distributed(): from mpi4py import MPI MPI.COMM_WORLD.Barrier( ) # Need to get time for all predictors for given batch_idx batch_time = time.time() - step_t0 throughput = eval_batch_size / batch_time model_throughput_list.append(throughput) inference_time_list.append(batch_time) logging.info( 'Running inference on batch %03d/%03d... - Step Time: %.4fs - Throughput: %.1f imgs/s' % (batch_idx + 1, num_batches, batch_time, throughput)) except StopIteration: logging.info('Get StopIteration at %d batch.' % (batch_idx + 1)) break step_predictions = process_prediction_for_eval(step_predictions) for k, v in step_predictions.items(): if k not in predictions: predictions[k] = [v] else: predictions[k].append(v) batch_idx = batch_idx + 1 # If you want the report to happen each report_frequency to happen each report_frequency batches. # Thus, each report is of eval_batch_size * report_frequency if report_frequency and batch_idx % report_frequency == 0: eval_results = evaluation_preds(preds=predictions) logging.info('Eval results: %s' % pprint.pformat(eval_results, indent=4)) if MPI_is_distributed(): from mpi4py import MPI all_predictions = MPI.COMM_WORLD.gather(predictions, root=0) MPI.COMM_WORLD.Barrier( ) # FIXME: first gather is calling MPI_FINALIZE causing crash if MPI_rank() == 0: predictions.clear() for pred in all_predictions: for k in pred.keys(): if k not in predictions: predictions[k] = pred[k] else: predictions[k].extend(pred[k]) inference_time_list.sort() eval_results = evaluation_preds(preds=predictions) if not MPI_is_distributed() or MPI_rank() == 0: average_time = np.mean(inference_time_list) latency_50 = max( inference_time_list[:int(len(inference_time_list) * 0.5)]) latency_90 = max( inference_time_list[:int(len(inference_time_list) * 0.90)]) latency_95 = max( inference_time_list[:int(len(inference_time_list) * 0.95)]) latency_99 = max( inference_time_list[:int(len(inference_time_list) * 0.99)]) latency_100 = max( inference_time_list[:int(len(inference_time_list) * 1)]) print() # Visual Spacing logging.info("# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #") logging.info(" Evaluation Performance Summary ") logging.info("# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #") total_processing_hours, rem = divmod(np.sum(model_throughput_list), 3600) total_processing_minutes, total_processing_seconds = divmod(rem, 60) if len(model_throughput_list) > BURNIN_STEPS: # Take into account cuDNN & Tensorflow warmup # Drop N first steps for avg throughput calculation # Also drop last step which may have a different batch size avg_throughput = np.mean(model_throughput_list[BURNIN_STEPS:-1]) else: avg_throughput = -1. print() # Visual Spacing logging.info("Average throughput: {throughput:.1f} samples/sec".format( throughput=avg_throughput)) logging.info("Inference Latency Average (s) = {avg:.4f}".format( avg=average_time)) logging.info( "Inference Latency 50% (s) = {cf_50:.4f}".format(cf_50=latency_50)) logging.info("Inference Latency 90% (s) = {cf_90:.4f}".format( cf_90=latency_90)) logging.info("Inference Latency 95% (s) = {cf_95:.4f}".format( cf_95=latency_95)) logging.info("Inference Latency 99% (s) = {cf_99:.4f}".format( cf_99=latency_99)) logging.info("Inference Latency 100% (s) = {cf_100:.4f}".format( cf_100=latency_100)) logging.info("Total processed steps: {total_steps}".format( total_steps=len(model_throughput_list))) logging.info( "Total processing time: {hours}h {minutes:02d}m {seconds:02d}s". format(hours=total_processing_hours, minutes=int(total_processing_minutes), seconds=int(total_processing_seconds))) dllogger.log(step=(), data={"avg_inference_throughput": avg_throughput}, verbosity=Verbosity.DEFAULT) avg_inference_time = float(total_processing_hours * 3600 + int(total_processing_minutes) * 60 + int(total_processing_seconds)) dllogger.log(step=(), data={"avg_inference_time": avg_inference_time}, verbosity=Verbosity.DEFAULT) logging.info("==================== Metrics ====================") # logging.info('Eval Epoch results: %s' % pprint.pformat(eval_results, indent=4)) for key, value in sorted(eval_results.items(), key=operator.itemgetter(0)): logging.info("%s: %.9f" % (key, value)) print() # Visual Spacing return eval_results, predictions
def multilevel_propose_rois(scores_outputs, box_outputs, all_anchors, image_info, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights, use_batched_nms=False): """Proposes RoIs given a group of candidates from different FPN levels. Args: scores_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4] all_anchors: an Anchors object that contains the all anchors. image_info: a tensor of shape [batch_size, 5] where the three columns encode the input image's [height, width, scale, original_height, original_width]. Height and width are for the input to the network, not the original image; scale is the scale factor used to scale the network input size to the original image size. See dataloader.DetectionInputProcessor for details. The last two are original height and width. See dataloader.DetectionInputProcessor for details. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). bbox_reg_weights: None or a list of four integer specifying the weights used when decoding the box. use_batched_nms: whether use batched nms. The batched nms will use tf.combined_non_max_suppression, which is only available for CPU/GPU. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. rois: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] representing the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. """ with tf.name_scope('multilevel_propose_rois'): levels = scores_outputs.keys() scores = [] rois = [] anchor_boxes = all_anchors.get_unpacked_boxes() height = tf.expand_dims(image_info[:, 0:1], axis=-1) width = tf.expand_dims(image_info[:, 1:2], axis=-1) scale = tf.expand_dims(image_info[:, 2:3], axis=-1) for level in levels: with tf.name_scope('level_%d' % level) as scope: batch_size, feature_h, feature_w, num_anchors_per_location = scores_outputs[ level].get_shape().as_list() num_boxes = feature_h * feature_w * num_anchors_per_location this_level_scores = tf.reshape(scores_outputs[level], [batch_size, num_boxes]) this_level_scores = tf.sigmoid(this_level_scores) this_level_boxes = tf.reshape(box_outputs[level], [batch_size, num_boxes, 4]) this_level_anchors = tf.cast(tf.reshape( tf.expand_dims(anchor_boxes[level], axis=0) * tf.ones([batch_size, 1, 1, 1]), [batch_size, num_boxes, 4]), dtype=this_level_scores.dtype) # TODO: Remove when Batched NMS stop leading to eval metrics being all 0 # commented out because scope no longer exists if use_batched_nms: logging.info("[ROI OPs] Using Batched NMS... Scope: %s" % scope) propose_rois_fn = _propose_rois_gpu else: logging.debug( "[ROI OPs] Not Using Batched NMS... Scope: %s" % scope) propose_rois_fn = _propose_rois_tpu this_level_scores, this_level_boxes = propose_rois_fn( this_level_scores, this_level_boxes, this_level_anchors, height, width, scale, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights) scores.append(this_level_scores) rois.append(this_level_boxes) scores = tf.concat(scores, axis=1) rois = tf.concat(rois, axis=1) with tf.name_scope('roi_post_nms_topk'): post_nms_num_anchors = scores.shape[1] post_nms_topk_limit = min(post_nms_num_anchors, rpn_post_nms_topn) top_k_scores, top_k_rois = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[rois]) top_k_rois = top_k_rois[0] return top_k_scores, top_k_rois
def call(self, inputs, **kwargs): """ Returns: mask_outputs: a tensor with a shape of [batch_size, num_masks, mask_height, mask_width], representing the mask predictions. fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2], representing the fg mask targets. Raises: ValueError: If boxes is not a rank-3 tensor or the last dimension of boxes is not 4. """ batch_size, num_rois, height, width, filters = inputs.get_shape( ).as_list() net = tf.reshape(inputs, [-1, height, width, filters]) for conv_id in range(4): net = self._conv_stage1[conv_id](net) net = self._conv_stage2(net) mask_outputs = self._conv_stage3(net) # [SW-34925] Workaround to offload ScatterNd Op to CPU if self._offload_post_proc: logging.info("[Mask Head] Running post processing on CPU") with tf.device('CPU:0'): mask_outputs = tf.reshape(mask_outputs, [ -1, num_rois, self._mrcnn_resolution, self._mrcnn_resolution, self._num_classes ]) with tf.name_scope('masks_post_processing'): mask_outputs = tf.transpose(a=mask_outputs, perm=[0, 1, 4, 2, 3]) indices_dtype = tf.float32 if self._is_gpu_inference else tf.int32 if batch_size == 1: indices = tf.reshape( tf.reshape(tf.range(num_rois, dtype=indices_dtype), [batch_size, num_rois, 1]) * self._num_classes + tf.expand_dims(self._class_indices, axis=-1), [batch_size, -1]) indices = tf.cast(indices, tf.int32) mask_outputs = tf.gather(tf.reshape( mask_outputs, [ batch_size, -1, self._mrcnn_resolution, self._mrcnn_resolution ]), indices, axis=1) mask_outputs = tf.squeeze(mask_outputs, axis=1) mask_outputs = tf.reshape(mask_outputs, [ batch_size, num_rois, self._mrcnn_resolution, self._mrcnn_resolution ]) else: batch_indices = ( tf.expand_dims(tf.range(batch_size, dtype=indices_dtype), axis=1) * tf.ones([1, num_rois], dtype=indices_dtype)) mask_indices = (tf.expand_dims( tf.range(num_rois, dtype=indices_dtype), axis=0) * tf.ones([batch_size, 1], dtype=indices_dtype)) gather_indices = tf.stack( [batch_indices, mask_indices, self._class_indices], axis=2) if self._is_gpu_inference: gather_indices = tf.cast(gather_indices, dtype=tf.int32) mask_outputs = tf.gather_nd(mask_outputs, gather_indices) return mask_outputs else: mask_outputs = tf.reshape(mask_outputs, [ -1, num_rois, self._mrcnn_resolution, self._mrcnn_resolution, self._num_classes ]) with tf.name_scope('masks_post_processing'): mask_outputs = tf.transpose(a=mask_outputs, perm=[0, 1, 4, 2, 3]) indices_dtype = tf.float32 if self._is_gpu_inference else tf.int32 if batch_size == 1: indices = tf.reshape( tf.reshape(tf.range(num_rois, dtype=indices_dtype), [batch_size, num_rois, 1]) * self._num_classes + tf.expand_dims(self._class_indices, axis=-1), [batch_size, -1]) indices = tf.cast(indices, tf.int32) mask_outputs = tf.gather(tf.reshape( mask_outputs, [ batch_size, -1, self._mrcnn_resolution, self._mrcnn_resolution ]), indices, axis=1) mask_outputs = tf.squeeze(mask_outputs, axis=1) mask_outputs = tf.reshape(mask_outputs, [ batch_size, num_rois, self._mrcnn_resolution, self._mrcnn_resolution ]) else: batch_indices = (tf.expand_dims( tf.range(batch_size, dtype=indices_dtype), axis=1) * tf.ones([1, num_rois], dtype=indices_dtype)) mask_indices = ( tf.expand_dims(tf.range(num_rois, dtype=indices_dtype), axis=0) * tf.ones([batch_size, 1], dtype=indices_dtype)) mask_indices = batch_indices * num_rois + mask_indices gather_indices = tf.stack( [mask_indices, self._class_indices], axis=2) gather_indices = tf.reshape(gather_indices, [batch_size * num_rois, 2]) if self._is_gpu_inference: gather_indices = tf.cast(gather_indices, dtype=tf.int32) mask_outputs = tf.reshape(mask_outputs, [ batch_size * num_rois, self._num_classes, self._mrcnn_resolution, self._mrcnn_resolution ]) mask_outputs = tf.gather_nd(mask_outputs, gather_indices) mask_outputs = tf.reshape(mask_outputs, [ batch_size, num_rois, self._mrcnn_resolution, self._mrcnn_resolution ]) return mask_outputs
def load_predictions(self, detection_results, include_mask, is_image_mask=False): """Create prediction dictionary list from detection and mask results. Args: detection_results: a dictionary containing numpy arrays which corresponds to prediction results. include_mask: a boolean, whether to include mask in detection results. is_image_mask: a boolean, where the predict mask is a whole image mask. Returns: a list of dictionary including different prediction results from the model in numpy form. """ predictions = [] num_detections = detection_results['detection_scores'].size current_index = 0 for i, image_id in enumerate(detection_results['source_id']): if include_mask: box_coorindates_in_image = detection_results[ 'detection_boxes'][i] segments = generate_segmentation_from_masks( detection_results['detection_masks'][i], box_coorindates_in_image, int(detection_results['image_info'][i][3]), int(detection_results['image_info'][i][4]), is_image_mask=is_image_mask) # Convert the mask to uint8 and then to fortranarray for RLE encoder. encoded_masks = [ maskUtils.encode( np.asfortranarray(instance_mask.astype(np.uint8))) for instance_mask in segments ] for box_index in range(int( detection_results['num_detections'][i])): if current_index % 1000 == 0: logging.info('{}/{}'.format(current_index, num_detections)) current_index += 1 prediction = { 'image_id': int(image_id), 'bbox': detection_results['detection_boxes'][i] [box_index].tolist(), 'score': detection_results['detection_scores'][i][box_index], 'category_id': int(detection_results['detection_classes'][i][box_index]), } if include_mask: prediction['segmentation'] = encoded_masks[box_index] predictions.append(prediction) return predictions
def __call__(self, params, input_context=None): batch_size = params['batch_size'] if 'batch_size' in params else 1 try: seed = params['seed'] if not MPI_is_distributed( ) else params['seed'] * MPI_rank() except (KeyError, TypeError): seed = None if MPI_is_distributed(): n_gpus = MPI_size() elif input_context is not None: n_gpus = input_context.num_input_pipelines else: n_gpus = 1 ################################################## dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) if self._mode == tf.estimator.ModeKeys.TRAIN: if input_context is not None: logging.info("Using Dataset Sharding with TF Distributed") _num_shards = input_context.num_input_pipelines _shard_idx = input_context.input_pipeline_id elif MPI_is_distributed(): logging.info("Using Dataset Sharding with Horovod") _shard_idx, _num_shards = MPI_rank_and_size() try: dataset = dataset.shard(num_shards=_num_shards, index=_shard_idx) dataset = dataset.shuffle(math.ceil(256 / _num_shards)) except NameError: # Not a distributed training setup pass def _prefetch_dataset(filename): return tf.data.TFRecordDataset(filename).prefetch(1) dataset = dataset.interleave( map_func=_prefetch_dataset, cycle_length=32, block_length=64, num_parallel_calls=tf.data.experimental.AUTOTUNE, ) if self._num_examples is not None and self._num_examples > 0: logging.info("[*] Limiting the amount of sample to: %d" % self._num_examples) dataset = dataset.take(self._num_examples) dataset = dataset.cache() if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(buffer_size=4096, reshuffle_each_iteration=True, seed=seed) dataset = dataset.repeat() # Parse the fetched records to input tensors for model function. dataset = dataset.map( map_func=self._create_dataset_parser_fn(params), num_parallel_calls=16, ) dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. logging.info("Using Fake Dataset Loop...") dataset = dataset.take(1).cache().repeat() if self._mode != tf.estimator.ModeKeys.TRAIN: dataset = dataset.take(int(5000 / batch_size)) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE, ) if not tf.distribute.has_strategy(): dataset = dataset.apply( tf.data.experimental.prefetch_to_device( '/gpu:0', # With Horovod the local GPU is always 0 buffer_size=1, )) data_options = tf.data.Options() data_options.experimental_deterministic = seed is not None if LooseVersion(tf.__version__) <= LooseVersion("2.0.0"): data_options.experimental_distribute.auto_shard = False else: data_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # data_options.experimental_distribute.auto_shard = False data_options.experimental_slack = True data_options.experimental_threading.max_intra_op_parallelism = 1 # data_options.experimental_threading.private_threadpool_size = int(multiprocessing.cpu_count() / n_gpus) * 2 # ================= experimental_optimization ================= # data_options.experimental_optimization.apply_default_optimizations = False # data_options.experimental_optimization.autotune = True data_options.experimental_optimization.filter_fusion = True data_options.experimental_optimization.map_and_batch_fusion = True data_options.experimental_optimization.map_and_filter_fusion = True data_options.experimental_optimization.map_fusion = True data_options.experimental_optimization.map_parallelization = True map_vectorization_options = tf.data.experimental.MapVectorizationOptions( ) map_vectorization_options.enabled = True map_vectorization_options.use_choose_fastest = True data_options.experimental_optimization.map_vectorization = map_vectorization_options data_options.experimental_optimization.noop_elimination = True data_options.experimental_optimization.parallel_batch = True data_options.experimental_optimization.shuffle_and_repeat_fusion = True # ========== Stats on TF Data ============= # aggregator = tf.data.experimental.StatsAggregator() # data_options.experimental_stats.aggregator = aggregator # data_options.experimental_stats.latency_all_edges = True dataset = dataset.with_options(data_options) return dataset
def get_image_summary(predictions, current_step, max_images=10): """Write out image and prediction for summary.""" if 'orig_images' not in predictions: logging.info('Missing orig_images in predictions: %s', predictions.keys()) return max_images = min( len(predictions['orig_images']) * predictions['orig_images'][0].shape[0], max_images) _detection_boxes = np.concatenate(predictions['detection_boxes'], axis=0) _detection_scores = np.concatenate(predictions['detection_scores'], axis=0) _detection_classes = np.concatenate(predictions['detection_classes'], axis=0) _image_info = np.concatenate(predictions['image_info'], axis=0) _num_detections = np.concatenate(predictions['num_detections'], axis=0) _orig_images = np.concatenate(predictions['orig_images'], axis=0) if 'detection_masks' in predictions: _detection_masks = np.concatenate(predictions['detection_masks'], axis=0) else: _detection_masks = None if 'groundtruth_boxes' in predictions: _groundtruth_boxes = np.concatenate(predictions['groundtruth_boxes'], axis=0) else: _groundtruth_boxes = None _orig_images = _orig_images * 255 _orig_images = _orig_images.astype(np.uint8) image_previews = [] for i in range(max_images): num_detections = min(len(_detection_boxes[i]), int(_num_detections[i])) detection_boxes = _detection_boxes[i][:num_detections] detection_scores = _detection_scores[i][:num_detections] detection_classes = _detection_classes[i][:num_detections] image = _orig_images[i] image_height = image.shape[0] image_width = image.shape[1] # Rescale the box to fit the visualization image. h, w = _image_info[i][3:5] detection_boxes = detection_boxes / np.array([w, h, w, h]) detection_boxes = detection_boxes * np.array( [image_width, image_height, image_width, image_height]) if _groundtruth_boxes is not None: gt_boxes = _groundtruth_boxes[i] gt_boxes = gt_boxes * np.array( [image_height, image_width, image_height, image_width]) else: gt_boxes = None if _detection_masks is not None: instance_masks = _detection_masks[i][0:num_detections] segmentations = coco_metric.generate_segmentation_from_masks( instance_masks, detection_boxes, image_height, image_width) else: segmentations = None # From [x, y, w, h] to [x1, y1, x2, y2] and # process_prediction_for_eval() set the box to be [x, y] format, need to # reverted them to [y, x] format. xmin, ymin, w, h = np.split(detection_boxes, 4, axis=-1) xmax = xmin + w ymax = ymin + h boxes_to_visualize = np.concatenate([ymin, xmin, ymax, xmax], axis=-1) image_preview = generate_image_preview( image, boxes=boxes_to_visualize, scores=detection_scores, classes=detection_classes.astype(np.int32), gt_boxes=gt_boxes, segmentations=segmentations) image_previews.append(image_preview) try: summaries = [] for i, image_preview in enumerate(image_previews): image_buffer = generate_image_buffer(image_preview) image_summary = tf.compat.v1.Summary.Image( encoded_image_string=image_buffer) image_value = tf.compat.v1.Summary.Value(tag='%d_input' % i, image=image_summary) summaries.append(image_value) except AttributeError: image_previews = np.array(image_previews) summaries = tf.summary.image(name='image_summary', data=image_previews, step=current_step, max_outputs=max_images) return summaries
def _propose_rois(scores, boxes, anchor_boxes, height, width, scale, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights, topk_before_nms, nms_on_hpu): """Proposes RoIs giva group of candidates. Args: scores: a tensor with a shape of [batch_size, num_boxes]. boxes: a tensor with a shape of [batch_size, num_boxes, 4], in the encoded form. anchor_boxes: an Anchors object that contains the anchors with a shape of [batch_size, num_boxes, 4]. height: a tensor of shape [batch_size, 1, 1] representing the image height. width: a tensor of shape [batch_size, 1, 1] representing the image width. scale: a tensor of shape [batch_size, 1, 1] representing the image scale. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). bbox_reg_weights: None or a list of four integer specifying the weights used when decoding the box. topk_before_nms: a boolean indicating if topk is done before NMS nms_on_hpu: a boolean indicating if NMS is to be run on HPU. If False will run on CPU. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. It has same dtype as input scores. boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] represneting the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as input boxes. """ batch_size, num_boxes = scores.get_shape().as_list() topk_limit = min(num_boxes, rpn_pre_nms_topn) if topk_before_nms: scores, boxes_list = box_utils.top_k(scores, k=topk_limit, boxes_list=[boxes, anchor_boxes]) boxes = boxes_list[0] anchor_boxes = boxes_list[1] num_boxes = topk_limit boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) boxes = box_utils.clip_boxes(boxes, height, width) if rpn_min_size > 0.0: boxes, scores = box_utils.filter_boxes(boxes, tf.expand_dims(scores, axis=-1), rpn_min_size, height, width, scale) scores = tf.squeeze(scores, axis=-1) post_nms_topk_limit = topk_limit if topk_limit < rpn_post_nms_topn else rpn_post_nms_topn if rpn_nms_threshold > 0: # Normalize coordinates as combined_non_max_suppression currently # only support normalized coordinates. pre_nms_boxes = box_utils.to_normalized_coordinates( boxes, height, width) pre_nms_boxes = tf.reshape(pre_nms_boxes, [batch_size, num_boxes, 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, 1]) if not nms_on_hpu: logging.info( "[ROI OPs] Using Batched NMS on CPU/GPU with %d input boxes" % num_boxes) with tf.device('CPU:0'): boxes, scores, _, _ = tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=topk_limit, max_total_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold, score_threshold=0.0, pad_per_class=False, ) else: logging.info( "[ROI OPs] Using Batched NMS on HPU with %d input boxes" % num_boxes) boxes, scores, _, _ = tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=topk_limit, max_total_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold, score_threshold=0.0, pad_per_class=False, ) boxes = box_utils.to_absolute_coordinates(boxes, height, width) else: scores, boxes = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[boxes]) boxes = boxes[0] return scores, boxes
def log_info(self, message): logging.info("%s%s" % (self.LOGGING_PREFIX, message))