def build_default_flags(self): flag_dict = {} flag_dict["verbose"] = self.verbose # Handle plugins if self.name in plugin_map: plugins = plugin_map[self.name] for plugin in plugins: self.check_file_exists(plugin) flag_dict["plugins"] = ",".join(plugins) # Generate flags for logfile names. log_dir = self.get_full_log_dir() if not os.path.exists(log_dir): os.makedirs(log_dir) flag_dict["logfile_outdir"] = log_dir flag_dict["logfile_prefix"] = "mlperf_log_" # Handle performance sample count perf_sample_count = dict_get(self.args, "performance_sample_count", None) if perf_sample_count is not None: flag_dict["performance_sample_count"] = perf_sample_count elif benchmark_qsl_size_map[self.name] > 0: flag_dict["performance_sample_count"] = benchmark_qsl_size_map[self.name] else: flag_dict["performance_sample_count"] = self.args["gpu_batch_size"] # Handle custom arguments for arg in self.flag_builder_custom_args: val = dict_get(self.args, arg, None) if val is not None: flag_dict[arg] = val return flag_dict
def build_scenario_specific_flags(self): flag_dict = {} prefix = self.qps_prefix if self.scenario == SCENARIOS.SingleStream: scenario_keys = common_args.SINGLE_STREAM_PARAMS elif self.scenario == SCENARIOS.Offline: scenario_keys = common_args.OFFLINE_PARAMS elif self.scenario == SCENARIOS.MultiStream: scenario_keys = common_args.MULTI_STREAM_PARAMS elif self.scenario == SCENARIOS.Server: scenario_keys = common_args.SERVER_PARAMS else: scenario_keys = [] raise RuntimeError("Unknown Scenario \"{}\"".format(self.scenario)) for arg in scenario_keys: val = dict_get(self.args, prefix + arg, None) if val is None: raise ValueError("Missing required key {:}".format(prefix + arg)) flag_dict[arg] = val # Handle RUN_ARGS for arg in scenario_keys: val = dict_get(self.args, arg, None) if val is not None: flag_dict[arg] = val return flag_dict
def __init__(self, args): workspace_size = dict_get(args, "workspace_size", default=(5 << 30)) logging.info("Using workspace size: {:,}".format(workspace_size)) super().__init__(args, name=BENCHMARKS.BERT, workspace_size=workspace_size) self.bert_config_path = "code/bert/tensorrt/bert_config.json" self.seq_len = 384 # default sequence length self.batch_size = dict_get(args, "batch_size", default=1) self.num_profiles = 1 if 'gpu_inference_streams' in args: # use gpu_inference_streams to determine the number of duplicated profiles # in the engine when not using lwis mode self.num_profiles = args['gpu_inference_streams'] self.is_int8 = args['precision'] == 'int8' if self.is_int8: self.model_path = dict_get( args, "model_path", default="build/models/bert/bert_large_v1_1_fake_quant.onnx") else: self.model_path = dict_get( args, "model_path", default="build/models/bert/bert_large_v1_1.onnx") self.bert_config = BertConfig(self.bert_config_path) self.enable_interleaved = False if self.is_int8 and 'enable_interleaved' in args: self.enable_interleaved = args['enable_interleaved'] # Small-Tile GEMM Plugin # Since it doesn't support interleaved format, two options are mutually exclusive self.use_small_tile_gemm_plugin = self.args.get( "use_small_tile_gemm_plugin", False) self.gemm_plugin_fairshare_cache_size = self.args.get( "gemm_plugin_fairshare_cache_size", -1) if self.enable_interleaved and self.use_small_tile_gemm_plugin: assert False, "Small-Tile GEMM Plugin doesn't support interleaved format." # Query system id for architecture self.system = get_system() self.gpu_arch = self.system.arch if self.batch_size > 512: # tactics selection is limited at very large batch sizes self.builder_config.max_workspace_size = 7 << 30 if 'nx' in self.system.gpu.lower(): # use 1GB only for XavierNX self.builder_config.max_workspace_size = 1 << 30
def _build_custom_flags(self, flag_dict): # Rename gpu_batch_size to batch_size batch_size = dict_get(self.args, "gpu_batch_size", default=None) flag_dict["batch_size"] = batch_size flag_dict["gpu_batch_size"] = None # Rename use_graphs to cuda_graph use_graphs = dict_get(self.args, "use_graphs", default=False) flag_dict["cuda_graph"] = use_graphs flag_dict["use_graphs"] = None # Rename max_seq_length to hp_max_seq_length max_seq_length = dict_get(self.args, "max_seq_length", default=None) flag_dict["hp_max_seq_length"] = max_seq_length flag_dict["max_seq_length"] = None # Handle more harness_rnnt knobs no_pipelined = dict_get(self.args, "nopipelined_execution", default=False) flag_dict["pipelined_execution"] = not no_pipelined flag_dict["nopipelined_execution"] = None # Handle more harness_rnnt knobs : disable batch sorting by sequence length no_sorting = dict_get(self.args, "nobatch_sorting", default=False) flag_dict["batch_sorting"] = not no_sorting flag_dict["nobatch_sorting"] = None # Handle yet another harness_rnnt knob: turning off DALI preprocessing for debug no_dali = dict_get(self.args, "noenable_audio_processing", default=False) flag_dict["enable_audio_processing"] = not no_dali flag_dict["noenable_audio_processing"] = None # Handle yet another harness_rnnt knob: disable DALI's scatter gather kernel no_copy_kernel = dict_get(self.args, "nouse_copy_kernel", default=False) flag_dict["use_copy_kernel"] = not no_copy_kernel flag_dict["nouse_copy_kernel"] = None # Rename gpu_inference_streams to streams_per_gpu num_inference = dict_get(self.args, "gpu_inference_streams", default=None) flag_dict["streams_per_gpu"] = num_inference flag_dict["gpu_inference_streams"] = None audio_fp16_input = dict_get(self.args, "audio_fp16_input", default=True) flag_dict["audio_fp16_input"] = audio_fp16_input start_from_device = dict_get(self.args, "start_from_device", default=False) flag_dict["start_from_device"] = start_from_device audio_input_suffix = "fp16" if audio_fp16_input else "fp32" flag_dict["audio_serialized_pipeline_file"] = "build/bin/dali" + "/dali_pipeline_gpu_" + audio_input_suffix + ".pth" argstr = args_to_string(flag_dict) + " --scenario {:} --model {:}".format(self.scenario, self.name) # Handle engine dir argstr += " --engine_dir={:}".format(self.engine_dir) return argstr
def __init__(self, args, name="", skip_file_checks=False): self.args = args self.name = name self.verbose = dict_get(args, "verbose", default=None) if self.verbose: logging.info("===== Harness arguments for {:} =====".format(name)) for key in args: logging.info("{:}={:}".format(key, args[key])) self.system_id = args["system_id"] self.scenario = args["scenario"] self.config_ver = args["config_ver"] self.engine_dir = "./build/engines/{:}/{:}/{:}".format( self.system_id, self.name, self.scenario) self.precision = args["precision"] # Detect devices used to set field prefixes self.has_gpu = dict_get(args, "gpu_batch_size", default=None) is not None self.has_dla = dict_get(args, "dla_batch_size", default=None) is not None self.qps_prefix = "" if self.has_gpu and self.has_dla: self.qps_prefix = "concurrent_" elif self.has_gpu: self.qps_prefix = "gpu_" elif self.has_dla: self.qps_prefix = "dla_" # Check if we actually need to execute the harness self.generate_conf_files_only = False if dict_get(self.args, "generate_conf_files_only", False): logging.info("Only generating measurements/ configuration entries") self.generate_conf_files_only = True self.args["generate_conf_files_only"] = None # Enumerate engine files # Engine not needed if we are only generating measurements/ entries self.skip_file_checks = skip_file_checks or self.generate_conf_files_only self.gpu_engine = None self.dla_engine = None self.enumerate_engines() # Enumerate harness executable self.executable = self._get_harness_executable() self.check_file_exists(self.executable) self.use_jemalloc = False self.env_vars = os.environ.copy() self.flag_builder_custom_args = []
def _get_submission_benchmark_name(self): full_benchmark_name = self.name if dict_get(self.args, "accuracy_level", "99%") == "99.9%": full_benchmark_name += "-99.9" elif self.name in BENCHMARKS.HIGH_ACC_ENABLED: full_benchmark_name += "-99" return full_benchmark_name
def __init__(self, args): workspace_size = dict_get(args, "workspace_size", default=(1 << 30)) logging.info("Use workspace_size: {:}".format(workspace_size)) super().__init__(args, name=BENCHMARKS.ResNet50, workspace_size=workspace_size) # Model path self.model_path = dict_get( args, "model_path", default="code/resnet50/tensorrt/ofa_autosinian_is176.onnx") logging.info("Using AutoSinian optimized once-for-all network") self.cache_file = None self.need_calibration = False if self.precision == "int8": # Get calibrator variables calib_batch_size = dict_get(self.args, "calib_batch_size", default=1) calib_max_batches = dict_get(self.args, "calib_max_batches", default=500) force_calibration = dict_get(self.args, "force_calibration", default=False) cache_file = dict_get( self.args, "cache_file", default="code/resnet50/tensorrt/calibrator.cache") preprocessed_data_dir = dict_get(self.args, "preprocessed_data_dir", default="build/preprocessed_data") calib_data_map = dict_get(self.args, "calib_data_map", default="data_maps/imagenet/cal_map.txt") calib_image_dir = os.path.join(preprocessed_data_dir, "imagenet/ResNet50/fp32") # Set up calibrator self.calibrator = RN50Calibrator( calib_batch_size=calib_batch_size, calib_max_batches=calib_max_batches, force_calibration=force_calibration, cache_file=cache_file, image_dir=calib_image_dir, calib_data_map=calib_data_map) self.builder_config.int8_calibrator = self.calibrator self.cache_file = cache_file self.need_calibration = force_calibration or not os.path.exists( cache_file)
def __init__(self, args): workspace_size = dict_get(args, "workspace_size", default=(5 << 30)) logging.info("Use workspace_size: {:}".format(workspace_size)) super().__init__(args, name=BENCHMARKS.BERT, workspace_size=workspace_size) self.bert_config_path = "code/bert/tensorrt/bert_config.json" self.seq_len = 384 # default sequence length assert 'batch_size' in args, 'batch_size is not specified' self.batch_size = args['batch_size'] self.num_profiles = 1 if 'gpu_inference_streams' in args: # use gpu_inference_streams to determine the number of duplicated profiles # in the engine when not using lwis mode self.num_profiles = args['gpu_inference_streams'] self.is_int8 = args['precision'] == 'int8' if self.is_int8: self.model_path = dict_get( args, "model_path", default="build/models/bert/bert_large_v1_1_fake_quant.onnx") else: self.model_path = dict_get( args, "model_path", default="build/models/bert/bert_large_v1_1.onnx") self.bert_config = BertConfig(self.bert_config_path) self.enable_il = False if self.is_int8 and 'enable_interleaved' in args: self.enable_il = args['enable_interleaved'] if self.batch_size > 512: # tactics selection is limited at very large batch sizes self.builder_config.max_workspace_size = 7 << 30 if 'nx' in self.system_id.lower(): # use 1GB only for XavierNX self.builder_config.max_workspace_size = 1 << 30
def get_system_name(self): override_system_name = dict_get(self.args, "system_name", default=None) if override_system_name not in {None, ""}: return override_system_name system_name = self.system_id for kw in system_name_map.keys(): if kw in self.system_id: system_name = "_".join([system_name_map[kw], system_name]) break return "{:}_TRT{:}".format(system_name, TENSORRT_VERSION)
def get_system_name(self): override_system_name = dict_get(self.args, "system_name", default=None) if override_system_name not in {None, ""}: return override_system_name system_name = self.system_id for kw, prepend_name in system_name_map: if kw in self.system_id: system_name = "_".join([prepend_name, system_name]) break full_system_name = "{:}_TRT{:}".format(system_name, TENSORRT_VERSION) return self._append_config_ver_name(full_system_name)
def build_scenario_specific_flags(self): """Return flags specific to current scenario.""" flag_dict = {} prefix = self.qps_prefix scenario_keys = common_args.getScenarioMetricArgs(self.scenario) for arg in scenario_keys: val = dict_get(self.args, prefix + arg, None) if val is None: raise ValueError("Missing required key {:}".format(prefix + arg)) flag_dict[arg] = val # Handle RUN_ARGS for arg in scenario_keys: val = dict_get(self.args, arg, None) if val is not None: flag_dict[arg] = val return flag_dict
def __init__(self, args): """Set up the config and calibrator for DLRM. Does not initialize.""" workspace_size = dict_get(args, "workspace_size", default=(4 << 30)) logging.info("Using workspace size: {:,}".format(workspace_size)) super().__init__(args, name=BENCHMARKS.DLRM, workspace_size=workspace_size) with open("code/dlrm/tensorrt/mlperf_40m.limit.json") as f: self.dlrm_config = json.load(f) logging.info("DLRM config: {:}".format(self.dlrm_config)) self.num_numerical_inputs = self.dlrm_config["num_numerical_features"] self.num_features = len(self.dlrm_config["categorical_feature_sizes"]) self.num_interactions = (self.num_features + 1) * self.num_features // 2 self.embedding_size = self.dlrm_config["embedding_dim"] self.embedding_rows = self.dlrm_config["categorical_feature_sizes"] self.embedding_rows_bound = 40000000 self.embedding_rows = [min(i, self.embedding_rows_bound) for i in self.embedding_rows] self.embedding_rows_total = np.sum(np.array(self.embedding_rows)) self.bottom_mlp_channels = self.dlrm_config["bottom_mlp_sizes"] self.bottom_mlp_names = ["bot_l.0", "bot_l.2", "bot_l.4"] self.output_padding = self.args.get("output_padding_granularity", 32) self.top_mlp_input_size = (self.num_interactions + self.embedding_size + self.output_padding - 1) // self.output_padding * self.output_padding self.top_mlp_channels = self.dlrm_config["top_mlp_sizes"] self.top_mlp_names = ["top_l.0", "top_l.2", "top_l.4", "top_l.6", "top_l.8"] self.model_filepath = "build/models/dlrm/tb00_40M.pt" self.embedding_weights_binary_filepath = "build/models/dlrm/40m_limit/dlrm_embedding_weights_int8_v3.bin" self.model_without_embedding_weights_filepath = "build/models/dlrm/40m_limit/model_test_without_embedding_weights_v3.pt" self.row_frequencies_binary_filepath = "build/models/dlrm/40m_limit/row_frequencies.bin" self.row_frequencies_src_dir = "build/models/dlrm/40m_limit/row_freq" self.embedding_weights_on_gpu_part = self.args.get("embedding_weights_on_gpu_part", 1.0) self.use_row_frequencies = True if self.embedding_weights_on_gpu_part < 1.0 else False self.num_profiles = self.args.get("gpu_inference_streams", 1) self.use_small_tile_gemm_plugin = self.args.get("use_small_tile_gemm_plugin", False) self.gemm_plugin_fairshare_cache_size = self.args.get("gemm_plugin_fairshare_cache_size", -1) self.enable_interleaved_top_mlp = self.args.get("enable_interleaved_top_mlp", False) if self.precision == "fp16": self.apply_flag(trt.BuilderFlag.FP16) elif self.precision == "int8": self.apply_flag(trt.BuilderFlag.INT8) if self.precision == "int8": # Get calibrator variables calib_batch_size = dict_get(self.args, "calib_batch_size", default=512) calib_max_batches = dict_get(self.args, "calib_max_batches", default=500) force_calibration = dict_get(self.args, "force_calibration", default=False) cache_file = dict_get(self.args, "cache_file", default="code/dlrm/tensorrt/calibrator.cache") preprocessed_data_dir = dict_get(self.args, "preprocessed_data_dir", default="build/preprocessed_data") calib_data_dir = os.path.join(preprocessed_data_dir, "criteo/full_recalib/val_data_128000") # Set up calibrator self.calibrator = DLRMCalibrator(calib_batch_size=calib_batch_size, calib_max_batches=calib_max_batches, force_calibration=force_calibration, cache_file=cache_file, data_dir=calib_data_dir) self.builder_config.int8_calibrator = self.calibrator self.cache_file = cache_file self.need_calibration = force_calibration or not os.path.exists(cache_file) else: self.need_calibration = False
def initialize(self): """ Parse the processed model to create the network. """ # Create network. self.network = self.builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) channel_idx = 1 # Input shape input_tensor_dim = [-1] + self.input_volume_dim input_tensor_dim.insert(channel_idx, self.num_input_channel) # Parse from onnx file. parser = trt.OnnxParser(self.network, self.logger) model = self.preprocess_onnx(onnx.load(self.model_path)) success = parser.parse(onnx._serialize(model)) if not success: raise RuntimeError( "3D-Unet onnx model parsing failed! Error: {:}".format( parser.get_error(0).desc())) # Set input/output tensor dtype and formats input_tensor = self.network.get_input(0) output_tensor = self.network.get_output(0) input_tensor.shape = input_tensor_dim if self.input_dtype == "int8": input_tensor.dtype = trt.int8 elif self.input_dtype == "fp16": input_tensor.dtype = trt.float16 elif self.input_dtype == "fp32": input_tensor.dtype = trt.float32 if self.input_format == "linear": input_tensor.allowed_formats = 1 << int(trt.TensorFormat.LINEAR) elif self.input_format == "dhwc8": input_tensor.allowed_formats = 1 << int(trt.TensorFormat.DHWC8) elif self.input_format == "cdhw32": input_tensor.allowed_formats = 1 << int(trt.TensorFormat.CDHW32) # Always use FP16 output # workaround for calibration not working with the identity layer properly force_calibration = dict_get(self.args, "force_calibration", default=False) output_tensor.dtype = trt.float16 if force_calibration == False else trt.float32 output_tensor.allowed_formats = 1 << int(trt.TensorFormat.LINEAR) self.initialized = True
def __init__(self, args): workspace_size = dict_get(args, "workspace_size", default=(2 << 30)) logging.info("Using workspace size: {:,}".format(workspace_size)) super().__init__(args, name=BENCHMARKS.SSDResNet34, workspace_size=workspace_size) # Model path self.model_path = dict_get( args, "model_path", default="build/models/SSDResNet34/resnet34-ssd1200.pytorch") if self.precision == "int8": force_calibration = dict_get(self.args, "force_calibration", default=False) calib_batch_size = dict_get(self.args, "calib_batch_size", default=10) calib_max_batches = dict_get(self.args, "calib_max_batches", default=50) cache_file = dict_get( self.args, "cache_file", default="code/ssd-resnet34/tensorrt/calibrator.cache") preprocessed_data_dir = dict_get(self.args, "preprocessed_data_dir", default="build/preprocessed_data") calib_data_map = dict_get(self.args, "calib_data_map", default="data_maps/coco/cal_map.txt") calib_image_dir = os.path.join(preprocessed_data_dir, "coco/train2017/SSDResNet34/fp32") self.calibrator = SSDResNet34EntropyCalibrator( calib_image_dir, cache_file, calib_batch_size, calib_max_batches, force_calibration, calib_data_map) self.builder_config.int8_calibrator = self.calibrator self.cache_file = cache_file
def __init__(self, args): workspace_size = dict_get(args, "workspace_size", default=(2 << 31)) logging.info("Use workspace_size: {:}".format(workspace_size)) super().__init__(args, name="ssd-mobilenet", workspace_size=workspace_size) # Model path self.model_path = dict_get( args, "model_path", default="build/models/SSDMobileNet/frozen_inference_graph.pb") if self.precision == "int8": calib_batch_size = dict_get(self.args, "calib_batch_size", default=1) calib_max_batches = dict_get(self.args, "calib_max_batches", default=500) force_calibration = dict_get(self.args, "force_calibration", default=False) cache_file = dict_get( self.args, "cache_file", default="code/ssd-mobilenet/tensorrt/calibrator.cache") preprocessed_data_dir = dict_get(self.args, "preprocessed_data_dir", default="build/preprocessed_data") calib_data_map = dict_get(self.args, "calib_data_map", default="data_maps/coco/cal_map.txt") calib_image_dir = os.path.join(preprocessed_data_dir, "coco/train2017/SSDMobileNet/fp32") self.calibrator = SSDMobileNetEntropyCalibrator( calib_batch_size, calib_max_batches, force_calibration, cache_file, calib_image_dir, calib_data_map) self.builder_config.int8_calibrator = self.calibrator self.cache_file = cache_file
def __init__(self, args): workspace_size = dict_get(args, "workspace_size", default=(1 << 30)) logging.info("Use workspace_size: {:}".format(workspace_size)) super().__init__(args, name=BENCHMARKS.ResNet50, workspace_size=workspace_size) # Model path self.model_path = dict_get( args, "model_path", default="code/resnet50/tensorrt/resnet50_inspur_open.onnx") if self.precision == "int8": # Get calibrator variables calib_batch_size = dict_get(self.args, "calib_batch_size", default=1) calib_max_batches = dict_get(self.args, "calib_max_batches", default=500) force_calibration = dict_get(self.args, "force_calibration", default=False) cache_file = dict_get( self.args, "cache_file", default="code/resnet50/tensorrt/calibrator.cache") calib_data_map = dict_get(self.args, "calib_data_map", default="data_maps/imagenet/cal_map.txt") calib_image_dir = "build/data/imagenet" # Set up calibrator self.calibrator = RN50Calibrator( calib_batch_size=calib_batch_size, calib_max_batches=calib_max_batches, force_calibration=force_calibration, cache_file=cache_file, image_dir=calib_image_dir, calib_data_map=calib_data_map) self.builder_config.int8_calibrator = self.calibrator self.cache_file = cache_file self.need_calibration = force_calibration or not os.path.exists( cache_file)
def _build_custom_flags(self, flag_dict): # Handle use_jemalloc self.use_jemalloc = dict_get(flag_dict, "use_jemalloc", False) flag_dict['use_jemalloc'] = None argstr = args_to_string(flag_dict) + " --scenario " + self.scenario + " --model " + self.name return argstr
def __init__(self, args, name="", workspace_size=(1 << 30)): """ Constructor :param args: arguments represented by a dictionary :param name: name of the benchmark """ self.name = name self.args = args # Configuration variables self.verbose = dict_get(args, "verbose", default=False) if self.verbose: logging.info("========= BenchmarkBuilder Arguments =========") for arg in args: logging.info("{:}={:}".format(arg, args[arg])) self.system_id = args["system_id"] self.scenario = args["scenario"] self.config_ver = args["config_ver"] self.engine_dir = "./build/engines/{:}/{:}/{:}".format( self.system_id, self.name, self.scenario) # Set up logger, builder, and network. self.logger = trt.Logger( trt.Logger.VERBOSE if self.verbose else trt.Logger.INFO) trt.init_libnvinfer_plugins(self.logger, "") self.builder = trt.Builder(self.logger) self.builder_config = self.builder.create_builder_config() self.builder_config.max_workspace_size = workspace_size if dict_get(args, "verbose_nvtx", default=False): self.builder_config.profiling_verbosity = trt.ProfilingVerbosity.VERBOSE # Precision variables self.input_dtype = dict_get(args, "input_dtype", default="fp32") self.input_format = dict_get(args, "input_format", default="linear") self.precision = dict_get(args, "precision", default="int8") self.clear_flag(trt.BuilderFlag.TF32) if self.precision == "fp16": self.apply_flag(trt.BuilderFlag.FP16) elif self.precision == "int8": self.apply_flag(trt.BuilderFlag.INT8) # Device variables self.device_type = "gpu" self.dla_core = args.get("dla_core", None) if self.dla_core is not None: logging.info("Using DLA: Core {:}".format(self.dla_core)) self.device_type = "dla" self.apply_flag(trt.BuilderFlag.GPU_FALLBACK) self.builder_config.default_device_type = trt.DeviceType.DLA self.builder_config.DLA_core = int(self.dla_core) if self.scenario == SCENARIOS.SingleStream: self.batch_size = 1 elif self.scenario in [ SCENARIOS.Server, SCENARIOS.Offline, SCENARIOS.MultiStream ]: self.batch_size = self.args.get("batch_size", 1) else: raise ValueError("Invalid scenario: {:}".format(self.scenario)) # Currently, TRT has limitation that we can only create one execution # context for each optimization profile. Therefore, create more profiles # so that LWIS can create multiple contexts. self.num_profiles = self.args.get("gpu_copy_streams", 4) self.initialized = False
def __init__(self, args): workspace_size = dict_get(args, "workspace_size", default=(8 << 30)) super().__init__(args, name=BENCHMARKS.UNET, workspace_size=workspace_size) # input channel self.num_input_channel = 4 # input volume dimension self.input_volume_dim = [224, 224, 160] # use InstNorm3D plugin self.use_instnorm3d_plugin = True # use pixelShuffle plugin self.enable_pixelshuffle3d_plugin = True self.enable_pixelshuffle3d_plugin_concat_fuse = True # Deconv->Conv conversion self.use_conv_for_deconv = True self.pixel_shuffle_cdwh = True # If false, do dhwc # use last layer plugin self.use_conv3d1x1x1k4_plugin = True # Model is imported from ONNX self.model_path = dict_get( args, "model_path", default="build/models/3d-unet/3dUNetBraTS.onnx") force_calibration = dict_get(self.args, "force_calibration", default=False) # Calibrator if self.precision == "int8" or force_calibration: self.apply_flag(trt.BuilderFlag.INT8) preprocessed_data_dir = dict_get( self.args, "preprocessed_data_dir", default="build/preprocessed_data/brats/calibration") calib_batch_size = dict_get(self.args, "calib_batch_size", default=2) calib_max_batches = dict_get(self.args, "calib_max_batches", default=20) calib_data_map = dict_get(self.args, "calib_data_map", default="data_maps/brats/cal_map.txt") calib_volume_dir = os.path.join(preprocessed_data_dir, "brats_npy/fp32") input_shape = [self.num_input_channel] + self.input_volume_dim cache_file = dict_get( self.args, "cache_file", default="code/3d-unet/tensorrt/calibrator.cache") self.calibrator = UNet3DLegacyCalibrator( calib_volume_dir, cache_file, calib_batch_size, calib_max_batches, force_calibration, calib_data_map, input_shape) assert self.calibrator, "Calibrator is not init'ed" assert self.calibrator.get_algorithm( ) == trt.CalibrationAlgoType.LEGACY_CALIBRATION, "Calibrator type is not Legacy" self.builder_config.int8_calibrator = self.calibrator self.cache_file = cache_file # TRT builder flag if self.precision == "fp16": self.apply_flag(trt.BuilderFlag.FP16) elif self.precision == "int8": self.apply_flag(trt.BuilderFlag.FP16) self.apply_flag(trt.BuilderFlag.INT8)