def run_harness(self): flag_dict = self.build_default_flags() flag_dict.update(self.build_scenario_specific_flags()) # Handle engines if self.has_gpu: flag_dict["gpu_engines"] = self.gpu_engine # Generates the entries in the `measurements/` directory, and updates flag_dict accordingly generate_measurements_entry(self.get_system_name(), self.name, self._get_submission_benchmark_name(), self.scenario, self.args["input_dtype"], self.args["precision"], flag_dict) # Stop here if we are only generating .conf files in measurements if self.generate_conf_files_only: return "Generated conf files" argstr = self._build_custom_flags(flag_dict) if type(argstr) is dict: argstr = args_to_string(flag_dict) # Handle environment variables if self.use_jemalloc: self.prepend_ld_preload( "/usr/lib/x86_64-linux-gnu/libjemalloc.so.2") cmd = "{:} {:}".format(self.executable, argstr) output = run_command(cmd, get_output=True, custom_env=self.env_vars) # Return harness result. return self._handle_harness_result( self.harness_get_result(output, scenario_result_regex[self.scenario]))
def _build_custom_flags(self, flag_dict): # Triton does not use gpu_engines flag flag_dict["gpu_engines"] = None # Force performance sample count flag_dict["performance_sample_count"] = benchmark_qsl_size_map[self.name] # Server harness binary assumes GPU and uses --batch_size instead of --gpu_batch_size flag_dict["batch_size"] = flag_dict["gpu_batch_size"] flag_dict["gpu_batch_size"] = None engine_info = self.get_engine_info() flag_dict["model_store_path"] = self.model_store_path flag_dict["model_name"] = self.model_name flag_dict["model_version"] = self.model_version flag_dict["buffer_manager_thread_count"] = self.args.get("buffer_manager_thread_count", 0) flag_dict["pinned_input"] = True if flag_dict["buffer_manager_thread_count"] == 0 else False flag_dict["batch_triton_requests"] = self.args.get("batch_triton_requests", False) flag_dict["check_contiguity"] = (flag_dict["batch_triton_requests"] == True) and (self.scenario == "Offline") # Inform the server to use different QSL flag_dict["use_dlrm_qsl"] = (self.name == BENCHMARKS.DLRM) # Set up Triton model repo self.setup_triton_model_repo(engine_info) argstr = args_to_string(flag_dict) + " --scenario " + self.scenario + " --model " + self.name if self.name in [BENCHMARKS.SSDMobileNet, BENCHMARKS.SSDResNet34]: argstr += " --response_postprocess coco" return argstr
def run_harness(self): flag_dict = self.build_default_flags() flag_dict.update(self.build_scenario_specific_flags()) # Handle engines if self.has_gpu: flag_dict["gpu_engines"] = self.gpu_engine # MLPINF-853: Special handing of --fast. Use min_duration=60000, and if Multistream, use min_query_count=1. if flag_dict.get("fast", False): if "min_duration" not in flag_dict: flag_dict["min_duration"] = 60000 if self.scenario in [SCENARIOS.Offline, SCENARIOS.MultiStream]: if "min_query_count" not in flag_dict: flag_dict["min_query_count"] = 1 flag_dict["fast"] = None # Generates the entries in the `measurements/` directory, and updates flag_dict accordingly generate_measurements_entry( self.get_system_name(), self.name, self._get_submission_benchmark_name(), self.scenario, self.args["input_dtype"], self.args["precision"], flag_dict) # Stop here if we are only generating .conf files in measurements if self.generate_conf_files_only: return "Generated conf files" argstr = self._build_custom_flags(flag_dict) if type(argstr) is dict: argstr = args_to_string(flag_dict) # Handle environment variables if self.use_jemalloc: self.prepend_ld_preload("/usr/lib/x86_64-linux-gnu/libjemalloc.so.2") cmd = "{:} {:}".format(self.executable, argstr) output = run_command(cmd, get_output=True, custom_env=self.env_vars) # Return harness result. scenario_key = scenario_loadgen_log_keys[self.scenario] results = from_loadgen_by_keys( os.path.join( self.args["log_dir"], self.get_system_name(), self._get_submission_benchmark_name(), self.scenario), ["result_validity", scenario_key]) if scenario_key not in results: result_string = "Cannot find performance result. Maybe you are running in AccuracyOnly mode." elif "result_validity" not in results: result_string = "{}: {}, Result validity unknown".format(scenario_key, results[scenario_key]) else: result_string = "{}: {}, Result is {}".format(scenario_key, results[scenario_key], results["result_validity"]) return self._handle_harness_result(result_string)
def _build_custom_flags(self, flag_dict): # Rename gpu_batch_size to batch_size batch_size = dict_get(self.args, "gpu_batch_size", default=None) flag_dict["batch_size"] = batch_size flag_dict["gpu_batch_size"] = None # Rename use_graphs to cuda_graph use_graphs = dict_get(self.args, "use_graphs", default=False) flag_dict["cuda_graph"] = use_graphs flag_dict["use_graphs"] = None # Rename max_seq_length to hp_max_seq_length max_seq_length = dict_get(self.args, "max_seq_length", default=None) flag_dict["hp_max_seq_length"] = max_seq_length flag_dict["max_seq_length"] = None # Handle more harness_rnnt knobs no_pipelined = dict_get(self.args, "nopipelined_execution", default=False) flag_dict["pipelined_execution"] = not no_pipelined flag_dict["nopipelined_execution"] = None # Handle more harness_rnnt knobs : disable batch sorting by sequence length no_sorting = dict_get(self.args, "nobatch_sorting", default=False) flag_dict["batch_sorting"] = not no_sorting flag_dict["nobatch_sorting"] = None # Handle yet another harness_rnnt knob: turning off DALI preprocessing for debug no_dali = dict_get(self.args, "noenable_audio_processing", default=False) flag_dict["enable_audio_processing"] = not no_dali flag_dict["noenable_audio_processing"] = None # Handle yet another harness_rnnt knob: disable DALI's scatter gather kernel no_copy_kernel = dict_get(self.args, "nouse_copy_kernel", default=False) flag_dict["use_copy_kernel"] = not no_copy_kernel flag_dict["nouse_copy_kernel"] = None # Rename gpu_inference_streams to streams_per_gpu num_inference = dict_get(self.args, "gpu_inference_streams", default=None) flag_dict["streams_per_gpu"] = num_inference flag_dict["gpu_inference_streams"] = None audio_fp16_input = dict_get(self.args, "audio_fp16_input", default=True) flag_dict["audio_fp16_input"] = audio_fp16_input start_from_device = dict_get(self.args, "start_from_device", default=False) flag_dict["start_from_device"] = start_from_device audio_input_suffix = "fp16" if audio_fp16_input else "fp32" flag_dict["audio_serialized_pipeline_file"] = "build/bin/dali" + "/dali_pipeline_gpu_" + audio_input_suffix + ".pth" argstr = args_to_string(flag_dict) + " --scenario {:} --model {:}".format(self.scenario, self.name) # Handle engine dir argstr += " --engine_dir={:}".format(self.engine_dir) return argstr
def _build_custom_flags(self, flag_dict): # Triton does not use gpu_engines flag flag_dict["gpu_engines"] = None # Force performance sample count flag_dict["performance_sample_count"] = benchmark_qsl_size_map[ self.name] flag_dict["model_store_path"] = self.model_store_path flag_dict["model_name"] = self.model_name flag_dict["model_version"] = self.model_version flag_dict["buffer_manager_thread_count"] = self.args.get( "buffer_manager_thread_count", 0) flag_dict["pinned_input"] = True # Inform the server to use different QSL flag_dict["use_dlrm_qsl"] = (self.name == BENCHMARKS.DLRM) # Specify harness-specific flags here flag_dict["tensor_path"] = self.tensor_path if self.test_mode: flag_dict["test_mode"] = self.test_mode if self.map_path: flag_dict["map_path"] = self.map_path if self.coalesced: flag_dict["coalesced_tensor"] = self.coalesced self.setup_triton_model_repo() argstr = args_to_string( flag_dict ) + " --scenario " + self.scenario + " --model " + self.name # Assign proper callback function here if self.name == BENCHMARKS.ResNet50: argstr += " --response_postprocess ovrn50" elif self.name in [BENCHMARKS.SSDMobileNet, BENCHMARKS.SSDResNet34]: argstr += " --response_postprocess ovcoco" return argstr
def _build_custom_flags(self, flag_dict): if self.has_dla: flag_dict["dla_engines"] = self.dla_engine if self.has_gpu and self.has_dla: pass elif self.has_gpu: flag_dict["max_dlas"] = 0 elif self.has_dla: flag_dict["max_dlas"] = 1 else: raise ValueError( "Cannot specify --no_gpu and --gpu_only at the same time") argstr = args_to_string( flag_dict ) + " --scenario " + self.scenario + " --model " + self.name if self.name in response_postprocess_map: argstr += " --response_postprocess " + response_postprocess_map[ self.name] return argstr
def _build_custom_flags(self, flag_dict): return args_to_string(flag_dict) + " --scenario " + self.scenario + " --model " + self.name
def _build_custom_flags(self, flag_dict): # Handle use_jemalloc self.use_jemalloc = dict_get(flag_dict, "use_jemalloc", False) flag_dict['use_jemalloc'] = None argstr = args_to_string(flag_dict) + " --scenario " + self.scenario + " --model " + self.name return argstr