def prepend_ld_preload(self, so_path): if "LD_PRELOAD" in self.env_vars: self.env_vars["LD_PRELOAD"] = ":".join([so_path, self.env_vars["LD_PRELOAD"]]) else: self.env_vars["LD_PRELOAD"] = so_path logging.info("Updated LD_PRELOAD: " + self.env_vars["LD_PRELOAD"])
def run(self): self.load_val_images() logging.info("Running accuracy check on {:} images.".format(self.num_images)) class_predictions = [] batch_idx = 0 for image_idx in range(0, self.num_images, self.batch_size): actual_batch_size = self.batch_size if image_idx + self.batch_size <= self.num_images else self.num_images - image_idx batch_images = self.image_list[image_idx:image_idx + actual_batch_size] # DLA does not support batches that are less than the engine's configured batch size. Pad with junk. while len(batch_images) < self.batch_size: batch_images.append(self.image_list[0]) batch_images = np.ascontiguousarray(np.stack([np.load(os.path.join(self.image_dir, name + ".npy")) for name in batch_images])) start_time = time.time() outputs = self.runner([batch_images], self.batch_size) if self.verbose: logging.info("Batch {:d} (Size {:}) >> Inference time: {:f}".format(batch_idx, actual_batch_size, time.time() - start_time)) class_predictions.extend(outputs[0][:actual_batch_size]) batch_idx += 1 class_list = self.class_list[:self.num_images] num_matches = np.sum(np.array(class_list) == np.array(class_predictions)) accuracy = float(num_matches) / len(class_list) return accuracy
def get_score(predictions): logging.info("Evaluating predictions...") input_file = "build/data/squad/dev-v1.1.json" with open(input_file) as f: data = json.load(f)["data"] f1_score_total = 0.0 exact_score_total = 0.0 sample_idx = 0 for task in data: title = task["title"] for paragraph_idx, paragraph in enumerate(task["paragraphs"]): context = paragraph["context"] for q_idx, qas in enumerate(paragraph["qas"]): if sample_idx < len(predictions): answers = qas["answers"] f1_score_this = 0.0 exact_score_this = 0.0 for answer in answers: f1_score_this = max(f1_score_this, f1_score(predictions[sample_idx], answer["text"])) exact_score_this = max(exact_score_this, exact_match_score(predictions[sample_idx], answer["text"])) f1_score_total += f1_score_this exact_score_total += exact_score_this sample_idx += 1 f1_score_avg = f1_score_total / len(predictions) * 100 exact_score_avg = exact_score_total / len(predictions) * 100 return (exact_score_avg, f1_score_avg)
def infer_decoder(self): batch_idx = 0 max_seq_length = 1152 // 2 (infer_ndtype, dtype_esize) = get_dtype_info(self.args.input_dtype) self.outputs = [] for image_idx in range(0, self.num_samples, self.batch_size): # Actual batch size might be smaller than max batch size actual_batch_size = self.batch_size if image_idx + self.batch_size <= self.num_samples else self.num_samples - image_idx start_time = time.time() input_port = np.ascontiguousarray(np.random.randint( 0, high=self.hyperP.labels_size, size=(actual_batch_size, 1)), dtype=np.int32) # iterate over seq id for seq_id in range(max_seq_length): predictions = self.decoder._run_decoder([input_port], seq_id, actual_batch_size) predictions = predictions.reshape( (actual_batch_size, self.hyperP.decoder_hidden_size)) winners = np.argmax(predictions, axis=1) self.outputs.extend(winners[:actual_batch_size]) input_port = np.minimum(winners, self.hyperP.labels_size - 1) input_port = np.ascontiguousarray(input_port.reshape( (actual_batch_size, 1)), dtype=np.int32) logging.info( "Batch {:d} (Size {:}) >> Inference time: {:f}".format( batch_idx, actual_batch_size, time.time() - start_time)) batch_idx += 1
def add_fc(self): """ add FC layer """ logging.info("Adding FC layer") # fetch some attrs from old fc1000; note MatMul doesn't have bias old_fc_op = [_n for _n in self.graph.nodes if _n.name == "fc1000"][0] old_fc_kernel = old_fc_op.inputs[1] fc_kernel_weights = old_fc_kernel.values[:, 1:] # instantiate fc weight # NOTE: expects KM weight, if transpose is not set (default not set) fc_weight = gs.Constant("fc_replaced_weight", values=fc_kernel_weights) # find input to fc to be added squeeze_replaced_op = [ _n for _n in self.graph.nodes if _n.name == "squeeze_replaced" ][0] squeeze_replaced_out = squeeze_replaced_op.outputs[0] # reshape input reshape_shape = np.array([-1, fc_kernel_weights.shape[0]], dtype=np.int64) fc_reshape_shape = gs.Constant("fc_reshape_shape", values=reshape_shape) # add FC: Reshape=>MatMul fc_reshape_out = self.graph.Reshape("fc_reshape_input", squeeze_replaced_out, fc_reshape_shape) fc_out = self.graph.MatMul("fc_replaced", fc_reshape_out, fc_weight)
def _generate_harness_object(config, profile): # Refactors harness generation for use by functions other than handle_run_harness benchmark_name = config['benchmark'] if config.get("use_triton"): from code.common.server_harness import TritonHarness harness = TritonHarness(config, name=benchmark_name) config["inference_server"] = "triton" elif benchmark_name == BENCHMARKS.BERT: from code.bert.tensorrt.harness import BertHarness harness = BertHarness(config, name=benchmark_name) config["inference_server"] = "custom" elif benchmark_name == BENCHMARKS.DLRM: from code.dlrm.tensorrt.harness import DLRMHarness harness = DLRMHarness(config, name=benchmark_name) config["inference_server"] = "custom" elif benchmark_name == BENCHMARKS.RNNT: from code.rnnt.tensorrt.harness import RNNTHarness harness = RNNTHarness(config, name=benchmark_name) config["inference_server"] = "custom" else: from code.common.lwis_harness import LWISHarness harness = LWISHarness(config, name=benchmark_name) # Attempt to run profiler. Note that this is only available internally. if profile is not None: try: from code.internal.profiler import ProfilerHarness harness = ProfilerHarness(harness, profile) except BaseException: logging.info("Could not load profiler: Are you an internal user?") return harness, config
def infer_joint(self): (infer_ndtype, dtype_esize) = get_dtype_info(self.args.input_dtype) batch_idx = 0 self.outputs = [] for image_idx in range(0, self.num_samples, self.batch_size): # Actual batch size might be smaller than max batch size actual_batch_size = self.batch_size if image_idx + self.batch_size <= self.num_samples else self.num_samples - image_idx start_time = time.time() max_seq_length = 1152 + 1152 // 2 # U=1152//2 + T=1152 FIXME encoder_input_size = self.hyperP.encoder_hidden_size decoder_input_size = self.hyperP.decoder_hidden_size for seq_idx in range(max_seq_length): # input ports enc_input_port = np.ascontiguousarray(np.random.rand( actual_batch_size, 1, encoder_input_size), dtype=infer_ndtype) dec_input_port = np.ascontiguousarray(np.random.rand( actual_batch_size, 1, decoder_input_size), dtype=infer_ndtype) inputs = [enc_input_port, dec_input_port] outputs = self.joint(inputs, actual_batch_size) self.outputs.extend(outputs[0][:actual_batch_size]) logging.info( "Batch {:d} (Size {:}) >> Inference time: {:f}".format( batch_idx, actual_batch_size, time.time() - start_time)) batch_idx += 1
def convert(self, image_list): for idx, img_file in enumerate(image_list): logging.info("Processing image No.{:d}/{:d}...".format( idx, len(image_list))) output_files = [ self.get_filename(format, img_file) for format in self.run_formats ] if all([os.path.exists(i) for i in output_files]) and not self.overwrite: logging.info( "Skipping {:} because it already exists.".format(img_file)) continue image_fp32 = self.loader(os.path.join(self.src_dir, img_file)) if "fp32" in self.run_formats: np.save(self.get_filename("fp32", img_file), image_fp32) image_int8_linear = self.quantizer(image_fp32) if "int8_linear" in self.run_formats: np.save(self.get_filename("int8_linear", img_file), image_int8_linear) image_int8_chw4 = self.linear_to_chw4(image_int8_linear) if "int8_chw4" in self.run_formats: np.save(self.get_filename("int8_chw4", img_file), image_int8_chw4)
def copy_default_engine(benchmark): new_path = benchmark._get_engine_name(None, None) # Use default values benchmark.config_ver = "default" default_path = benchmark._get_engine_name(None, None) logging.info("Copying {:} to {:}".format(default_path, new_path)) shutil.copyfile(default_path, new_path)
def main(): args = common_args.parse_args(common_args.ACCURACY_ARGS) logging.info("Running accuracy test...") run_SSDResNet34_accuracy(args["engine_file"], args["batch_size"], args["num_samples"], verbose=args["verbose"])
def dump_embedding_weights_to_binary_file(self): logging.info("Writing quantized embedding weights to " + self.embedding_weights_binary_filepath) with open(self.embedding_weights_binary_filepath,'wb') as f: f.write(struct.pack('i', self.num_features)) # Calculate the maximum absolute value of embedding weights for each table mults = np.ndarray(shape=(self.num_features)) for feature_id in range(self.num_features): weight_tensor_name = "emb_l." + str(feature_id) + ".weight" embeddings = self.weights[weight_tensor_name].numpy() maxAbsVal = abs(max(embeddings.max(), embeddings.min(), key=abs)) mults[feature_id] = 127.5 / maxAbsVal embeddingsScale = 1.0 / mults[feature_id] f.write(struct.pack('f', embeddingsScale)) for feature_id in range(self.num_features): weight_tensor_name = "emb_l." + str(feature_id) + ".weight" embeddings = self.weights[weight_tensor_name].numpy() if (embeddings.shape[0] != self.embedding_rows[feature_id]): raise IOError("Expected " + str(self.embedding_rows[feature_id]) + " embedding rows, but got " + str(embeddings.shape[0]) + " rows for feature " + str(feature_id)) embeddingsQuantized = np.minimum(np.maximum(np.rint(np.multiply(embeddings, mults[feature_id])), -127), 127).astype('int8') # Remove the embedding weights, we don't need them any longer del self.weights[weight_tensor_name] # Write quantized embeddings to file embeddingsQuantized.tofile(f)
def cleanup(): """Delete files for audit cleanup.""" tmp_files = ["audit.config", "verify_accuracy.txt", "verify_performance.txt", "mlperf_log_accuracy_baseline.json", "accuracy.txt", "predictions.json"] for fname in tmp_files: if os.path.exists(fname): logging.info('Audit cleanup: Removing file {}'.format(fname)) os.remove(fname)
def turn_on_mps(active_sms): if not is_xavier(): turn_off_mps() cmd = "export CUDA_MPS_ACTIVE_THREAD_PERCENTAGE={:d} && nvidia-cuda-mps-control -d".format( active_sms) logging.info("Turn on MPS with active_sms = {:d}.".format(active_sms)) run_command(cmd)
def run_dlrm_accuracy(engine_file, batch_size, num_pairs=10000000, verbose=False): if verbose: logging.info("Running DLRM accuracy test with:") logging.info(" engine_file: {:}".format(engine_file)) logging.info(" batch_size: {:}".format(batch_size)) logging.info(" num_pairs: {:}".format(num_pairs)) runner = EngineRunner(engine_file, verbose=verbose) pair_dir = os.path.join( os.getenv("PREPROCESSED_DATA_DIR", "build/preprocessed_data"), "criteo", "full_recalib") input_dtype, input_format = get_input_format(runner.engine) if input_dtype == trt.DataType.FLOAT: format_string = "fp32" elif input_dtype == trt.DataType.HALF: format_string = "fp16" elif input_dtype == trt.DataType.INT8: format_string = "int8" if input_format == trt.TensorFormat.CHW4: format_string += "_chw4" else: raise NotImplementedError( "Unsupported DataType {:}".format(input_dtype)) numerical_inputs = np.load( os.path.join(pair_dir, "numeric_{:}.npy".format(format_string))) categ_inputs = np.load(os.path.join(pair_dir, "categorical_int32.npy")) predictions = [] refs = [] batch_idx = 0 for pair_idx in range(0, int(num_pairs), batch_size): actual_batch_size = batch_size if pair_idx + batch_size <= num_pairs else num_pairs - pair_idx numerical_input = np.ascontiguousarray( numerical_inputs[pair_idx:pair_idx + actual_batch_size]) categ_input = np.ascontiguousarray(categ_inputs[pair_idx:pair_idx + actual_batch_size]) start_time = time.time() outputs = runner([numerical_input, categ_input], actual_batch_size) if verbose: logging.info( "Batch {:d} (Size {:}) >> Inference time: {:f}".format( batch_idx, actual_batch_size, time.time() - start_time)) predictions.extend(outputs[0][:actual_batch_size]) batch_idx += 1 ground_truths = np.load(os.path.join( pair_dir, "ground_truth.npy"))[:num_pairs].tolist() return evaluate(ground_truths, predictions)
def __init__(self, args): """Set up the config and calibrator for DLRM. Does not initialize.""" workspace_size = dict_get(args, "workspace_size", default=(4 << 30)) logging.info("Using workspace size: {:,}".format(workspace_size)) super().__init__(args, name=BENCHMARKS.DLRM, workspace_size=workspace_size) with open("code/dlrm/tensorrt/mlperf_40m.limit.json") as f: self.dlrm_config = json.load(f) logging.info("DLRM config: {:}".format(self.dlrm_config)) self.num_numerical_inputs = self.dlrm_config["num_numerical_features"] self.num_features = len(self.dlrm_config["categorical_feature_sizes"]) self.num_interactions = (self.num_features + 1) * self.num_features // 2 self.embedding_size = self.dlrm_config["embedding_dim"] self.embedding_rows = self.dlrm_config["categorical_feature_sizes"] self.embedding_rows_bound = 40000000 self.embedding_rows = [min(i, self.embedding_rows_bound) for i in self.embedding_rows] self.embedding_rows_total = np.sum(np.array(self.embedding_rows)) self.bottom_mlp_channels = self.dlrm_config["bottom_mlp_sizes"] self.bottom_mlp_names = ["bot_l.0", "bot_l.2", "bot_l.4"] self.output_padding = self.args.get("output_padding_granularity", 32) self.top_mlp_input_size = (self.num_interactions + self.embedding_size + self.output_padding - 1) // self.output_padding * self.output_padding self.top_mlp_channels = self.dlrm_config["top_mlp_sizes"] self.top_mlp_names = ["top_l.0", "top_l.2", "top_l.4", "top_l.6", "top_l.8"] self.model_filepath = "build/models/dlrm/tb00_40M.pt" self.embedding_weights_binary_filepath = "build/models/dlrm/40m_limit/dlrm_embedding_weights_int8_v3.bin" self.model_without_embedding_weights_filepath = "build/models/dlrm/40m_limit/model_test_without_embedding_weights_v3.pt" self.row_frequencies_binary_filepath = "build/models/dlrm/40m_limit/row_frequencies.bin" self.row_frequencies_src_dir = "build/models/dlrm/40m_limit/row_freq" self.embedding_weights_on_gpu_part = self.args.get("embedding_weights_on_gpu_part", 1.0) self.use_row_frequencies = True if self.embedding_weights_on_gpu_part < 1.0 else False self.num_profiles = self.args.get("gpu_inference_streams", 1) self.use_small_tile_gemm_plugin = self.args.get("use_small_tile_gemm_plugin", False) self.gemm_plugin_fairshare_cache_size = self.args.get("gemm_plugin_fairshare_cache_size", -1) self.enable_interleaved_top_mlp = self.args.get("enable_interleaved_top_mlp", False) if self.precision == "fp16": self.apply_flag(trt.BuilderFlag.FP16) elif self.precision == "int8": self.apply_flag(trt.BuilderFlag.INT8) if self.precision == "int8": # Get calibrator variables calib_batch_size = dict_get(self.args, "calib_batch_size", default=512) calib_max_batches = dict_get(self.args, "calib_max_batches", default=500) force_calibration = dict_get(self.args, "force_calibration", default=False) cache_file = dict_get(self.args, "cache_file", default="code/dlrm/tensorrt/calibrator.cache") preprocessed_data_dir = dict_get(self.args, "preprocessed_data_dir", default="build/preprocessed_data") calib_data_dir = os.path.join(preprocessed_data_dir, "criteo/full_recalib/val_data_128000") # Set up calibrator self.calibrator = DLRMCalibrator(calib_batch_size=calib_batch_size, calib_max_batches=calib_max_batches, force_calibration=force_calibration, cache_file=cache_file, data_dir=calib_data_dir) self.builder_config.int8_calibrator = self.calibrator self.cache_file = cache_file self.need_calibration = force_calibration or not os.path.exists(cache_file) else: self.need_calibration = False
def main(): # Parse arguments to identify the data directory with the input images # and the output directory for the preprocessed images. # The data dicretory is assumed to have the following structure: # <data_dir> # └── imagenet # And the output directory will have the following structure: # <preprocessed_data_dir> # └── imagenet # └── ResNet50 # ├── fp32 # └── int8_linear parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", "-d", help="Specifies the directory containing the input images.", default="build/data" ) parser.add_argument( "--preprocessed_data_dir", "-o", help="Specifies the output directory for the preprocessed data.", default="build/preprocessed_data" ) parser.add_argument( "--formats", "-t", help="Comma-separated list of formats. Choices: fp32, int8_linear, int8_chw4.", default="default" ) parser.add_argument( "--overwrite", "-f", help="Overwrite existing files.", action="store_true" ) parser.add_argument( "--cal_only", help="Only preprocess calibration set.", action="store_true" ) parser.add_argument( "--val_only", help="Only preprocess validation set.", action="store_true" ) args = parser.parse_args() data_dir = args.data_dir preprocessed_data_dir = args.preprocessed_data_dir formats = args.formats.split(",") overwrite = args.overwrite cal_only = args.cal_only val_only = args.val_only default_formats = ["int8_linear"] # Now, actually preprocess the input images logging.info("Loading and preprocessing images. This might take a while...") if args.formats == "default": formats = default_formats preprocess_imagenet_for_resnet50(data_dir, preprocessed_data_dir, formats, overwrite, cal_only, val_only) logging.info("Preprocessing done.")
def main(): args = common_args.parse_args(common_args.ACCURACY_ARGS) logging.info("Running accuracy test...") acc = run_dlrm_accuracy(args["engine_file"], args["batch_size"], args["num_samples"], verbose=args["verbose"]) logging.info("Accuracy: {:}".format(acc))
def copy_default_engine(benchmark): """Copy engine file from default path to new path.""" new_path = benchmark._get_engine_fpath(None, None) # Use default values benchmark.config_ver = "default" default_path = benchmark._get_engine_fpath(None, None) logging.info("Copying {:} to {:}".format(default_path, new_path)) shutil.copyfile(default_path, new_path)
def remove_obsolete(self): """ Remove obsolete layers """ logging.info("Removing obsolete layers") topk_op = [_n for _n in self.graph.nodes if _n.name == "topk_layer"][0] self.graph.outputs = topk_op.outputs self.cleanup_graph()
def print_match(pattern, match): for node in pattern: key = node["name"] value = match[key] if isinstance(value, trt.ILayer): logging.info(key + "=" + match[key].name) else: logging.info(key + "=" + value.__str__())
def verify_test01(harness): # Compute path to results dir script_path = 'build/inference/compliance/nvidia/TEST01/run_verification.py' results_path = os.path.join('results', harness.get_system_name(), harness._get_submission_benchmark_name(), harness.scenario) logging.info('AUDIT HARNESS: ' + results_path + '/accuracy' + '\n' + results_path + '/performance') verification_command = 'python3 {} --results={} --compliance={} --output_dir={}'.format( script_path, results_path, harness.get_full_log_dir(), harness.get_full_log_dir()) return run_command(verification_command, get_output=True)
def handle_calibrate(config): benchmark_name = config["benchmark"] logging.info("Generating calibration cache for Benchmark \"{:}\"".format(benchmark_name)) config = apply_overrides(config, common_args.CALIBRATION_ARGS) config["dla_core"] = None config["force_calibration"] = True b = get_benchmark(config) b.calibrate()
def get_engine_info(self): if self.verbose: logging.info("Loading engine to get engine info") def extract_dtype(s): if "INT8" in s: return "TYPE_INT8" elif "FP32" in s: return "TYPE_FP32" elif "INT32" in s: return "TYPE_INT32" elif "FP16" in s: return "TYPE_FP16" else: raise ValueError("Data type must be INT8 or FP32 or INT32, got {:}".format(s)) format_rgx = re.compile(r"\(k[A-Z]+[0-9]*\)") # EngineRunner is the convention to load engines plugins = None if self.name in plugin_map: plugins = plugin_map[self.name] for plugin in plugins: self.check_file_exists(plugin) runner = EngineRunner(self.gpu_engine, verbose=self.verbose, plugins=plugins) inputs = [] outputs = [] # FIXME exploit the use of optimization profile if needed num_profiles = runner.engine.num_optimization_profiles num_bindings_per_profile = runner.engine.num_bindings // num_profiles has_dynamic_shape = False for idx in range(num_bindings_per_profile): tensor = {} tensor["name"] = runner.engine.get_binding_name(idx) binding_shape = runner.engine.get_binding_shape(idx) if -1 in binding_shape: tensor["dims"] = binding_shape[1:] has_dynamic_shape = True else: tensor["dims"] = binding_shape tensor["format"] = runner.engine.get_binding_format_desc(idx) tensor["dtype"] = extract_dtype(tensor["format"]) match = format_rgx.search(tensor["format"]) if match is None: raise ValueError("Invalid input format: {:}".format(tensor["format"])) tensor["dformat"] = match.group(0).strip("()") if runner.engine.binding_is_input(idx): inputs.append(tensor) else: outputs.append(tensor) is_static = not has_dynamic_shape and not runner.engine.has_implicit_batch_dimension # Clean up runner del runner return (inputs, outputs, [0], is_static)
def autosinian_optimize(self): logging.info("Applying AutoSinian Optimization...") optimize_points = [(10, 15), (21, 26), (27, 32), (38, 43), (44, 49), (55, 60), (61, 66), (67, 72), (78, 83), (84, 89), (90, 95), (0, 4), (5, 9), (16, 20), (33, 37), (50, 54), (73, 77), (96, 100)] optimizer = AutoSinian_Optimizer(self.cache_file) for point in optimize_points: optimizer.optimize(self.network, point)
def __init__(self, args): workspace_size = dict_get(args, "workspace_size", default=(5 << 30)) logging.info("Using workspace size: {:,}".format(workspace_size)) super().__init__(args, name=BENCHMARKS.BERT, workspace_size=workspace_size) self.bert_config_path = "code/bert/tensorrt/bert_config.json" self.seq_len = 384 # default sequence length self.batch_size = dict_get(args, "batch_size", default=1) self.num_profiles = 1 if 'gpu_inference_streams' in args: # use gpu_inference_streams to determine the number of duplicated profiles # in the engine when not using lwis mode self.num_profiles = args['gpu_inference_streams'] self.is_int8 = args['precision'] == 'int8' if self.is_int8: self.model_path = dict_get( args, "model_path", default="build/models/bert/bert_large_v1_1_fake_quant.onnx") else: self.model_path = dict_get( args, "model_path", default="build/models/bert/bert_large_v1_1.onnx") self.bert_config = BertConfig(self.bert_config_path) self.enable_interleaved = False if self.is_int8 and 'enable_interleaved' in args: self.enable_interleaved = args['enable_interleaved'] # Small-Tile GEMM Plugin # Since it doesn't support interleaved format, two options are mutually exclusive self.use_small_tile_gemm_plugin = self.args.get( "use_small_tile_gemm_plugin", False) self.gemm_plugin_fairshare_cache_size = self.args.get( "gemm_plugin_fairshare_cache_size", -1) if self.enable_interleaved and self.use_small_tile_gemm_plugin: assert False, "Small-Tile GEMM Plugin doesn't support interleaved format." # Query system id for architecture self.system = get_system() self.gpu_arch = self.system.arch if self.batch_size > 512: # tactics selection is limited at very large batch sizes self.builder_config.max_workspace_size = 7 << 30 if 'nx' in self.system.gpu.lower(): # use 1GB only for XavierNX self.builder_config.max_workspace_size = 1 << 30
def print_matches(pattern, matches): matchNumber = 1 if isinstance(matches, list): for match in matches: logging.info("Match number:", matchNumber) network_search.print_match(pattern, match) logging.info() matchNumber = matchNumber + 1 else: print_match(pattern + "=" + match)
def rename_ops(self): """ Rename op names as in self.op_name_map """ logging.info("Renaming layers") for node in self.graph.nodes: if node.name in self.op_name_map: new_name = self.op_name_map[node.name] # logging.info("Renaming layer: {} -> {}".format(node.name, new_name)) node.name = new_name
def infer(args): hyperParam = RnnHyperParam(args) runner = RNNTRunner(args, hyperParam) logging.info("Start running inference -- topology : {:}".format( args.topology)) start = time.time() runner.infer() end = time.time() elapsed = end - start logging.info("Inference takes {:f} secs. Throughput = {:f}/s".format( elapsed, args.num_samples / elapsed))
def __init__(self, args): workspace_size = dict_get(args, "workspace_size", default=(1 << 30)) logging.info("Use workspace_size: {:}".format(workspace_size)) super().__init__(args, name=BENCHMARKS.ResNet50, workspace_size=workspace_size) # Model path self.model_path = dict_get( args, "model_path", default="code/resnet50/tensorrt/ofa_autosinian_is176.onnx") logging.info("Using AutoSinian optimized once-for-all network") self.cache_file = None self.need_calibration = False if self.precision == "int8": # Get calibrator variables calib_batch_size = dict_get(self.args, "calib_batch_size", default=1) calib_max_batches = dict_get(self.args, "calib_max_batches", default=500) force_calibration = dict_get(self.args, "force_calibration", default=False) cache_file = dict_get( self.args, "cache_file", default="code/resnet50/tensorrt/calibrator.cache") preprocessed_data_dir = dict_get(self.args, "preprocessed_data_dir", default="build/preprocessed_data") calib_data_map = dict_get(self.args, "calib_data_map", default="data_maps/imagenet/cal_map.txt") calib_image_dir = os.path.join(preprocessed_data_dir, "imagenet/ResNet50/fp32") # Set up calibrator self.calibrator = RN50Calibrator( calib_batch_size=calib_batch_size, calib_max_batches=calib_max_batches, force_calibration=force_calibration, cache_file=cache_file, image_dir=calib_image_dir, calib_data_map=calib_data_map) self.builder_config.int8_calibrator = self.calibrator self.cache_file = cache_file self.need_calibration = force_calibration or not os.path.exists( cache_file)
def load(audit_test, benchmark): # Calculates path to audit.config src_config = os.path.join('build/inference/compliance/nvidia', audit_test, benchmark, 'audit.config') logging.info('AUDIT HARNESS: Looking for audit.config in {}...'.format(src_config)) if not os.path.isfile(src_config): # For tests that have one central audit.config instead of per-benchmark src_config = os.path.join('build/inference/compliance/nvidia', audit_test, 'audit.config') logging.info('AUDIT HARNESS: Search failed. Looking for audit.config in {}...'.format(src_config)) # Destination is audit.config dest_config = 'audit.config' # Copy the file shutil.copyfile(src_config, dest_config) return dest_config