def main(): global num_sockets global start_time global item_total global last_timeing args = get_args() log.info(args) config = os.path.abspath(args.config) user_config = os.path.abspath(args.user_config) if not os.path.exists(config): log.error("{} not found".format(config)) sys.exit(1) if not os.path.exists(user_config): log.error("{} not found".format(user_config)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) lock = multiprocessing.Lock() init_counter = multiprocessing.Value("i", 0) total_samples = multiprocessing.Value("i", 0) dsQueue = multiprocessing.Queue() outQueue = multiprocessing.Queue() inQueue = multiprocessing.JoinableQueue(num_sockets * 4) consumers = [ Consumer(inQueue, outQueue, dsQueue, lock, init_counter, total_samples, i, args) for i in range(num_sockets) ] for c in consumers: c.start() # Wait until subprocess ready while init_counter.value < num_sockets: time.sleep(2) # Start response thread response_worker = threading.Thread(target=response_loadgen, args=(outQueue, args.accuracy)) response_worker.daemon = True response_worker.start() scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](inQueue, max_batchsize=args.max_batchsize) def issue_queries(response_ids, query_sample_indexes): runner.enqueue(response_ids, query_sample_indexes) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(config, args.model, args.scenario) settings.FromConfig(user_config, args.model, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly settings.performance_sample_count_override = total_samples.value if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.duration: settings.min_duration_ms = args.duration settings.max_duration_ms = args.duration if args.target_qps: settings.server_target_qps = float(args.target_qps) settings.offline_expected_qps = float(args.target_qps) if args.count_queries: settings.min_query_count = args.count_queries settings.max_query_count = args.count_queries if args.samples_per_query_multistream: settings.multi_stream_samples_per_query = args.samples_per_query_multistream if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) def load_query_samples(sample_list): # Wait until subprocess ready global start_time for _ in range(num_sockets): dsQueue.put(sample_list) while init_counter.value < 2 * num_sockets: time.sleep(2) start_time = time.time() def unload_query_samples(sample_list): pass import torch import criteo sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL( total_samples.value, min(total_samples.value, args.samples_per_query_offline), load_query_samples, unload_query_samples) log.info("starting {}".format(scenario)) result_dict = { "good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario) } lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = item_timing if args.accuracy: result_dict["good"] = item_good result_dict["total"] = item_total result_dict["roc_auc"] = criteo.auc_score(item_results) final_results = { "runtime": "pytorch-native-dlrm", "version": torch.__version__, "time": int(time.time()), "cmdline": str(args), } add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - start_time, args.accuracy) inQueue.join() for _ in consumers: inQueue.put(None) for c in consumers: c.join() outQueue.put(None) lg.DestroyQSL(qsl) lg.DestroyFastSUT(sut) # write final results if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global num_ins global num_phy_cpus global in_queue_cnt global out_queue_cnt args = get_args() log.info(args) num_ins = args.num_instance num_phy_cpus = args.num_phy_cpus log.info('Run with {} instance on {} cpus'.format(num_ins, num_phy_cpus)) mlperf_conf = os.path.abspath(args.mlperf_conf) if not os.path.exists(mlperf_conf): log.error("{} not found".format(mlperf_conf)) sys.exit(1) user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) sys.exit(1) image_format = 'NCHW' dataset = "imagenet" wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, cache_dir=args.cache_dir, count=args.count, use_int8=args.use_int8_dataset, num_workers=num_phy_cpus, **kwargs) # Establish communication queues log.info('Start comsumer queue and response thread') lock = multiprocessing.Lock() init_counter = multiprocessing.Value("i", 0) in_queue = multiprocessing.JoinableQueue() out_queue = multiprocessing.Queue() ds_queue = multiprocessing.Queue() # Start consumers consumers = [Consumer(in_queue, out_queue, ds_queue, lock, init_counter, i, args) for i in range(num_ins)] for c in consumers: c.start() # Wait until all sub-processors are ready block_until(init_counter, num_ins, 2) # Start response thread response_worker = threading.Thread( target=response_loadgen, args=(out_queue,)) response_worker.daemon = True response_worker.start() scenario = SCENARIO_MAP[args.scenario] runner = QueueRunner(in_queue, args.batch_size) def issue_queries(response_ids, query_sample_indexes): runner.put(response_ids, query_sample_indexes) def flush_queries(): pass def process_latencies(latencies_ns): log.info("Average latency: {}".format(np.mean(latencies_ns))) log.info("Median latency: {}".format(np.percentile(latencies_ns, 50))) log.info("90 percentile latency: {}".format(np.percentile(latencies_ns, 90))) def load_query_samples(sample_list): for _ in range(num_ins): ds_queue.put(sample_list) block_until(init_counter, 2 * num_ins, 2) def unload_query_samples(sample_list): pass settings = lg.TestSettings() settings.FromConfig(mlperf_conf, "resnet50", args.scenario) settings.FromConfig(user_conf, "resnet50", args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.qps: qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps count = ds.get_item_count() perf_count = 1024 if args.accuracy: perf_count = count sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, perf_count, load_query_samples, unload_query_samples) log.info("starting {}".format(scenario)) lg.StartTest(sut, qsl, settings) # Wait until outQueue done while out_queue_cnt < in_queue_cnt: time.sleep(0.2) in_queue.join() for i in range(num_ins): in_queue.put('DONE') for c in consumers: c.join() out_queue.put('DONE') if args.accuracy: output_file = 'accuracy.txt' if args.output_file: output_file = args.output_file cmd = "python tools/accuracy-imagenet.py " \ "--mlperf-accuracy-file=mlperf_log_accuracy.json " \ "--imagenet-val-file=val_map.txt --output-file={}".format(output_file) cmd = cmd.split(' ') subprocess.check_call(cmd) lg.DestroyQSL(qsl) lg.DestroyFastSUT(sut) log.info('Test done.')
def main(): global qcount global num_sockets global cpus_per_socket global cpus_per_process global cpus_per_instance global total_instances global start_time global item_total global last_timeing args = get_args() log.info(args) config = os.path.abspath(args.config) user_config = os.path.abspath(args.user_config) if not os.path.exists(config): log.error("{} not found".format(config)) sys.exit(1) if not os.path.exists(user_config): log.error("{} not found".format(user_config)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) if os.path.exists("./audit.config"): copyfile("./audit.config", output_dir + "/audit.config") os.chdir(output_dir) settings = lg.TestSettings() settings.FromConfig(config, args.model, args.scenario) settings.FromConfig(user_config, args.model, args.scenario) settings.mode = lg.TestMode.PerformanceOnly cpus_for_loadgen = 1 left_cores = cpus_per_socket * num_sockets - total_procs * cpus_per_process first_instance_start_core = cpus_for_loadgen if left_cores > cpus_for_loadgen: first_instance_start_core = 0 cpus_for_loadgen = left_cores total_instances = 0 instances_per_proc = (cpus_per_process // cpus_per_instance) for i in range(total_procs): if i == 0 and first_instance_start_core > 0: total_instances = total_instances + ((cpus_per_process - first_instance_start_core) // cpus_per_instance) if (cpus_per_instance - first_instance_start_core) >= (cpus_per_instance // 2): total_instances = total_instances + 1 else: total_instances = total_instances + instances_per_proc #print("Setup {} Instances !!".format(total_instances)) lock = multiprocessing.Lock() barrier = multiprocessing.Barrier(total_instances) init_counter = multiprocessing.Value("i", 0) total_samples = multiprocessing.Value("i", 0) finished_samples = multiprocessing.Value("i", 0) dsQueue = multiprocessing.Queue() numOutQ = num_sockets outQueues = [multiprocessing.Queue() for i in range(numOutQ)] #inQueue = multiprocessing.JoinableQueue() inQueue = multiprocessing.Queue() consumers = [Consumer(inQueue, outQueues[i%numOutQ], dsQueue, lock, init_counter, finished_samples, barrier, i, args, settings.min_query_count, first_instance_start_core) for i in range(total_procs)] for c in consumers: c.start() # Wait until subprocess ready while init_counter.value < total_procs: time.sleep(2) import torch import criteo torch.set_num_threads(cpus_per_socket * num_sockets) dlrm_dataset = get_dataset(args) total_samples.value = dlrm_dataset.get_item_count() scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } settings.scenario = scenario runner = runner_map[scenario](inQueue, dlrm_dataset, total_samples.value, args.max_sample_size, args.max_batchsize) # Start response thread response_workers = [threading.Thread( target=response_loadgen, args=(outQueues[i], args.accuracy)) for i in range(numOutQ)] for response_worker in response_workers: response_worker.daemon = True response_worker.start() def issue_queries(response_ids, query_sample_indexes): runner.enqueue(response_ids, query_sample_indexes) def flush_queries(): runner.unload_query_samples() def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly settings.performance_sample_count_override = total_samples.value if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.duration: settings.min_duration_ms = args.duration settings.max_duration_ms = args.duration if args.target_qps: settings.server_target_qps = float(args.target_qps) settings.offline_expected_qps = float(args.target_qps) if args.count_queries: settings.min_query_count = args.count_queries settings.max_query_count = args.count_queries if args.samples_per_query_multistream: settings.multi_stream_samples_per_query = args.samples_per_query_multistream if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) if args.accuracy: qcount = total_samples.value else: qcount = settings.min_query_count def load_query_samples(sample_list): # Wait until subprocess ready global start_time global total_instances for _ in range(total_instances): dsQueue.put(sample_list) while init_counter.value < total_procs + total_instances: time.sleep(2) start_time = time.time() def unload_query_samples(sample_list): pass sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(total_samples.value, min(total_samples.value, args.samples_per_query_offline), load_query_samples, unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario)} torch.set_num_threads(cpus_for_loadgen) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = item_timing if args.accuracy: result_dict["good"] = item_good result_dict["total"] = item_total result_dict["roc_auc"] = criteo.auc_score(item_results) final_results = { "runtime": "pytorch-native-dlrm", "version": torch.__version__, "time": int(time.time()), "cmdline": str(args), } add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - start_time, args.accuracy) #inQueue.join() for _ in range(total_instances): inQueue.put(None) for c in consumers: c.join() for i in range(numOutQ): outQueues[i].put(None) lg.DestroyQSL(qsl) lg.DestroyFastSUT(sut) # write final results if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global last_timeing args = get_args() log.info(args) backend = BackendTensorRT() ds = Imagenet( data_path=args.dataset_path, use_cache=args.cache, batch_size=args.batch_size, image_size=args.image_size, calib_file='cal_image_list_option_%d.txt' % args.calib_file) model = backend.load(args, ds=ds) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } config = os.path.abspath(args.config) assert(os.path.exists(config)), "%s not existed!" % config user_config = os.path.abspath(args.user_config) assert(os.path.exists(user_config)), "%s not existed!" % user_config base_path = os.path.dirname(os.path.realpath(__file__)) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) post_proc = PostProcessCommon(offset=0) runner = QueueRunner( model, ds, args.threads, post_proc=post_proc, batch_size=args.batch_size) def issue_queries(ids, indices): runner.enqueue(ids, indices) def flush_queries(): pass def process_latencies(latencies_ns): global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() model_name = 'OFAnet-AutoSinian' settings.FromConfig(config, model_name, args.scenario) settings.FromConfig(user_config, model_name, args.scenario) if args.audit_test: audit_config_path = base_path + '/audit%s.config' % args.audit_test settings.FromConfig(audit_config_path, model_name, args.scenario) scenario = SCENARIO_MAP[args.scenario] settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(ds.get_item_count(), args.batch_size, ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) start = time.time() lg.StartTest(sut, qsl, settings) post_proc.finalize(result_dict) add_results(final_results, "{}".format(scenario), result_dict, last_timeing, runner.finishTime - ds.last_loaded, args) runner.finish() lg.DestroyQSL(qsl) lg.DestroyFastSUT(sut) if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=2)