コード例 #1
0
def main():
    global num_sockets
    global start_time
    global item_total
    global last_timeing

    args = get_args()
    log.info(args)
    config = os.path.abspath(args.config)
    user_config = os.path.abspath(args.user_config)

    if not os.path.exists(config):
        log.error("{} not found".format(config))
        sys.exit(1)

    if not os.path.exists(user_config):
        log.error("{} not found".format(user_config))
        sys.exit(1)

    if args.output:
        output_dir = os.path.abspath(args.output)
        os.makedirs(output_dir, exist_ok=True)
        os.chdir(output_dir)

    lock = multiprocessing.Lock()
    init_counter = multiprocessing.Value("i", 0)
    total_samples = multiprocessing.Value("i", 0)
    dsQueue = multiprocessing.Queue()
    outQueue = multiprocessing.Queue()
    inQueue = multiprocessing.JoinableQueue(num_sockets * 4)
    consumers = [
        Consumer(inQueue, outQueue, dsQueue, lock, init_counter, total_samples,
                 i, args) for i in range(num_sockets)
    ]
    for c in consumers:
        c.start()

    # Wait until subprocess ready
    while init_counter.value < num_sockets:
        time.sleep(2)

    # Start response thread
    response_worker = threading.Thread(target=response_loadgen,
                                       args=(outQueue, args.accuracy))
    response_worker.daemon = True
    response_worker.start()

    scenario = SCENARIO_MAP[args.scenario]
    runner_map = {
        lg.TestScenario.Server: QueueRunner,
        lg.TestScenario.Offline: QueueRunner
    }

    runner = runner_map[scenario](inQueue, max_batchsize=args.max_batchsize)

    def issue_queries(response_ids, query_sample_indexes):
        runner.enqueue(response_ids, query_sample_indexes)

    def flush_queries():
        pass

    def process_latencies(latencies_ns):
        # called by loadgen to show us the recorded latencies
        global last_timeing
        last_timeing = [t / NANO_SEC for t in latencies_ns]

    settings = lg.TestSettings()
    settings.FromConfig(config, args.model, args.scenario)
    settings.FromConfig(user_config, args.model, args.scenario)
    settings.scenario = scenario
    settings.mode = lg.TestMode.PerformanceOnly

    if args.accuracy:
        settings.mode = lg.TestMode.AccuracyOnly
        settings.performance_sample_count_override = total_samples.value

    if args.find_peak_performance:
        settings.mode = lg.TestMode.FindPeakPerformance

    if args.duration:
        settings.min_duration_ms = args.duration
        settings.max_duration_ms = args.duration

    if args.target_qps:
        settings.server_target_qps = float(args.target_qps)
        settings.offline_expected_qps = float(args.target_qps)

    if args.count_queries:
        settings.min_query_count = args.count_queries
        settings.max_query_count = args.count_queries

    if args.samples_per_query_multistream:
        settings.multi_stream_samples_per_query = args.samples_per_query_multistream

    if args.max_latency:
        settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
        settings.multi_stream_target_latency_ns = int(args.max_latency *
                                                      NANO_SEC)

    def load_query_samples(sample_list):
        # Wait until subprocess ready
        global start_time
        for _ in range(num_sockets):
            dsQueue.put(sample_list)
        while init_counter.value < 2 * num_sockets:
            time.sleep(2)
        start_time = time.time()

    def unload_query_samples(sample_list):
        pass

    import torch
    import criteo

    sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies)
    qsl = lg.ConstructQSL(
        total_samples.value,
        min(total_samples.value, args.samples_per_query_offline),
        load_query_samples, unload_query_samples)

    log.info("starting {}".format(scenario))
    result_dict = {
        "good": 0,
        "total": 0,
        "roc_auc": 0,
        "scenario": str(scenario)
    }

    lg.StartTest(sut, qsl, settings)

    if not last_timeing:
        last_timeing = item_timing
    if args.accuracy:
        result_dict["good"] = item_good
        result_dict["total"] = item_total
        result_dict["roc_auc"] = criteo.auc_score(item_results)

    final_results = {
        "runtime": "pytorch-native-dlrm",
        "version": torch.__version__,
        "time": int(time.time()),
        "cmdline": str(args),
    }

    add_results(final_results, "{}".format(scenario), result_dict,
                last_timeing,
                time.time() - start_time, args.accuracy)

    inQueue.join()
    for _ in consumers:
        inQueue.put(None)
    for c in consumers:
        c.join()
    outQueue.put(None)

    lg.DestroyQSL(qsl)
    lg.DestroyFastSUT(sut)

    # write final results
    if args.output:
        with open("results.json", "w") as f:
            json.dump(final_results, f, sort_keys=True, indent=4)
コード例 #2
0
def main():
    global num_ins
    global num_phy_cpus
    global in_queue_cnt
    global out_queue_cnt

    args = get_args()
    log.info(args)

    num_ins = args.num_instance
    num_phy_cpus = args.num_phy_cpus
    log.info('Run with {} instance on {} cpus'.format(num_ins, num_phy_cpus))

    mlperf_conf = os.path.abspath(args.mlperf_conf)
    if not os.path.exists(mlperf_conf):
        log.error("{} not found".format(mlperf_conf))
        sys.exit(1)

    user_conf = os.path.abspath(args.user_conf)
    if not os.path.exists(user_conf):
        log.error("{} not found".format(user_conf))
        sys.exit(1)

    image_format = 'NCHW'
    dataset = "imagenet"
    wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[dataset]

    ds = wanted_dataset(data_path=args.dataset_path,
                        image_list=args.dataset_list,
                        name=dataset,
                        image_format=image_format,
                        pre_process=pre_proc,
                        use_cache=args.cache,
                        cache_dir=args.cache_dir,
                        count=args.count,
                        use_int8=args.use_int8_dataset,
                        num_workers=num_phy_cpus,
                        **kwargs)

    # Establish communication queues
    log.info('Start comsumer queue and response thread')
    lock = multiprocessing.Lock()
    init_counter = multiprocessing.Value("i", 0)
    in_queue = multiprocessing.JoinableQueue()
    out_queue = multiprocessing.Queue()
    ds_queue = multiprocessing.Queue()

    # Start consumers
    consumers = [Consumer(in_queue, out_queue, ds_queue, lock, init_counter, i, args)
                 for i in range(num_ins)]
    for c in consumers:
        c.start()

    # Wait until all sub-processors are ready
    block_until(init_counter, num_ins, 2)

    # Start response thread
    response_worker = threading.Thread(
        target=response_loadgen, args=(out_queue,))
    response_worker.daemon = True
    response_worker.start()

    scenario = SCENARIO_MAP[args.scenario]
    runner = QueueRunner(in_queue, args.batch_size)

    def issue_queries(response_ids, query_sample_indexes):
        runner.put(response_ids, query_sample_indexes)

    def flush_queries():
        pass

    def process_latencies(latencies_ns):
        log.info("Average latency: {}".format(np.mean(latencies_ns)))
        log.info("Median latency: {}".format(np.percentile(latencies_ns, 50)))
        log.info("90 percentile latency: {}".format(np.percentile(latencies_ns, 90)))

    def load_query_samples(sample_list):
        for _ in range(num_ins):
            ds_queue.put(sample_list)
        block_until(init_counter, 2 * num_ins, 2)

    def unload_query_samples(sample_list):
        pass

    settings = lg.TestSettings()
    settings.FromConfig(mlperf_conf, "resnet50", args.scenario)
    settings.FromConfig(user_conf, "resnet50", args.scenario)
    settings.scenario = scenario
    settings.mode = lg.TestMode.PerformanceOnly
    if args.accuracy:
        settings.mode = lg.TestMode.AccuracyOnly
    if args.find_peak_performance:
        settings.mode = lg.TestMode.FindPeakPerformance

    if args.qps:
        qps = float(args.qps)
        settings.server_target_qps = qps
        settings.offline_expected_qps = qps

    count = ds.get_item_count()
    perf_count = 1024
    if args.accuracy:
        perf_count = count
    sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies)
    qsl = lg.ConstructQSL(count, perf_count, load_query_samples, unload_query_samples)

    log.info("starting {}".format(scenario))
    lg.StartTest(sut, qsl, settings)

    # Wait until outQueue done
    while out_queue_cnt < in_queue_cnt:
        time.sleep(0.2)

    in_queue.join()
    for i in range(num_ins):
        in_queue.put('DONE')
    for c in consumers:
        c.join()
    out_queue.put('DONE')

    if args.accuracy:
        output_file = 'accuracy.txt'
        if args.output_file:
            output_file = args.output_file
        cmd = "python tools/accuracy-imagenet.py " \
              "--mlperf-accuracy-file=mlperf_log_accuracy.json " \
              "--imagenet-val-file=val_map.txt --output-file={}".format(output_file)
        cmd = cmd.split(' ')
        subprocess.check_call(cmd)

    lg.DestroyQSL(qsl)
    lg.DestroyFastSUT(sut)

    log.info('Test done.')
コード例 #3
0
def main():
    global qcount
    global num_sockets
    global cpus_per_socket
    global cpus_per_process
    global cpus_per_instance
    global total_instances
    global start_time
    global item_total
    global last_timeing

    args = get_args()
    log.info(args)
    config = os.path.abspath(args.config)
    user_config = os.path.abspath(args.user_config)

    if not os.path.exists(config):
        log.error("{} not found".format(config))
        sys.exit(1)

    if not os.path.exists(user_config):
        log.error("{} not found".format(user_config))
        sys.exit(1)

    if args.output:
        output_dir = os.path.abspath(args.output)
        os.makedirs(output_dir, exist_ok=True)
        if os.path.exists("./audit.config"):
            copyfile("./audit.config", output_dir + "/audit.config")
        os.chdir(output_dir)

    settings = lg.TestSettings()
    settings.FromConfig(config, args.model, args.scenario)
    settings.FromConfig(user_config, args.model, args.scenario)
    settings.mode = lg.TestMode.PerformanceOnly

    cpus_for_loadgen = 1
    left_cores = cpus_per_socket * num_sockets - total_procs * cpus_per_process
    first_instance_start_core = cpus_for_loadgen 
    if left_cores > cpus_for_loadgen:
        first_instance_start_core = 0
        cpus_for_loadgen = left_cores

    total_instances = 0
    instances_per_proc = (cpus_per_process // cpus_per_instance)
    for i in range(total_procs):
        if i == 0 and first_instance_start_core > 0:
            total_instances = total_instances + ((cpus_per_process - first_instance_start_core) // cpus_per_instance)
            if (cpus_per_instance - first_instance_start_core) >= (cpus_per_instance // 2):
                total_instances = total_instances + 1
        else:
            total_instances = total_instances + instances_per_proc
    #print("Setup {} Instances !!".format(total_instances))

    lock = multiprocessing.Lock()
    barrier = multiprocessing.Barrier(total_instances)
    init_counter = multiprocessing.Value("i", 0)
    total_samples = multiprocessing.Value("i", 0)
    finished_samples = multiprocessing.Value("i", 0)
    dsQueue = multiprocessing.Queue()
    numOutQ = num_sockets
    outQueues = [multiprocessing.Queue() for i in range(numOutQ)]
    #inQueue = multiprocessing.JoinableQueue()
    inQueue = multiprocessing.Queue()
    consumers = [Consumer(inQueue, outQueues[i%numOutQ], dsQueue, lock, init_counter, finished_samples, barrier, i, args, settings.min_query_count, first_instance_start_core)
                 for i in range(total_procs)]
    for c in consumers:
        c.start()

    # Wait until subprocess ready
    while init_counter.value < total_procs: time.sleep(2)

    import torch
    import criteo
    torch.set_num_threads(cpus_per_socket * num_sockets)

    dlrm_dataset = get_dataset(args)
    total_samples.value = dlrm_dataset.get_item_count()
    scenario = SCENARIO_MAP[args.scenario]
    runner_map = {
        lg.TestScenario.Server: QueueRunner,
        lg.TestScenario.Offline: QueueRunner
    }

    settings.scenario = scenario
    runner = runner_map[scenario](inQueue, dlrm_dataset, total_samples.value, args.max_sample_size, args.max_batchsize)

    # Start response thread
    response_workers = [threading.Thread(
        target=response_loadgen, args=(outQueues[i], args.accuracy)) for i in range(numOutQ)]
    for response_worker in response_workers:
       response_worker.daemon = True
       response_worker.start()


    def issue_queries(response_ids, query_sample_indexes):
        runner.enqueue(response_ids, query_sample_indexes)

    def flush_queries():
        runner.unload_query_samples()

    def process_latencies(latencies_ns):
        # called by loadgen to show us the recorded latencies
        global last_timeing
        last_timeing = [t / NANO_SEC for t in latencies_ns]

    if args.accuracy:
        settings.mode = lg.TestMode.AccuracyOnly
        settings.performance_sample_count_override = total_samples.value

    if args.find_peak_performance:
        settings.mode = lg.TestMode.FindPeakPerformance

    if args.duration:
        settings.min_duration_ms = args.duration
        settings.max_duration_ms = args.duration

    if args.target_qps:
        settings.server_target_qps = float(args.target_qps)
        settings.offline_expected_qps = float(args.target_qps)

    if args.count_queries:
        settings.min_query_count = args.count_queries
        settings.max_query_count = args.count_queries

    if args.samples_per_query_multistream:
        settings.multi_stream_samples_per_query = args.samples_per_query_multistream

    if args.max_latency:
        settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
        settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC)

    if args.accuracy:
        qcount = total_samples.value
    else:
        qcount = settings.min_query_count

    def load_query_samples(sample_list):
        # Wait until subprocess ready
        global start_time
        global total_instances
        for _ in range(total_instances):
            dsQueue.put(sample_list)
        while init_counter.value < total_procs + total_instances: time.sleep(2)
        start_time = time.time()

    def unload_query_samples(sample_list):
        pass

    sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies)
    qsl = lg.ConstructQSL(total_samples.value, min(total_samples.value, args.samples_per_query_offline), load_query_samples, unload_query_samples)

    log.info("starting {}".format(scenario))
    result_dict = {"good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario)}

    torch.set_num_threads(cpus_for_loadgen)
    lg.StartTest(sut, qsl, settings)

    if not last_timeing:
        last_timeing = item_timing
    if args.accuracy:
        result_dict["good"] = item_good
        result_dict["total"] = item_total
        result_dict["roc_auc"] = criteo.auc_score(item_results)

    final_results = {
        "runtime": "pytorch-native-dlrm",
        "version": torch.__version__,
        "time": int(time.time()),
        "cmdline": str(args),
    }

    add_results(final_results, "{}".format(scenario),
                result_dict, last_timeing, time.time() - start_time, args.accuracy)

    #inQueue.join()
    for _ in range(total_instances):
        inQueue.put(None)
    for c in consumers:
        c.join()
    for i in range(numOutQ):
        outQueues[i].put(None)

    lg.DestroyQSL(qsl)
    lg.DestroyFastSUT(sut)

    # write final results
    if args.output:
        with open("results.json", "w") as f:
            json.dump(final_results, f, sort_keys=True, indent=4)
コード例 #4
0
def main():
  global last_timeing
  args = get_args()

  log.info(args)

  backend = BackendTensorRT()

  ds = Imagenet(
      data_path=args.dataset_path,
      use_cache=args.cache,
      batch_size=args.batch_size,
      image_size=args.image_size,
      calib_file='cal_image_list_option_%d.txt' % args.calib_file)

  model = backend.load(args, ds=ds)

  final_results = {
      "runtime": model.name(),
      "version": model.version(),
      "time": int(time.time()),
      "cmdline": str(args),
  }

  config = os.path.abspath(args.config)
  assert(os.path.exists(config)), "%s not existed!" % config
  user_config = os.path.abspath(args.user_config)
  assert(os.path.exists(user_config)), "%s not existed!" % user_config

  base_path = os.path.dirname(os.path.realpath(__file__))
  if args.output:
    output_dir = os.path.abspath(args.output)
    os.makedirs(output_dir, exist_ok=True)
    os.chdir(output_dir)

  post_proc = PostProcessCommon(offset=0)
  runner = QueueRunner(
      model, ds, args.threads, post_proc=post_proc, batch_size=args.batch_size)

  def issue_queries(ids, indices):
    runner.enqueue(ids, indices)

  def flush_queries():
    pass

  def process_latencies(latencies_ns):
    global last_timeing
    last_timeing = [t / NANO_SEC for t in latencies_ns]

  settings = lg.TestSettings()
  model_name = 'OFAnet-AutoSinian'
  settings.FromConfig(config, model_name, args.scenario)
  settings.FromConfig(user_config, model_name, args.scenario)

  if args.audit_test:
    audit_config_path = base_path + '/audit%s.config' % args.audit_test
    settings.FromConfig(audit_config_path, model_name, args.scenario)

  scenario = SCENARIO_MAP[args.scenario]
  settings.scenario = scenario
  settings.mode = lg.TestMode.PerformanceOnly
  if args.accuracy:
    settings.mode = lg.TestMode.AccuracyOnly

  sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies)
  qsl = lg.ConstructQSL(ds.get_item_count(), args.batch_size,
                        ds.load_query_samples, ds.unload_query_samples)

  log.info("starting {}".format(scenario))
  result_dict = {"good": 0, "total": 0, "scenario": str(scenario)}
  runner.start_run(result_dict, args.accuracy)
  start = time.time()
  lg.StartTest(sut, qsl, settings)
  post_proc.finalize(result_dict)
  add_results(final_results, "{}".format(scenario), result_dict, last_timeing,
              runner.finishTime - ds.last_loaded, args)

  runner.finish()
  lg.DestroyQSL(qsl)
  lg.DestroyFastSUT(sut)

  if args.output:
    with open("results.json", "w") as f:
      json.dump(final_results, f, sort_keys=True, indent=2)