Beispiel #1
0
def main(argv):
    global threadnum

    listimage = os.listdir(calib_image_dir)
    threadAll = []
    threadnum = int(argv[1])
    i = 0
    global runTotall
    runTotall = len(listimage)
    g = xir.graph.Graph.deserialize(pathlib.Path(argv[2]))
    subgraphs = get_subgraph(g)
    assert len(subgraphs) == 1  # only one DPU kernel
    all_dpu_runners = []
    for i in range(int(threadnum)):
        all_dpu_runners.append(runner.Runner(subgraphs[0], "run"))
    """image list to be run """
    xclbin_p = str("/mnt/dpu.xclbin")
    kernelName_p = "pp_pipeline_accel"
    deviceIdx_p = 0
    fpga_pp = waa_rt.PreProcess(xclbin_p, kernelName_p, deviceIdx_p)
    time1 = int(round(time.time() * 1000))
    img = []
    for i in range(runTotall):
        path = os.path.join(calib_image_dir, listimage[i])
        image = cv2.imread(path)
        rows, cols, channels = image.shape
        image = fpga_pp.preprocess_input(image, rows, cols)
        img.append(image)

    time_pre = int(round(time.time() * 1000))

    start = 0
    for i in range(int(threadnum)):
        if (i == threadnum - 1):
            end = len(img)
        else:
            end = start + (len(img) // threadnum)
        t1 = threading.Thread(target=runResnet50,
                              args=(all_dpu_runners[i], img[start:end],
                                    len(img[start:end])))
        threadAll.append(t1)
        start = end
    for x in threadAll:
        x.start()
    for x in threadAll:
        x.join()

    time2 = int(round(time.time() * 1000))
    timetotal = time2 - time1
    fps = float(runTotall * 1000 / timetotal)
    #print("Pre time: %d ms" %(time_pre - time1))
    #print("DPU + post time: %d ms" %(time2 - time_pre))
    #print("Total time : %d ms" %timetotal)
    #print("Total frames : %d" %len(img))
    print("Performance : %.2f FPS" % fps)
Beispiel #2
0
def pre_process(q, args):

    xclbin_p = str(args['xclbin'] + "/xdnn_v3_96x16_2pe_8b_9mb_bank03.xclbin")
    kernelName_p = "pp_pipeline_accel"
    deviceIdx_p = args['deviceid']
    fpga_pp = waa_rt.PreProcess(xclbin_p, kernelName_p, deviceIdx_p, 1)

    batch_sz = args['batch_sz']
    img_paths = xdnn_io.getFilePaths(args['images'])
    for i in range(0, len(img_paths), batch_sz):
        for j, p in enumerate(img_paths[i:i + batch_sz]):
            arr, ht = fpga_pp.preprocess_input(p)
            q.put(arr)
Beispiel #3
0
def main(argv):
    global threadnum

    listimage=os.listdir(calib_image_dir)
    threadAll = []
    threadnum = int(argv[1])
    i = 0
    global runTotall
    runTotall = len(listimage)
    g = xir.Graph.deserialize(argv[2])
    subgraphs = get_child_subgraph_dpu(g)
    assert len(subgraphs) == 1 # only one DPU kernel
    all_dpu_runners = []
    for i in range(int(threadnum)):
        all_dpu_runners.append(vart.Runner.create_runner(subgraphs[0], "run"))
    """image list to be run """
    xclbin_p=str("/usr/lib/dpu.xclbin")
    kernelName_p="pp_pipeline_accel"
    deviceIdx_p=0
    fpga_pp = waa_rt.PreProcess(xclbin_p,kernelName_p,deviceIdx_p)
    time1 = int(round(time.time() * 1000))
    img = []
    time_start = time.time()
    for i in range(runTotall):
        path = os.path.join(calib_image_dir,listimage[i])
        img.append(fpga_pp.preprocess_input(path))

    cnt = 1
    """run with batch """
    for i in range(int(threadnum)):
        t1 = threading.Thread(target=runResnet50, args=(all_dpu_runners[i], img, cnt))
        threadAll.append(t1)
    for x in threadAll:
        x.start()
    for x in threadAll:
        x.join()

    del all_dpu_runners
    #print("Pre time: %d ms" %(time_pre - time1))
    time_end = time.time()
    timetotal = time_end - time_start
    total_frames = runTotall
    fps = float(total_frames / timetotal)
    print(
        "FPS=%.2f, total frames = %.2f , time=%.6f seconds"
        % (fps, total_frames, timetotal)
    )
Beispiel #4
0
def pre_process(q_img, q_shape,args):

  xclbin_p=str(args['xclbin']+"/xdnn_v3_96x16_2pe_8b_9mb_bank03.xclbin")
  kernelName_p="pp_pipeline_accel"
  deviceIdx_p=args['deviceid']
  handle_p = waa_rt.PreProcess(xclbin_p,kernelName_p,deviceIdx_p, 1)
  if handle_p == -1:
    print("Unable to Create handle for pre-processing kernel. Only U200 device is supported")
    sys.exit()
  batch_sz = args['batch_sz']
  img_paths = xdnn_io.getFilePaths(args['images'])
  print("Pre-processing handle created. Populating Queue")
  for i in range(0, len(img_paths), batch_sz):
    for j, p in enumerate(img_paths[i:i + batch_sz]):
      arr, shape = handle_p.preprocess_input(p)
      q_img.put(arr)
      q_shape.put(shape)