def main(argv): global threadnum listimage = os.listdir(calib_image_dir) threadAll = [] threadnum = int(argv[1]) i = 0 global runTotall runTotall = len(listimage) g = xir.graph.Graph.deserialize(pathlib.Path(argv[2])) subgraphs = get_subgraph(g) assert len(subgraphs) == 1 # only one DPU kernel all_dpu_runners = [] for i in range(int(threadnum)): all_dpu_runners.append(runner.Runner(subgraphs[0], "run")) """image list to be run """ xclbin_p = str("/mnt/dpu.xclbin") kernelName_p = "pp_pipeline_accel" deviceIdx_p = 0 fpga_pp = waa_rt.PreProcess(xclbin_p, kernelName_p, deviceIdx_p) time1 = int(round(time.time() * 1000)) img = [] for i in range(runTotall): path = os.path.join(calib_image_dir, listimage[i]) image = cv2.imread(path) rows, cols, channels = image.shape image = fpga_pp.preprocess_input(image, rows, cols) img.append(image) time_pre = int(round(time.time() * 1000)) start = 0 for i in range(int(threadnum)): if (i == threadnum - 1): end = len(img) else: end = start + (len(img) // threadnum) t1 = threading.Thread(target=runResnet50, args=(all_dpu_runners[i], img[start:end], len(img[start:end]))) threadAll.append(t1) start = end for x in threadAll: x.start() for x in threadAll: x.join() time2 = int(round(time.time() * 1000)) timetotal = time2 - time1 fps = float(runTotall * 1000 / timetotal) #print("Pre time: %d ms" %(time_pre - time1)) #print("DPU + post time: %d ms" %(time2 - time_pre)) #print("Total time : %d ms" %timetotal) #print("Total frames : %d" %len(img)) print("Performance : %.2f FPS" % fps)
def pre_process(q, args): xclbin_p = str(args['xclbin'] + "/xdnn_v3_96x16_2pe_8b_9mb_bank03.xclbin") kernelName_p = "pp_pipeline_accel" deviceIdx_p = args['deviceid'] fpga_pp = waa_rt.PreProcess(xclbin_p, kernelName_p, deviceIdx_p, 1) batch_sz = args['batch_sz'] img_paths = xdnn_io.getFilePaths(args['images']) for i in range(0, len(img_paths), batch_sz): for j, p in enumerate(img_paths[i:i + batch_sz]): arr, ht = fpga_pp.preprocess_input(p) q.put(arr)
def main(argv): global threadnum listimage=os.listdir(calib_image_dir) threadAll = [] threadnum = int(argv[1]) i = 0 global runTotall runTotall = len(listimage) g = xir.Graph.deserialize(argv[2]) subgraphs = get_child_subgraph_dpu(g) assert len(subgraphs) == 1 # only one DPU kernel all_dpu_runners = [] for i in range(int(threadnum)): all_dpu_runners.append(vart.Runner.create_runner(subgraphs[0], "run")) """image list to be run """ xclbin_p=str("/usr/lib/dpu.xclbin") kernelName_p="pp_pipeline_accel" deviceIdx_p=0 fpga_pp = waa_rt.PreProcess(xclbin_p,kernelName_p,deviceIdx_p) time1 = int(round(time.time() * 1000)) img = [] time_start = time.time() for i in range(runTotall): path = os.path.join(calib_image_dir,listimage[i]) img.append(fpga_pp.preprocess_input(path)) cnt = 1 """run with batch """ for i in range(int(threadnum)): t1 = threading.Thread(target=runResnet50, args=(all_dpu_runners[i], img, cnt)) threadAll.append(t1) for x in threadAll: x.start() for x in threadAll: x.join() del all_dpu_runners #print("Pre time: %d ms" %(time_pre - time1)) time_end = time.time() timetotal = time_end - time_start total_frames = runTotall fps = float(total_frames / timetotal) print( "FPS=%.2f, total frames = %.2f , time=%.6f seconds" % (fps, total_frames, timetotal) )
def pre_process(q_img, q_shape,args): xclbin_p=str(args['xclbin']+"/xdnn_v3_96x16_2pe_8b_9mb_bank03.xclbin") kernelName_p="pp_pipeline_accel" deviceIdx_p=args['deviceid'] handle_p = waa_rt.PreProcess(xclbin_p,kernelName_p,deviceIdx_p, 1) if handle_p == -1: print("Unable to Create handle for pre-processing kernel. Only U200 device is supported") sys.exit() batch_sz = args['batch_sz'] img_paths = xdnn_io.getFilePaths(args['images']) print("Pre-processing handle created. Populating Queue") for i in range(0, len(img_paths), batch_sz): for j, p in enumerate(img_paths[i:i + batch_sz]): arr, shape = handle_p.preprocess_input(p) q_img.put(arr) q_shape.put(shape)