def faceDetection(args, FDDB_list, FDDB_results_file): runner = Runner(args.vitisrundir) inTensors = runner.get_input_tensors() outTensors = runner.get_output_tensors() batch_sz = 1 fpgaBlobs = [] for io in [inTensors, outTensors]: blobs = [] for t in io: shape = (batch_sz, ) + tuple([t.dims[i] for i in range(t.ndims)][1:]) blobs.append(np.empty((shape), dtype=np.float32, order='C')) fpgaBlobs.append(blobs) for i, line in enumerate(FDDB_list): FDDB_results_file.write('%s\n' % line.strip()) image_name = args.fddbPath + line.strip() + '.jpg' image_ori = cv2.imread(image_name, cv2.IMREAD_COLOR) rects = detect(runner, fpgaBlobs, image_ori) FDDB_results_file.write('%d\n' % len(rects)) for rect in rects: FDDB_results_file.write('%d %d %d %d %f\n' % (rect[0], rect[1], rect[2] - rect[0], rect[3] - rect[1], rect[4])) FDDB_results_file.close()
def fpga_process(vitisrundir, shared_trans_arrs, shared_output_arrs, ready_fpga): runner = Runner(vitisrundir) qWait = mp.Queue(maxsize=100) ready_fpga.put(1) t = threading.Thread(target=fpga_wait, args=(runner, qWait, shared_output_arrs, shared_trans_arrs)) t.start() numProcessed = 0 startTime = time.time() while True: # Get the buffer for fpga output write_slot = shared_output_arrs.openWriteId() write_slot_arrs = shared_output_arrs.accessNumpyBuffer(write_slot) # Get the input buffer for fpga exec read_slot = shared_trans_arrs.openReadId() if read_slot is None: break read_slot_arrs = shared_trans_arrs.accessNumpyBuffer(read_slot) # Start execution jid = runner.execute_async([read_slot_arrs[1]], write_slot_arrs) # runner.wait(jid) qWait.put((write_slot, read_slot, jid)) #shared_trans_arrs.closeReadId(read_slot) numProcessed += 1 qWait.put((None, None, None)) t.join() elapsedTime = (time.time() - startTime) print("FPGA_process: ", float(numProcessed) / elapsedTime, "img/s")
def faceDetection(vitis_rundir,outpath, rsz_h, rsz_w, path): runner = Runner(vitis_rundir) inTensors = runner.get_input_tensors() outTensors = runner.get_output_tensors() batch_sz = 1 fpgaBlobs= [] for io in [inTensors, outTensors]: blobs = [] for t in io: shape = (batch_sz,) + tuple([t.dims[i] for i in range(t.ndims)][1:]) blobs.append(np.empty((shape), dtype=np.float32, order='C')) fpgaBlobs.append(blobs) dirName = outpath if not os.path.exists(dirName): os.mkdir(dirName) output_Img_path = dirName #os.chdir(path) res=[] for fn in sorted(glob.glob(path+ '/*.jpg'), key=os.path.getsize): filename = fn[fn.rfind('/')+1:] src_img=cv2.imread(fn) input_img=cv2.resize(src_img,(rsz_w, rsz_h)) face_rects=detect(runner, fpgaBlobs, input_img) dst_img=input_img.copy() if len(face_rects) != 0: for face_rect in face_rects: res.append("{} {} {} {} {}".format(fn, face_rect[0],face_rect[1],face_rect[2],face_rect[3])) print ("{} {} {} {} {}".format(fn, face_rect[0],face_rect[1],face_rect[2],face_rect[3])) cv2.rectangle(dst_img,(face_rect[0],face_rect[1]),(face_rect[2],face_rect[3]),(0,255,0),2) cv2.imwrite(output_Img_path+filename,dst_img)
def run(rundir, n, q): runners = [] for i in range(n): runners.append(Runner(rundir)) inTensors = runners[0].get_input_tensors() shape = [inTensors[0].dims[i] for i in range(inTensors[0].ndims)] q.put(shape) # ready for work q.get() # wait for exit signal
def run(rundir, chanIdx, q, args): xspub = xstream.Publisher() xssub = xstream.Subscribe(chanIdx2Str(chanIdx)) runner = Runner(rundir) inTensors = runner.get_input_tensors() outTensors = runner.get_output_tensors() q.put(1) # ready for work fpgaBlobs = None fcOutput = None labels = xdnn_io.get_labels(args['labels']) xdnnCPUOp = xdnn.XDNNCPUOp("%s/weights.h5" % rundir) while True: try: payload = xssub.get() if not payload: break (meta, buf) = payload if fpgaBlobs == None: # allocate buffers fpgaBlobs = [] batchsz = meta['shape'][0] # inTensors[0].dims[0] for io in [inTensors, outTensors]: blobs = [] for t in io: shape = (batchsz,) + tuple([t.dims[i] for i in range(t.ndims)][1:]) blobs.append(np.empty((shape), dtype=np.float32, order='C')) fpgaBlobs.append(blobs) fcOutput = np.empty((batchsz, args['outsz'],), dtype=np.float32, order='C') fpgaInput = fpgaBlobs[0][0] assert(tuple(meta['shape']) == fpgaInput.shape) data = np.frombuffer(buf, dtype=np.float32).reshape(fpgaInput.shape) np.copyto(fpgaInput, data) jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1]) runner.wait(jid) xdnnCPUOp.computeFC(fpgaBlobs[1][0], fcOutput) softmaxOut = xdnnCPUOp.computeSoftmax(fcOutput) xdnn_io.printClassification(softmaxOut, meta['images'], labels) sys.stdout.flush() if meta['id'] % 1000 == 0: print("Recvd query %d" % meta['id']) sys.stdout.flush() del data del buf del payload xspub.send(meta['from'], "success") except Exception as e: logging.error("Worker exception " + str(e))
def __init__(self, rundir, nFPGA, nDispatchers, batchsz=-1): # update meta.json with nFPGA meta = {} with open("%s/meta.json" % rundir) as f: meta = json.load(f) meta['num_fpga'] = nFPGA if 'publish_id' in meta: del meta['publish_id'] if 'subscribe_id' in meta: del meta['subscribe_id'] with open("%s/meta.json" % rundir, "w") as f: json.dump(meta, f) # acquire FPGA runner = Runner(rundir) inTensors = runner.get_input_tensors() outTensors = runner.get_output_tensors() inshape = [inTensors[0].dims[i] for i in range(inTensors[0].ndims)] outshape = [outTensors[0].dims[i] for i in range(outTensors[0].ndims)] if batchsz != -1: inshape[0] = batchsz # update batch size outshape[0] = batchsz # update batch size Dispatcher.runner = runner Dispatcher.inTensors = inTensors Dispatcher.outTensors = outTensors Dispatcher.inshape = inshape Dispatcher.outshape = outshape self.q = Queue(maxsize=nDispatchers * 4) self.workers = [] for i in range(nDispatchers): sys.stdout.flush() worker = threading.Thread(target=self._run, args=( self.q, inshape, outshape, )) self.workers.append(worker) worker.start()
def faceDetection(vitis_rundir, outpath, rsz_h, rsz_w, path): runner = Runner(vitis_rundir) inTensors = runner.get_input_tensors() outTensors = runner.get_output_tensors() batch_sz = 1 fpgaBlobs = [] for io in [inTensors, outTensors]: blobs = [] for t in io: shape = (batch_sz, ) + tuple([t.dims[i] for i in range(t.ndims)][1:]) blobs.append(np.empty((shape), dtype=np.float32, order='C')) fpgaBlobs.append(blobs) dirName = outpath if not os.path.exists(dirName): os.mkdir(dirName) output_Img_path = dirName #os.chdir(path) fp = open(output_Img_path + "/output.txt", 'w') for fn in sorted(glob.glob(path + '/*.jpg'), key=os.path.getsize): filename = fn[fn.rfind('/') + 1:] image_ori = cv2.imread(fn, cv2.IMREAD_COLOR) face_rects = detect(runner, fpgaBlobs, image_ori) res = [] if len(face_rects) != 0: for face_rect in face_rects: res.append("{} {} {} {} {}".format(filename, face_rect[0], face_rect[1], face_rect[2], face_rect[3])) print("{} {} {} {} {}".format(fn, face_rect[0], face_rect[1], face_rect[2], face_rect[3])) cv2.rectangle(image_ori, (face_rect[0], face_rect[1]), (face_rect[2], face_rect[3]), (0, 255, 0), 2) cv2.imwrite(output_Img_path + filename, image_ori) for faces in res: fp.write(faces + '\n') # else: #res.append("{} {} {} {} {}".format(fn, 0,0,0,0)) fp.close()
def fpga_process(args, num_img, compJson, shared_trans_arrs,shared_output_arrs): runner = Runner(args['vitis_rundir']) qWait = mp.Queue(maxsize=100) t = threading.Thread(target=fpga_wait, args=(runner, qWait, shared_output_arrs, shared_trans_arrs)) t.start() numProcessed = 0 startTime = time.time() while numProcessed < num_img or args['perpetual']: # Get the buffer for fpga output write_slot = shared_output_arrs.openWriteId() write_slot_arrs = shared_output_arrs.accessNumpyBuffer(write_slot) # Get the input buffer for fpga exec read_slot = shared_trans_arrs.openReadId() if read_slot is None: break read_slot_arrs = shared_trans_arrs.accessNumpyBuffer(read_slot) # Copy meta data from input to output write_slot_arrs[-1][:] = read_slot_arrs[-1][:] # Start execution jid = runner.execute_async(read_slot_arrs[:-1], write_slot_arrs[:-1]) # runner.wait(jid) qWait.put((write_slot, read_slot, jid)) #shared_trans_arrs.closeReadId(read_slot) numProcessed += 1 if(args['perpetual'] == False): if numProcessed == num_img: break qWait.put((None, None, None)) t.join() elapsedTime = ( time.time() - startTime ) print( "FPGA_process: ", float(numProcessed), "batch") print( "FPGA_process: ", float(numProcessed)/elapsedTime, "batch/s")
def main(): args = xdnn_io.processCommandLine() runner = Runner(args['vitis_rundir']) inTensors = runner.get_input_tensors() outTensors = runner.get_output_tensors() batch_sz = args['batch_sz'] if batch_sz == -1: # use Runner's suggested batch size batch_sz = inTensors[0].dims[0] if args['golden']: goldenMap = xdnn_io.getGoldenMap(args['golden']) top5Count = 0 top1Count = 0 fpgaBlobs = [] for io in [inTensors, outTensors]: blobs = [] for t in io: shape = (batch_sz,) + tuple([t.dims[i] for i in range(t.ndims)][1:]) blobs.append(np.empty((shape), dtype=np.float32, order='C')) fpgaBlobs.append(blobs) img_paths = xdnn_io.getFilePaths(args['images']) labels = xdnn_io.get_labels(args['labels']) xdnnCPUOp = xdnn.XDNNCPUOp("%s/weights.h5" % args['vitis_rundir']) fcOutput = np.empty((batch_sz, args['outsz'],), dtype=np.float32, order='C') fpgaInput = fpgaBlobs[0][0] for i in range(0, len(img_paths), batch_sz): pl = [] # fill tensor input data from image file for j, p in enumerate(img_paths[i:i + batch_sz]): img, _ = xdnn_io.loadImageBlobFromFile(p, args['img_raw_scale'], args['img_mean'], args['img_input_scale'], fpgaInput.shape[2], fpgaInput.shape[3]) pl.append(p) np.copyto(fpgaInput[j], img) jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1]) runner.wait(jid) xdnnCPUOp.computeFC(fpgaBlobs[1][0], fcOutput) softmaxOut = xdnnCPUOp.computeSoftmax(fcOutput) if args['golden']: for j,p in enumerate(img_paths[i:i + batch_sz]): top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 1) top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 5) else: xdnn_io.printClassification(softmaxOut, pl, labels) if args['golden']: print ( ("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % (len(img_paths), float(top1Count)/float(len(img_paths))*100., float(top5Count)/float(len(img_paths))*100.) )
def main(argv): global threadnum """create runner """ dpu = Runner(argv[2]) listimage = os.listdir(calib_image_dir) threadAll = [] threadnum = int(argv[1]) i = 0 global runTotall runTotall = len(listimage) """ Image list to be run """ img = [] for i in range(runTotall): path = os.path.join(calib_image_dir, listimage[i]) image = cv2.imread(path) img.append(input_fn.preprocess_fn(image)) imgT = np.transpose(img, (0, 3, 1, 2)) """run with batch """ time1 = time.time() for i in range(int(threadnum)): t1 = threading.Thread(target=runInceptionV1, args=(dpu, imgT, i * batchSize)) threadAll.append(t1) for x in threadAll: x.start() for x in threadAll: x.join() time2 = time.time() timetotal = time2 - time1 fps = float(runTotall / timetotal) print("%.2f FPS" % fps) del dpu
def run(rundir, chanIdx, q, args): xspub = xstream.Publisher() xssub = xstream.Subscribe(chanIdx2Str(chanIdx)) runner = Runner(rundir) inTensors = runner.get_input_tensors() outTensors = runner.get_output_tensors() q.put(1) # ready for work fpgaBlobs = None labels = xdnn_io.get_labels(args['labels']) if args['yolo_version'] == 'v2': yolo_postproc = yolo.yolov2_postproc elif args['yolo_version'] == 'v3': yolo_postproc = yolo.yolov3_postproc else: assert args['yolo_version'] in ( 'v2', 'v3'), "--yolo_version should be <v2|v3>" biases = bias_selector(args) if (args['visualize']): colors = generate_colors(len(labels)) while True: try: payload = xssub.get() if not payload: break (meta, buf) = payload if fpgaBlobs == None: # allocate buffers fpgaBlobs = [] batchsz = meta['shape'][0] # inTensors[0].dims[0] for io in [inTensors, outTensors]: blobs = [] for t in io: shape = (batchsz, ) + tuple( [t.dims[i] for i in range(t.ndims)][1:]) blobs.append( np.empty((shape), dtype=np.float32, order='C')) fpgaBlobs.append(blobs) fcOutput = np.empty(( batchsz, args['outsz'], ), dtype=np.float32, order='C') fpgaInput = fpgaBlobs[0][0] assert (tuple(meta['shape']) == fpgaInput.shape) data = np.frombuffer(buf, dtype=np.float32).reshape(fpgaInput.shape) np.copyto(fpgaInput, data) jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1]) runner.wait(jid) boxes = yolo_postproc(fpgaBlobs[1], args, meta['image_shapes'], biases=biases) if (not args['profile']): for i in range(min(batchsz, len(meta['image_shapes']))): print("Detected {} boxes in {}".format( len(boxes[i]), meta['images'][i]), flush=True) # Save the result if (args['results_dir']): for i in range(min(batchsz, len(meta['image_shapes']))): fname = meta['images'][i] filename = os.path.splitext(os.path.basename(fname))[0] out_file_txt = os.path.join(args['results_dir'], filename + '.txt') print("Saving {} boxes to {}".format( len(boxes[i]), out_file_txt)) sys.stdout.flush() saveDetectionDarknetStyle(out_file_txt, boxes[i], meta['image_shapes'][i]) if (args['visualize']): out_file_png = os.path.join( args['results_dir'], filename + '.png') print("Saving result to {}".format(out_file_png)) sys.stdout.flush() draw_boxes(fname, boxes[i], labels, colors, out_file_png) if meta['id'] % 1000 == 0: print("Recvd query %d" % meta['id']) sys.stdout.flush() del data del buf del payload xspub.send(meta['from'], "success") except Exception as e: logging.error("Worker exception " + str(e))
def main(): parser = xdnn_io.default_parser_args() parser = yolo_parser_args(parser) args = parser.parse_args() args = xdnn_io.make_dict_args(args) # Setup the environment img_paths = xdnn_io.getFilePaths(args['images']) if (args['golden'] or args['visualize']): assert args['labels'], "Provide --labels to compute mAP." assert args[ 'results_dir'], "For accuracy measurements, provide --results_dir to save the detections." labels = xdnn_io.get_labels(args['labels']) colors = generate_colors(len(labels)) if args['yolo_version'] == 'v2': yolo_postproc = yolo.yolov2_postproc elif args['yolo_version'] == 'v3': yolo_postproc = yolo.yolov3_postproc runner = Runner(args['vitis_rundir']) # Setup the blobs inTensors = runner.get_input_tensors() outTensors = runner.get_output_tensors() batch_sz = args['batch_sz'] if batch_sz == -1: batch_sz = inTensors[0].dims[0] fpgaBlobs = [] for io in [inTensors, outTensors]: blobs = [] for t in io: shape = (batch_sz, ) + tuple([t.dims[i] for i in range(t.ndims)][1:]) blobs.append(np.empty((shape), dtype=np.float32, order='C')) fpgaBlobs.append(blobs) fpgaInput = fpgaBlobs[0][0] # Setup the YOLO config net_h, net_w = fpgaInput.shape[-2:] args['net_h'] = net_h args['net_w'] = net_w biases = bias_selector(args) # Setup profiling env prep_time = 0 exec_time = 0 post_time = 0 # Start the execution for i in range(0, len(img_paths), batch_sz): pl = [] img_shapes = [] # Prep images t1 = timeit.default_timer() for j, p in enumerate(img_paths[i:i + batch_sz]): fpgaInput[j, ...], img_shape = xdnn_io.loadYoloImageBlobFromFile( p, net_h, net_w) pl.append(p) img_shapes.append(img_shape) t2 = timeit.default_timer() # Execute jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1]) runner.wait(jid) # Post Proc t3 = timeit.default_timer() boxes = yolo_postproc(fpgaBlobs[1], args, img_shapes, biases=biases) t4 = timeit.default_timer() prep_time += (t2 - t1) exec_time += (t3 - t2) post_time += (t4 - t3) for i in range(min(batch_sz, len(img_shapes))): print("Detected {} boxes in {}".format(len(boxes[i]), pl[i])) # Save the result if (args['results_dir']): for i in range(min(batch_sz, len(img_shapes))): filename = os.path.splitext(os.path.basename(pl[i]))[0] out_file_txt = os.path.join(args['results_dir'], filename + '.txt') print("Saving {} boxes to {}".format(len(boxes[i]), out_file_txt)) sys.stdout.flush() saveDetectionDarknetStyle(out_file_txt, boxes[i], img_shapes[i]) if (args['visualize']): out_file_png = os.path.join(args['results_dir'], filename + '.png') print("Saving result to {}".format(out_file_png)) sys.stdout.flush() draw_boxes(pl[i], boxes[i], labels, colors, out_file_png) # Profiling results if (args['profile']): print("\nAverage Latency in ms:") print(" Image Prep: {0:3f}".format(prep_time * 1000.0 / len(img_paths))) print(" Exec: {0:3f}".format(exec_time * 1000.0 / len(img_paths))) print(" Post Proc: {0:3f}".format(post_time * 1000.0 / len(img_paths))) sys.stdout.flush() # mAP calculation if (args['golden']): print() print("Computing mAP score : ") print("Class names are : {} ".format(labels)) mAP = calc_detector_mAP(args['results_dir'], args['golden'], len(labels), labels, args['prob_threshold'], args['mapiouthresh'], args['points']) sys.stdout.flush()