def __init__(self, param_str): self.input_mean_value_=128.0 self.input_scale_=1.0 param_dict = eval(param_str) # Get args from prototxt self._args = xdnn_io.make_dict_args(param_dict) self._numPE = self._args["batch_sz"] # Bryan hack to determine number of PEs in FPGA # Establish FPGA Communication, Load bitstream ret, handles = xdnn.createHandle(self._args["xclbin"], "kernelSxdnn_0") if ret != 0: raise Exception("Failed to open FPGA handle.") self._args["scaleB"] = 1 self._args["PE"] = -1 # Instantiate runtime interface object self._fpgaRT = xdnn.XDNNFPGAOp(handles, self._args) self._parser = xdnn.CompilerJsonParser(self._args["netcfg"]) self._indictnames = self._parser.getInputs() self._outdictnames = self._parser.getOutputs() input_shapes = map(lambda x: tuple(x), self._parser.getInputs().itervalues()) output_shapes = map(lambda x: tuple(x), self._parser.getOutputs().itervalues()) self._indict = {} for i,name in enumerate(self._indictnames): self._indict[name] = np.empty(input_shapes[i],dtype=np.float32) self._outdict = {} for i,name in enumerate(self._outdictnames): self._outdict[name] = np.empty(output_shapes[i],dtype=np.float32)
def initialize(self, args): self._xsIn = xstream.Base() self._compJson = xdnn.CompilerJsonParser(args['netcfg']) self._firstInputShape = next(itervalues(self._compJson.getInputs())) # For now don't use a mean array, in theory this should avoid broadcasting, but harder to pass in from recipe # Can be hacked in, if that performance boost is necessary #HWC format as this is the native format that comes out of jpeg decode #self._meanarr = np.zeros ( (self._firstInputShape[2], self._firstInputShape[3], self._firstInputShape[1],), dtype = np.float32, order='C' ) #self._meanarr += args['img_mean'] if self.master: print("Pre is starting loop") self.run()
def initialize(self, args): self.numProcessed = 0 self.startTime = timeit.default_timer() self.net = caffe.Net(args['deploymodel'], args['caffemodel'], caffe.TEST) self.netOut = np.empty((args['batch_sz'], ) + self.net.blobs['layer31-conv'].data.shape[1:], dtype=np.float32) self.biases = bias_selector(args) self._args['net_h'] = self.net.blobs['data'].data.shape[2] self._args['net_w'] = self.net.blobs['data'].data.shape[3] self.fpgaOutputShapes = list( itervalues( xdnn.CompilerJsonParser(self._args['netcfg']).getOutputs())) for i in range(len(self.fpgaOutputShapes)): self.fpgaOutputShapes[i][0] = self._args['batch_sz'] # indices for unpacking concatenated arrays to individual array. self.buf_indices = [0] for i, outputShape in enumerate(self.fpgaOutputShapes): self.buf_indices.append(self.buf_indices[-1] + np.prod(outputShape)) print("Post is starting loop") self.run()
def run(args=None): if not args: parser = xdnn_io.default_parser_args() parser.add_argument('--numprepproc', type=int, default=1, help='number of parallel processes used to decode and quantize images') parser.add_argument('--numstream', type=int, default=16, help='number of FPGA streams') parser.add_argument('--deviceID', type=int, default=0, help='FPGA no. -> FPGA ID to run in case multiple FPGAs') parser.add_argument('--benchmarkmode', type=int, default=0, help='bypass pre/post processing for benchmarking') parser.add_argument('--profile', action='store_true', help='Print average latencies for preproc/exec/postproc') args = parser.parse_args() args = xdnn_io.make_dict_args(args) sharedInputArrs = [] fpgaOutputs = [] compilerJSONObj = xdnn.CompilerJsonParser(args['netcfg']) input_shapes = [v for k,v in compilerJSONObj.getInputs().items()] output_shapes = [v for k,v in compilerJSONObj.getOutputs().items()] for in_idx in range(len(input_shapes)): input_shapes[in_idx][0] = args['batch_sz'] for out_idx in range(len(output_shapes)): output_shapes[out_idx][0] = args['batch_sz'] input_sizes = map(lambda x: np.prod(x), input_shapes) output_sizes = map(lambda x: np.prod(x), output_shapes) num_shared_slots = args['numstream'] # shared memory from preprocessing to fpga forward shared_trans_arrs = SharedMemoryQueue("trans",num_shared_slots*(args['numprepproc']*args['batch_sz']), input_shapes +[(args['batch_sz'], 4)]) # shared memory from fpga forward to postprocessing shared_output_arrs = SharedMemoryQueue("output",num_shared_slots, output_shapes + [(args['batch_sz'], 4)]) # Form list of images to chunks of batch_sz img_paths = xdnn_io.getFilePaths(args['images']) imgids = list(range(len(img_paths))) imgid_chunks = [ imgids[i:i+args['batch_sz']] for i in range(0, len(img_paths), args['batch_sz']) ] # Start all processes p = mp.Pool(initializer = init_pre_process, initargs = (args, img_paths, input_shapes, shared_trans_arrs, ), processes = args['numprepproc']) xdnnProc = mp.Process(target=fpga_process, args=(args, len(imgid_chunks), compilerJSONObj,shared_trans_arrs,shared_output_arrs,)) postProc = mp.Process(target=post_process, args=(args, img_paths, fpgaOutputs,output_shapes,shared_output_arrs,)) xdnnProc.start() postProc.start() t1 = timeit.default_timer() if args['perpetual']: while True: res = [p.map_async(run_pre_process, imgid_chunks)] for j in res: j.wait() del j else: p.map_async(run_pre_process, imgid_chunks) xdnnProc.join() postProc.join() p.close() p.join() t2 = timeit.default_timer() total_t = t2 - t1 if(args['profile']): print("Total time taken: {} s\n Total images: {}\nAverage FPS: {}".format(total_t, \ len(img_paths), len(img_paths)/total_t))
def run(args=None): if not args: parser = xdnn_io.default_parser_args() parser = yolo_parser_args(parser) parser.add_argument('--startxstream', default=True, action='store_true', help='automatically start obj store server') parser.add_argument('--servermode', default=False, action='store_true', help='accept images from another process') parser.add_argument("--deploymodel", type=str, default='', help='Original prototxt') parser.add_argument("--caffemodel", type=str, default='', help='Original caffemodel') args = parser.parse_args() args = xdnn_io.make_dict_args(args) args['preprocseq'] = [('resize', (224, 224)), ('meansub', [104.007, 116.669, 122.679]), ('chtranspose', (2, 0, 1))] if (args['golden'] or args['visualize']): assert args['labels'], "Provide --labels to compute mAP." assert args[ 'results_dir'], "For accuracy measurements, provide --results_dir to save the detections." labels = xdnn_io.get_labels(args['labels']) colors = generate_colors(len(labels)) args['startxstream'] = True args['servermode'] = False timerQ = Queue() args['timerQ'] = timerQ compJson = xdnn.CompilerJsonParser(args['netcfg']) firstInputShape = next(itervalues(compJson.getInputs())) args['net_h'] = firstInputShape[2] args['net_w'] = firstInputShape[3] # start object store # (make sure to 'pip install pyarrow') xserver = None if args['startxstream']: xserver = xstream.Server() graph = grapher.Graph("yolo_v2") graph.node("prep", yolov2_pre.Node, args) graph.node("fpga", yolov2_fpga.Node, args) graph.node("post", yolov2_post.Node, args) graph.edge("START", None, "prep") graph.edge("prep", "prep", "fpga") graph.edge("fpga", "fpga", "post") graph.edge("DONE", "post", "fpga") graph.edge("DONE", "post", None) if not args['servermode']: graph.serve(background=True) img_paths = xdnn_io.getFilePaths(args['images']) reqProc = mp.Process(target=request_process, args=( args, img_paths, graph._in[0], graph._out[0], )) t = timeit.default_timer() reqProc.start() reqProc.join() graph.stop(kill=False) t2 = args['timerQ'].get() full_time = t2 - t args['timerQ'].close() print("Total time : {}s for {} images".format(full_time, len(img_paths))) print("Average FPS : {} imgs/sec".format(len(img_paths) / full_time)) else: print("Serving %s -> %s" % (graph._in[0], graph._out[0])) graph.serve() # mAP calculation if (args['golden']): print(flush=True) print("Computing mAP score : ", flush=True) print("Class names are : {} ".format(labels), flush=True) mAP = calc_detector_mAP(args['results_dir'], args['golden'], len(labels), labels,\ args['prob_threshold'], args['mapiouthresh'], args['points']) sys.stdout.flush()
def initialize(self, args): # make sure to set up subscribe sockets first, so we don't miss messages self.sub_0 = self.get_sub(0) self.sub_1 = self.get_sub(1) self._compJson = xdnn.CompilerJsonParser(args['netcfg']) self._fpgaRT = xdnn.XDNNFPGAOp(args) self._numStreams = args['numstream'] # allocate twice as many buffers than streams here because... # we know when 'a' stream/buffer completes, but we don't keep track # of exactly which stream/buffer is freed. # using double the buffers ensures that we are never clobbering # existing streams/buffers self._numStreamBuffers = self._numStreams * 2 self._numStreamsActive = 0 self._currStreamIdx = 0 self._bsz = args['batch_sz'] self._inputBuffers = [] self._outputBuffers = [] self._firstInputShape = next(itervalues(self._compJson.getInputs())) # firstOutputShape = next(itervalues(self._compJson.getOutputs())) outputShapes = list(itervalues(self._compJson.getOutputs())) outputNames = list(iterkeys(self._compJson.getOutputs())) for si in range(self._bsz * self._numStreamBuffers): self._inputBuffers.append( mp.Array(ctypes.c_float, np.prod(tuple(self._firstInputShape)).tolist())) bufs = [] for outputShape in outputShapes: bufs.append( np.empty((self._bsz, ) + tuple(outputShape[1:]), dtype=np.float32, order='C')) self._outputBuffers.append(bufs) # print("outputBuffer : ", [len(item) for item in self._outputBuffers]) # Pipeline: # 1) ingest # collect individual requests into 1 batch for 1 stream # 2) ingest_worker(s) # copy individual object store blobs into local buffers for 1 stream # 3) loop # submit fpga job # 4) wait # wait for fpga job self._qingest = mp.Queue(maxsize=len(self._inputBuffers)) self._qfpga = mp.Queue(maxsize=len(self._inputBuffers)) # spawn ingest_workers to copy remote buffers to local buffer self._ingestWorkers = [] for pi in range(args['numprepproc']): p = mp.Process(target=ingest_worker, args=( self._qingest, self._qfpga, self._firstInputShape, self._inputBuffers, )) p.start() self._ingestWorkers.append(p) # ingest thread dispatches incoming work to ingest_workers self._ingestThread = threading.Thread(target=self.ingest, args=(self._qingest, self._qfpga)) self._ingestThread.start() # wait thread collects completed FPGA results and sends forward self._qwait = mp.Queue(maxsize=len(self._inputBuffers)) self._waitThread = threading.Thread(target=self.wait, args=( self._qwait, outputNames, )) self._waitThread.start() print("Starting FPGA loop") self.run()
sys.stdout.flush() mp_classify.register_pre(YoloPreProcess) mp_classify.register_post(YoloPostProcess) if __name__ == '__main__': parser = xdnn_io.default_parser_args() parser = yolo_parser_args(parser) args = parser.parse_args() args = xdnn_io.make_dict_args(args) if(args['golden'] or args['visualize']): assert args['labels'], "Provide --labels to compute mAP." assert args['results_dir'], "For accuracy measurements, provide --results_dir to save the detections." compilerJSONObj = xdnn.CompilerJsonParser(args['netcfg']) input_shapes = [v for k,v in compilerJSONObj.getInputs().items()] output_shapes = [v for k,v in compilerJSONObj.getOutputs().items()] for out_idx in range(len(output_shapes)): output_shapes[out_idx][0] = args['batch_sz'] input_sizes = map(lambda x: np.prod(x), input_shapes) output_sizes = map(lambda x: np.prod(x), output_shapes) out_w = output_shapes[0][2] out_h = output_shapes[0][3] args['net_h'] = int(input_shapes[0][2]) args['net_w'] = int(input_shapes[0][3])
cap = cv2.VideoCapture(args.videofile) if not cap.isOpened(): raise ValueError("Couldn't read the video file {}".format( args.videofile)) frame_q = mp.Queue() resize_q = mp.Queue() trans_q = mp.Queue() output_q = mp.Queue() face_q = mp.Queue() scale_q = mp.Queue() ready_fpga = mp.Queue() sharedInputArrs = [] compilerJSONObj = xdnn.CompilerJsonParser(args.vitisrundir + '/compiler.json') input_shapes = [v for k, v in compilerJSONObj.getInputs().items()] output_shapes = [v for k, v in compilerJSONObj.getOutputs().items()] input_sizes = list(map(lambda x: np.prod(x), input_shapes)) output_sizes = map(lambda x: np.prod(x), output_shapes) input_list = input_shapes[0] N, C, H, W = input_list[0], input_list[1], input_list[2], input_list[3] # shared memory from video capture to preprocessing shared_frame_arrs = SharedMemoryQueue("frame", num_shared_slots, [(H, W, C)]) # shared memory from preprocessing to fpga forward shared_trans_arrs = SharedMemoryQueue("trans", num_shared_slots, [(H, W, C)] + input_shapes)
print('drawing boxes time: {0} seconds'.format(end_time - start_time)) if __name__ == '__main__': frame_q = mp.Queue() resize_q = mp.Queue() trans_q = mp.Queue() output_q = mp.Queue() face_q = mp.Queue() ready_fpga = mp.Queue() sharedInputArrs = [] compilerJSONObj = xdnn.CompilerJsonParser('deploy.compiler.json') input_shapes = map(lambda x: tuple(x), compilerJSONObj.getInputs().itervalues()) output_shapes = map(lambda x: tuple(x), compilerJSONObj.getOutputs().itervalues()) input_sizes = map(lambda x: np.prod(x), input_shapes) output_sizes = map(lambda x: np.prod(x), output_shapes) print input_shapes print output_shapes # shared memory from video capture to preprocessing shared_frame_arrs = SharedMemoryQueue("frame", num_shared_slots, [(320, 320, 3)])