def main(): parser = xdnn_io.default_parser_args() parser.add_argument('--numprepproc', type=int, default=1, help='number of parallel processes used to decode and quantize images') parser.add_argument('--numstream', type=int, default=16, help='number of FPGA streams') parser.add_argument('--deviceID', type=int, default=0, help='FPGA no. -> FPGA ID to run in case multiple FPGAs') parser.add_argument('--benchmarkmode', type=int, default=0, help='bypass pre/post processing for benchmarking') args = parser.parse_args() args = xdnn_io.make_dict_args(args) ret = xdnn.createManager() if ret != True: sys.exit(1) sharedInputArrs = [] fpgaOutputs = [] compilerJSONObj = xdnn.CompilerJsonParser( args['netcfg']) qPrep = mp.Queue(maxsize=args['numprepproc']*10) qFpga = mp.Queue(maxsize=100) streamQ = mp.Queue(maxsize=args['numstream']) prepProcQ = mp.Queue(maxsize=100) firstOutputShape = compilerJSONObj.getOutputs().itervalues().next() firstInputShape = compilerJSONObj.getInputs().itervalues().next() for i in range( args['numstream'] ): fpgaOutputs.append(mp.Array(ctypes.c_float, args['batch_sz'] * np.prod( tuple(firstOutputShape[1:]) ) )) streamQ.put ( i ) for i in range(100): bufSize = np.prod(tuple(firstInputShape)) sharedInputArrs.append( mp.Array(ctypes.c_float, bufSize ) ) prepProcQ.put (i) img_paths = xdnn_io.getFilePaths(args['images']) p = mp.Pool( initializer = init_prepImage, initargs = (args, qPrep, img_paths, sharedInputArrs, prepProcQ, compilerJSONObj, ), processes = args['numprepproc']) xdnnProc = mp.Process(target=fpga_process_async, args=(qPrep, qFpga, args, len(img_paths), sharedInputArrs,prepProcQ, streamQ, fpgaOutputs, compilerJSONObj,)) xdnnProc.start() postProc = mp.Process(target=post_process, args=(qFpga, args, img_paths,streamQ, fpgaOutputs,)) postProc.start() if args['perpetual']: while True: res = [p.map_async(run_prepImage, range(len(img_paths)))] for j in res: j.wait() del j else: p.map_async(run_prepImage, range(len(img_paths))) xdnnProc.join() postProc.join() p.close() p.join()
def init_fpga(): global g_inputs global g_inputbuf global g_fpgaOutput global g_weightsBlob global g_fcWeight global g_fcBias print(" --- INIT FPGA --- \n") print("xclbin: {0}.\n".format(g_xclbin)) print("xdnnLib: {0}.\n".format(g_xdnnLib)) ret = xdnn.createManager(g_xdnnLib) if ret != True: raise SystemExit("Error: xdnn createManager failed.") (g_fcWeight, g_fcBias) = xdnn_io.loadFCWeightsBias(g_xdnnTestDataDir) ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib, g_numDevices) if ret: raise SystemExit("ERROR: Unable to create handle to FPGA") else: print("INFO: Sucessfully create handle to FPGA.") # magics. See ml-suite/notebooks tutorial. Should we overwrite PE? args = { 'datadir': g_xdnnTestDataDir, 'quantizecfg': g_fpgaCfgFile, 'scaleA': g_scaleA, 'scaleB': g_scaleB, 'PE': -1, 'netcfg': g_netFile } print(" --- load weights --- \n") g_weightsBlob = xdnn_io.loadWeightsBiasQuant(args) print(" --- read lable file --- \n") with open(g_lableFile, 'r') as f: for line in f: g_labelarray.append(line.strip()) print(" --- prepare inputs --- \n") g_inputs = np.zeros((g_batchSize, g_img_c * g_img_h * g_img_w), dtype=np.float32) g_inputbuf = np.zeros((g_batchSize, g_img_c, g_img_h, g_img_w), dtype=np.float32) print "g_inputs", g_inputs print(" --- prepare outputs --- \n") g_fpgaOutput, fpgaHandle = xdnn.makeFPGAFloatArray(g_fpgaOutputSize * g_batchSize)
def prep_process(q): ret = xdnn.createManager(g_xdnnLib) if ret != True: sys.exit(1) while True: (inputs, inputImageFiles) = prepareImages() if inputs is None: break fpgaInputs = xdnn.quantizeInputs(g_firstFpgaLayerName, g_fpgaCfgFile, g_scaleB, inputs) q.put((fpgaInputs, inputImageFiles)) q.put((None, None))
def __init__(self, args,q, img_paths, sharedInputArrs, prepProcQ, compJson): ret = xdnn.createManager() if ret != True: sys.exit(1) np.random.seed(123) # for reproducibility self._args = args self._firstInputShape = compJson.getInputs().itervalues().next() self._q = q self._imgpaths = img_paths current = mp.current_process() self._procid = (int(current._identity[0]) - 1) % args['numprepproc'] self._sharedmem = sharedInputArrs self._prepQ = prepProcQ #HWC format as this is the native format that comes out of jpeg decode self._meanarr = np.zeros ( (self._firstInputShape[2], self._firstInputShape[3], self._firstInputShape[1],), dtype = np.float32, order='C' ) self._meanarr += args['img_mean']
def prep_process(q, sharedInputArrs): global g_numImages global g_numProcessed #p_history = {} #p_history["y"] = [] #p_history["t"] = [] ret = xdnn.createManager(g_xdnnLib) if ret != True: sys.exit(1) shMemIdx = -1 while True: #p_history["y"].append(1) #p_history["t"].append(timeit.default_timer()) shMemIdx = (shMemIdx + 1) % len(sharedInputArrs) # WARNING: shared mem below is not synchronized. # currently relies on shared mem banks to be consumed faster # than the next cycle of writes can come long. # Be sure to add enough shared mem banks to feed FPGA. sharedInputArr = sharedInputArrs[shMemIdx] sharedNpArr = np.frombuffer(sharedInputArr, np.int16) if g_ldPreProcImgsDir is not None: (fpgaInputs, inputImageFiles) = loadNpyImages(sharedNpArr) else: (fpgaInputs, inputImageFiles) = prepareImages(sharedNpArr) if fpgaInputs is None: break putImages(shMemIdx, q) #p_history["y"].append(0) #p_history["t"].append(timeit.default_timer()) #plt.plot(np.array(p_history["t"]),np.array(p_history["y"])) #plt.show() #print p_history q.put(None) g_perfProf.syncToShared()
def main(): processCommandLine() ret = xdnn.createManager(g_xdnnLib) if ret != True: sys.exit(1) (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(g_xdnnTestDataDir) # # Spawn the first 2 stages of our pipeline # Stage 1: Process JPG # Stage 2: Run FPGA "classify" qPrep = Queue(maxsize=1) qFpga = Queue(maxsize=1) prepProc = Process(target=prep_process, args=(qPrep, )) xdnnProc = Process(target=xdnn_process, args=(qPrep, qFpga)) prepProc.start() xdnnProc.start() # # The rest of this function post-processes FPGA output: # 1) Compute the final FC + Softmax layers # 2) Print classification & accuracy # zmqPub = None if g_zmqPub: zmqPub = ZmqResultPublisher() goldenMap = None if g_goldenFile: goldenMap = getGoldenMap(g_goldenFile, g_labelFile) numProcessed = 0 allTop1 = 0 allTop5 = 0 while True: loopTime = timeit.default_timer() (fpgaOutput, inputImageFiles) = qFpga.get() if type(fpgaOutput) == type(None) \ and type(inputImageFiles) == type(None): break startTime = timeit.default_timer() fcOutput = xdnn.computeFC(fcWeight, fcBias, fpgaOutput, g_batchSize, g_outputSize, g_fpgaOutputSize, g_useBlas) elapsedTime = timeit.default_timer() - startTime print "[time] FC (%.2f ms)" % (elapsedTime * 1000) startTime = timeit.default_timer() smaxOutput = xdnn.computeSoftmax(fcOutput, g_batchSize) elapsedTime = timeit.default_timer() - startTime #print "\nAfter Softmax (%.2f ms):" % (elapsedTime * 1000) numProcessed += g_batchSize (top1, top5) = printClassification(smaxOutput.flatten().tolist(), g_outputSize, inputImageFiles, g_labelFile, goldenMap, zmqPub=zmqPub) if goldenMap: print "Accuracy (i=%d) Top-1: %d, Top-5: %d" \ % (numProcessed/g_batchSize, top1, top5) allTop1 += top1 allTop5 += top5 print "Num processed: %d" % numProcessed print "\n[time] Total loop (%.2f ms)" % ( (timeit.default_timer() - loopTime) * 1000) if goldenMap and numProcessed: print "\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n" \ % (numProcessed, float(allTop1)/float(numProcessed)*100., float(allTop5)/float(numProcessed)*100.) prepProc.join() xdnnProc.join()
def post_process(): global g_numProcessed processCommandLine() ret = xdnn.createManager(g_xdnnLib) if ret != True: sys.exit(1) loadImages() (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(g_xdnnTestDataDir) # sharedInputArrs = rolling bank of shared memory blocks # -- 1 bank for each stream sharedInputArrs = [] for i in range(4): sharedInputArrs.append( sharedctypes.RawArray(\ ctypes.c_short, g_fpgaBatchSize*g_paddedImageSize)) # Spawn the first 2 stages of our pipeline # Stage 1: Process JPG # Stage 2: Run FPGA "classify" qFpga = Queue(maxsize=1) qPrep = Queue(maxsize=1) qMsgFromXdnn = Queue(maxsize=1) # start FPGA proc first to make sure FPGA is done initializing xdnnProc = Process(target=xdnn_process, args=(qPrep, qFpga, qMsgFromXdnn, sharedInputArrs)) xdnnProc.start() # only start prep proc after FPGA xdnn proc is ready xdnnReady = qMsgFromXdnn.get() prepProc = Process(target=prep_process, args=(qPrep, sharedInputArrs)) prepProc.start() # # The rest of this function post-processes FPGA output: # 1) Compute the final FC + Softmax layers # 2) Print classification & accuracy # zmqPub = None if g_zmqPub: zmqPub = ZmqResultPublisher() goldenMap = None if g_goldenFile: goldenMap = getGoldenMap(g_goldenFile) g_numProcessed = 0 allTop1 = 0 allTop5 = 0 startTime = None while True: loopTime = timeit.default_timer() * (-1) fpgaOutput = getFpgaOutputs(qFpga) if g_numImages is not None and g_numProcessed >= g_numImages: break if type(fpgaOutput) == type(None): break inputImageFiles = [] for i in range(g_batchSize): idx = (g_numProcessed + i) % len(g_allInputImageFiles) inputImageFiles.append(g_allInputImageFiles[idx]) if g_bypassFC: fcOutput = np.zeros(g_batchSize * g_outputSize) else: fcOutput = fullyConnected(fcWeight, fcBias, fpgaOutput, g_batchSize, g_outputSize, g_fpgaOutputSize, g_useBlas) smaxOutput = softmax(fcOutput, g_batchSize) loopTime += timeit.default_timer() loopTime *= 1000 # ms g_numProcessed += g_batchSize if not g_bypassLoad: (top1, top5) = reportAccuracy(smaxOutput.flatten().tolist(), g_outputSize, inputImageFiles, g_labels, goldenMap, zmqPub, True) allTop1 += top1 allTop5 += top5 #g_perfProf.drawBars(g_batchSize, loopTime) if startTime == None: # set startTime after skipping 1st iteration startTime = timeit.default_timer() endTime = timeit.default_timer() elapsed = endTime - startTime elapsed *= 1000 prepProc.join() xdnnProc.join() g_perfProf.syncToShared() g_perfProf.printSummary() if g_numProcessed > 1: numProfiled = g_numProcessed - 1 # we skipped 1 iter to flush pipe print("===========================================") print("Performance Summary\n") print(" Images: %d" % (g_numProcessed)) if goldenMap is not None: print(" Top1: %.2f%%" % (100 * allTop1 / float(g_numProcessed))) print(" Top5: %.2f%%" % (100 * allTop5 / float(g_numProcessed))) print(" Batch Size: %d" % (g_batchSize)) print(" Total Batches: %d" % (numProfiled / g_batchSize)) print(" Total Time: %.2f ms" % (elapsed)) print(" Time/Batch: %.2f ms" % (g_batchSize * elapsed / numProfiled)) print(" Time/Image: %.2f ms" % (elapsed / numProfiled)) print(" Images/Second: %f" % (1000 * numProfiled / elapsed)) print("===========================================\n")
def main(): parser = xdnn_io.default_parser_args() parser.add_argument( '--numprepproc', type=int, default=1, help='number of parallel processes used to decode and quantize images') parser.add_argument('--numstream', type=int, default=16, help='number of FPGA streams') parser.add_argument( '--deviceID', type=int, default=0, help='FPGA no. -> FPGA ID to run in case multiple FPGAs') args = parser.parse_args() args = xdnn_io.make_dict_args(args) ret = xdnn.createManager(args['xlnxlib']) if ret != True: sys.exit(1) sharedInputArrs = [] fpgaOutputs = [] qPrep = mp.Queue(maxsize=args['numprepproc'] * 10) qFpga = mp.Queue(maxsize=100) streamQ = mp.Queue(maxsize=args['numstream']) prepProcQ = mp.Queue(maxsize=100) for i in range(args['numstream']): shared_arr = mp.Array(ctypes.c_float, args['batch_sz'] * args['fpgaoutsz']) fpgaOutputs.append(shared_arr) streamQ.put(i) for i in range(100): bufSize = np.prod(args['in_shape']) sharedInputArrs.append(mp.Array(ctypes.c_float, bufSize)) prepProcQ.put(i) img_paths = xdnn_io.getFilePaths(args['images']) p = mp.Pool(initializer=init_prepImage, initargs=( args, qPrep, img_paths, sharedInputArrs, prepProcQ, ), processes=args['numprepproc']) xdnnProc = mp.Process(target=fpga_process_async, args=( qPrep, qFpga, args, len(img_paths), sharedInputArrs, prepProcQ, streamQ, fpgaOutputs, )) xdnnProc.start() postProc = mp.Process(target=post_process, args=( qFpga, args, img_paths, streamQ, fpgaOutputs, )) postProc.start() if args['perpetual']: while True: res = [p.map_async(run_prepImage, range(len(img_paths)))] for j in res: j.wait() del j else: p.map_async(run_prepImage, range(len(img_paths))) xdnnProc.join() postProc.join() p.close() p.join()