def main(argv): args = xdnn_io.processCommandLine(argv) ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib) if ret != 0: sys.exit(1) labels = xdnn_io.get_labels(args['labels']) # TODO dict of tuples instead? fpgaRT = {} fpgaOutputs = {} fcWeights = {} fcBiases = {} netFiles = {} confNames = [] args = args['jsoncfg'] # we do not use other args' keys for netconf_args in args: confName = str(netconf_args['name']) confNames += [confName] # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp']) fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args) netconf_args['in_shape'] = tuple((netconf_args['batch_sz'],) + tuple(fpgaRT[confName].getInputDescriptors().itervalues().next()[1:] )) (fcWeights[confName], fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args) fpgaOutputs[confName] = np.empty ((netconf_args['batch_sz'], int(netconf_args['fpgaoutsz']),), dtype=np.float32, order='C') netFiles[confName] = str(netconf_args['netcfg']) batchArrays = [] for streamId, netconf_args in enumerate(args): batchArrays.append(np.empty(netconf_args['in_shape'], dtype=np.float32, order='C')) pl = [] img_paths = xdnn_io.getFilePaths(netconf_args['images']) for j, p in enumerate(img_paths[:netconf_args['batch_sz']]): batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile(p, netconf_args['img_raw_scale'], netconf_args['img_mean'], netconf_args['img_input_scale'], netconf_args['in_shape'][2], netconf_args['in_shape'][3]) pl.append(p) confName = str(netconf_args['name']) firstInputName = fpgaRT[confName].getInputs().iterkeys().next() firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next() fpgaRT[confName].exec_async({ firstInputName : batchArrays[-1] }, { firstOutputName : fpgaOutputs[confName] }, streamId) for streamId, confName in enumerate(confNames): fpgaRT[confName].get_result (streamId) for netconf_args in args: confName = str(netconf_args['name']) fcOut = np.empty( (netconf_args['batch_sz'], netconf_args['outsz']), dtype=np.float32, order = 'C') xdnn.computeFC (fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], fcOut) softmaxOut = xdnn.computeSoftmax(fcOut) xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels); xdnn.closeHandle()
def main(): args = xdnn_io.processCommandLine() ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args) img_paths = xdnn_io.getFilePaths(args['images']) fpgaOutput = np.empty(( args['batch_sz'], args['fpgaoutsz'], ), dtype=np.float32, order='C') fcOutput = np.empty(( args['batch_sz'], args['outsz'], ), dtype=np.float32, order='C') batch_array = np.empty(((args['batch_sz'], ) + args['in_shape']), dtype=np.float32, order='C') labels = xdnn_io.get_labels(args['labels']) if args['golden']: goldenMap = xdnn_io.getGoldenMap(args['golden']) top5Count = 0 top1Count = 0 for i in xrange(0, len(img_paths), args['batch_sz']): pl = [] for j, p in enumerate(img_paths[i:i + args['batch_sz']]): batch_array[j, ...], _ = xdnn_io.loadImageBlobFromFile( p, args['img_raw_scale'], args['img_mean'], args['img_input_scale'], args['in_shape'][2], args['in_shape'][1]) pl.append(p) fpgaRT.execute(batch_array, fpgaOutput) xdnn.computeFC(fcWeight, fcBias, fpgaOutput, args['batch_sz'], args['outsz'], args['fpgaoutsz'], fcOutput) softmaxOut = xdnn.computeSoftmax(fcOutput) xdnn_io.printClassification(softmaxOut, pl, labels) if args['golden']: for j, p in enumerate(img_paths[i:i + args['batch_sz']]): top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 1) top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 5) xdnn.closeHandle() if args['golden']: print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % ( len(img_paths), float(top1Count) / float(len(img_paths)) * 100., float(top5Count) / float(len(img_paths)) * 100.)
def fpga_process_async(qFrom, qTo, args, num_img, sharedInputArrs, prepProcQ, streamQ, fpgaOutputs): ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", [args["deviceID"]]) if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) qWait = mp.Queue(maxsize=100) numStreams = args['numstream'] bsz = args['batch_sz'] input_ptrs = [] for i in range(numStreams): input_ptrs.append([]) numProcessed = 0 t = threading.Thread(target=xdnn_wait, args=( fpgaRT, qWait, qTo, prepProcQ, )) t.start() #startTime = time.time() while numProcessed < num_img or args['perpetual']: img_list = np.full((bsz, ), -1, dtype=np.int32) sId = streamQ.get() input_ptrs[sId] = [] shMemIdxArr = [] for j in range(bsz): (sMemIdx, img_idx) = qFrom.get() numProcessed += 1 img_list[j] = img_idx nparr_view = np.frombuffer(sharedInputArrs[sMemIdx].get_obj(), dtype=np.float32) nparr_view = nparr_view[np.newaxis, ...] input_ptrs[sId].append( nparr_view.ctypes.data_as(ctypes.POINTER(ctypes.c_float))) shMemIdxArr.append(sMemIdx) if numProcessed == num_img: break npout_view = np.frombuffer(fpgaOutputs[sId].get_obj(), dtype=np.float32) fpgaRT.exec_async(input_ptrs[sId], npout_view, sId) qWait.put((sId, img_list, shMemIdxArr)) qWait.put((None, None, None)) #elapsedTime = ( time.time() - startTime ) #print ( "FPGA_process: ", float(numProcessed)/elapsedTime, "img/s") t.join() xdnn.closeHandle()
def fpga_process_async (qFrom, qTo, args, num_img, sharedInputArrs, prepProcQ, streamQ, fpgaOutputs, compJson): ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", [args["deviceID"]]) if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) qWait = mp.Queue(maxsize=100) numStreams = args['numstream'] bsz = args['batch_sz'] input_ptrs = [[] for i in range(numStreams)] numProcessed = 0 t = threading.Thread(target=xdnn_wait, args=(fpgaRT, qWait, qTo, prepProcQ, )) t.start() firstInputName = compJson.getInputs().iterkeys().next() firstOutputName = compJson.getOutputs().iterkeys().next() firstOutputShape = compJson.getOutputs().itervalues().next() firstInputShape = compJson.getInputs().itervalues().next() #startTime = time.time() while numProcessed < num_img or args['perpetual']: img_list = np.full( (bsz,), -1, dtype = np.int32 ) sId = streamQ.get() input_ptrs[sId] = [] shMemIdxArr = [] for j in range(bsz): (sMemIdx, img_idx) = qFrom.get() numProcessed += 1 img_list[j] = img_idx nparr_view = np.frombuffer(sharedInputArrs[sMemIdx].get_obj(), dtype = np.float32) #nparr_view = np.frombuffer(sharedInputArrs[sMemIdx].get_obj(), dtype = np.float32).reshape ( tuple ( firstInputShape )) input_ptrs[sId].append( nparr_view ) shMemIdxArr.append(sMemIdx) if numProcessed == num_img: break npout_view = np.frombuffer(fpgaOutputs[sId].get_obj(), dtype = np.float32).reshape( (args['batch_sz'],) + tuple ( firstOutputShape[1:]) ) fpgaRT.exec_async( {firstInputName : input_ptrs[sId]}, {firstOutputName : npout_view}, sId) qWait.put((sId, img_list, shMemIdxArr)) qWait.put ((None, None, None)) #elapsedTime = ( time.time() - startTime ) #print ( "FPGA_process: ", float(numProcessed)/elapsedTime, "img/s") t.join() xdnn.closeHandle()
def xdnn_process(qFrom, qTo): xdnn_handle = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib, g_numDevices) if xdnn_handle != 0: sys.exit(1) args = { 'datadir': g_xdnnTestDataDir, 'quantizecfg': g_fpgaCfgFile, 'scaleA': g_scaleA, 'scaleB': g_scaleB, 'PE': -1, 'netcfg': g_netFile } if g_xdnnv3 == True: weightsBlob = xdnn_io.loadWeightsBiasQuantv3(args) else: weightsBlob = xdnn_io.loadWeightsBiasQuant(args) fpgaOutput = prepareOutput(g_batchSize) while True: (inputs, inputImageFiles) = qFrom.get() if inputs is None: break fpgaInputs = prepareFpgaInputs(inputs) if not fpgaInputs: break startTime = timeit.default_timer() xdnn.execute( g_netFile, weightsBlob, fpgaInputs, fpgaOutput, g_batchSize, # num batches g_fpgaCfgFile, g_scaleB, g_PE) qTo.put((fpgaOutput, inputImageFiles)) print "[time] FPGA xdnn execute (%.2f ms):" % ( (timeit.default_timer() - startTime) * 1000) qTo.put((None, None)) xdnn.closeHandle()
def main(): args = xdnn_io.processCommandLine() ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib']) if ret != 0: sys.exit(1) (weightsBlob, fcWeight, fcBias) = xdnn_io.loadWeights(args) (fpgaInputs, batch_sz) = xdnn_io.prepareInput(args) fpgaOutput = xdnn_io.prepareOutput(args['fpgaoutsz'], batch_sz) for i in range(1): startTime = timeit.default_timer() xdnn.execute( args['netcfg'], weightsBlob, fpgaInputs, fpgaOutput, batch_sz, # num batches args['quantizecfg'], args['scaleB'], args['PE']) elapsedTime = timeit.default_timer() - startTime print "\nAfter FPGA (%f ms)" % (elapsedTime * 1000) startTime = timeit.default_timer() fcOut = xdnn.computeFC(fcWeight, fcBias, fpgaOutput, batch_sz, args['outsz'], args['fpgaoutsz'], args['useblas']) elapsedTime = timeit.default_timer() - startTime print "\nAfter FC (%f ms)" % (elapsedTime * 1000) #for i in range(10): # print "%f" % fpgaOutput[i], startTime = timeit.default_timer() softmaxOut = xdnn.computeSoftmax(fcOut, batch_sz) elapsedTime = timeit.default_timer() - startTime print "\nAfter Softmax (%f ms)" % (elapsedTime * 1000) #for i in range(10): # print "%f" % fpgaOutput[i], xdnn_io.printClassification(softmaxOut, args) print "\nSuccess!\n" xdnn.closeHandle()
def main(): args = xdnn_io.processCommandLine() # processCommandLine() startTime = timeit.default_timer() ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib']) # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib) if ret != 0: sys.exit(1) elapsedTime = timeit.default_timer() - startTime print "\nAfter createHandle (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() # TODO dict of tuples instead? fpgaInputs = {} fpgaOutputs = {} weightsBlobs = {} fcWeights = {} fcBiases = {} batch_sizes = {} fpgaOutputSizes = {} PEs = {} netFiles = {} confNames = [] for netconf_args in args['jsoncfg']: confName = str(netconf_args['name']) confNames.append(confName) # make a tuple instead PE = [int(x) for x in netconf_args['PE'].split()] # if cuMask in cuMaskList: # raise Exception('cuMasks are non-disjoint') datadir = str(netconf_args['datadir']) fpgaoutsz = int(netconf_args['fpgaoutsz']) netfile = str(netconf_args['netcfg']) PEs[confName] = PE (weightsBlobs[confName], fcWeights[confName], fcBiases[confName]) = xdnn_io.loadWeights(netconf_args) fpgaOutputSizes[confName] = fpgaoutsz (fpgaInputs[confName], batch_sz) = xdnn_io.prepareInput(netconf_args, PE) batch_sizes[confName] = batch_sz fpgaOutputs[confName] = xdnn_io.prepareOutput( int(netconf_args['fpgaoutsz']), batch_sz) netFiles[confName] = netfile elapsedTime = timeit.default_timer() - startTime print "\nAfter init (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() for netconf_args in args['jsoncfg']: confName = str(netconf_args['name']) xdnn.exec_async(netFiles[confName], weightsBlobs[confName], fpgaInputs[confName], fpgaOutputs[confName], int(batch_sizes[confName]), netconf_args['quantizecfg'], netconf_args['scaleB'], PEs[confName]) elapsedTime = timeit.default_timer() - startTime print "\nAfter Execonly (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() for confName in confNames: xdnn.get_result(PEs[confName]) elapsedTime = timeit.default_timer() - startTime print "\nAfter wait (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() for netconf_args in args['jsoncfg']: confName = str(netconf_args['name']) fcOut = xdnn.computeFC(fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], batch_sizes[confName], netconf_args['outsz'], netconf_args['fpgaoutsz'], netconf_args['useblas']) elapsedTime = timeit.default_timer() - startTime print "\nAfter FC (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() softmaxOut = xdnn.computeSoftmax(fcOut, batch_sizes[confName]) elapsedTime = timeit.default_timer() - startTime print "\nAfter Softmax (%f ms):" % (elapsedTime * 1000) xdnn_io.printClassification(softmaxOut, netconf_args) print "\nSuccess!\n" xdnn.closeHandle()
def executeOnFPGA(sProtoBufPath, Qmode, Inference_Data, handle, name, num_models): TOTAL_IMAGES = 128 # Create handle for FPGA ret, handle = xdnn.createHandle( "../overlaybins/" + "aws" + "/overlay_1.xclbin", "kernelSxdnn_0") #Initialize objects to store results fpgaRT = {} fpgaOutput = {} fcWeight = {} fcBias = {} netFiles = {} confNames = [] #Generate batch batch_array = generateRandomBatch(TOTAL_IMAGES, None) #Get Image batch to start inference for i in range(0, num_models): confNames += [str(i)] #Generate batch 10 * batchsize config = initializeFpgaModel(sProtoBufPath, Qmode) config["PE"] = i config["name"] = config["name"] + "_" + str(i) # Load weights to FPGA config = TransferWeightsFPGA(len(batch_array), config, handle, i) fpgaRT[str(i)] = xdnn.XDNNFPGAOp(handle, config) (fcWeight[str(i)], fcBias[str(i)]) = xdnn_io.loadFCWeightsBias(config) fpgaOutput[str(i)], fcOutput, config = AllocateMemoryToHost(config) start0 = time.time() # Schedule FPGA execution asynchronously for i in range(0, num_models): fpgaRT[str(i)].exec_async(batch_array, fpgaOutput[str(i)], i) start1 = time.time() #Fetch results of all parallel executions for i in range(0, num_models): #Get FPGA output ret = fpgaRT[str(i)].get_result(i) #Compute Inner product - fully connected layer xdnn.computeFC(fcWeight[str(i)], fcBias[str(i)], fpgaOutput[str(i)], config['batch_sz'], config['outsz'], config['fpgaoutsz'], fcOutput) #Compute output softmax softmaxOut = xdnn.computeSoftmax(fcOutput) #xdnn_io.printClassification(softmaxOut, config['images'], labels); end = time.time() print("throughput", (num_models * len(batch_array) / (end - start0)), "duration", end - start0) Inference_result = [] #Append results Inference_Data.append({ "experiment": str(Qmode) + "_bit_mode", "duration_overall": end - start0, "imgsPerSecAll": num_models * len(batch_array) / (end - start0), "num_models_parallel": num_models }) xdnn.closeHandle() Inference_Data = pd.DataFrame(Inference_Data) # Inference_Data.to_csv('multinet_results.csv') result = pd.read_csv('multinet_results.csv') result = result.append(Inference_Data) result.to_csv('multinet_results.csv')
def main(argv=None): args = xdnn_io.processCommandLine(argv) startTime = timeit.default_timer() ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib']) if ret != 0: sys.exit(1) elapsedTime = timeit.default_timer() - startTime print "\nTime to createHandle (%f ms):" % (elapsedTime * 1000) # we do not need other args keys except 'jsoncfg' args = args['jsoncfg'] netCfgs = defaultdict(dict) confNames = [] startTime = timeit.default_timer() for streamId, netCfg_args in enumerate(args): confName = str(netCfg_args['name']) confNames += [confName] netCfg_args['netcfg'] = './data/{}_{}.cmd'.format( netCfg_args['net'], netCfg_args['dsp']) netCfgs[confName]['streamId'] = streamId netCfgs[confName]['args'] = netCfg_args (netCfgs[confName]['weightsBlobs'], netCfgs[confName]['fcWeights'], netCfgs[confName]['fcBiases']) = xdnn_io.loadWeights(netCfg_args) netCfgs[confName]['batch_sz'] = 1 netCfgs[confName]['fpgaOutputs'] = xdnn_io.prepareOutput( netCfg_args["fpgaoutsz"], netCfgs[confName]['batch_sz']) elapsedTime = timeit.default_timer() - startTime print "\nTime to init (%f ms):" % (elapsedTime * 1000) ## run YOLO confName = 'yolo' netCfg = netCfgs[confName] startTime = timeit.default_timer() (netCfg['fpgaInputs'], netCfg['batch_sz'], netCfg['shapes']) = xdnn_io.prepareInput(netCfg['args'], netCfg['args']['PE']) elapsedTime = timeit.default_timer() - startTime print "\nTime to transfer input image to FPGA (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() xdnn.exec_async(netCfg['args']['netcfg'], netCfg['weightsBlobs'], netCfg['fpgaInputs'], netCfg['fpgaOutputs'], netCfg['batch_sz'], netCfg['args']['quantizecfg'], netCfg['args']['scaleB'], netCfg['args']['PE'], netCfg['streamId']) elapsedTime = timeit.default_timer() - startTime print "\nTime to execute Yolo on FPGA (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() xdnn.get_result(netCfg['args']['PE'], netCfg['streamId']) elapsedTime = timeit.default_timer() - startTime print "\nTime to retrieve yolo outputs from FPGA (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() out_h = \ out_w = netCfg['args']['in_shape'][1] / 32 anchor_boxes = 5 objectness = 1 coordinates = 4 classes = 80 out_c = objectness + coordinates + classes # Reshape the fpgaOutputs into a 4D volume yolo_outputs = netCfg['fpgaOutputs'].reshape(anchor_boxes, out_c, out_h, out_w) # Apply sigmoid to 1st, 2nd, 4th channel for all anchor boxes yolo_outputs[:, 0:2, :, :] = sigmoid( yolo_outputs[:, 0:2, :, :]) # (X,Y) Predictions yolo_outputs[:, 4, :, :] = sigmoid( yolo_outputs[:, 4, :, :]) # Objectness / Box Confidence # Apply softmax on the class scores foreach anchor box for box in range(anchor_boxes): yolo_outputs[box, 5:, :, :] = softmax(yolo_outputs[box, 5:, :, :]) # Perform Non-Max Suppression # Non-Max Suppression filters out detections with a score lesser than 0.24 # Additionally if there are two predections with an overlap > 30%, the prediction with the lower score will be filtered scorethresh = 0.24 iouthresh = 0.3 bboxes = nms.do_baseline_nms(yolo_outputs.flat, netCfg['shapes'][0][1], netCfg['shapes'][0][0], netCfg['args']['in_shape'][2], netCfg['args']['in_shape'][1], out_w, out_h, anchor_boxes, classes, scorethresh, iouthresh) with open(netCfg['args']['labels']) as f: namez = f.readlines() names = [x.strip() for x in namez] # Lets print the detections our model made for j in range(len(bboxes)): print("Obj %d: %s" % (j, names[bboxes[j]['classid']])) print("\t score = %f" % (bboxes[j]['prob'])) print("\t (xlo,ylo) = (%d,%d)" % (bboxes[j]['ll']['x'], bboxes[j]['ll']['y'])) print("\t (xhi,yhi) = (%d,%d)" % (bboxes[j]['ur']['x'], bboxes[j]['ur']['y'])) elapsedTime = timeit.default_timer() - startTime print "\nTime to execute on CPU (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() img = cv2.imread(netCfg['args']['images'][0]) #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # YOLO was trained with RGB, not BGR like Caffe # choose one of the bounding boxes obj_idx = 0 # specify a margin added to the selected bounding box margin = 10 H_slice = slice(max(0, bboxes[obj_idx]['ur']['y'] - margin), min(img.shape[0], bboxes[obj_idx]['ll']['y'] + margin)) W_slice = slice(max(0, bboxes[obj_idx]['ll']['x'] - margin), min(img.shape[1], bboxes[obj_idx]['ur']['x'] + margin)) img = img[H_slice, W_slice, :] print('pass obj {}: {} with size {} to googlenet'.format( obj_idx, names[bboxes[obj_idx]['classid']], img.shape)) cv2.imwrite('cropped_yolo_output.jpg', img) ''' if img.shape[-1] == 1 or img.shape[-1] == 3: # [H, W, C] old_dims = np.array(img.shape[:2], dtype=float) else: # [C, H, W] old_dims = np.array(img.shape[1:], dtype=float) ''' ## run GOOGLENET confName = 'googlenet' netCfg = netCfgs[confName] ''' new_dims = netCfg['args']['in_shape'] if new_dims[-1] == 1 or new_dims[-1] == 3: # [H, W, C] new_dims = np.array(new_dims[:2], dtype=int) else: # [C, H, W] new_dims = np.array(new_dims[1:], dtype=int) scale_dims = new_dims.copy() min_scale_idx = np.argmin(old_dims/new_dims) if min_scale_idx == 0: scale_dims[1] = scale_dims[0] * old_dims[1] / old_dims[0] else: scale_dims[0] = scale_dims[1] * old_dims[0] / old_dims[1] scale_dims = scale_dims.astype(int) # transform input image to match googlenet # scale the image print('scale image to {}'.format(scale_dims)) img = resize_image(img, list(scale_dims)) cv2.imwrite('rescaled_scaled.jpg', img) # crop the image crop_idxs = [np.arange(new_dims[i]) + int((scale_dims[i]-new_dims[i])/2) for i in range(2)] if img.shape[-1] == 1 or img.shape[-1] == 3: # [H, W, C] img = img[crop_idxs[0].reshape(-1,1), crop_idxs[1], :] else: # [C, H, W] img = img[:, crop_idxs[0].reshape(-1,1), crop_idxs[1]] print('crop image to {}'.format(img.shape)) cv2.imwrite('rescaled_cropped.jpg', img) #img = np.transpose(img, (2, 0, 1)) #cv2.imwrite('rescaled_transposed.jpg', img) ''' netCfg['args']['images'] = [img] elapsedTime = timeit.default_timer() - startTime print "\nTime to prepare googlenet image on CPU (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() (netCfg['fpgaInputs'], netCfg['batch_sz'], netCfg['shapes']) = xdnn_io.prepareInput(netCfg['args'], netCfg['args']['PE']) elapsedTime = timeit.default_timer() - startTime print "\nTime to transfer input image to FPGA (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() xdnn.exec_async(netCfg['args']['netcfg'], netCfg['weightsBlobs'], netCfg['fpgaInputs'], netCfg['fpgaOutputs'], netCfg['batch_sz'], netCfg['args']['quantizecfg'], netCfg['args']['scaleB'], netCfg['args']['PE'], netCfg['streamId']) elapsedTime = timeit.default_timer() - startTime print "\nTime to execute googlenet on FPGA (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() xdnn.get_result(netCfg['args']['PE'], netCfg['streamId']) elapsedTime = timeit.default_timer() - startTime print "\nTime to retrieve googlenet outputs from FPGA (%f ms):" % ( elapsedTime * 1000) startTime = timeit.default_timer() fcOut = np.empty((netCfg['batch_sz'] * netCfg['args']['outsz']), dtype=np.float32, order='C') xdnn.computeFC(netCfg['fcWeights'], netCfg['fcBiases'], netCfg['fpgaOutputs'], netCfg['batch_sz'], netCfg['args']['outsz'], netCfg['args']['fpgaoutsz'], fcOut) elapsedTime = timeit.default_timer() - startTime print "\nTime to run FC layers on CPU (%f ms):" % (elapsedTime * 1000) startTime = timeit.default_timer() softmaxOut = xdnn.computeSoftmax(fcOut, netCfg['batch_sz']) elapsedTime = timeit.default_timer() - startTime print "\nTime to run Softmax on CPU (%f ms):" % (elapsedTime * 1000) xdnn_io.printClassification(softmaxOut, netCfg['args']) print "\nSuccess!\n" xdnn.closeHandle()
def networkForward(netcfg, layername): #args = xdnn_io.processCommandLine() parser = xdnn_io.default_parser_args() parser.add_argument('--layerindex', type=int, default=0, help='Index value for layer in json', required=True) argvt = parser.parse_args() args = xdnn_io.make_dict_args(argvt) args['netcfg'] = netcfg # Hardcode these parameters, so we only have to look at performance of 1 PE args["batch_sz"] = 1 args["PE"] = 0 #print "{:-^100}".format(' Before: createHandle ') ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") #print "{:-^100}".format(' After: createHandle ') if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) #print "{:-^100}".format('1') fpgaOutput = fpgaRT.getOutputs() #print "{:-^100}".format('2') fpgaInput = fpgaRT.getInputs() #print "{:-^100}".format('3') img_paths = xdnn_io.getFilePaths(args['images']) inShape = (args['batch_sz'],) + tuple ( tuple (fpgaRT.getInputDescriptors().values() )[0][1:] ) firstInput = list(fpgaInput.values())[0] firstOutput = list (fpgaOutput.values())[0] for i in xrange(0, len(img_paths), args['batch_sz']): pl = [] for j, p in enumerate(img_paths[i:i + args['batch_sz']]): firstInput[0, ...], _ = xdnn_io.loadImageBlobFromFile(img_paths[0], args['img_raw_scale'], args['img_mean'], args['img_input_scale'], inShape[2], inShape[3]) pl.append(p) with open(args['netcfg']) as fp: data = json.load(fp) #print json.dumps(data, indent=2) # Strip nodes that don't run in hardware nodes = data['network'] nodes = [x for x in nodes if x['xdnn_kv']] nLayers = len(nodes) # How many iterations to run, and average across iterations = 1 # Initialize empty list to hold accumulated runtime t1 = [] for k in range(iterations): t1.append(0.0) # Run N iterations of network permutations for l in range(iterations): fpgaRT.execute(fpgaInput, fpgaOutput) t1[l] += (fpgaRT.get_exec_time()) #for node in nodes: # print node['name'] # Average it avetime = sum(t1)/iterations #print "{:<25} = {:<25}".format(layername, avetime) return avetime xdnn.closeHandle() del fpgaRT del fpgaInput del fpgaOutput del ret
def xdnn_process(qFrom, qTo, qMsgFromXdnn, sharedInputArrs): global g_numImages global g_numProcessed global g_img_c global g_img_h global g_img_w xdnn_handle = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib, g_numDevices) if xdnn_handle != 0: sys.exit(1) fpgaOutputs = [] for inp in sharedInputArrs: fpgaOutputs.append(xdnn_io.prepareOutput(g_fpgaOutputSize, g_batchSize)) # load weights args = { 'datadir': g_xdnnTestDataDir, 'quantizecfg': g_fpgaCfgFile, 'scaleA': g_scaleA, 'scaleB': g_scaleB, 'PE': -1, 'netcfg': g_netFile } if g_xdnnv3 == True: weightsBlob = xdnn_io.loadWeightsBiasQuantv3(args) else: weightsBlob = xdnn_io.loadWeightsBiasQuant(args) # Dummy calls to load script for streamId in range(len(sharedInputArrs)): fpgaInputs = xdnn.passThruInputsForFpga(sharedInputArrs[streamId], g_fpgaBatchSize, g_paddedImageSize, g_fpgaCfgFile, g_scaleB, -1, g_firstFpgaLayerName, streamId) xdnn.exec_async(g_netFile, weightsBlob, fpgaInputs, fpgaOutputs[streamId], g_batchSize, g_fpgaCfgFile, g_scaleB, -1, streamId) xdnn.get_result(-1, streamId) # XDNN Is Ready to Rock qMsgFromXdnn.put(timeit.default_timer()) # Share Start Time print("Streaming...") pendingJobQ = [] while True: streamId = getImages(qFrom) if streamId is None: # finish pending jobs & quit for (streamId, startTime) in pendingJobQ: xdnn.get_result(-1, streamId) now = timeit.default_timer() g_perfProf.addSample("execute (latency)", now - startTime) g_perfProf.addSample("execute (thruput)", now - startTime) putFpgaOutputs(fpgaOutputs[streamId], qTo) break startTime = timeit.default_timer() fpgaInputs = xdnn.passThruInputsForFpga(sharedInputArrs[streamId], g_fpgaBatchSize, g_paddedImageSize, g_fpgaCfgFile, g_scaleB, -1, g_firstFpgaLayerName, streamId) g_perfProf.addSample("passThruInputsForFpga", timeit.default_timer() - startTime) if not fpgaInputs: break startTime = timeit.default_timer() xdnn.exec_async(g_netFile, weightsBlob, fpgaInputs, fpgaOutputs[streamId], g_batchSize, g_fpgaCfgFile, g_scaleB, -1, streamId) pendingJobQ.append((streamId, startTime)) if len(pendingJobQ) >= len(fpgaOutputs): # pop oldest job off the q and get_result (streamId, jobStartTime) = pendingJobQ.pop(0) xdnn.get_result(-1, streamId) now = timeit.default_timer() g_perfProf.addSample("execute (latency)", now - jobStartTime) g_perfProf.addSample("execute (thruput)", now - startTime) putFpgaOutputs(fpgaOutputs[streamId], qTo) qTo.put(None) g_perfProf.syncToShared() xdnn.closeHandle()
def benchmark(): mode = "Non-Blocking" #mode = "Blocking" # Extract Arguments from json args = xdnn_io.processCommandLine()["jsoncfg"][0] if "platform" in args: args["xclbin"] = "../../overlaybins/" + str( args["platform"]) + "/" + args["xclbin"] # Establish Communication w/ FPGA if xdnn.createHandle(args['xclbin'], libFile=args['xlnxlib']): sys.exit(1) # Transfer weights to device memory if "usexdnnv3" in args and args["usexdnnv3"] == "1": weightsBlob = xdnn_io.loadWeightsBiasQuantv3(args) else: weightsBlob = xdnn_io.loadWeightsBiasQuant(args) # Create random input data fpgaInputs = [] fpgaInputs.append( np.float32( np.random.standard_normal( (args["batchsz"], reduce(mul, args["in_shape"], 1))))) fpgaInputs[0] = xdnn.quantizeInputs(args["firstfpgalayer"], args["quantizecfg"], args["scaleB"], fpgaInputs[0]) fpgaInputs[0] = xdnn.prepareInputsForFpga(fpgaInputs[0], args["quantizecfg"], args["scaleB"], -1, args["firstfpgalayer"], 0) fpgaInputs.append( np.float32( np.random.standard_normal( (args["batchsz"], reduce(mul, args["in_shape"], 1))))) fpgaInputs[1] = xdnn.quantizeInputs(args["firstfpgalayer"], args["quantizecfg"], args["scaleB"], fpgaInputs[1]) fpgaInputs[1] = xdnn.prepareInputsForFpga(fpgaInputs[1], args["quantizecfg"], args["scaleB"], -1, args["firstfpgalayer"], 1) # Create buffers in host memory for result fpgaOutputs = [] fpgaOutputs.append( xdnn_io.prepareOutput(args['fpgaoutsz'], args["batchsz"])) fpgaOutputs.append( xdnn_io.prepareOutput(args['fpgaoutsz'], args["batchsz"])) # Load network schedule to accelerator xdnn.initScript(args['netcfg'], weightsBlob, args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 0) xdnn.initScript(args['netcfg'], weightsBlob, args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 1) # Run forward propagation N times print("Running inference...\n") cumulative_time = -1 * timeit.default_timer() if mode == "Non-Blocking": xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[0], fpgaOutputs[0], args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 0) xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[1], fpgaOutputs[1], args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 1) for i in range(args["iterations"] / 2 - 1): xdnn.get_result(-1, 0) # get 0 xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[0], fpgaOutputs[0], args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 0) # push 0 xdnn.get_result(-1, 1) # get 1 xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[1], fpgaOutputs[1], args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 1) # push 1 xdnn.get_result(-1, 0) # get 0 xdnn.get_result(-1, 1) # get 1 else: for i in range(args["iterations"]): xdnn.execute(args['netcfg'], weightsBlob, fpgaInputs[0], fpgaOutputs[0], args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE']) cumulative_time += timeit.default_timer() # Summarize print("===========================================") print("Performance Summary\n") print(" Network: %s" % (args["name"])) print(" Precision: %d" % (args["precision"])) print(" Images: %d" % (args["iterations"] * args["batchsz"])) print(" Batch Size: %d" % (args["batchsz"])) print(" Total Batches: %d" % (args["iterations"])) print(" Total Time: %.2f ms" % (1000 * cumulative_time)) print(" SIL: %.2f ms" % (1000 * cumulative_time / args["iterations"])) # Time per batch # Single Image Latency print(" FPS: %.2f" % (args["iterations"] * args["batchsz"] / cumulative_time)) print(" GOPS: %.2f" % (args["ops"] * args["iterations"] * args["batchsz"] / cumulative_time / 1000000000)) print("===========================================\n") # Release FPGA xdnn.closeHandle()