def main(argv): args = xdnn_io.processCommandLine(argv) ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib) if ret != 0: sys.exit(1) labels = xdnn_io.get_labels(args['labels']) # TODO dict of tuples instead? fpgaRT = {} fpgaOutputs = {} fcWeights = {} fcBiases = {} netFiles = {} confNames = [] args = args['jsoncfg'] # we do not use other args' keys for netconf_args in args: confName = str(netconf_args['name']) confNames += [confName] # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp']) fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args) netconf_args['in_shape'] = tuple((netconf_args['batch_sz'],) + tuple(fpgaRT[confName].getInputDescriptors().itervalues().next()[1:] )) (fcWeights[confName], fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args) fpgaOutputs[confName] = np.empty ((netconf_args['batch_sz'], int(netconf_args['fpgaoutsz']),), dtype=np.float32, order='C') netFiles[confName] = str(netconf_args['netcfg']) batchArrays = [] for streamId, netconf_args in enumerate(args): batchArrays.append(np.empty(netconf_args['in_shape'], dtype=np.float32, order='C')) pl = [] img_paths = xdnn_io.getFilePaths(netconf_args['images']) for j, p in enumerate(img_paths[:netconf_args['batch_sz']]): batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile(p, netconf_args['img_raw_scale'], netconf_args['img_mean'], netconf_args['img_input_scale'], netconf_args['in_shape'][2], netconf_args['in_shape'][3]) pl.append(p) confName = str(netconf_args['name']) firstInputName = fpgaRT[confName].getInputs().iterkeys().next() firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next() fpgaRT[confName].exec_async({ firstInputName : batchArrays[-1] }, { firstOutputName : fpgaOutputs[confName] }, streamId) for streamId, confName in enumerate(confNames): fpgaRT[confName].get_result (streamId) for netconf_args in args: confName = str(netconf_args['name']) fcOut = np.empty( (netconf_args['batch_sz'], netconf_args['outsz']), dtype=np.float32, order = 'C') xdnn.computeFC (fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], fcOut) softmaxOut = xdnn.computeSoftmax(fcOut) xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels); xdnn.closeHandle()
def init_fpga(): # Instead of using command line, we hard code it here. # Typing correct args is almost impossible so either do it in .sh or .py # global g_args global g_ctxt print(" --- INIT FPGA --- \n") xdnnArgs = build_xdnn_args() print(xdnnArgs) g_args = xdnn_io.processCommandLine(xdnnArgs) print(" --- After parsing --- \n") print(g_args) print(" --- Create handle --- \n") ret, handles = xdnn.createHandle(g_args['xclbin'], "kernelSxdnn_0") if ret != 0: print(" --- !!! FAILED: Cannot create handle. --- \n") sys.exit(1) print(" --- Create fpgaRT --- \n") fpgaRT = xdnn.XDNNFPGAOp(handles, g_args) g_ctxt["fpgaRT"] = fpgaRT print(" --- Weight and Bias --- \n") fcWeight, fcBias = xdnn_io.loadFCWeightsBias(g_args) g_ctxt["fcWeight"] = fcWeight g_ctxt["fcBias"] = fcBias print(" --- Init input input/output area --- \n") if is_deploymode(): g_ctxt['fpgaOutput'] = fpgaRT.getOutputs() g_ctxt['fpgaInput'] = fpgaRT.getInputs() g_ctxt['inShape'] = (g_args['batch_sz'], ) + tuple( fpgaRT.getInputDescriptors().itervalues().next()[1:]) else: g_ctxt['fpgaOutput'] = np.empty(( g_args['batch_sz'], g_args['fpgaoutsz'], ), dtype=np.float32, order='C') g_ctxt['batch_array'] = np.empty( ((g_args['batch_sz'], ) + g_args['in_shape']), dtype=np.float32, order='C') g_ctxt['fcOutput'] = np.empty(( g_args['batch_sz'], g_args['outsz'], ), dtype=np.float32, order='C') print(" --- Get lables --- \n") g_ctxt['labels'] = xdnn_io.get_labels(g_args['labels']) # golden? What is that? # Seems we are done. print(" --- FPGA INITIALIZED! ---\n")
def main(): args = xdnn_io.processCommandLine() ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args) img_paths = xdnn_io.getFilePaths(args['images']) fpgaOutput = np.empty(( args['batch_sz'], args['fpgaoutsz'], ), dtype=np.float32, order='C') fcOutput = np.empty(( args['batch_sz'], args['outsz'], ), dtype=np.float32, order='C') batch_array = np.empty(((args['batch_sz'], ) + args['in_shape']), dtype=np.float32, order='C') labels = xdnn_io.get_labels(args['labels']) if args['golden']: goldenMap = xdnn_io.getGoldenMap(args['golden']) top5Count = 0 top1Count = 0 for i in xrange(0, len(img_paths), args['batch_sz']): pl = [] for j, p in enumerate(img_paths[i:i + args['batch_sz']]): batch_array[j, ...], _ = xdnn_io.loadImageBlobFromFile( p, args['img_raw_scale'], args['img_mean'], args['img_input_scale'], args['in_shape'][2], args['in_shape'][1]) pl.append(p) fpgaRT.execute(batch_array, fpgaOutput) xdnn.computeFC(fcWeight, fcBias, fpgaOutput, args['batch_sz'], args['outsz'], args['fpgaoutsz'], fcOutput) softmaxOut = xdnn.computeSoftmax(fcOutput) xdnn_io.printClassification(softmaxOut, pl, labels) if args['golden']: for j, p in enumerate(img_paths[i:i + args['batch_sz']]): top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 1) top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 5) xdnn.closeHandle() if args['golden']: print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % ( len(img_paths), float(top1Count) / float(len(img_paths)) * 100., float(top5Count) / float(len(img_paths)) * 100.)
def fpga_process_async(qFrom, qTo, args, num_img, sharedInputArrs, prepProcQ, streamQ, fpgaOutputs): ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", [args["deviceID"]]) if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) qWait = mp.Queue(maxsize=100) numStreams = args['numstream'] bsz = args['batch_sz'] input_ptrs = [] for i in range(numStreams): input_ptrs.append([]) numProcessed = 0 t = threading.Thread(target=xdnn_wait, args=( fpgaRT, qWait, qTo, prepProcQ, )) t.start() #startTime = time.time() while numProcessed < num_img or args['perpetual']: img_list = np.full((bsz, ), -1, dtype=np.int32) sId = streamQ.get() input_ptrs[sId] = [] shMemIdxArr = [] for j in range(bsz): (sMemIdx, img_idx) = qFrom.get() numProcessed += 1 img_list[j] = img_idx nparr_view = np.frombuffer(sharedInputArrs[sMemIdx].get_obj(), dtype=np.float32) nparr_view = nparr_view[np.newaxis, ...] input_ptrs[sId].append( nparr_view.ctypes.data_as(ctypes.POINTER(ctypes.c_float))) shMemIdxArr.append(sMemIdx) if numProcessed == num_img: break npout_view = np.frombuffer(fpgaOutputs[sId].get_obj(), dtype=np.float32) fpgaRT.exec_async(input_ptrs[sId], npout_view, sId) qWait.put((sId, img_list, shMemIdxArr)) qWait.put((None, None, None)) #elapsedTime = ( time.time() - startTime ) #print ( "FPGA_process: ", float(numProcessed)/elapsedTime, "img/s") t.join() xdnn.closeHandle()
def fpga_process_async (qFrom, qTo, args, num_img, sharedInputArrs, prepProcQ, streamQ, fpgaOutputs, compJson): ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", [args["deviceID"]]) if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) qWait = mp.Queue(maxsize=100) numStreams = args['numstream'] bsz = args['batch_sz'] input_ptrs = [[] for i in range(numStreams)] numProcessed = 0 t = threading.Thread(target=xdnn_wait, args=(fpgaRT, qWait, qTo, prepProcQ, )) t.start() firstInputName = compJson.getInputs().iterkeys().next() firstOutputName = compJson.getOutputs().iterkeys().next() firstOutputShape = compJson.getOutputs().itervalues().next() firstInputShape = compJson.getInputs().itervalues().next() #startTime = time.time() while numProcessed < num_img or args['perpetual']: img_list = np.full( (bsz,), -1, dtype = np.int32 ) sId = streamQ.get() input_ptrs[sId] = [] shMemIdxArr = [] for j in range(bsz): (sMemIdx, img_idx) = qFrom.get() numProcessed += 1 img_list[j] = img_idx nparr_view = np.frombuffer(sharedInputArrs[sMemIdx].get_obj(), dtype = np.float32) #nparr_view = np.frombuffer(sharedInputArrs[sMemIdx].get_obj(), dtype = np.float32).reshape ( tuple ( firstInputShape )) input_ptrs[sId].append( nparr_view ) shMemIdxArr.append(sMemIdx) if numProcessed == num_img: break npout_view = np.frombuffer(fpgaOutputs[sId].get_obj(), dtype = np.float32).reshape( (args['batch_sz'],) + tuple ( firstOutputShape[1:]) ) fpgaRT.exec_async( {firstInputName : input_ptrs[sId]}, {firstOutputName : npout_view}, sId) qWait.put((sId, img_list, shMemIdxArr)) qWait.put ((None, None, None)) #elapsedTime = ( time.time() - startTime ) #print ( "FPGA_process: ", float(numProcessed)/elapsedTime, "img/s") t.join() xdnn.closeHandle()
def setup(self, bottom, top): self.param_dict = eval(self.param_str) # Get args from prototxt self._args = xdnn_io.make_dict_args(self.param_dict) self._numPE = self._args[ "batch_sz"] # Bryan hack to detremine number of PEs in FPGA # Establish FPGA Communication, Load bitstream ret, handles = xdnn.createHandle(self._args["xclbin"], "kernelSxdnn_0") if ret != 0: raise Exception("Failed to open FPGA handle.") self._args["scaleB"] = 1 self._args["PE"] = -1 # Instantiate runtime interface object self.fpgaRT = xdnn.XDNNFPGAOp(handles, self._args) self._indictnames = self._args["input_names"] self._outdictnames = self._args["output_names"] self._parser = xdnn.CompilerJsonParser(self._args["netcfg"])
def __init__(self, maxNumStreams): self._maxNumStreams = maxNumStreams self._streamsAvailable = [] self._streamInputs = [] self._streamOutputs = [] self._config = xdnn_io.processCommandLine() ret, handles = xdnn.createHandle(self._config['xclbin']) if ret != 0: sys.exit(1) self._fpgaRT = xdnn.XDNNFPGAOp(handles, self._config) self._fcWeight, self._fcBias = xdnn_io.loadFCWeightsBias(self._config) self._labels = xdnn_io.get_labels(self._config['labels']) for i in range(maxNumStreams): self._streamsAvailable.append(i) self._streamInputs.append(None) self._streamOutputs.append(None)
def executeOnFPGA(sProtoBufPath, Qmode, Inference_Data, handle, name, num_models): TOTAL_IMAGES = 128 # Create handle for FPGA ret, handle = xdnn.createHandle( "../overlaybins/" + "aws" + "/overlay_1.xclbin", "kernelSxdnn_0") #Initialize objects to store results fpgaRT = {} fpgaOutput = {} fcWeight = {} fcBias = {} netFiles = {} confNames = [] #Generate batch batch_array = generateRandomBatch(TOTAL_IMAGES, None) #Get Image batch to start inference for i in range(0, num_models): confNames += [str(i)] #Generate batch 10 * batchsize config = initializeFpgaModel(sProtoBufPath, Qmode) config["PE"] = i config["name"] = config["name"] + "_" + str(i) # Load weights to FPGA config = TransferWeightsFPGA(len(batch_array), config, handle, i) fpgaRT[str(i)] = xdnn.XDNNFPGAOp(handle, config) (fcWeight[str(i)], fcBias[str(i)]) = xdnn_io.loadFCWeightsBias(config) fpgaOutput[str(i)], fcOutput, config = AllocateMemoryToHost(config) start0 = time.time() # Schedule FPGA execution asynchronously for i in range(0, num_models): fpgaRT[str(i)].exec_async(batch_array, fpgaOutput[str(i)], i) start1 = time.time() #Fetch results of all parallel executions for i in range(0, num_models): #Get FPGA output ret = fpgaRT[str(i)].get_result(i) #Compute Inner product - fully connected layer xdnn.computeFC(fcWeight[str(i)], fcBias[str(i)], fpgaOutput[str(i)], config['batch_sz'], config['outsz'], config['fpgaoutsz'], fcOutput) #Compute output softmax softmaxOut = xdnn.computeSoftmax(fcOutput) #xdnn_io.printClassification(softmaxOut, config['images'], labels); end = time.time() print("throughput", (num_models * len(batch_array) / (end - start0)), "duration", end - start0) Inference_result = [] #Append results Inference_Data.append({ "experiment": str(Qmode) + "_bit_mode", "duration_overall": end - start0, "imgsPerSecAll": num_models * len(batch_array) / (end - start0), "num_models_parallel": num_models }) xdnn.closeHandle() Inference_Data = pd.DataFrame(Inference_Data) # Inference_Data.to_csv('multinet_results.csv') result = pd.read_csv('multinet_results.csv') result = result.append(Inference_Data) result.to_csv('multinet_results.csv')
def networkForward(netcfg, layername): #args = xdnn_io.processCommandLine() parser = xdnn_io.default_parser_args() parser.add_argument('--layerindex', type=int, default=0, help='Index value for layer in json', required=True) argvt = parser.parse_args() args = xdnn_io.make_dict_args(argvt) args['netcfg'] = netcfg # Hardcode these parameters, so we only have to look at performance of 1 PE args["batch_sz"] = 1 args["PE"] = 0 #print "{:-^100}".format(' Before: createHandle ') ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") #print "{:-^100}".format(' After: createHandle ') if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) #print "{:-^100}".format('1') fpgaOutput = fpgaRT.getOutputs() #print "{:-^100}".format('2') fpgaInput = fpgaRT.getInputs() #print "{:-^100}".format('3') img_paths = xdnn_io.getFilePaths(args['images']) inShape = (args['batch_sz'],) + tuple ( tuple (fpgaRT.getInputDescriptors().values() )[0][1:] ) firstInput = list(fpgaInput.values())[0] firstOutput = list (fpgaOutput.values())[0] for i in xrange(0, len(img_paths), args['batch_sz']): pl = [] for j, p in enumerate(img_paths[i:i + args['batch_sz']]): firstInput[0, ...], _ = xdnn_io.loadImageBlobFromFile(img_paths[0], args['img_raw_scale'], args['img_mean'], args['img_input_scale'], inShape[2], inShape[3]) pl.append(p) with open(args['netcfg']) as fp: data = json.load(fp) #print json.dumps(data, indent=2) # Strip nodes that don't run in hardware nodes = data['network'] nodes = [x for x in nodes if x['xdnn_kv']] nLayers = len(nodes) # How many iterations to run, and average across iterations = 1 # Initialize empty list to hold accumulated runtime t1 = [] for k in range(iterations): t1.append(0.0) # Run N iterations of network permutations for l in range(iterations): fpgaRT.execute(fpgaInput, fpgaOutput) t1[l] += (fpgaRT.get_exec_time()) #for node in nodes: # print node['name'] # Average it avetime = sum(t1)/iterations #print "{:<25} = {:<25}".format(layername, avetime) return avetime xdnn.closeHandle() del fpgaRT del fpgaInput del fpgaOutput del ret