def fpga_process(qin, qout): init_fpga() while True: inputs = qin.get() if inputs is None: break # print(" Prepare inputs for FPGA ...\n") fpgaInputs = pyxfdnn.prepareInputsForFpga(inputs, config["quantizecfg"], config["scaleB"], -1, config["firstfpgalayer"]) if not fpgaInputs: break # print(" Executing on FPGA!\n") pyxfdnn.execute( config["fpgacommands"], config["weightsBlob"], fpgaInputs, config["g_fpgaOutput"], g_batchSize, config["quantizecfg"], config["scaleB"] ) # print(" Done, put result back to q.\n") qout.put(config["g_fpgaOutput"]) qout.put(None) pyxfdnn.closeHandle()
def __exit__(self, *a): self.stop() self.proc_fpga.join() self.proc_bbox.join() if self.xdnn_handle: xdnn.closeHandle()
def main(): args = xdnn_io.processCommandLine() ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) fpgaOutput = fpgaRT.getOutputs() fpgaInput = fpgaRT.getInputs() fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args) img_paths = xdnn_io.getFilePaths(args['images']) fcOutput = np.empty(( args['batch_sz'], args['outsz'], ), dtype=np.float32, order='C') inShape = (args['batch_sz'], ) + tuple( tuple(fpgaRT.getInputDescriptors().values())[0][1:]) labels = xdnn_io.get_labels(args['labels']) if args['golden']: goldenMap = xdnn_io.getGoldenMap(args['golden']) top5Count = 0 top1Count = 0 firstInput = list(fpgaInput.values())[0] firstOutput = list(fpgaOutput.values())[0] for i in range(0, len(img_paths), args['batch_sz']): pl = [] for j, p in enumerate(img_paths[i:i + args['batch_sz']]): firstInput[j, ...], _ = xdnn_io.loadImageBlobFromFile( p, args['img_raw_scale'], args['img_mean'], args['img_input_scale'], inShape[2], inShape[3]) pl.append(p) fpgaRT.execute(fpgaInput, fpgaOutput) xdnn.computeFC(fcWeight, fcBias, firstOutput, fcOutput) softmaxOut = xdnn.computeSoftmax(fcOutput) xdnn_io.printClassification(softmaxOut, pl, labels) if args['golden']: for j, p in enumerate(img_paths[i:i + args['batch_sz']]): top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 1) top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 5) xdnn.closeHandle() if args['golden']: print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % ( len(img_paths), float(top1Count) / float(len(img_paths)) * 100., float(top5Count) / float(len(img_paths)) * 100.)
def main(argv): args = xdnn_io.processCommandLine(argv) ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib) if ret != 0: sys.exit(1) labels = xdnn_io.get_labels(args['labels']) # TODO dict of tuples instead? fpgaRT = {} fpgaOutputs = {} fcWeights = {} fcBiases = {} netFiles = {} confNames = [] args = args['jsoncfg'] # we do not use other args' keys for netconf_args in args: confName = str(netconf_args['name']) confNames += [confName] # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp']) fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args) netconf_args['in_shape'] = tuple((netconf_args['batch_sz'], ) + tuple( fpgaRT[confName].getInputDescriptors().itervalues().next()[1:])) (fcWeights[confName], fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args) fpgaOutputs[confName] = np.empty(( netconf_args['batch_sz'], int(netconf_args['fpgaoutsz']), ), dtype=np.float32, order='C') netFiles[confName] = str(netconf_args['netcfg']) batchArrays = [] for streamId, netconf_args in enumerate(args): batchArrays.append( np.empty(netconf_args['in_shape'], dtype=np.float32, order='C')) pl = [] img_paths = xdnn_io.getFilePaths(netconf_args['images']) for j, p in enumerate(img_paths[:netconf_args['batch_sz']]): batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile( p, netconf_args['img_raw_scale'], netconf_args['img_mean'], netconf_args['img_input_scale'], netconf_args['in_shape'][2], netconf_args['in_shape'][3]) pl.append(p) confName = str(netconf_args['name']) firstInputName = fpgaRT[confName].getInputs().iterkeys().next() firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next() fpgaRT[confName].exec_async({firstInputName: batchArrays[-1]}, {firstOutputName: fpgaOutputs[confName]}, streamId) for streamId, confName in enumerate(confNames): fpgaRT[confName].get_result(streamId) for netconf_args in args: confName = str(netconf_args['name']) fcOut = np.empty((netconf_args['batch_sz'], netconf_args['outsz']), dtype=np.float32, order='C') xdnn.computeFC(fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], fcOut) softmaxOut = xdnn.computeSoftmax(fcOut) xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels) xdnn.closeHandle()
def __exit__(self, *a): self.stop() if self.xdnn_handle: xdnn.closeHandle()
config["labels"] = "../models/caffe/flowers102/data/synset_words.txt" pyxfdnn_io.printClassification(softmaxOut, config) #Print Original Image for Reference img = cv2.imread(config["images"][0]) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) plt.imshow(img) plt.title(config["images"]) plt.show() # ### 13. Close the handle # In[15]: pyxfdnn.closeHandle() # ### 14. Your Turn! # Great work! Now it is your turn! # # We have another trained model which leverages the Inception v1 architecture. # This one is trained on the flowers dataset which has 102 classes. # # The final, fully connected layer has only 102 outputs for 102 output categories. # # This means that the graph and weights are different. # # Update this notebook to classify pretty flowers instead! # # Start by clicking **Kernel** from the menu, and then select **Reset & Clear Output**. #
def fpga_process(fpgaRT, args, num_img, compJson, shared_trans_arrs, shared_output_arrs): if fpgaRT is None: ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", [args["deviceID"]]) if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) else: print "fpga process handle was ready:" qWait = mp.Queue(maxsize=100) numStreams = args['numstream'] bsz = args['batch_sz'] input_ptrs = [[] for i in range(numStreams)] numProcessed = 0 t = threading.Thread(target=fpga_wait, args=(fpgaRT, qWait, shared_output_arrs, shared_trans_arrs)) t.start() input_shapes = map(lambda x: (x), compJson.getInputs().itervalues()) output_shapes = map(lambda x: (x), compJson.getOutputs().itervalues()) InputName_list = map(lambda x: str(x), compJson.getInputs().iterkeys()) OutputName_list = map(lambda x: str(x), compJson.getOutputs().iterkeys()) num_inputs = len(input_shapes) num_outputs = len(output_shapes) startTime = time.time() while numProcessed < num_img or args['perpetual']: write_slot = shared_output_arrs.openWriteId() write_slot_arrs = shared_output_arrs.accessNumpyBuffer(write_slot) in_dict = {} out_dict = {} for out_idx in range(num_outputs): out_dict[OutputName_list[out_idx]] = write_slot_arrs[out_idx] read_slot_arrs_list = [] read_slot_list = [] for img_num in range(args['batch_sz']): read_slot = shared_trans_arrs.openReadId() if read_slot is None: break read_slot_arrs = shared_trans_arrs.accessNumpyBuffer(read_slot) read_slot_arrs_list.append(read_slot_arrs) read_slot_list.append(read_slot) write_slot_arrs[-1][img_num][:] = read_slot_arrs[-1][:] numProcessed += 1 if (args['perpetual'] == False): if numProcessed == num_img: break images_added = len(read_slot_arrs_list) # when number of images avaiable are less than the batch size, fill the rest of the out buffer image-id slots with -1 for img_num in range(images_added, args['batch_sz']): write_slot_arrs[-1][img_num][:] = -1 for in_idx in range(num_inputs): in_dict[InputName_list[in_idx]] = [] for img_idx in range(len(read_slot_arrs_list)): in_dict[InputName_list[in_idx]].append( read_slot_arrs_list[img_idx][in_idx]) fpgaRT.exec_async(in_dict, out_dict, write_slot) qWait.put((write_slot, read_slot_list, img_num)) #shared_trans_arrs.closeReadId(read_slot) qWait.put((None, None, None)) t.join() elapsedTime = (time.time() - startTime) print("FPGA_process: ", float(numProcessed) / elapsedTime, "img/s") xdnn.closeHandle()