def main(argv): args = xdnn_io.processCommandLine(argv) ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib) if ret != 0: sys.exit(1) labels = xdnn_io.get_labels(args['labels']) # TODO dict of tuples instead? fpgaRT = {} fpgaOutputs = {} fcWeights = {} fcBiases = {} netFiles = {} confNames = [] args = args['jsoncfg'] # we do not use other args' keys for netconf_args in args: confName = str(netconf_args['name']) confNames += [confName] # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp']) fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args) netconf_args['in_shape'] = tuple((netconf_args['batch_sz'],) + tuple(fpgaRT[confName].getInputDescriptors().itervalues().next()[1:] )) (fcWeights[confName], fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args) fpgaOutputs[confName] = np.empty ((netconf_args['batch_sz'], int(netconf_args['fpgaoutsz']),), dtype=np.float32, order='C') netFiles[confName] = str(netconf_args['netcfg']) batchArrays = [] for streamId, netconf_args in enumerate(args): batchArrays.append(np.empty(netconf_args['in_shape'], dtype=np.float32, order='C')) pl = [] img_paths = xdnn_io.getFilePaths(netconf_args['images']) for j, p in enumerate(img_paths[:netconf_args['batch_sz']]): batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile(p, netconf_args['img_raw_scale'], netconf_args['img_mean'], netconf_args['img_input_scale'], netconf_args['in_shape'][2], netconf_args['in_shape'][3]) pl.append(p) confName = str(netconf_args['name']) firstInputName = fpgaRT[confName].getInputs().iterkeys().next() firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next() fpgaRT[confName].exec_async({ firstInputName : batchArrays[-1] }, { firstOutputName : fpgaOutputs[confName] }, streamId) for streamId, confName in enumerate(confNames): fpgaRT[confName].get_result (streamId) for netconf_args in args: confName = str(netconf_args['name']) fcOut = np.empty( (netconf_args['batch_sz'], netconf_args['outsz']), dtype=np.float32, order = 'C') xdnn.computeFC (fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], fcOut) softmaxOut = xdnn.computeSoftmax(fcOut) xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels); xdnn.closeHandle()
def main(): parser = xdnn_io.default_parser_args() parser.add_argument('--numprepproc', type=int, default=1, help='number of parallel processes used to decode and quantize images') parser.add_argument('--numstream', type=int, default=16, help='number of FPGA streams') parser.add_argument('--deviceID', type=int, default=0, help='FPGA no. -> FPGA ID to run in case multiple FPGAs') parser.add_argument('--benchmarkmode', type=int, default=0, help='bypass pre/post processing for benchmarking') args = parser.parse_args() args = xdnn_io.make_dict_args(args) ret = xdnn.createManager() if ret != True: sys.exit(1) sharedInputArrs = [] fpgaOutputs = [] compilerJSONObj = xdnn.CompilerJsonParser( args['netcfg']) qPrep = mp.Queue(maxsize=args['numprepproc']*10) qFpga = mp.Queue(maxsize=100) streamQ = mp.Queue(maxsize=args['numstream']) prepProcQ = mp.Queue(maxsize=100) firstOutputShape = compilerJSONObj.getOutputs().itervalues().next() firstInputShape = compilerJSONObj.getInputs().itervalues().next() for i in range( args['numstream'] ): fpgaOutputs.append(mp.Array(ctypes.c_float, args['batch_sz'] * np.prod( tuple(firstOutputShape[1:]) ) )) streamQ.put ( i ) for i in range(100): bufSize = np.prod(tuple(firstInputShape)) sharedInputArrs.append( mp.Array(ctypes.c_float, bufSize ) ) prepProcQ.put (i) img_paths = xdnn_io.getFilePaths(args['images']) p = mp.Pool( initializer = init_prepImage, initargs = (args, qPrep, img_paths, sharedInputArrs, prepProcQ, compilerJSONObj, ), processes = args['numprepproc']) xdnnProc = mp.Process(target=fpga_process_async, args=(qPrep, qFpga, args, len(img_paths), sharedInputArrs,prepProcQ, streamQ, fpgaOutputs, compilerJSONObj,)) xdnnProc.start() postProc = mp.Process(target=post_process, args=(qFpga, args, img_paths,streamQ, fpgaOutputs,)) postProc.start() if args['perpetual']: while True: res = [p.map_async(run_prepImage, range(len(img_paths)))] for j in res: j.wait() del j else: p.map_async(run_prepImage, range(len(img_paths))) xdnnProc.join() postProc.join() p.close() p.join()
def main(): args = xdnn_io.processCommandLine() ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args) img_paths = xdnn_io.getFilePaths(args['images']) fpgaOutput = np.empty(( args['batch_sz'], args['fpgaoutsz'], ), dtype=np.float32, order='C') fcOutput = np.empty(( args['batch_sz'], args['outsz'], ), dtype=np.float32, order='C') batch_array = np.empty(((args['batch_sz'], ) + args['in_shape']), dtype=np.float32, order='C') labels = xdnn_io.get_labels(args['labels']) if args['golden']: goldenMap = xdnn_io.getGoldenMap(args['golden']) top5Count = 0 top1Count = 0 for i in xrange(0, len(img_paths), args['batch_sz']): pl = [] for j, p in enumerate(img_paths[i:i + args['batch_sz']]): batch_array[j, ...], _ = xdnn_io.loadImageBlobFromFile( p, args['img_raw_scale'], args['img_mean'], args['img_input_scale'], args['in_shape'][2], args['in_shape'][1]) pl.append(p) fpgaRT.execute(batch_array, fpgaOutput) xdnn.computeFC(fcWeight, fcBias, fpgaOutput, args['batch_sz'], args['outsz'], args['fpgaoutsz'], fcOutput) softmaxOut = xdnn.computeSoftmax(fcOutput) xdnn_io.printClassification(softmaxOut, pl, labels) if args['golden']: for j, p in enumerate(img_paths[i:i + args['batch_sz']]): top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 1) top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 5) xdnn.closeHandle() if args['golden']: print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % ( len(img_paths), float(top1Count) / float(len(img_paths)) * 100., float(top5Count) / float(len(img_paths)) * 100.)
def networkForward(netcfg, layername): #args = xdnn_io.processCommandLine() parser = xdnn_io.default_parser_args() parser.add_argument('--layerindex', type=int, default=0, help='Index value for layer in json', required=True) argvt = parser.parse_args() args = xdnn_io.make_dict_args(argvt) args['netcfg'] = netcfg # Hardcode these parameters, so we only have to look at performance of 1 PE args["batch_sz"] = 1 args["PE"] = 0 #print "{:-^100}".format(' Before: createHandle ') ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") #print "{:-^100}".format(' After: createHandle ') if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) #print "{:-^100}".format('1') fpgaOutput = fpgaRT.getOutputs() #print "{:-^100}".format('2') fpgaInput = fpgaRT.getInputs() #print "{:-^100}".format('3') img_paths = xdnn_io.getFilePaths(args['images']) inShape = (args['batch_sz'],) + tuple ( tuple (fpgaRT.getInputDescriptors().values() )[0][1:] ) firstInput = list(fpgaInput.values())[0] firstOutput = list (fpgaOutput.values())[0] for i in xrange(0, len(img_paths), args['batch_sz']): pl = [] for j, p in enumerate(img_paths[i:i + args['batch_sz']]): firstInput[0, ...], _ = xdnn_io.loadImageBlobFromFile(img_paths[0], args['img_raw_scale'], args['img_mean'], args['img_input_scale'], inShape[2], inShape[3]) pl.append(p) with open(args['netcfg']) as fp: data = json.load(fp) #print json.dumps(data, indent=2) # Strip nodes that don't run in hardware nodes = data['network'] nodes = [x for x in nodes if x['xdnn_kv']] nLayers = len(nodes) # How many iterations to run, and average across iterations = 1 # Initialize empty list to hold accumulated runtime t1 = [] for k in range(iterations): t1.append(0.0) # Run N iterations of network permutations for l in range(iterations): fpgaRT.execute(fpgaInput, fpgaOutput) t1[l] += (fpgaRT.get_exec_time()) #for node in nodes: # print node['name'] # Average it avetime = sum(t1)/iterations #print "{:<25} = {:<25}".format(layername, avetime) return avetime xdnn.closeHandle() del fpgaRT del fpgaInput del fpgaOutput del ret
def main(): parser = xdnn_io.default_parser_args() parser.add_argument( '--numprepproc', type=int, default=1, help='number of parallel processes used to decode and quantize images') parser.add_argument('--numstream', type=int, default=16, help='number of FPGA streams') parser.add_argument( '--deviceID', type=int, default=0, help='FPGA no. -> FPGA ID to run in case multiple FPGAs') args = parser.parse_args() args = xdnn_io.make_dict_args(args) ret = xdnn.createManager(args['xlnxlib']) if ret != True: sys.exit(1) sharedInputArrs = [] fpgaOutputs = [] qPrep = mp.Queue(maxsize=args['numprepproc'] * 10) qFpga = mp.Queue(maxsize=100) streamQ = mp.Queue(maxsize=args['numstream']) prepProcQ = mp.Queue(maxsize=100) for i in range(args['numstream']): shared_arr = mp.Array(ctypes.c_float, args['batch_sz'] * args['fpgaoutsz']) fpgaOutputs.append(shared_arr) streamQ.put(i) for i in range(100): bufSize = np.prod(args['in_shape']) sharedInputArrs.append(mp.Array(ctypes.c_float, bufSize)) prepProcQ.put(i) img_paths = xdnn_io.getFilePaths(args['images']) p = mp.Pool(initializer=init_prepImage, initargs=( args, qPrep, img_paths, sharedInputArrs, prepProcQ, ), processes=args['numprepproc']) xdnnProc = mp.Process(target=fpga_process_async, args=( qPrep, qFpga, args, len(img_paths), sharedInputArrs, prepProcQ, streamQ, fpgaOutputs, )) xdnnProc.start() postProc = mp.Process(target=post_process, args=( qFpga, args, img_paths, streamQ, fpgaOutputs, )) postProc.start() if args['perpetual']: while True: res = [p.map_async(run_prepImage, range(len(img_paths)))] for j in res: j.wait() del j else: p.map_async(run_prepImage, range(len(img_paths))) xdnnProc.join() postProc.join() p.close() p.join()