def post_process(qFrom, args, img_paths, streamQ, fpgaOutputs):
    numProcessed = 0
    labels = xdnn_io.get_labels(args['labels'])
    zmqPub = None
    if args['zmqpub']:
        zmqPub = ZmqResultPublisher(args['deviceID'])
    goldenMap = None
    if args['golden']:
        goldenMap = xdnn_io.getGoldenMap(args['golden'])
        top5Count = 0
        top1Count = 0

    (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(args)
    bsz = args['batch_sz']
    fcOutput = np.empty((
        bsz,
        args['outsz'],
    ), dtype=np.float32, order='C')
    start = 0
    while True:
        (sId, img_idx) = qFrom.get()
        if numProcessed == 0:
            start = timeit.default_timer()
        if sId is None or img_idx is None:
            break

        imgList = []
        for x in np.nditer(img_idx):
            if x >= 0:
                imgList.append(img_paths[x])
                numProcessed += 1

        npout_view = np.frombuffer(fpgaOutputs[sId].get_obj(),
                                   dtype=np.float32)
        xdnn.computeFC(fcWeight, fcBias, npout_view, bsz, args['outsz'],
                       args['fpgaoutsz'], fcOutput)
        streamQ.put(sId)

        smaxOutput = xdnn.computeSoftmax(fcOutput)
        if args['golden']:
            for i, p in enumerate(imgList):
                top1Count += xdnn_io.isTopK(smaxOutput[i], goldenMap, p,
                                            labels, 1)
                top5Count += xdnn_io.isTopK(smaxOutput[i], goldenMap, p,
                                            labels, 5)

        if zmqPub is not None:
            predictMsg = xdnn_io.getClassification(smaxOutput,
                                                   imgList,
                                                   labels,
                                                   zmqPub=True)
            zmqPub.send(predictMsg)

    print("%g images/s" % (float(numProcessed) / (time.time() - start)))

    if args['golden']:
        print ("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") \
          % (numProcessed,
             float(top1Count)/float(numProcessed)*100.,
             float(top5Count)/float(numProcessed)*100.)
Example #2
0
def main(argv):
    args = xdnn_io.processCommandLine(argv)
    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
    # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib)
    if ret != 0:
      sys.exit(1)
    labels = xdnn_io.get_labels(args['labels'])

    # TODO dict of tuples instead?
    fpgaRT          = {}
    fpgaOutputs     = {}
    fcWeights       = {}
    fcBiases        = {}
    netFiles        = {}
    confNames       = []

    args = args['jsoncfg']      # we do not use other args' keys
    for netconf_args in args:
      
      confName   = str(netconf_args['name'])
      confNames += [confName]
      # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp'])
      fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args)
      netconf_args['in_shape'] = tuple((netconf_args['batch_sz'],) + tuple(fpgaRT[confName].getInputDescriptors().itervalues().next()[1:] )) 
      (fcWeights[confName],
        fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args)
      fpgaOutputs[confName]             = np.empty ((netconf_args['batch_sz'], int(netconf_args['fpgaoutsz']),), dtype=np.float32, order='C')
      netFiles[confName]                = str(netconf_args['netcfg'])

    batchArrays = []
    for streamId, netconf_args in enumerate(args):
      batchArrays.append(np.empty(netconf_args['in_shape'], dtype=np.float32, order='C'))
      pl = []
      img_paths = xdnn_io.getFilePaths(netconf_args['images'])
      for j, p in enumerate(img_paths[:netconf_args['batch_sz']]):
        batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile(p, netconf_args['img_raw_scale'],
                                                                  netconf_args['img_mean'],
                                                                  netconf_args['img_input_scale'],
                                                                  netconf_args['in_shape'][2],
                                                                  netconf_args['in_shape'][3])
        pl.append(p)

      confName = str(netconf_args['name'])
      firstInputName = fpgaRT[confName].getInputs().iterkeys().next()
      firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next()
      fpgaRT[confName].exec_async({ firstInputName : batchArrays[-1] }, { firstOutputName : fpgaOutputs[confName] }, streamId)

    for streamId, confName in enumerate(confNames):
      fpgaRT[confName].get_result (streamId)

    for netconf_args in args:
      confName = str(netconf_args['name'])
      fcOut = np.empty( (netconf_args['batch_sz'], netconf_args['outsz']), dtype=np.float32, order = 'C')
      xdnn.computeFC (fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], fcOut)

      softmaxOut = xdnn.computeSoftmax(fcOut)
      xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels);

    xdnn.closeHandle()
Example #3
0
def init_fpga():
    # Instead of using command line, we hard code it here.
    # Typing correct args is almost impossible so either do it in .sh or .py
    #
    global g_args
    global g_ctxt
    print(" --- INIT FPGA --- \n")
    xdnnArgs = build_xdnn_args()
    print(xdnnArgs)
    g_args = xdnn_io.processCommandLine(xdnnArgs)
    print(" --- After parsing --- \n")
    print(g_args)

    print(" --- Create handle --- \n")
    ret, handles = xdnn.createHandle(g_args['xclbin'], "kernelSxdnn_0")
    if ret != 0:
        print(" --- !!! FAILED: Cannot create handle. --- \n")
        sys.exit(1)

    print(" --- Create fpgaRT --- \n")
    fpgaRT = xdnn.XDNNFPGAOp(handles, g_args)
    g_ctxt["fpgaRT"] = fpgaRT

    print(" --- Weight and Bias --- \n")
    fcWeight, fcBias = xdnn_io.loadFCWeightsBias(g_args)
    g_ctxt["fcWeight"] = fcWeight
    g_ctxt["fcBias"] = fcBias

    print(" --- Init input input/output area --- \n")
    if is_deploymode():
        g_ctxt['fpgaOutput'] = fpgaRT.getOutputs()
        g_ctxt['fpgaInput'] = fpgaRT.getInputs()
        g_ctxt['inShape'] = (g_args['batch_sz'], ) + tuple(
            fpgaRT.getInputDescriptors().itervalues().next()[1:])
    else:
        g_ctxt['fpgaOutput'] = np.empty((
            g_args['batch_sz'],
            g_args['fpgaoutsz'],
        ),
                                        dtype=np.float32,
                                        order='C')
        g_ctxt['batch_array'] = np.empty(
            ((g_args['batch_sz'], ) + g_args['in_shape']),
            dtype=np.float32,
            order='C')

    g_ctxt['fcOutput'] = np.empty((
        g_args['batch_sz'],
        g_args['outsz'],
    ),
                                  dtype=np.float32,
                                  order='C')

    print(" --- Get lables --- \n")
    g_ctxt['labels'] = xdnn_io.get_labels(g_args['labels'])
    # golden?   What is that?
    # Seems we are done.

    print(" --- FPGA INITIALIZED! ---\n")
Example #4
0
def main():
    args = xdnn_io.processCommandLine()

    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
    if ret != 0:
        sys.exit(1)
    fpgaRT = xdnn.XDNNFPGAOp(handles, args)
    fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args)
    img_paths = xdnn_io.getFilePaths(args['images'])
    fpgaOutput = np.empty((
        args['batch_sz'],
        args['fpgaoutsz'],
    ),
                          dtype=np.float32,
                          order='C')
    fcOutput = np.empty((
        args['batch_sz'],
        args['outsz'],
    ),
                        dtype=np.float32,
                        order='C')
    batch_array = np.empty(((args['batch_sz'], ) + args['in_shape']),
                           dtype=np.float32,
                           order='C')
    labels = xdnn_io.get_labels(args['labels'])
    if args['golden']:
        goldenMap = xdnn_io.getGoldenMap(args['golden'])
        top5Count = 0
        top1Count = 0

    for i in xrange(0, len(img_paths), args['batch_sz']):
        pl = []
        for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
            batch_array[j, ...], _ = xdnn_io.loadImageBlobFromFile(
                p, args['img_raw_scale'], args['img_mean'],
                args['img_input_scale'], args['in_shape'][2],
                args['in_shape'][1])
            pl.append(p)

        fpgaRT.execute(batch_array, fpgaOutput)
        xdnn.computeFC(fcWeight, fcBias, fpgaOutput, args['batch_sz'],
                       args['outsz'], args['fpgaoutsz'], fcOutput)
        softmaxOut = xdnn.computeSoftmax(fcOutput)
        xdnn_io.printClassification(softmaxOut, pl, labels)
        if args['golden']:
            for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
                top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p,
                                            labels, 1)
                top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p,
                                            labels, 5)

    xdnn.closeHandle()
    if args['golden']:
        print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % (
            len(img_paths), float(top1Count) / float(len(img_paths)) * 100.,
            float(top5Count) / float(len(img_paths)) * 100.)
Example #5
0
def init_fpga():
    global g_inputs
    global g_inputbuf
    global g_fpgaOutput
    global g_weightsBlob
    global g_fcWeight
    global g_fcBias
    print(" --- INIT FPGA --- \n")
    print("xclbin: {0}.\n".format(g_xclbin))
    print("xdnnLib: {0}.\n".format(g_xdnnLib))
    ret = xdnn.createManager(g_xdnnLib)
    if ret != True:
        raise SystemExit("Error: xdnn createManager failed.")
    (g_fcWeight, g_fcBias) = xdnn_io.loadFCWeightsBias(g_xdnnTestDataDir)

    ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib, g_numDevices)
    if ret:
        raise SystemExit("ERROR: Unable to create handle to FPGA")
    else:
        print("INFO: Sucessfully create handle to FPGA.")

    # magics.   See ml-suite/notebooks tutorial.   Should we overwrite PE?
    args = {
        'datadir': g_xdnnTestDataDir,
        'quantizecfg': g_fpgaCfgFile,
        'scaleA': g_scaleA,
        'scaleB': g_scaleB,
        'PE': -1,
        'netcfg': g_netFile
    }

    print(" --- load weights --- \n")
    g_weightsBlob = xdnn_io.loadWeightsBiasQuant(args)

    print(" --- read lable file --- \n")
    with open(g_lableFile, 'r') as f:
        for line in f:
            g_labelarray.append(line.strip())

    print(" --- prepare inputs --- \n")
    g_inputs = np.zeros((g_batchSize, g_img_c * g_img_h * g_img_w),
                        dtype=np.float32)
    g_inputbuf = np.zeros((g_batchSize, g_img_c, g_img_h, g_img_w),
                          dtype=np.float32)

    print "g_inputs", g_inputs

    print(" --- prepare outputs --- \n")
    g_fpgaOutput, fpgaHandle = xdnn.makeFPGAFloatArray(g_fpgaOutputSize *
                                                       g_batchSize)
Example #6
0
    def __init__(self, maxNumStreams):
        self._maxNumStreams = maxNumStreams
        self._streamsAvailable = []
        self._streamInputs = []
        self._streamOutputs = []

        self._config = xdnn_io.processCommandLine()
        ret, handles = xdnn.createHandle(self._config['xclbin'])
        if ret != 0:
            sys.exit(1)

        self._fpgaRT = xdnn.XDNNFPGAOp(handles, self._config)
        self._fcWeight, self._fcBias = xdnn_io.loadFCWeightsBias(self._config)
        self._labels = xdnn_io.get_labels(self._config['labels'])

        for i in range(maxNumStreams):
            self._streamsAvailable.append(i)
            self._streamInputs.append(None)
            self._streamOutputs.append(None)
Example #7
0
def main():
    processCommandLine()
    ret = xdnn.createManager(g_xdnnLib)
    if ret != True:
        sys.exit(1)

    (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(g_xdnnTestDataDir)

    #
    # Spawn the first 2 stages of our pipeline
    # Stage 1: Process JPG
    # Stage 2: Run FPGA "classify"
    qPrep = Queue(maxsize=1)
    qFpga = Queue(maxsize=1)
    prepProc = Process(target=prep_process, args=(qPrep, ))
    xdnnProc = Process(target=xdnn_process, args=(qPrep, qFpga))
    prepProc.start()
    xdnnProc.start()

    #
    # The rest of this function post-processes FPGA output:
    # 1) Compute the final FC + Softmax layers
    # 2) Print classification & accuracy
    #
    zmqPub = None
    if g_zmqPub:
        zmqPub = ZmqResultPublisher()
    goldenMap = None
    if g_goldenFile:
        goldenMap = getGoldenMap(g_goldenFile, g_labelFile)
    numProcessed = 0
    allTop1 = 0
    allTop5 = 0
    while True:
        loopTime = timeit.default_timer()
        (fpgaOutput, inputImageFiles) = qFpga.get()

        if type(fpgaOutput) == type(None) \
          and type(inputImageFiles) == type(None):
            break

        startTime = timeit.default_timer()

        fcOutput = xdnn.computeFC(fcWeight, fcBias, fpgaOutput, g_batchSize,
                                  g_outputSize, g_fpgaOutputSize, g_useBlas)

        elapsedTime = timeit.default_timer() - startTime
        print "[time] FC (%.2f ms)" % (elapsedTime * 1000)

        startTime = timeit.default_timer()
        smaxOutput = xdnn.computeSoftmax(fcOutput, g_batchSize)
        elapsedTime = timeit.default_timer() - startTime
        #print "\nAfter Softmax (%.2f ms):" % (elapsedTime * 1000)

        numProcessed += g_batchSize

        (top1, top5) = printClassification(smaxOutput.flatten().tolist(),
                                           g_outputSize,
                                           inputImageFiles,
                                           g_labelFile,
                                           goldenMap,
                                           zmqPub=zmqPub)
        if goldenMap:
            print "Accuracy (i=%d) Top-1: %d, Top-5: %d" \
              % (numProcessed/g_batchSize, top1, top5)
        allTop1 += top1
        allTop5 += top5

        print "Num processed: %d" % numProcessed
        print "\n[time] Total loop (%.2f ms)" % (
            (timeit.default_timer() - loopTime) * 1000)

    if goldenMap and numProcessed:
        print "\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n" \
          % (numProcessed,
             float(allTop1)/float(numProcessed)*100.,
             float(allTop5)/float(numProcessed)*100.)

    prepProc.join()
    xdnnProc.join()
def executeOnFPGA(sProtoBufPath, Qmode, Inference_Data, handle, name,
                  num_models):
    TOTAL_IMAGES = 128

    # Create handle for FPGA
    ret, handle = xdnn.createHandle(
        "../overlaybins/" + "aws" + "/overlay_1.xclbin", "kernelSxdnn_0")

    #Initialize objects to store results
    fpgaRT = {}
    fpgaOutput = {}
    fcWeight = {}
    fcBias = {}
    netFiles = {}
    confNames = []

    #Generate batch
    batch_array = generateRandomBatch(TOTAL_IMAGES, None)

    #Get Image batch to start inference

    for i in range(0, num_models):
        confNames += [str(i)]
        #Generate batch 10 * batchsize
        config = initializeFpgaModel(sProtoBufPath, Qmode)
        config["PE"] = i
        config["name"] = config["name"] + "_" + str(i)
        # Load weights to FPGA
        config = TransferWeightsFPGA(len(batch_array), config, handle, i)
        fpgaRT[str(i)] = xdnn.XDNNFPGAOp(handle, config)
        (fcWeight[str(i)], fcBias[str(i)]) = xdnn_io.loadFCWeightsBias(config)
        fpgaOutput[str(i)], fcOutput, config = AllocateMemoryToHost(config)

    start0 = time.time()
    # Schedule FPGA execution asynchronously
    for i in range(0, num_models):
        fpgaRT[str(i)].exec_async(batch_array, fpgaOutput[str(i)], i)

    start1 = time.time()

    #Fetch results of all parallel executions
    for i in range(0, num_models):
        #Get FPGA output
        ret = fpgaRT[str(i)].get_result(i)
        #Compute Inner product - fully connected layer
        xdnn.computeFC(fcWeight[str(i)], fcBias[str(i)], fpgaOutput[str(i)],
                       config['batch_sz'], config['outsz'],
                       config['fpgaoutsz'], fcOutput)
        #Compute output softmax
        softmaxOut = xdnn.computeSoftmax(fcOutput)

    #xdnn_io.printClassification(softmaxOut, config['images'], labels);
    end = time.time()
    print("throughput", (num_models * len(batch_array) / (end - start0)),
          "duration", end - start0)
    Inference_result = []
    #Append results
    Inference_Data.append({
        "experiment":
        str(Qmode) + "_bit_mode",
        "duration_overall":
        end - start0,
        "imgsPerSecAll":
        num_models * len(batch_array) / (end - start0),
        "num_models_parallel":
        num_models
    })
    xdnn.closeHandle()

    Inference_Data = pd.DataFrame(Inference_Data)
    #    Inference_Data.to_csv('multinet_results.csv')
    result = pd.read_csv('multinet_results.csv')
    result = result.append(Inference_Data)
    result.to_csv('multinet_results.csv')
Example #9
0
def post_process():
    global g_numProcessed
    processCommandLine()
    ret = xdnn.createManager(g_xdnnLib)
    if ret != True:
        sys.exit(1)

    loadImages()
    (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(g_xdnnTestDataDir)

    # sharedInputArrs = rolling bank of shared memory blocks
    # -- 1 bank for each stream
    sharedInputArrs = []
    for i in range(4):
        sharedInputArrs.append(
          sharedctypes.RawArray(\
          ctypes.c_short, g_fpgaBatchSize*g_paddedImageSize))

    # Spawn the first 2 stages of our pipeline
    # Stage 1: Process JPG
    # Stage 2: Run FPGA "classify"
    qFpga = Queue(maxsize=1)
    qPrep = Queue(maxsize=1)
    qMsgFromXdnn = Queue(maxsize=1)

    # start FPGA proc first to make sure FPGA is done initializing
    xdnnProc = Process(target=xdnn_process,
                       args=(qPrep, qFpga, qMsgFromXdnn, sharedInputArrs))
    xdnnProc.start()

    # only start prep proc after FPGA xdnn proc is ready
    xdnnReady = qMsgFromXdnn.get()
    prepProc = Process(target=prep_process, args=(qPrep, sharedInputArrs))
    prepProc.start()

    #
    # The rest of this function post-processes FPGA output:
    # 1) Compute the final FC + Softmax layers
    # 2) Print classification & accuracy
    #
    zmqPub = None
    if g_zmqPub:
        zmqPub = ZmqResultPublisher()
    goldenMap = None
    if g_goldenFile:
        goldenMap = getGoldenMap(g_goldenFile)
    g_numProcessed = 0
    allTop1 = 0
    allTop5 = 0

    startTime = None
    while True:
        loopTime = timeit.default_timer() * (-1)
        fpgaOutput = getFpgaOutputs(qFpga)

        if g_numImages is not None and g_numProcessed >= g_numImages:
            break

        if type(fpgaOutput) == type(None):
            break

        inputImageFiles = []
        for i in range(g_batchSize):
            idx = (g_numProcessed + i) % len(g_allInputImageFiles)
            inputImageFiles.append(g_allInputImageFiles[idx])

        if g_bypassFC:
            fcOutput = np.zeros(g_batchSize * g_outputSize)
        else:
            fcOutput = fullyConnected(fcWeight, fcBias, fpgaOutput,
                                      g_batchSize, g_outputSize,
                                      g_fpgaOutputSize, g_useBlas)
        smaxOutput = softmax(fcOutput, g_batchSize)
        loopTime += timeit.default_timer()
        loopTime *= 1000  # ms
        g_numProcessed += g_batchSize

        if not g_bypassLoad:
            (top1, top5) = reportAccuracy(smaxOutput.flatten().tolist(),
                                          g_outputSize, inputImageFiles,
                                          g_labels, goldenMap, zmqPub, True)
            allTop1 += top1
            allTop5 += top5

        #g_perfProf.drawBars(g_batchSize, loopTime)

        if startTime == None:
            # set startTime after skipping 1st iteration
            startTime = timeit.default_timer()

    endTime = timeit.default_timer()
    elapsed = endTime - startTime
    elapsed *= 1000

    prepProc.join()
    xdnnProc.join()

    g_perfProf.syncToShared()
    g_perfProf.printSummary()

    if g_numProcessed > 1:
        numProfiled = g_numProcessed - 1  # we skipped 1 iter to flush pipe
        print("===========================================")
        print("Performance Summary\n")
        print("  Images: %d" % (g_numProcessed))
        if goldenMap is not None:
            print("  Top1: %.2f%%" % (100 * allTop1 / float(g_numProcessed)))
            print("  Top5: %.2f%%" % (100 * allTop5 / float(g_numProcessed)))
        print("  Batch Size: %d" % (g_batchSize))
        print("  Total Batches: %d" % (numProfiled / g_batchSize))
        print("  Total Time: %.2f ms" % (elapsed))
        print("  Time/Batch: %.2f ms" % (g_batchSize * elapsed / numProfiled))
        print("  Time/Image: %.2f ms" % (elapsed / numProfiled))
        print("  Images/Second: %f" % (1000 * numProfiled / elapsed))
        print("===========================================\n")