Beispiel #1
0
def main(argv):
    args = xdnn_io.processCommandLine(argv)
    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
    # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib)
    if ret != 0:
      sys.exit(1)
    labels = xdnn_io.get_labels(args['labels'])

    # TODO dict of tuples instead?
    fpgaRT          = {}
    fpgaOutputs     = {}
    fcWeights       = {}
    fcBiases        = {}
    netFiles        = {}
    confNames       = []

    args = args['jsoncfg']      # we do not use other args' keys
    for netconf_args in args:
      
      confName   = str(netconf_args['name'])
      confNames += [confName]
      # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp'])
      fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args)
      netconf_args['in_shape'] = tuple((netconf_args['batch_sz'],) + tuple(fpgaRT[confName].getInputDescriptors().itervalues().next()[1:] )) 
      (fcWeights[confName],
        fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args)
      fpgaOutputs[confName]             = np.empty ((netconf_args['batch_sz'], int(netconf_args['fpgaoutsz']),), dtype=np.float32, order='C')
      netFiles[confName]                = str(netconf_args['netcfg'])

    batchArrays = []
    for streamId, netconf_args in enumerate(args):
      batchArrays.append(np.empty(netconf_args['in_shape'], dtype=np.float32, order='C'))
      pl = []
      img_paths = xdnn_io.getFilePaths(netconf_args['images'])
      for j, p in enumerate(img_paths[:netconf_args['batch_sz']]):
        batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile(p, netconf_args['img_raw_scale'],
                                                                  netconf_args['img_mean'],
                                                                  netconf_args['img_input_scale'],
                                                                  netconf_args['in_shape'][2],
                                                                  netconf_args['in_shape'][3])
        pl.append(p)

      confName = str(netconf_args['name'])
      firstInputName = fpgaRT[confName].getInputs().iterkeys().next()
      firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next()
      fpgaRT[confName].exec_async({ firstInputName : batchArrays[-1] }, { firstOutputName : fpgaOutputs[confName] }, streamId)

    for streamId, confName in enumerate(confNames):
      fpgaRT[confName].get_result (streamId)

    for netconf_args in args:
      confName = str(netconf_args['name'])
      fcOut = np.empty( (netconf_args['batch_sz'], netconf_args['outsz']), dtype=np.float32, order = 'C')
      xdnn.computeFC (fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], fcOut)

      softmaxOut = xdnn.computeSoftmax(fcOut)
      xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels);

    xdnn.closeHandle()
Beispiel #2
0
def main():
    args = xdnn_io.processCommandLine()

    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
    if ret != 0:
        sys.exit(1)
    fpgaRT = xdnn.XDNNFPGAOp(handles, args)
    fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args)
    img_paths = xdnn_io.getFilePaths(args['images'])
    fpgaOutput = np.empty((
        args['batch_sz'],
        args['fpgaoutsz'],
    ),
                          dtype=np.float32,
                          order='C')
    fcOutput = np.empty((
        args['batch_sz'],
        args['outsz'],
    ),
                        dtype=np.float32,
                        order='C')
    batch_array = np.empty(((args['batch_sz'], ) + args['in_shape']),
                           dtype=np.float32,
                           order='C')
    labels = xdnn_io.get_labels(args['labels'])
    if args['golden']:
        goldenMap = xdnn_io.getGoldenMap(args['golden'])
        top5Count = 0
        top1Count = 0

    for i in xrange(0, len(img_paths), args['batch_sz']):
        pl = []
        for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
            batch_array[j, ...], _ = xdnn_io.loadImageBlobFromFile(
                p, args['img_raw_scale'], args['img_mean'],
                args['img_input_scale'], args['in_shape'][2],
                args['in_shape'][1])
            pl.append(p)

        fpgaRT.execute(batch_array, fpgaOutput)
        xdnn.computeFC(fcWeight, fcBias, fpgaOutput, args['batch_sz'],
                       args['outsz'], args['fpgaoutsz'], fcOutput)
        softmaxOut = xdnn.computeSoftmax(fcOutput)
        xdnn_io.printClassification(softmaxOut, pl, labels)
        if args['golden']:
            for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
                top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p,
                                            labels, 1)
                top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p,
                                            labels, 5)

    xdnn.closeHandle()
    if args['golden']:
        print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % (
            len(img_paths), float(top1Count) / float(len(img_paths)) * 100.,
            float(top5Count) / float(len(img_paths)) * 100.)
def fpga_process_async(qFrom, qTo, args, num_img, sharedInputArrs, prepProcQ,
                       streamQ, fpgaOutputs):

    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0",
                                     [args["deviceID"]])
    if ret != 0:
        sys.exit(1)
    fpgaRT = xdnn.XDNNFPGAOp(handles, args)

    qWait = mp.Queue(maxsize=100)

    numStreams = args['numstream']
    bsz = args['batch_sz']
    input_ptrs = []
    for i in range(numStreams):
        input_ptrs.append([])

    numProcessed = 0
    t = threading.Thread(target=xdnn_wait,
                         args=(
                             fpgaRT,
                             qWait,
                             qTo,
                             prepProcQ,
                         ))
    t.start()
    #startTime = time.time()
    while numProcessed < num_img or args['perpetual']:
        img_list = np.full((bsz, ), -1, dtype=np.int32)
        sId = streamQ.get()
        input_ptrs[sId] = []
        shMemIdxArr = []
        for j in range(bsz):
            (sMemIdx, img_idx) = qFrom.get()
            numProcessed += 1
            img_list[j] = img_idx
            nparr_view = np.frombuffer(sharedInputArrs[sMemIdx].get_obj(),
                                       dtype=np.float32)
            nparr_view = nparr_view[np.newaxis, ...]
            input_ptrs[sId].append(
                nparr_view.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
            shMemIdxArr.append(sMemIdx)
            if numProcessed == num_img:
                break

        npout_view = np.frombuffer(fpgaOutputs[sId].get_obj(),
                                   dtype=np.float32)
        fpgaRT.exec_async(input_ptrs[sId], npout_view, sId)

        qWait.put((sId, img_list, shMemIdxArr))

    qWait.put((None, None, None))
    #elapsedTime = ( time.time() - startTime )
    #print ( "FPGA_process: ", float(numProcessed)/elapsedTime, "img/s")
    t.join()
    xdnn.closeHandle()
Beispiel #4
0
def fpga_process_async (qFrom, qTo, args, num_img, sharedInputArrs, prepProcQ,  streamQ, fpgaOutputs, compJson):

  ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", [args["deviceID"]])
  if ret != 0:
    sys.exit(1)
  fpgaRT = xdnn.XDNNFPGAOp(handles, args)

  qWait = mp.Queue(maxsize=100)

  numStreams = args['numstream']
  bsz = args['batch_sz']
  input_ptrs = [[] for i in range(numStreams)]

  numProcessed = 0
  t = threading.Thread(target=xdnn_wait, args=(fpgaRT, qWait, qTo, prepProcQ, ))
  t.start()
  
  firstInputName = compJson.getInputs().iterkeys().next()
  firstOutputName = compJson.getOutputs().iterkeys().next()
  firstOutputShape = compJson.getOutputs().itervalues().next()
  firstInputShape = compJson.getInputs().itervalues().next()
  #startTime = time.time()
  while numProcessed < num_img or args['perpetual']:
    img_list = np.full( (bsz,), -1, dtype = np.int32 )
    sId = streamQ.get()
    input_ptrs[sId] = []
    shMemIdxArr = []
    for j in range(bsz):
      (sMemIdx, img_idx) = qFrom.get()
      numProcessed += 1
      img_list[j] = img_idx
      nparr_view = np.frombuffer(sharedInputArrs[sMemIdx].get_obj(), dtype = np.float32)
      #nparr_view = np.frombuffer(sharedInputArrs[sMemIdx].get_obj(), dtype = np.float32).reshape ( tuple ( firstInputShape ))
      input_ptrs[sId].append( nparr_view )
      shMemIdxArr.append(sMemIdx)
      if numProcessed == num_img:
        break

    npout_view = np.frombuffer(fpgaOutputs[sId].get_obj(), dtype = np.float32).reshape( (args['batch_sz'],) + tuple ( firstOutputShape[1:]) )
    fpgaRT.exec_async( {firstInputName : input_ptrs[sId]}, {firstOutputName : npout_view}, sId)

    qWait.put((sId, img_list, shMemIdxArr))

  qWait.put ((None, None, None))
  #elapsedTime = ( time.time() - startTime )
  #print ( "FPGA_process: ", float(numProcessed)/elapsedTime, "img/s")
  t.join()
  xdnn.closeHandle()
Beispiel #5
0
def xdnn_process(qFrom, qTo):
    xdnn_handle = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib,
                                    g_numDevices)
    if xdnn_handle != 0:
        sys.exit(1)
    args = {
        'datadir': g_xdnnTestDataDir,
        'quantizecfg': g_fpgaCfgFile,
        'scaleA': g_scaleA,
        'scaleB': g_scaleB,
        'PE': -1,
        'netcfg': g_netFile
    }
    if g_xdnnv3 == True:
        weightsBlob = xdnn_io.loadWeightsBiasQuantv3(args)
    else:
        weightsBlob = xdnn_io.loadWeightsBiasQuant(args)
    fpgaOutput = prepareOutput(g_batchSize)
    while True:
        (inputs, inputImageFiles) = qFrom.get()
        if inputs is None:
            break

        fpgaInputs = prepareFpgaInputs(inputs)
        if not fpgaInputs:
            break

        startTime = timeit.default_timer()
        xdnn.execute(
            g_netFile,
            weightsBlob,
            fpgaInputs,
            fpgaOutput,
            g_batchSize,  # num batches
            g_fpgaCfgFile,
            g_scaleB,
            g_PE)

        qTo.put((fpgaOutput, inputImageFiles))

        print "[time] FPGA xdnn execute (%.2f ms):" % (
            (timeit.default_timer() - startTime) * 1000)

    qTo.put((None, None))
    xdnn.closeHandle()
Beispiel #6
0
def main():
    args = xdnn_io.processCommandLine()
    ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib'])
    if ret != 0:
        sys.exit(1)
    (weightsBlob, fcWeight, fcBias) = xdnn_io.loadWeights(args)
    (fpgaInputs, batch_sz) = xdnn_io.prepareInput(args)
    fpgaOutput = xdnn_io.prepareOutput(args['fpgaoutsz'], batch_sz)
    for i in range(1):
        startTime = timeit.default_timer()
        xdnn.execute(
            args['netcfg'],
            weightsBlob,
            fpgaInputs,
            fpgaOutput,
            batch_sz,  # num batches
            args['quantizecfg'],
            args['scaleB'],
            args['PE'])
        elapsedTime = timeit.default_timer() - startTime
        print "\nAfter FPGA (%f ms)" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    fcOut = xdnn.computeFC(fcWeight, fcBias, fpgaOutput, batch_sz,
                           args['outsz'], args['fpgaoutsz'], args['useblas'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter FC (%f ms)" % (elapsedTime * 1000)
    #for i in range(10):
    #  print "%f" % fpgaOutput[i],

    startTime = timeit.default_timer()
    softmaxOut = xdnn.computeSoftmax(fcOut, batch_sz)
    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter Softmax (%f ms)" % (elapsedTime * 1000)

    #for i in range(10):
    #  print "%f" % fpgaOutput[i],

    xdnn_io.printClassification(softmaxOut, args)

    print "\nSuccess!\n"
    xdnn.closeHandle()
Beispiel #7
0
def main():
    args = xdnn_io.processCommandLine()

    # processCommandLine()
    startTime = timeit.default_timer()
    ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib'])
    # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib)
    if ret != 0:
        sys.exit(1)
    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter createHandle (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    # TODO dict of tuples instead?
    fpgaInputs = {}
    fpgaOutputs = {}
    weightsBlobs = {}
    fcWeights = {}
    fcBiases = {}
    batch_sizes = {}
    fpgaOutputSizes = {}
    PEs = {}
    netFiles = {}
    confNames = []

    for netconf_args in args['jsoncfg']:
        confName = str(netconf_args['name'])
        confNames.append(confName)
        # make a tuple instead
        PE = [int(x) for x in netconf_args['PE'].split()]
        # if cuMask in cuMaskList:
        #  raise Exception('cuMasks are non-disjoint')
        datadir = str(netconf_args['datadir'])
        fpgaoutsz = int(netconf_args['fpgaoutsz'])
        netfile = str(netconf_args['netcfg'])

        PEs[confName] = PE
        (weightsBlobs[confName], fcWeights[confName],
         fcBiases[confName]) = xdnn_io.loadWeights(netconf_args)
        fpgaOutputSizes[confName] = fpgaoutsz
        (fpgaInputs[confName],
         batch_sz) = xdnn_io.prepareInput(netconf_args, PE)
        batch_sizes[confName] = batch_sz
        fpgaOutputs[confName] = xdnn_io.prepareOutput(
            int(netconf_args['fpgaoutsz']), batch_sz)
        netFiles[confName] = netfile

    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter init (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    for netconf_args in args['jsoncfg']:
        confName = str(netconf_args['name'])
        xdnn.exec_async(netFiles[confName], weightsBlobs[confName],
                        fpgaInputs[confName], fpgaOutputs[confName],
                        int(batch_sizes[confName]),
                        netconf_args['quantizecfg'], netconf_args['scaleB'],
                        PEs[confName])

    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter Execonly (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    for confName in confNames:
        xdnn.get_result(PEs[confName])

    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter wait (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    for netconf_args in args['jsoncfg']:
        confName = str(netconf_args['name'])
        fcOut = xdnn.computeFC(fcWeights[confName], fcBiases[confName],
                               fpgaOutputs[confName], batch_sizes[confName],
                               netconf_args['outsz'],
                               netconf_args['fpgaoutsz'],
                               netconf_args['useblas'])

        elapsedTime = timeit.default_timer() - startTime
        print "\nAfter FC (%f ms):" % (elapsedTime * 1000)
        startTime = timeit.default_timer()

        softmaxOut = xdnn.computeSoftmax(fcOut, batch_sizes[confName])

        elapsedTime = timeit.default_timer() - startTime
        print "\nAfter Softmax (%f ms):" % (elapsedTime * 1000)

        xdnn_io.printClassification(softmaxOut, netconf_args)

    print "\nSuccess!\n"

    xdnn.closeHandle()
def executeOnFPGA(sProtoBufPath, Qmode, Inference_Data, handle, name,
                  num_models):
    TOTAL_IMAGES = 128

    # Create handle for FPGA
    ret, handle = xdnn.createHandle(
        "../overlaybins/" + "aws" + "/overlay_1.xclbin", "kernelSxdnn_0")

    #Initialize objects to store results
    fpgaRT = {}
    fpgaOutput = {}
    fcWeight = {}
    fcBias = {}
    netFiles = {}
    confNames = []

    #Generate batch
    batch_array = generateRandomBatch(TOTAL_IMAGES, None)

    #Get Image batch to start inference

    for i in range(0, num_models):
        confNames += [str(i)]
        #Generate batch 10 * batchsize
        config = initializeFpgaModel(sProtoBufPath, Qmode)
        config["PE"] = i
        config["name"] = config["name"] + "_" + str(i)
        # Load weights to FPGA
        config = TransferWeightsFPGA(len(batch_array), config, handle, i)
        fpgaRT[str(i)] = xdnn.XDNNFPGAOp(handle, config)
        (fcWeight[str(i)], fcBias[str(i)]) = xdnn_io.loadFCWeightsBias(config)
        fpgaOutput[str(i)], fcOutput, config = AllocateMemoryToHost(config)

    start0 = time.time()
    # Schedule FPGA execution asynchronously
    for i in range(0, num_models):
        fpgaRT[str(i)].exec_async(batch_array, fpgaOutput[str(i)], i)

    start1 = time.time()

    #Fetch results of all parallel executions
    for i in range(0, num_models):
        #Get FPGA output
        ret = fpgaRT[str(i)].get_result(i)
        #Compute Inner product - fully connected layer
        xdnn.computeFC(fcWeight[str(i)], fcBias[str(i)], fpgaOutput[str(i)],
                       config['batch_sz'], config['outsz'],
                       config['fpgaoutsz'], fcOutput)
        #Compute output softmax
        softmaxOut = xdnn.computeSoftmax(fcOutput)

    #xdnn_io.printClassification(softmaxOut, config['images'], labels);
    end = time.time()
    print("throughput", (num_models * len(batch_array) / (end - start0)),
          "duration", end - start0)
    Inference_result = []
    #Append results
    Inference_Data.append({
        "experiment":
        str(Qmode) + "_bit_mode",
        "duration_overall":
        end - start0,
        "imgsPerSecAll":
        num_models * len(batch_array) / (end - start0),
        "num_models_parallel":
        num_models
    })
    xdnn.closeHandle()

    Inference_Data = pd.DataFrame(Inference_Data)
    #    Inference_Data.to_csv('multinet_results.csv')
    result = pd.read_csv('multinet_results.csv')
    result = result.append(Inference_Data)
    result.to_csv('multinet_results.csv')
def main(argv=None):
    args = xdnn_io.processCommandLine(argv)

    startTime = timeit.default_timer()
    ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib'])
    if ret != 0:
        sys.exit(1)
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to createHandle (%f ms):" % (elapsedTime * 1000)

    # we do not need other args keys except 'jsoncfg'
    args = args['jsoncfg']

    netCfgs = defaultdict(dict)
    confNames = []
    startTime = timeit.default_timer()
    for streamId, netCfg_args in enumerate(args):
        confName = str(netCfg_args['name'])
        confNames += [confName]

        netCfg_args['netcfg'] = './data/{}_{}.cmd'.format(
            netCfg_args['net'], netCfg_args['dsp'])
        netCfgs[confName]['streamId'] = streamId
        netCfgs[confName]['args'] = netCfg_args
        (netCfgs[confName]['weightsBlobs'], netCfgs[confName]['fcWeights'],
         netCfgs[confName]['fcBiases']) = xdnn_io.loadWeights(netCfg_args)
        netCfgs[confName]['batch_sz'] = 1
        netCfgs[confName]['fpgaOutputs'] = xdnn_io.prepareOutput(
            netCfg_args["fpgaoutsz"], netCfgs[confName]['batch_sz'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to init (%f ms):" % (elapsedTime * 1000)

    ## run YOLO
    confName = 'yolo'
    netCfg = netCfgs[confName]

    startTime = timeit.default_timer()
    (netCfg['fpgaInputs'], netCfg['batch_sz'],
     netCfg['shapes']) = xdnn_io.prepareInput(netCfg['args'],
                                              netCfg['args']['PE'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to transfer input image to FPGA (%f ms):" % (elapsedTime *
                                                               1000)

    startTime = timeit.default_timer()
    xdnn.exec_async(netCfg['args']['netcfg'], netCfg['weightsBlobs'],
                    netCfg['fpgaInputs'], netCfg['fpgaOutputs'],
                    netCfg['batch_sz'], netCfg['args']['quantizecfg'],
                    netCfg['args']['scaleB'], netCfg['args']['PE'],
                    netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to execute Yolo on FPGA (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    xdnn.get_result(netCfg['args']['PE'], netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to retrieve yolo outputs from FPGA (%f ms):" % (elapsedTime *
                                                                  1000)

    startTime = timeit.default_timer()
    out_h         = \
    out_w         = netCfg['args']['in_shape'][1] / 32
    anchor_boxes = 5
    objectness = 1
    coordinates = 4
    classes = 80
    out_c = objectness + coordinates + classes

    # Reshape the fpgaOutputs into a 4D volume
    yolo_outputs = netCfg['fpgaOutputs'].reshape(anchor_boxes, out_c, out_h,
                                                 out_w)

    # Apply sigmoid to 1st, 2nd, 4th channel for all anchor boxes
    yolo_outputs[:, 0:2, :, :] = sigmoid(
        yolo_outputs[:, 0:2, :, :])  # (X,Y) Predictions
    yolo_outputs[:, 4, :, :] = sigmoid(
        yolo_outputs[:, 4, :, :])  # Objectness / Box Confidence

    # Apply softmax on the class scores foreach anchor box
    for box in range(anchor_boxes):
        yolo_outputs[box, 5:, :, :] = softmax(yolo_outputs[box, 5:, :, :])

    # Perform Non-Max Suppression
    # Non-Max Suppression filters out detections with a score lesser than 0.24
    # Additionally if there are two predections with an overlap > 30%, the prediction with the lower score will be filtered
    scorethresh = 0.24
    iouthresh = 0.3
    bboxes = nms.do_baseline_nms(yolo_outputs.flat, netCfg['shapes'][0][1],
                                 netCfg['shapes'][0][0],
                                 netCfg['args']['in_shape'][2],
                                 netCfg['args']['in_shape'][1], out_w, out_h,
                                 anchor_boxes, classes, scorethresh, iouthresh)

    with open(netCfg['args']['labels']) as f:
        namez = f.readlines()
        names = [x.strip() for x in namez]

    # Lets print the detections our model made
    for j in range(len(bboxes)):
        print("Obj %d: %s" % (j, names[bboxes[j]['classid']]))
        print("\t score = %f" % (bboxes[j]['prob']))
        print("\t (xlo,ylo) = (%d,%d)" %
              (bboxes[j]['ll']['x'], bboxes[j]['ll']['y']))
        print("\t (xhi,yhi) = (%d,%d)" %
              (bboxes[j]['ur']['x'], bboxes[j]['ur']['y']))

    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to execute on CPU (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()

    img = cv2.imread(netCfg['args']['images'][0])
    #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # YOLO was trained with RGB, not BGR like Caffe

    # choose one of the bounding boxes
    obj_idx = 0

    # specify a margin added to the selected bounding box
    margin = 10

    H_slice = slice(max(0, bboxes[obj_idx]['ur']['y'] - margin),
                    min(img.shape[0], bboxes[obj_idx]['ll']['y'] + margin))
    W_slice = slice(max(0, bboxes[obj_idx]['ll']['x'] - margin),
                    min(img.shape[1], bboxes[obj_idx]['ur']['x'] + margin))
    img = img[H_slice, W_slice, :]

    print('pass obj {}: {} with size {} to googlenet'.format(
        obj_idx, names[bboxes[obj_idx]['classid']], img.shape))

    cv2.imwrite('cropped_yolo_output.jpg', img)
    '''
    if img.shape[-1] == 1 or img.shape[-1] == 3:
        # [H, W, C]
        old_dims = np.array(img.shape[:2], dtype=float)
    else:
        # [C, H, W]
        old_dims = np.array(img.shape[1:], dtype=float)
    '''

    ## run GOOGLENET
    confName = 'googlenet'
    netCfg = netCfgs[confName]
    '''
    new_dims = netCfg['args']['in_shape']
    if new_dims[-1] == 1 or new_dims[-1] == 3:
        # [H, W, C]
        new_dims = np.array(new_dims[:2], dtype=int)
    else:
        # [C, H, W]
        new_dims = np.array(new_dims[1:], dtype=int)

    scale_dims    = new_dims.copy()
    min_scale_idx = np.argmin(old_dims/new_dims)
    if min_scale_idx == 0:
      scale_dims[1] = scale_dims[0] * old_dims[1] / old_dims[0]
    else:
      scale_dims[0] = scale_dims[1] * old_dims[0] / old_dims[1]

    scale_dims = scale_dims.astype(int)

    # transform input image to match googlenet
    # scale the image
    print('scale image to {}'.format(scale_dims))
    img = resize_image(img, list(scale_dims))
    cv2.imwrite('rescaled_scaled.jpg', img)

    # crop the image
    crop_idxs = [np.arange(new_dims[i]) + int((scale_dims[i]-new_dims[i])/2) for i in range(2)]

    if img.shape[-1] == 1 or img.shape[-1] == 3:
        # [H, W, C]
        img = img[crop_idxs[0].reshape(-1,1), crop_idxs[1], :]
    else:
        # [C, H, W]
        img = img[:, crop_idxs[0].reshape(-1,1), crop_idxs[1]]

    print('crop image to {}'.format(img.shape))
    cv2.imwrite('rescaled_cropped.jpg', img)

    #img = np.transpose(img, (2, 0, 1))
    #cv2.imwrite('rescaled_transposed.jpg', img)
    '''

    netCfg['args']['images'] = [img]
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to prepare googlenet image on CPU (%f ms):" % (elapsedTime *
                                                                 1000)

    startTime = timeit.default_timer()
    (netCfg['fpgaInputs'], netCfg['batch_sz'],
     netCfg['shapes']) = xdnn_io.prepareInput(netCfg['args'],
                                              netCfg['args']['PE'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to transfer input image to FPGA (%f ms):" % (elapsedTime *
                                                               1000)

    startTime = timeit.default_timer()
    xdnn.exec_async(netCfg['args']['netcfg'], netCfg['weightsBlobs'],
                    netCfg['fpgaInputs'], netCfg['fpgaOutputs'],
                    netCfg['batch_sz'], netCfg['args']['quantizecfg'],
                    netCfg['args']['scaleB'], netCfg['args']['PE'],
                    netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to execute googlenet on FPGA (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    xdnn.get_result(netCfg['args']['PE'], netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to retrieve googlenet outputs from FPGA (%f ms):" % (
        elapsedTime * 1000)

    startTime = timeit.default_timer()
    fcOut = np.empty((netCfg['batch_sz'] * netCfg['args']['outsz']),
                     dtype=np.float32,
                     order='C')
    xdnn.computeFC(netCfg['fcWeights'], netCfg['fcBiases'],
                   netCfg['fpgaOutputs'], netCfg['batch_sz'],
                   netCfg['args']['outsz'], netCfg['args']['fpgaoutsz'], fcOut)
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to run FC layers on CPU (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    softmaxOut = xdnn.computeSoftmax(fcOut, netCfg['batch_sz'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to run Softmax on CPU (%f ms):" % (elapsedTime * 1000)

    xdnn_io.printClassification(softmaxOut, netCfg['args'])

    print "\nSuccess!\n"

    xdnn.closeHandle()
Beispiel #10
0
def networkForward(netcfg, layername):

  #args = xdnn_io.processCommandLine()
  parser = xdnn_io.default_parser_args()
  parser.add_argument('--layerindex', type=int, default=0, help='Index value for layer in json', required=True)
  argvt = parser.parse_args()
  args  = xdnn_io.make_dict_args(argvt)
  
  args['netcfg'] = netcfg
  # Hardcode these parameters, so we only have to look at performance of 1 PE
  args["batch_sz"] = 1
  args["PE"] = 0

  #print "{:-^100}".format(' Before: createHandle ')
  ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
  #print "{:-^100}".format(' After: createHandle ')
  if ret != 0:
      sys.exit(1)

  fpgaRT = xdnn.XDNNFPGAOp(handles, args)
  #print "{:-^100}".format('1')
  fpgaOutput = fpgaRT.getOutputs()
  #print "{:-^100}".format('2')
  fpgaInput = fpgaRT.getInputs()
  #print "{:-^100}".format('3')

  img_paths = xdnn_io.getFilePaths(args['images'])
  inShape = (args['batch_sz'],) +  tuple ( tuple (fpgaRT.getInputDescriptors().values() )[0][1:] )

  firstInput = list(fpgaInput.values())[0]
  firstOutput = list (fpgaOutput.values())[0] 


  for i in xrange(0, len(img_paths), args['batch_sz']):
    pl = []
    for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
        firstInput[0, ...], _ = xdnn_io.loadImageBlobFromFile(img_paths[0], args['img_raw_scale'], args['img_mean'], args['img_input_scale'], inShape[2], inShape[3])
    pl.append(p)

    with open(args['netcfg']) as fp:
      data = json.load(fp)
      #print json.dumps(data, indent=2)

      # Strip nodes that don't run in hardware
      nodes = data['network']
      nodes = [x for x in nodes if x['xdnn_kv']]

      nLayers = len(nodes)

      # How many iterations to run, and average across
      iterations = 1

      # Initialize empty list to hold accumulated runtime
      t1 = []
      for k in range(iterations):
        t1.append(0.0)

      # Run N iterations of network permutations
      for l in range(iterations):
        fpgaRT.execute(fpgaInput, fpgaOutput)
        t1[l] += (fpgaRT.get_exec_time())

      #for node in nodes:
      #  print node['name']

      # Average it
      avetime = sum(t1)/iterations
      #print "{:<25} = {:<25}".format(layername, avetime)

  return avetime
  xdnn.closeHandle()
  del fpgaRT
  del fpgaInput
  del fpgaOutput
  del ret
Beispiel #11
0
def xdnn_process(qFrom, qTo, qMsgFromXdnn, sharedInputArrs):

    global g_numImages
    global g_numProcessed

    global g_img_c
    global g_img_h
    global g_img_w

    xdnn_handle = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib,
                                    g_numDevices)
    if xdnn_handle != 0:
        sys.exit(1)

    fpgaOutputs = []
    for inp in sharedInputArrs:
        fpgaOutputs.append(xdnn_io.prepareOutput(g_fpgaOutputSize,
                                                 g_batchSize))

    # load weights
    args = {
        'datadir': g_xdnnTestDataDir,
        'quantizecfg': g_fpgaCfgFile,
        'scaleA': g_scaleA,
        'scaleB': g_scaleB,
        'PE': -1,
        'netcfg': g_netFile
    }
    if g_xdnnv3 == True:
        weightsBlob = xdnn_io.loadWeightsBiasQuantv3(args)
    else:
        weightsBlob = xdnn_io.loadWeightsBiasQuant(args)

    # Dummy calls to load script
    for streamId in range(len(sharedInputArrs)):
        fpgaInputs = xdnn.passThruInputsForFpga(sharedInputArrs[streamId],
                                                g_fpgaBatchSize,
                                                g_paddedImageSize,
                                                g_fpgaCfgFile, g_scaleB, -1,
                                                g_firstFpgaLayerName, streamId)
        xdnn.exec_async(g_netFile, weightsBlob, fpgaInputs,
                        fpgaOutputs[streamId], g_batchSize, g_fpgaCfgFile,
                        g_scaleB, -1, streamId)
        xdnn.get_result(-1, streamId)

    # XDNN Is Ready to Rock
    qMsgFromXdnn.put(timeit.default_timer())  # Share Start Time

    print("Streaming...")
    pendingJobQ = []
    while True:
        streamId = getImages(qFrom)
        if streamId is None:
            # finish pending jobs & quit
            for (streamId, startTime) in pendingJobQ:
                xdnn.get_result(-1, streamId)
                now = timeit.default_timer()
                g_perfProf.addSample("execute (latency)", now - startTime)
                g_perfProf.addSample("execute (thruput)", now - startTime)

                putFpgaOutputs(fpgaOutputs[streamId], qTo)
            break

        startTime = timeit.default_timer()
        fpgaInputs = xdnn.passThruInputsForFpga(sharedInputArrs[streamId],
                                                g_fpgaBatchSize,
                                                g_paddedImageSize,
                                                g_fpgaCfgFile, g_scaleB, -1,
                                                g_firstFpgaLayerName, streamId)
        g_perfProf.addSample("passThruInputsForFpga",
                             timeit.default_timer() - startTime)
        if not fpgaInputs:
            break

        startTime = timeit.default_timer()
        xdnn.exec_async(g_netFile, weightsBlob, fpgaInputs,
                        fpgaOutputs[streamId], g_batchSize, g_fpgaCfgFile,
                        g_scaleB, -1, streamId)
        pendingJobQ.append((streamId, startTime))

        if len(pendingJobQ) >= len(fpgaOutputs):
            # pop oldest job off the q and get_result
            (streamId, jobStartTime) = pendingJobQ.pop(0)
            xdnn.get_result(-1, streamId)
            now = timeit.default_timer()
            g_perfProf.addSample("execute (latency)", now - jobStartTime)
            g_perfProf.addSample("execute (thruput)", now - startTime)

            putFpgaOutputs(fpgaOutputs[streamId], qTo)

    qTo.put(None)
    g_perfProf.syncToShared()
    xdnn.closeHandle()
Beispiel #12
0
def benchmark():

    mode = "Non-Blocking"
    #mode = "Blocking"

    # Extract Arguments from json
    args = xdnn_io.processCommandLine()["jsoncfg"][0]

    if "platform" in args:
        args["xclbin"] = "../../overlaybins/" + str(
            args["platform"]) + "/" + args["xclbin"]

    # Establish Communication w/ FPGA
    if xdnn.createHandle(args['xclbin'], libFile=args['xlnxlib']):
        sys.exit(1)

    # Transfer weights to device memory
    if "usexdnnv3" in args and args["usexdnnv3"] == "1":
        weightsBlob = xdnn_io.loadWeightsBiasQuantv3(args)
    else:
        weightsBlob = xdnn_io.loadWeightsBiasQuant(args)

    # Create random input data
    fpgaInputs = []
    fpgaInputs.append(
        np.float32(
            np.random.standard_normal(
                (args["batchsz"], reduce(mul, args["in_shape"], 1)))))
    fpgaInputs[0] = xdnn.quantizeInputs(args["firstfpgalayer"],
                                        args["quantizecfg"], args["scaleB"],
                                        fpgaInputs[0])
    fpgaInputs[0] = xdnn.prepareInputsForFpga(fpgaInputs[0],
                                              args["quantizecfg"],
                                              args["scaleB"], -1,
                                              args["firstfpgalayer"], 0)
    fpgaInputs.append(
        np.float32(
            np.random.standard_normal(
                (args["batchsz"], reduce(mul, args["in_shape"], 1)))))
    fpgaInputs[1] = xdnn.quantizeInputs(args["firstfpgalayer"],
                                        args["quantizecfg"], args["scaleB"],
                                        fpgaInputs[1])
    fpgaInputs[1] = xdnn.prepareInputsForFpga(fpgaInputs[1],
                                              args["quantizecfg"],
                                              args["scaleB"], -1,
                                              args["firstfpgalayer"], 1)

    # Create buffers in host memory for result
    fpgaOutputs = []
    fpgaOutputs.append(
        xdnn_io.prepareOutput(args['fpgaoutsz'], args["batchsz"]))
    fpgaOutputs.append(
        xdnn_io.prepareOutput(args['fpgaoutsz'], args["batchsz"]))

    # Load network schedule to accelerator
    xdnn.initScript(args['netcfg'], weightsBlob, args["batchsz"],
                    args['quantizecfg'], args['scaleB'], args['PE'], 0)
    xdnn.initScript(args['netcfg'], weightsBlob, args["batchsz"],
                    args['quantizecfg'], args['scaleB'], args['PE'], 1)

    # Run forward propagation N times
    print("Running inference...\n")
    cumulative_time = -1 * timeit.default_timer()

    if mode == "Non-Blocking":

        xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[0],
                        fpgaOutputs[0], args["batchsz"], args['quantizecfg'],
                        args['scaleB'], args['PE'], 0)
        xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[1],
                        fpgaOutputs[1], args["batchsz"], args['quantizecfg'],
                        args['scaleB'], args['PE'], 1)

        for i in range(args["iterations"] / 2 - 1):
            xdnn.get_result(-1, 0)  # get 0
            xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[0],
                            fpgaOutputs[0], args["batchsz"],
                            args['quantizecfg'], args['scaleB'], args['PE'],
                            0)  # push 0
            xdnn.get_result(-1, 1)  # get 1
            xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[1],
                            fpgaOutputs[1], args["batchsz"],
                            args['quantizecfg'], args['scaleB'], args['PE'],
                            1)  # push 1

        xdnn.get_result(-1, 0)  # get 0
        xdnn.get_result(-1, 1)  # get 1

    else:
        for i in range(args["iterations"]):
            xdnn.execute(args['netcfg'], weightsBlob, fpgaInputs[0],
                         fpgaOutputs[0], args["batchsz"], args['quantizecfg'],
                         args['scaleB'], args['PE'])

    cumulative_time += timeit.default_timer()

    # Summarize
    print("===========================================")
    print("Performance Summary\n")
    print("  Network: %s" % (args["name"]))
    print("  Precision: %d" % (args["precision"]))
    print("  Images: %d" % (args["iterations"] * args["batchsz"]))
    print("  Batch Size: %d" % (args["batchsz"]))
    print("  Total Batches: %d" % (args["iterations"]))
    print("  Total Time: %.2f ms" % (1000 * cumulative_time))
    print("  SIL: %.2f ms" %
          (1000 * cumulative_time /
           args["iterations"]))  # Time per batch # Single Image Latency
    print("  FPS: %.2f" %
          (args["iterations"] * args["batchsz"] / cumulative_time))
    print("  GOPS: %.2f" % (args["ops"] * args["iterations"] *
                            args["batchsz"] / cumulative_time / 1000000000))
    print("===========================================\n")

    # Release FPGA
    xdnn.closeHandle()