Beispiel #1
0
def init_fpga():
    # asw or nimbix
    config["device"] = "nimbix"
    config["xclbin"] = mlsuite + "/overlaybins/" + config[
        "device"] + "/overlay_3.xclbin"
    config["xfdnn_library"] = mlsuite + "/xfdnn/rt/xdnn_cpp/lib/libxfdnn.so"

    print(" --- INIT FPGA ------\n")
    print("xclbin: {0}.\n".format(config["xclbin"]))
    print("xfdnn_library: {0}.\n".format(config["xfdnn_library"]))

    ret = pyxfdnn.createHandle(config["xclbin"], "kernelSxdnn_0",
                               config["xfdnn_library"])
    if ret:
        raise SystemExit("ERROR: Unable to create handle to FPGA")
    else:
        print("INFO: Sucessfully create handle to FPGA.")

    # magics.   See ml-suite/notebooks tutorial.
    config["quantizecfg"] = "./gnet/quantization_params.json"
    config["bitwidths"] = [16, 16, 16]
    # config["in_shape"] = [3, 224, 224], g_img c/h/w
    config["transpose"] = [2, 0, 1]
    config["channel_swap"] = [2, 1, 0]
    config["raw_scale"] = 255.0
    config["img_mean"] = [104.007, 116.669, 122.679]
    config["input_scale"] = 1.0
    config["calibration_size"] = 15
    config[
        "calibration_directory"] = mlsuite + "/xfdnn/tools/quantize/calibration_directory"
    config["datadir"] = "./gnet/bvlc_googlenet_without_lrn.caffemodel_data"
    config["scaleA"] = 10000
    config["scaleB"] = 30
    config["PE"] = -1
    config["transform"] = "resize"

    config["firstfpgalayer"] = "conv1/7x7_s2"

    config["fpgacommands"] = "./gnet/fpga.cmds"
    config["fpgaoutsz"] = 1024
    config["outsz"] = 1000
    config["useblas"] = True
    config["labels"] = mlsuite + "/examples/classification/synset_words.txt"

    (a, b, c) = pyxfdnn_io.loadWeights(config)
    config["weightsBlob"] = a
    config["fcWeight"] = b
    config["fcBias"] = c

    config["labelarray"] = []
    with open(config["labels"], 'r') as f:
        for line in f:
            config["labelarray"].append(line.strip())

    # prepare inputs, inputbuf, see ml-suite examples batch_classify.py
    config["g_inputs"] = np.zeros((g_batchSize, g_imgc * g_imgh * g_imgw),
                                  dtype=np.float32)
    # g_inputbuf = np.zeros((g_batchSize, g_imgc, g_imgh, g_imgw), dtype=np.float32)
    config["g_fpgaOutput"] = pyxfdnn_io.prepareOutput(config["fpgaoutsz"],
                                                      g_batchSize)
Beispiel #2
0
    def createHandle(self):
        (ret, handles) = xdnn.createHandle(self._xdnnParams['xclbin'])

        if ret != 0:
          raise RuntimeError("Could not init FPGA: xclbin %s lib_path %s" % (self._xdnnParams['xclbin'], self._xdnnParams['lib_path']))
          sys.exit(1)

        self._xdnnParams['handles'] = handles
Beispiel #3
0
    def fpga_stage(config, q_fpga, q_bbox):
        config['xdnn_handle'] = xdnn.createHandle(\
          config['xclbin'], "kernelSxdnn_0", config['xlnxlib'])
        if config['xdnn_handle'] != 0:
            log.error("Failed to start FPGA process ",
              " - could not open xclbin %s %s!" \
              % (config['xclbin'], config['xlnxlib']))
            sys.exit(1)

        # Load Weights
        config['weightsBlob'] = xdnn_io.loadWeightsBiasQuant(config)
        # Allocate FPGA Outputs
        fpgaOutput = xdnn_io.prepareOutput(
            config['out_w'] * config['out_h'] * config['bboxplanes'] *
            (config['classes'] + config['coords'] + 1), config['batch_sz'])

        while True:
            job = q_fpga.get()
            if job == None:
                q_bbox.put(None)  # propagate 'stop' signal downstream
                sys.exit(0)

            images = job['images']
            display = job['display']
            coco = job['coco']

            if images is not None:
                log.info("Running Image(s):")
                log.info(images)
                config['images'] = images
            else:
                log.error("Detect requires images as a parameter")
                continue

            log.info("Preparing Input...")
            (fpgaInputs, shapes, _) = xdnn_io.prepareInput(config)
            job['shapes'] = shapes  # pass shapes to next stage

            # EXECUTE XDNN
            log.info("Running %s image(s)" % (config['batch_sz']))
            startTime = timeit.default_timer()
            xdnn.execute(
                config['netcfg'],
                config['weightsBlob'],
                fpgaInputs,
                fpgaOutput,
                config['batch_sz'],  # num batches
                config['quantizecfg'],
                config['scaleB'],
                config['PE'])
            elapsedTime = timeit.default_timer() - startTime

            # Only showing time for second run because first is loading script
            log.info("\nTotal FPGA: %f ms" % (elapsedTime * 1000))
            log.info("Image Time: (%f ms/img):" %
                     (elapsedTime * 1000 / config['batch_sz']))

            q_bbox.put((job, fpgaOutput))
Beispiel #4
0
  def fpga_stage(config, q_fpga, q_bbox, maxNumIters=-1):
    config['xdnn_handle'], handles = xdnn.createHandle(config['xclbin'], "kernelSxdnn_0")
    if config['xdnn_handle'] != 0:
      log.error("Failed to start FPGA process ",
        " - could not open xclbin %s %s!" \
        % (config['xclbin'], config['xlnxlib']))
      sys.exit(1)

    fpgaRT = xdnn.XDNNFPGAOp(handles, config)

    # Allocate FPGA Outputs 
    fpgaOutSize = config['out_w']*config['out_h']*config['bboxplanes']*(config['classes']+config['coords']+1)
    fpgaOutput = np.empty((config['batch_sz'], fpgaOutSize,), dtype=np.float32, order='C')
    raw_img = np.empty(((config['batch_sz'],) + config['in_shape']), dtype=np.float32, order='C')

    numIters = 0
    while True:
      numIters += 1
      if maxNumIters > 0 and numIters > maxNumIters:
        break
      
      job = q_fpga.get()
      if job == None:
        q_bbox.put(None) # propagate 'stop' signal downstream
        sys.exit(0)

      images = job['images']
      display = job['display']
      coco = job['coco']

      if images is not None:
        log.info("Running Image(s):")
        log.info(images)
        config['images'] = images
      else:
        log.error("Detect requires images as a parameter")
        continue

      log.info("Preparing Input...")
      shapes = []
      for i,img in enumerate(images):
        raw_img[i,...], s = xdnn_io.loadYoloImageBlobFromFile(img,  config['in_shape'][1], config['in_shape'][2])
        shapes.append(s)

      job['shapes'] = shapes # pass shapes to next stage

      # EXECUTE XDNN
      log.info("Running %s image(s)"%(config['batch_sz']))
      startTime = timeit.default_timer()
      fpgaRT.execute(raw_img, fpgaOutput, config['PE'])
      elapsedTime = timeit.default_timer() - startTime

      # Only showing time for second run because first is loading script
      log.info("\nTotal FPGA: %f ms" % (elapsedTime*1000))
      log.info("Image Time: (%f ms/img):" % (elapsedTime*1000/config['batch_sz']))

      q_bbox.put((job, fpgaOutput))
Beispiel #5
0
def main():
    args = xdnn_io.processCommandLine()

    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
    if ret != 0:
        sys.exit(1)

    fpgaRT = xdnn.XDNNFPGAOp(handles, args)
    fpgaOutput = fpgaRT.getOutputs()
    fpgaInput = fpgaRT.getInputs()

    fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args)
    img_paths = xdnn_io.getFilePaths(args['images'])
    fcOutput = np.empty((
        args['batch_sz'],
        args['outsz'],
    ),
                        dtype=np.float32,
                        order='C')

    inShape = (args['batch_sz'], ) + tuple(
        tuple(fpgaRT.getInputDescriptors().values())[0][1:])

    labels = xdnn_io.get_labels(args['labels'])
    if args['golden']:
        goldenMap = xdnn_io.getGoldenMap(args['golden'])
        top5Count = 0
        top1Count = 0

    firstInput = list(fpgaInput.values())[0]
    firstOutput = list(fpgaOutput.values())[0]

    for i in range(0, len(img_paths), args['batch_sz']):
        pl = []
        for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
            firstInput[j, ...], _ = xdnn_io.loadImageBlobFromFile(
                p, args['img_raw_scale'], args['img_mean'],
                args['img_input_scale'], inShape[2], inShape[3])
            pl.append(p)

        fpgaRT.execute(fpgaInput, fpgaOutput)
        xdnn.computeFC(fcWeight, fcBias, firstOutput, fcOutput)
        softmaxOut = xdnn.computeSoftmax(fcOutput)
        xdnn_io.printClassification(softmaxOut, pl, labels)
        if args['golden']:
            for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
                top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p,
                                            labels, 1)
                top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p,
                                            labels, 5)

    xdnn.closeHandle()
    if args['golden']:
        print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % (
            len(img_paths), float(top1Count) / float(len(img_paths)) * 100.,
            float(top5Count) / float(len(img_paths)) * 100.)
Beispiel #6
0
def fpga_init():
    # Parse arguments
    parser = xdnn_io.default_parser_args()
    parser.add_argument('--deviceID', type=int, default=0,
                        help='FPGA no. -> FPGA ID to run in case multiple FPGAs')
    args = parser.parse_args()
    args = xdnn_io.make_dict_args(args)

    # Create manager
    if not xdnn.createManager():
        raise Exception("Failed to create manager")

    compilerJSONObj = xdnn.CompilerJsonParser(args['netcfg'])

    # Get input and output shape
    input_shapes = list(map(lambda x: (x), compilerJSONObj.getInputs().itervalues()))
    output_shapes = list(map(lambda x: (x), compilerJSONObj.getOutputs().itervalues()))

    for in_idx in range(len(input_shapes)):
        input_shapes[in_idx][0] = args['batch_sz']
    for out_idx in range(len(output_shapes)):
        output_shapes[out_idx][0] = args['batch_sz']

    input_node_names = list(map(lambda x: str(x), compilerJSONObj.getInputs().iterkeys()))
    output_node_names = list(map(lambda x: str(x), compilerJSONObj.getOutputs().iterkeys()))

    num_inputs = len(input_shapes)
    num_outputs = len(output_shapes)

    # Create runtime
    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", [args["deviceID"]])
    if ret != 0:
        raise Exception("Failed to create handle, return value: {error}".format(error=ret))
    fpgaRT = xdnn.XDNNFPGAOp(handles, args)

    print("Batch size:", args['batch_sz'])
    print("Input shapes:", input_shapes)
    print("Input nodes:", input_node_names)
    print("Ouput shapes:", output_shapes)
    print("Ouput nodes:", output_node_names)

    output_buffers = []
    for _ in range(N_STREAMS):
        buffer = {name: np.empty(shape=shape, dtype=np.float32)
                  for name, shape in zip(output_node_names, output_shapes)}
        output_buffers.append(buffer)

    # fpgaRT.exec_async({input_node_names[0]: np.zeros(input_shapes[0])},
    #                   output_buffers[0], 0)
    # fpgaRT.get_result(0)
    (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(args)

    return fpgaRT, output_buffers,\
        {name: shape for name, shape in zip(input_node_names, input_shapes)},\
        fcWeight, fcBias
Beispiel #7
0
    def setup(self, bottom, top):
        self.param_dict = eval(self.param_str)  # Get args from prototxt
        self._args = xdnn_io.make_dict_args(self.param_dict)
        self._numPE = self._args[
            "batch_sz"]  # Bryan hack to detremine number of PEs in FPGA
        # Establish FPGA Communication, Load bitstream
        ret, handles = xdnn.createHandle(self._args["xclbin"], "kernelSxdnn_0")
        if ret != 0:
            raise Exception("Failed to open FPGA handle.")

        self._args["scaleB"] = 1
        self._args["PE"] = -1
        # Instantiate runtime interface object
        self.fpgaRT = xdnn.XDNNFPGAOp(handles, self._args)
        self._indictnames = self._args["input_names"]
        self._outdictnames = self._args["output_names"]
        self._parser = xdnn.CompilerJsonParser(self._args["netcfg"])
Beispiel #8
0
def init_fpga():
    # asw or nimbix
    config["device"] = "nimbix"
    config["xclbin"] = mlsuite + "/overlaybins/" + config["device"] + "/overlay_3.xclbin"
    config["xfdnn_library"] = mlsuite + "/xfdnn/rt/xdnn_cpp/lib/libxfdnn.so"

    ret = pyxfdnn.createHandle(config["xclbin"], "kernelSxdnn_0", config["xfdnn_library"])
    if ret:
        raise SystemExit("ERROR: Unable to create handle to FPGA")
    else:
        print("INFO: Sucessfully create handle to FPGA.")

    # magics
    config["fpgacommands"] = "./yolo/fpga.cmds"
    config["memory"] = 5 
    config["dsp"] = 56
    config["quantizecfg"] = "./yolo/quantization_params.json"
    config["bitwidths"] = [16,16,16]
    config["in_shape"] = [3, 608, 608]
    config["transpose"] = [2,0,1]
    config["channel_swap"] = [2,1,0]
    config["raw_scale"] = 1.0
    config["img_mean"] = [0.0, 0.0, 0.0]
    config["input_scale"] = 1.0
    config["calibration_size"] = 15
    config["calibration_directory"] = mlsuite + "/xfdnn/tools/quantize/calibration_directory"

    config["datadir"] = "./yolo/yolov2.caffemodel_data"
    config["scaleA"] = 10000
    config["scaleB"] = 30

    config["transform"] = "yolo"
    config["firstfpgalayer"] = "layer1-conv"

    # This one!   Comes from notebook example, but we will try to use -1 (auto select.)
    print ("Loading weights ...")
    config["PE"] = -1 
    (a, b, c) = pyxfdnn_io.loadWeights(config)
    config["weightsBlob"] = a
    config["fcWeight"] = b
    config["fcBias"] = c
    print ("Weights loaded.")

    # Allocate output memory.   See notebook example for all the dark magics.
    config["fpgaoutsz"] = g_anchor_boxes * g_outc * g_outh * g_outw 
    config["g_fpgaOutput"] = pyxfdnn_io.prepareOutput(config["fpgaoutsz"], g_batchSize)
Beispiel #9
0
    def __init__(self, params):
        self._args = xdnn_io.make_dict_args(params)
        self._numPE = self._args[
            "batch_sz"
        ]  # Bryan hack to detremine number of PEs in FPGA

        # Establish FPGA Communication, Load bitstream
        ret, handles = xdnn.createHandle(self._args["xclbin"], "kernelSxdnn_0")
        if ret != 0:
            raise Exception("Failed to open FPGA handle.")

        self._args["scaleB"] = 1
        self._args["PE"] = -1
        self._streamIds = [0, 1, 2, 3, 4, 5, 6, 7]  # Allow 8 streams

        # Instantiate runtime interface object
        self.fpgaRT = xdnn.XDNNFPGAOp(handles, self._args)
        self._indictnames = self._args["input_names"]
        self._outdictnames = self._args["output_names"]
        self._parser = xdnn.CompilerJsonParser(self._args["netcfg"])
Beispiel #10
0
def fpga_init():
    global PORT
    global N_STREAMS
    # Parse arguments
    parser = xdnn_io.default_parser_args()
    parser.add_argument('--device-ids',
                        type=int,
                        default=[0],
                        nargs="+",
                        help='a list of device IDs for FPGA')
    parser.add_argument('--port',
                        type=int,
                        default=5000,
                        help='port to listen on')
    args = parser.parse_args()
    device_ids = args.device_ids
    PORT = args.port
    N_STREAMS *= len(device_ids)
    args = xdnn_io.make_dict_args(args)

    # Create manager
    if not xdnn.createManager():
        raise Exception("Failed to create manager")

    compilerJSONObj = xdnn.CompilerJsonParser(args['netcfg'])

    # Get input and output shape
    input_shapes = list(
        map(lambda x: (x),
            compilerJSONObj.getInputs().itervalues()))
    output_shapes = list(
        map(lambda x: (x),
            compilerJSONObj.getOutputs().itervalues()))

    for in_idx in range(len(input_shapes)):
        input_shapes[in_idx][0] = args['batch_sz']
    for out_idx in range(len(output_shapes)):
        output_shapes[out_idx][0] = args['batch_sz']

    input_node_names = list(
        map(lambda x: str(x),
            compilerJSONObj.getInputs().iterkeys()))
    output_node_names = list(
        map(lambda x: str(x),
            compilerJSONObj.getOutputs().iterkeys()))

    num_inputs = len(input_shapes)
    num_outputs = len(output_shapes)

    # Create runtime
    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0",
                                     device_ids)
    if ret != 0:
        raise Exception(
            "Failed to create handle, return value: {error}".format(error=ret))
    fpgaRT = xdnn.XDNNFPGAOp(handles, args)

    print("Batch size:", args['batch_sz'])
    print("Input shapes:", input_shapes)
    print("Input nodes:", input_node_names)
    print("Ouput shapes:", output_shapes)
    print("Ouput nodes:", output_node_names)
    print("Using model {path}".format(path=args["netcfg"]))
    print("Using FPGA device:", device_ids)

    output_buffers = []
    for _ in range(N_STREAMS):
        buffer = {
            name: np.empty(shape=shape, dtype=np.float32)
            for name, shape in zip(output_node_names, output_shapes)
        }
        output_buffers.append(buffer)

    # fpgaRT.exec_async({input_node_names[0]: np.zeros(input_shapes[0])},
    #                   output_buffers[0], 0)
    # fpgaRT.get_result(0)
    (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(args)

    return fpgaRT, output_buffers, output_node_names[0],\
        {name: shape for name, shape in zip(input_node_names, input_shapes)},\
        fcWeight, fcBias, args['batch_sz']
Beispiel #11
0
def main(argv):
    args = xdnn_io.processCommandLine(argv)
    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
    # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib)
    if ret != 0:
        sys.exit(1)
    labels = xdnn_io.get_labels(args['labels'])

    # TODO dict of tuples instead?
    fpgaRT = {}
    fpgaOutputs = {}
    fcWeights = {}
    fcBiases = {}
    netFiles = {}
    confNames = []

    args = args['jsoncfg']  # we do not use other args' keys
    for netconf_args in args:

        confName = str(netconf_args['name'])
        confNames += [confName]
        # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp'])
        fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args)
        netconf_args['in_shape'] = tuple((netconf_args['batch_sz'], ) + tuple(
            fpgaRT[confName].getInputDescriptors().itervalues().next()[1:]))
        (fcWeights[confName],
         fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args)
        fpgaOutputs[confName] = np.empty((
            netconf_args['batch_sz'],
            int(netconf_args['fpgaoutsz']),
        ),
                                         dtype=np.float32,
                                         order='C')
        netFiles[confName] = str(netconf_args['netcfg'])

    batchArrays = []
    for streamId, netconf_args in enumerate(args):
        batchArrays.append(
            np.empty(netconf_args['in_shape'], dtype=np.float32, order='C'))
        pl = []
        img_paths = xdnn_io.getFilePaths(netconf_args['images'])
        for j, p in enumerate(img_paths[:netconf_args['batch_sz']]):
            batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile(
                p, netconf_args['img_raw_scale'], netconf_args['img_mean'],
                netconf_args['img_input_scale'], netconf_args['in_shape'][2],
                netconf_args['in_shape'][3])
            pl.append(p)

        confName = str(netconf_args['name'])
        firstInputName = fpgaRT[confName].getInputs().iterkeys().next()
        firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next()
        fpgaRT[confName].exec_async({firstInputName: batchArrays[-1]},
                                    {firstOutputName: fpgaOutputs[confName]},
                                    streamId)

    for streamId, confName in enumerate(confNames):
        fpgaRT[confName].get_result(streamId)

    for netconf_args in args:
        confName = str(netconf_args['name'])
        fcOut = np.empty((netconf_args['batch_sz'], netconf_args['outsz']),
                         dtype=np.float32,
                         order='C')
        xdnn.computeFC(fcWeights[confName], fcBiases[confName],
                       fpgaOutputs[confName], fcOut)

        softmaxOut = xdnn.computeSoftmax(fcOut)
        xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels)

    xdnn.closeHandle()
Beispiel #12
0
    def fpga_stage(config, q_fpga, q_bbox, maxNumIters=-1):
        config['xdnn_handle'], handles = xdnn.createHandle(
            config['xclbin'], "kernelSxdnn_0")
        if config['xdnn_handle'] != 0:
            log.error("Failed to start FPGA process ",
              " - could not open xclbin %s %s!" \
              % (config['xclbin'], config['xlnxlib']))
            sys.exit(1)

        fpgaRT = xdnn.XDNNFPGAOp(handles, config)

        fpgaInput = fpgaRT.getInputs()
        fpgaOutput = fpgaRT.getOutputs()

        numIters = 0
        while True:
            numIters += 1
            if maxNumIters > 0 and numIters > maxNumIters:
                break

            job = q_fpga.get()
            if job == None:
                q_bbox.put(None)  # propagate 'stop' signal downstream
                sys.exit(0)

            images = job['images']
            display = job['display']
            coco = job['coco']

            if images is not None:
                log.info("Running Image(s):")
                log.info(images)
                config['images'] = images
            else:
                log.error("Detect requires images as a parameter")
                continue

            if ((config['yolo_model'] == 'xilinx_yolo_v2')
                    or (config['yolo_model'] == 'xilinx_prelu_yolo_v2')
                    or (config['yolo_model'] == 'tiny_yolo_v2_voc')):
                pass
            else:

                out_data_shape = []
                net = caffe.Net(config['caffe_prototxt'],
                                config['caffe_model'], caffe.TEST)

                if (config['yolo_model'] == 'standard_yolo_v2'):
                    out_data_shape.append(
                        (config['batch_sz'], ) +
                        tuple(net.blobs['layer31-conv'].data.shape[1:4]))

                elif (config['yolo_model'] == 'tiny_yolo_v2'):
                    out_data_shape.append(
                        (config['batch_sz'], ) +
                        tuple(net.blobs['layer15-conv'].data.shape[1:4]))

                elif (config['yolo_model'] == 'tiny_yolo_v3'):
                    out_data_shape.append(
                        (config['batch_sz'], ) +
                        tuple(net.blobs['layer14-conv'].data.shape[1:4]))
                    out_data_shape.append(
                        (config['batch_sz'], ) +
                        tuple(net.blobs['layer21-conv'].data.shape[1:4]))

                elif (config['yolo_model'] == 'standard_yolo_v3'):
                    out_data_shape.append(
                        (config['batch_sz'], ) +
                        tuple(net.blobs['layer81-conv'].data.shape[1:4]))
                    out_data_shape.append(
                        (config['batch_sz'], ) +
                        tuple(net.blobs['layer93-conv'].data.shape[1:4]))
                    out_data_shape.append(
                        (config['batch_sz'], ) +
                        tuple(net.blobs['layer105-conv'].data.shape[1:4]))

                elif (config['yolo_model'] == 'spp_yolo_v3'):
                    out_data_shape.append(
                        (config['batch_sz'], ) +
                        tuple(net.blobs['layer88-conv'].data.shape[1:4]))
                    out_data_shape.append(
                        (config['batch_sz'], ) +
                        tuple(net.blobs['layer100-conv'].data.shape[1:4]))
                    out_data_shape.append(
                        (config['batch_sz'], ) +
                        tuple(net.blobs['layer112-conv'].data.shape[1:4]))

                #print "out_data_shape : ", out_data_shape
                softmaxOut = []
                for list_idx in range(len(out_data_shape)):
                    softmaxOut.append(np.empty(out_data_shape[list_idx]))

            firstInput = fpgaInput.itervalues().next()
            firstOutput = fpgaOutput.itervalues().next()
            maxpool_out = np.empty_like(firstOutput)

            log.info("Preparing Input...")
            shapes = []
            inputs = []
            for i, img in enumerate(images):
                firstInput[i, ...], s = xdnn_io.loadYoloImageBlobFromFile(
                    img, config['in_shape'][1], config['in_shape'][2])
                shapes.append(s)

            job['shapes'] = shapes  # pass shapes to next stage
            # EXECUTE XDNN
            log.info("Running %s image(s)" % (config['batch_sz']))

            if ((config['yolo_model'] == 'xilinx_yolo_v2')
                    or (config['yolo_model'] == 'xilinx_prelu_yolo_v2')
                    or (config['yolo_model'] == 'tiny_yolo_v2_voc')):
                startTime = timeit.default_timer()
                fpgaRT.execute(fpgaInput, fpgaOutput, config['PE'])
                elapsedTime = timeit.default_timer() - startTime

                # Only showing time for second run because first is loading script
                log.info("\nTotal FPGA: %f ms" % (elapsedTime * 1000))
                log.info("Image Time: (%f ms/img):" %
                         (elapsedTime * 1000 / config['batch_sz']))

                q_bbox.put((job, firstOutput))

            elif (config['yolo_model'] == 'standard_yolo_v2'):

                startTime = timeit.default_timer()
                fpgaRT.execute(fpgaInput, fpgaOutput, config['PE'])
                elapsedTime = timeit.default_timer() - startTime
                #out_data_shape = (config['batch_sz'] ,) + tuple(net.blobs['layer31-conv'].data.shape[1:4])
                #softmaxOut = np.empty(out_data_shape)

                startTime = timeit.default_timer()
                for bt_idx in range(config['batch_sz']):
                    net.blobs['layer25-conv'].data[
                        ...] = fpgaOutput['layer25-conv'][bt_idx, ...]
                    net.blobs['layer27-conv'].data[
                        ...] = fpgaOutput['layer27-conv'][bt_idx, ...]
                    net.forward(start='layer28-reorg', end='layer31-conv')
                    final_out = net.blobs['layer31-conv'].data[...]
                    softmaxOut[0][bt_idx, ...] = final_out[...]

                elapsedTime_cpu = timeit.default_timer() - startTime
                # Only showing time for second run because first is loading script
                print(elapsedTime * 1000, (elapsedTime_cpu * 1000),
                      ((elapsedTime + elapsedTime_cpu) * 1000 /
                       config['batch_sz']))
                log.info("\nTotal FPGA: %f ms" % (elapsedTime * 1000))
                log.info("\nTotal FPGA: %f ms" % (elapsedTime_cpu * 1000))
                log.info("Image Time: (%f ms/img):" %
                         ((elapsedTime + elapsedTime_cpu) * 1000 /
                          config['batch_sz']))

                q_bbox.put((job, softmaxOut[0]))

            elif (config['yolo_model'] == 'tiny_yolo_v3'):
                startTime = timeit.default_timer()
                fpgaRT.execute(fpgaInput, fpgaOutput, config['PE'])
                elapsedTime = timeit.default_timer() - startTime
                for bt_idx in range(config['batch_sz']):
                    softmaxOut[0][bt_idx,
                                  ...] = fpgaOutput['layer14-conv'][bt_idx,
                                                                    ...]
                    softmaxOut[1][bt_idx,
                                  ...] = fpgaOutput['layer21-conv'][bt_idx,
                                                                    ...]

                q_bbox.put((job, softmaxOut))

            elif (config['yolo_model'] == 'standard_yolo_v3'):
                use_fpga = 1
                if (use_fpga == 1):
                    startTime = timeit.default_timer()
                    fpgaRT.execute(fpgaInput, fpgaOutput, config['PE'])
                    elapsedTime = timeit.default_timer() - startTime

                    startTime = timeit.default_timer()
                    for bt_idx in range(config['batch_sz']):
                        softmaxOut[0][bt_idx,
                                      ...] = fpgaOutput['layer81-conv'][bt_idx,
                                                                        ...]
                        softmaxOut[1][bt_idx,
                                      ...] = fpgaOutput['layer93-conv'][bt_idx,
                                                                        ...]
                        softmaxOut[2][
                            bt_idx, ...] = fpgaOutput['layer105-conv'][bt_idx,
                                                                       ...]

                    elapsedTime_cpu = timeit.default_timer() - startTime

                    print(elapsedTime * 1000, (elapsedTime_cpu * 1000),
                          ((elapsedTime + elapsedTime_cpu) * 1000 /
                           config['batch_sz']))
                else:

                    for bt_idx in range(config['batch_sz']):
                        net.blobs['data'].data[...] = firstInput[bt_idx, ...]
                        net.forward()
                        softmaxOut[0][
                            bt_idx, ...] = net.blobs['layer81-conv'].data[...]
                        softmaxOut[1][
                            bt_idx, ...] = net.blobs['layer93-conv'].data[...]
                        softmaxOut[2][
                            bt_idx, ...] = net.blobs['layer105-conv'].data[...]
            # Only showing time for second run because first is loading script
            #log.info("\nTotal FPGA: %f ms" % (elapsedTime*1000))
            #log.info("\nTotal FPGA: %f ms" % (elapsedTime_cpu*1000))
            #log.info("Image Time: (%f ms/img):" % ((elapsedTime+elapsedTime_cpu)*1000/config['batch_sz']))

                q_bbox.put((job, softmaxOut))

            elif (config['yolo_model'] == 'spp_yolo_v3'):
                startTime = timeit.default_timer()
                fpgaRT.execute(fpgaInput, fpgaOutput, config['PE'])
                elapsedTime = timeit.default_timer() - startTime

                startTime = timeit.default_timer()
                for bt_idx in range(config['batch_sz']):
                    softmaxOut[0][bt_idx,
                                  ...] = fpgaOutput['layer88-conv'][bt_idx,
                                                                    ...]
                    softmaxOut[1][bt_idx,
                                  ...] = fpgaOutput['layer100-conv'][bt_idx,
                                                                     ...]
                    softmaxOut[2][bt_idx,
                                  ...] = fpgaOutput['layer112-conv'][bt_idx,
                                                                     ...]

                elapsedTime_cpu = timeit.default_timer() - startTime
                # Only showing time for second run because first is loading script
                print(elapsedTime * 1000, (elapsedTime_cpu * 1000),
                      ((elapsedTime + elapsedTime_cpu) * 1000 /
                       config['batch_sz']))
                log.info("\nTotal FPGA: %f ms" % (elapsedTime * 1000))
                log.info("\nTotal FPGA: %f ms" % (elapsedTime_cpu * 1000))
                log.info("Image Time: (%f ms/img):" %
                         ((elapsedTime + elapsedTime_cpu) * 1000 /
                          config['batch_sz']))

                q_bbox.put((job, softmaxOut))

            elif (config['yolo_model'] == 'tiny_yolo_v2'):
                startTime = timeit.default_timer()
                fpgaRT.execute(fpgaInput, fpgaOutput, config['PE'])
                elapsedTime = timeit.default_timer() - startTime
                darknet_maxpool_k2x2_s1(firstOutput, maxpool_out)

                for bt_idx in range(config['batch_sz']):
                    net.blobs['data'].data[...] = maxpool_out[bt_idx, ...]
                    net.forward()
                    final_out = net.blobs['layer15-conv'].data[...]
                    softmaxOut[0][bt_idx, ...] = final_out[...]
                elapsedTime_cpu = timeit.default_timer() - startTime

                print(elapsedTime * 1000, (elapsedTime_cpu * 1000),
                      ((elapsedTime + elapsedTime_cpu) * 1000 /
                       config['batch_sz']))
                log.info("\nTotal FPGA: %f ms" % (elapsedTime * 1000))
                log.info("\nTotal FPGA: %f ms" % (elapsedTime_cpu * 1000))
                log.info("Image Time: (%f ms/img):" %
                         ((elapsedTime + elapsedTime_cpu) * 1000 /
                          config['batch_sz']))

                q_bbox.put((job, softmaxOut[0]))

            else:
                print("model not supported")
Beispiel #13
0
# [here]: https://github.com/Xilinx/ml-suite

# In[6]:

# Create a handle with which to communicate to the FPGA
# The actual handle is managed by pyxfdnn

config["xclbin"] = "../overlaybins/" + config[
    "device"] + "/xdnn_56_16b_5m.awsxclbin"  # Chosen Hardware Overlay
## NOTE: If you change the xclbin, we likely need to change some arguments provided to the compiler
## Specifically, the DSP array width, and the memory arguments

config[
    "xfdnn_library"] = "../xfdnn/rt/xdnn_cpp/lib/libxfdnn.so"  # Library functions called by pyXFDNN

ret = pyxfdnn.createHandle(config['xclbin'], "kernelSxdnn_0",
                           config['xfdnn_library'])
if ret:
    print("ERROR: Unable to create handle to FPGA")
else:
    print("INFO: Successfully created handle to FPGA")

# ### 6. Apply quantization scaling and transfer model weights to the FPGA.

# In[7]:

# Quantize, and transfer the weights to FPGA DDR

# config["datadir"] = "work/" + config["caffemodel"].split("/")[-1]+"_data" # From Compiler
config[
    "scaleA"] = 10000  # Global scaler for weights (Must be defined, although not used)
config[
Beispiel #14
0
def fpga_process(fpgaRT, args, num_img, compJson, shared_trans_arrs,
                 shared_output_arrs):
    if fpgaRT is None:
        ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0",
                                         [args["deviceID"]])
        if ret != 0:
            sys.exit(1)
        fpgaRT = xdnn.XDNNFPGAOp(handles, args)
    else:
        print "fpga process handle was ready:"
    qWait = mp.Queue(maxsize=100)
    numStreams = args['numstream']
    bsz = args['batch_sz']
    input_ptrs = [[] for i in range(numStreams)]

    numProcessed = 0
    t = threading.Thread(target=fpga_wait,
                         args=(fpgaRT, qWait, shared_output_arrs,
                               shared_trans_arrs))
    t.start()

    input_shapes = map(lambda x: (x), compJson.getInputs().itervalues())
    output_shapes = map(lambda x: (x), compJson.getOutputs().itervalues())

    InputName_list = map(lambda x: str(x), compJson.getInputs().iterkeys())
    OutputName_list = map(lambda x: str(x), compJson.getOutputs().iterkeys())
    num_inputs = len(input_shapes)
    num_outputs = len(output_shapes)

    startTime = time.time()
    while numProcessed < num_img or args['perpetual']:

        write_slot = shared_output_arrs.openWriteId()
        write_slot_arrs = shared_output_arrs.accessNumpyBuffer(write_slot)

        in_dict = {}
        out_dict = {}

        for out_idx in range(num_outputs):
            out_dict[OutputName_list[out_idx]] = write_slot_arrs[out_idx]

        read_slot_arrs_list = []
        read_slot_list = []
        for img_num in range(args['batch_sz']):
            read_slot = shared_trans_arrs.openReadId()

            if read_slot is None:
                break
            read_slot_arrs = shared_trans_arrs.accessNumpyBuffer(read_slot)
            read_slot_arrs_list.append(read_slot_arrs)
            read_slot_list.append(read_slot)

            write_slot_arrs[-1][img_num][:] = read_slot_arrs[-1][:]

            numProcessed += 1
            if (args['perpetual'] == False):
                if numProcessed == num_img:
                    break

        images_added = len(read_slot_arrs_list)

        # when number of images avaiable are less than the batch size, fill the rest of the out buffer image-id  slots with -1
        for img_num in range(images_added, args['batch_sz']):
            write_slot_arrs[-1][img_num][:] = -1

        for in_idx in range(num_inputs):
            in_dict[InputName_list[in_idx]] = []
            for img_idx in range(len(read_slot_arrs_list)):
                in_dict[InputName_list[in_idx]].append(
                    read_slot_arrs_list[img_idx][in_idx])

        fpgaRT.exec_async(in_dict, out_dict, write_slot)
        qWait.put((write_slot, read_slot_list, img_num))
        #shared_trans_arrs.closeReadId(read_slot)

    qWait.put((None, None, None))
    t.join()
    elapsedTime = (time.time() - startTime)
    print("FPGA_process: ", float(numProcessed) / elapsedTime, "img/s")

    xdnn.closeHandle()