Exemple #1
0
def faceDetection(args, FDDB_list, FDDB_results_file):
    runner = Runner(args.vitisrundir)
    inTensors = runner.get_input_tensors()
    outTensors = runner.get_output_tensors()
    batch_sz = 1
    fpgaBlobs = []
    for io in [inTensors, outTensors]:
        blobs = []
        for t in io:
            shape = (batch_sz, ) + tuple([t.dims[i]
                                          for i in range(t.ndims)][1:])
            blobs.append(np.empty((shape), dtype=np.float32, order='C'))
        fpgaBlobs.append(blobs)

    for i, line in enumerate(FDDB_list):
        FDDB_results_file.write('%s\n' % line.strip())
        image_name = args.fddbPath + line.strip() + '.jpg'
        image_ori = cv2.imread(image_name, cv2.IMREAD_COLOR)
        rects = detect(runner, fpgaBlobs, image_ori)
        FDDB_results_file.write('%d\n' % len(rects))

        for rect in rects:
            FDDB_results_file.write('%d %d %d %d %f\n' %
                                    (rect[0], rect[1], rect[2] - rect[0],
                                     rect[3] - rect[1], rect[4]))
    FDDB_results_file.close()
Exemple #2
0
def fpga_process(vitisrundir, shared_trans_arrs, shared_output_arrs,
                 ready_fpga):
    runner = Runner(vitisrundir)
    qWait = mp.Queue(maxsize=100)
    ready_fpga.put(1)
    t = threading.Thread(target=fpga_wait,
                         args=(runner, qWait, shared_output_arrs,
                               shared_trans_arrs))
    t.start()
    numProcessed = 0
    startTime = time.time()
    while True:
        # Get the buffer for fpga output
        write_slot = shared_output_arrs.openWriteId()
        write_slot_arrs = shared_output_arrs.accessNumpyBuffer(write_slot)

        # Get the input buffer for fpga exec
        read_slot = shared_trans_arrs.openReadId()

        if read_slot is None: break
        read_slot_arrs = shared_trans_arrs.accessNumpyBuffer(read_slot)

        # Start execution
        jid = runner.execute_async([read_slot_arrs[1]], write_slot_arrs)
        # runner.wait(jid)
        qWait.put((write_slot, read_slot, jid))
        #shared_trans_arrs.closeReadId(read_slot)

        numProcessed += 1

    qWait.put((None, None, None))
    t.join()
    elapsedTime = (time.time() - startTime)
    print("FPGA_process: ", float(numProcessed) / elapsedTime, "img/s")
Exemple #3
0
def faceDetection(vitis_rundir,outpath, rsz_h, rsz_w, path):
    runner = Runner(vitis_rundir)
    inTensors = runner.get_input_tensors()
    outTensors = runner.get_output_tensors()
    batch_sz = 1
    fpgaBlobs= []
    for io in [inTensors, outTensors]:
        blobs = []
        for t in io:
            shape = (batch_sz,) + tuple([t.dims[i] for i in range(t.ndims)][1:])
            blobs.append(np.empty((shape), dtype=np.float32, order='C'))
        fpgaBlobs.append(blobs)
    
    dirName = outpath
    if not os.path.exists(dirName):
        os.mkdir(dirName)
    
    output_Img_path = dirName
    #os.chdir(path)
    res=[] 
    for fn in sorted(glob.glob(path+ '/*.jpg'), key=os.path.getsize):
        filename = fn[fn.rfind('/')+1:]
        src_img=cv2.imread(fn)
        input_img=cv2.resize(src_img,(rsz_w, rsz_h))
        face_rects=detect(runner, fpgaBlobs, input_img)
        dst_img=input_img.copy()
        if len(face_rects) != 0:
            for face_rect in face_rects:
                res.append("{} {} {} {} {}".format(fn, face_rect[0],face_rect[1],face_rect[2],face_rect[3]))
                print ("{} {} {} {} {}".format(fn, face_rect[0],face_rect[1],face_rect[2],face_rect[3]))
                cv2.rectangle(dst_img,(face_rect[0],face_rect[1]),(face_rect[2],face_rect[3]),(0,255,0),2)
                cv2.imwrite(output_Img_path+filename,dst_img)
Exemple #4
0
 def run(rundir, n, q):
     runners = []
     for i in range(n):
         runners.append(Runner(rundir))
     inTensors = runners[0].get_input_tensors()
     shape = [inTensors[0].dims[i] for i in range(inTensors[0].ndims)]
     q.put(shape)  # ready for work
     q.get()  # wait for exit signal
Exemple #5
0
  def run(rundir, chanIdx, q, args):
    xspub = xstream.Publisher()
    xssub = xstream.Subscribe(chanIdx2Str(chanIdx))
    runner = Runner(rundir)
    inTensors = runner.get_input_tensors()
    outTensors = runner.get_output_tensors()

    q.put(1) # ready for work

    fpgaBlobs = None
    fcOutput = None
    labels = xdnn_io.get_labels(args['labels'])
    xdnnCPUOp = xdnn.XDNNCPUOp("%s/weights.h5" % rundir)
    while True:
      try:
        payload = xssub.get()
        if not payload:
          break
        (meta, buf) = payload

        if fpgaBlobs == None:
          # allocate buffers
          fpgaBlobs = []
          batchsz = meta['shape'][0] # inTensors[0].dims[0] 

          for io in [inTensors, outTensors]:
            blobs = []
            for t in io:
              shape = (batchsz,) + tuple([t.dims[i] for i in range(t.ndims)][1:])
              blobs.append(np.empty((shape), dtype=np.float32, order='C'))
            fpgaBlobs.append(blobs)

          fcOutput = np.empty((batchsz, args['outsz'],), dtype=np.float32, order='C')

        fpgaInput = fpgaBlobs[0][0]
        assert(tuple(meta['shape']) == fpgaInput.shape)
        data = np.frombuffer(buf, dtype=np.float32).reshape(fpgaInput.shape)
        np.copyto(fpgaInput, data)

        jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1])
        runner.wait(jid)

        xdnnCPUOp.computeFC(fpgaBlobs[1][0], fcOutput)
        softmaxOut = xdnnCPUOp.computeSoftmax(fcOutput)
        xdnn_io.printClassification(softmaxOut, meta['images'], labels)
        sys.stdout.flush()
          
        if meta['id'] % 1000 == 0:
          print("Recvd query %d" % meta['id'])
          sys.stdout.flush()

        del data
        del buf
        del payload

        xspub.send(meta['from'], "success")

      except Exception as e:
        logging.error("Worker exception " + str(e)) 
Exemple #6
0
    def __init__(self, rundir, nFPGA, nDispatchers, batchsz=-1):
        # update meta.json with nFPGA
        meta = {}
        with open("%s/meta.json" % rundir) as f:
            meta = json.load(f)
            meta['num_fpga'] = nFPGA
            if 'publish_id' in meta:
                del meta['publish_id']
            if 'subscribe_id' in meta:
                del meta['subscribe_id']
        with open("%s/meta.json" % rundir, "w") as f:
            json.dump(meta, f)

        # acquire FPGA
        runner = Runner(rundir)
        inTensors = runner.get_input_tensors()
        outTensors = runner.get_output_tensors()
        inshape = [inTensors[0].dims[i] for i in range(inTensors[0].ndims)]
        outshape = [outTensors[0].dims[i] for i in range(outTensors[0].ndims)]
        if batchsz != -1:
            inshape[0] = batchsz  # update batch size
            outshape[0] = batchsz  # update batch size

        Dispatcher.runner = runner
        Dispatcher.inTensors = inTensors
        Dispatcher.outTensors = outTensors
        Dispatcher.inshape = inshape
        Dispatcher.outshape = outshape

        self.q = Queue(maxsize=nDispatchers * 4)
        self.workers = []
        for i in range(nDispatchers):
            sys.stdout.flush()
            worker = threading.Thread(target=self._run,
                                      args=(
                                          self.q,
                                          inshape,
                                          outshape,
                                      ))
            self.workers.append(worker)
            worker.start()
Exemple #7
0
def faceDetection(vitis_rundir, outpath, rsz_h, rsz_w, path):
    runner = Runner(vitis_rundir)
    inTensors = runner.get_input_tensors()
    outTensors = runner.get_output_tensors()
    batch_sz = 1
    fpgaBlobs = []
    for io in [inTensors, outTensors]:
        blobs = []
        for t in io:
            shape = (batch_sz, ) + tuple([t.dims[i]
                                          for i in range(t.ndims)][1:])
            blobs.append(np.empty((shape), dtype=np.float32, order='C'))
        fpgaBlobs.append(blobs)

    dirName = outpath
    if not os.path.exists(dirName):
        os.mkdir(dirName)

    output_Img_path = dirName
    #os.chdir(path)
    fp = open(output_Img_path + "/output.txt", 'w')
    for fn in sorted(glob.glob(path + '/*.jpg'), key=os.path.getsize):
        filename = fn[fn.rfind('/') + 1:]
        image_ori = cv2.imread(fn, cv2.IMREAD_COLOR)
        face_rects = detect(runner, fpgaBlobs, image_ori)
        res = []
        if len(face_rects) != 0:
            for face_rect in face_rects:
                res.append("{} {} {} {} {}".format(filename, face_rect[0],
                                                   face_rect[1], face_rect[2],
                                                   face_rect[3]))
                print("{} {} {} {} {}".format(fn, face_rect[0], face_rect[1],
                                              face_rect[2], face_rect[3]))
                cv2.rectangle(image_ori, (face_rect[0], face_rect[1]),
                              (face_rect[2], face_rect[3]), (0, 255, 0), 2)
                cv2.imwrite(output_Img_path + filename, image_ori)
        for faces in res:
            fp.write(faces + '\n')
#        else:
#res.append("{} {} {} {} {}".format(fn, 0,0,0,0))
    fp.close()
Exemple #8
0
def fpga_process(args, num_img,  compJson, shared_trans_arrs,shared_output_arrs):
    runner = Runner(args['vitis_rundir'])
    qWait = mp.Queue(maxsize=100)

    t = threading.Thread(target=fpga_wait, args=(runner, qWait, shared_output_arrs, shared_trans_arrs))
    t.start()

    numProcessed = 0
    startTime = time.time()
    while numProcessed < num_img or args['perpetual']:
        # Get the buffer for fpga output
        write_slot = shared_output_arrs.openWriteId()
        write_slot_arrs = shared_output_arrs.accessNumpyBuffer(write_slot)

        # Get the input buffer for fpga exec
        read_slot = shared_trans_arrs.openReadId()
        if read_slot is None: break
        read_slot_arrs = shared_trans_arrs.accessNumpyBuffer(read_slot)

        # Copy meta data from input to output
        write_slot_arrs[-1][:] = read_slot_arrs[-1][:]

        # Start execution
        jid = runner.execute_async(read_slot_arrs[:-1], write_slot_arrs[:-1])
        # runner.wait(jid)
        qWait.put((write_slot, read_slot, jid))
        #shared_trans_arrs.closeReadId(read_slot)

        numProcessed += 1
        if(args['perpetual'] == False):
            if numProcessed == num_img:
                break


    qWait.put((None, None, None))
    t.join()
    elapsedTime = ( time.time() - startTime )
    print( "FPGA_process: ", float(numProcessed), "batch")
    print( "FPGA_process: ", float(numProcessed)/elapsedTime, "batch/s")
def main():
  args = xdnn_io.processCommandLine()

  runner = Runner(args['vitis_rundir'])
  inTensors = runner.get_input_tensors()
  outTensors = runner.get_output_tensors()
  batch_sz = args['batch_sz']
  if batch_sz == -1:
    # use Runner's suggested batch size
    batch_sz = inTensors[0].dims[0]

  if args['golden']:
    goldenMap = xdnn_io.getGoldenMap(args['golden'])
    top5Count = 0
    top1Count = 0

  fpgaBlobs = []
  for io in [inTensors, outTensors]:
    blobs = []
    for t in io:
      shape = (batch_sz,) + tuple([t.dims[i] for i in range(t.ndims)][1:])
      blobs.append(np.empty((shape), dtype=np.float32, order='C'))
    fpgaBlobs.append(blobs)

  img_paths = xdnn_io.getFilePaths(args['images'])
  labels = xdnn_io.get_labels(args['labels'])
  xdnnCPUOp = xdnn.XDNNCPUOp("%s/weights.h5" % args['vitis_rundir'])
  fcOutput = np.empty((batch_sz, args['outsz'],), dtype=np.float32, order='C')

  fpgaInput = fpgaBlobs[0][0]
  for i in range(0, len(img_paths), batch_sz):
    pl = []
    # fill tensor input data from image file
    for j, p in enumerate(img_paths[i:i + batch_sz]):
      img, _ = xdnn_io.loadImageBlobFromFile(p,
        args['img_raw_scale'], args['img_mean'], args['img_input_scale'],
        fpgaInput.shape[2], fpgaInput.shape[3])
      pl.append(p)
      np.copyto(fpgaInput[j], img)

    jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1])
    runner.wait(jid)

    xdnnCPUOp.computeFC(fpgaBlobs[1][0], fcOutput)
    softmaxOut = xdnnCPUOp.computeSoftmax(fcOutput)
    if args['golden']:
      for j,p in enumerate(img_paths[i:i + batch_sz]):
        top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 1)
        top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 5)
    else:
      xdnn_io.printClassification(softmaxOut, pl, labels)

  if args['golden']:
    print ( ("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % (len(img_paths), float(top1Count)/float(len(img_paths))*100., float(top5Count)/float(len(img_paths))*100.) )
Exemple #10
0
def main(argv):
    global threadnum
    """create runner """
    dpu = Runner(argv[2])

    listimage = os.listdir(calib_image_dir)
    threadAll = []
    threadnum = int(argv[1])
    i = 0
    global runTotall
    runTotall = len(listimage)
    """ Image list to be run """
    img = []
    for i in range(runTotall):
        path = os.path.join(calib_image_dir, listimage[i])
        image = cv2.imread(path)
        img.append(input_fn.preprocess_fn(image))

    imgT = np.transpose(img, (0, 3, 1, 2))
    """run with batch """
    time1 = time.time()
    for i in range(int(threadnum)):
        t1 = threading.Thread(target=runInceptionV1,
                              args=(dpu, imgT, i * batchSize))
        threadAll.append(t1)
    for x in threadAll:
        x.start()
    for x in threadAll:
        x.join()

    time2 = time.time()

    timetotal = time2 - time1
    fps = float(runTotall / timetotal)
    print("%.2f FPS" % fps)

    del dpu
Exemple #11
0
    def run(rundir, chanIdx, q, args):
        xspub = xstream.Publisher()
        xssub = xstream.Subscribe(chanIdx2Str(chanIdx))
        runner = Runner(rundir)
        inTensors = runner.get_input_tensors()
        outTensors = runner.get_output_tensors()

        q.put(1)  # ready for work

        fpgaBlobs = None
        labels = xdnn_io.get_labels(args['labels'])
        if args['yolo_version'] == 'v2': yolo_postproc = yolo.yolov2_postproc
        elif args['yolo_version'] == 'v3': yolo_postproc = yolo.yolov3_postproc
        else:
            assert args['yolo_version'] in (
                'v2', 'v3'), "--yolo_version should be <v2|v3>"

        biases = bias_selector(args)
        if (args['visualize']): colors = generate_colors(len(labels))

        while True:
            try:
                payload = xssub.get()
                if not payload:
                    break
                (meta, buf) = payload

                if fpgaBlobs == None:
                    # allocate buffers
                    fpgaBlobs = []
                    batchsz = meta['shape'][0]  # inTensors[0].dims[0]

                    for io in [inTensors, outTensors]:
                        blobs = []
                        for t in io:
                            shape = (batchsz, ) + tuple(
                                [t.dims[i] for i in range(t.ndims)][1:])
                            blobs.append(
                                np.empty((shape), dtype=np.float32, order='C'))
                        fpgaBlobs.append(blobs)

                    fcOutput = np.empty((
                        batchsz,
                        args['outsz'],
                    ),
                                        dtype=np.float32,
                                        order='C')

                fpgaInput = fpgaBlobs[0][0]
                assert (tuple(meta['shape']) == fpgaInput.shape)
                data = np.frombuffer(buf,
                                     dtype=np.float32).reshape(fpgaInput.shape)
                np.copyto(fpgaInput, data)

                jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1])
                runner.wait(jid)

                boxes = yolo_postproc(fpgaBlobs[1],
                                      args,
                                      meta['image_shapes'],
                                      biases=biases)

                if (not args['profile']):
                    for i in range(min(batchsz, len(meta['image_shapes']))):
                        print("Detected {} boxes in {}".format(
                            len(boxes[i]), meta['images'][i]),
                              flush=True)

                # Save the result
                if (args['results_dir']):
                    for i in range(min(batchsz, len(meta['image_shapes']))):
                        fname = meta['images'][i]
                        filename = os.path.splitext(os.path.basename(fname))[0]
                        out_file_txt = os.path.join(args['results_dir'],
                                                    filename + '.txt')
                        print("Saving {} boxes to {}".format(
                            len(boxes[i]), out_file_txt))
                        sys.stdout.flush()
                        saveDetectionDarknetStyle(out_file_txt, boxes[i],
                                                  meta['image_shapes'][i])

                        if (args['visualize']):
                            out_file_png = os.path.join(
                                args['results_dir'], filename + '.png')
                            print("Saving result to {}".format(out_file_png))
                            sys.stdout.flush()
                            draw_boxes(fname, boxes[i], labels, colors,
                                       out_file_png)

                if meta['id'] % 1000 == 0:
                    print("Recvd query %d" % meta['id'])
                    sys.stdout.flush()

                del data
                del buf
                del payload

                xspub.send(meta['from'], "success")

            except Exception as e:
                logging.error("Worker exception " + str(e))
def main():
    parser = xdnn_io.default_parser_args()
    parser = yolo_parser_args(parser)
    args = parser.parse_args()
    args = xdnn_io.make_dict_args(args)

    # Setup the environment
    img_paths = xdnn_io.getFilePaths(args['images'])
    if (args['golden'] or args['visualize']):
        assert args['labels'], "Provide --labels to compute mAP."
        assert args[
            'results_dir'], "For accuracy measurements, provide --results_dir to save the detections."
        labels = xdnn_io.get_labels(args['labels'])
        colors = generate_colors(len(labels))

    if args['yolo_version'] == 'v2': yolo_postproc = yolo.yolov2_postproc
    elif args['yolo_version'] == 'v3': yolo_postproc = yolo.yolov3_postproc

    runner = Runner(args['vitis_rundir'])

    # Setup the blobs
    inTensors = runner.get_input_tensors()
    outTensors = runner.get_output_tensors()
    batch_sz = args['batch_sz']
    if batch_sz == -1:
        batch_sz = inTensors[0].dims[0]

    fpgaBlobs = []
    for io in [inTensors, outTensors]:
        blobs = []
        for t in io:
            shape = (batch_sz, ) + tuple([t.dims[i]
                                          for i in range(t.ndims)][1:])
            blobs.append(np.empty((shape), dtype=np.float32, order='C'))
        fpgaBlobs.append(blobs)
    fpgaInput = fpgaBlobs[0][0]

    # Setup the YOLO config
    net_h, net_w = fpgaInput.shape[-2:]
    args['net_h'] = net_h
    args['net_w'] = net_w
    biases = bias_selector(args)

    # Setup profiling env
    prep_time = 0
    exec_time = 0
    post_time = 0

    # Start the execution
    for i in range(0, len(img_paths), batch_sz):
        pl = []
        img_shapes = []

        # Prep images
        t1 = timeit.default_timer()
        for j, p in enumerate(img_paths[i:i + batch_sz]):
            fpgaInput[j, ...], img_shape = xdnn_io.loadYoloImageBlobFromFile(
                p, net_h, net_w)
            pl.append(p)
            img_shapes.append(img_shape)
        t2 = timeit.default_timer()

        # Execute
        jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1])
        runner.wait(jid)

        # Post Proc
        t3 = timeit.default_timer()
        boxes = yolo_postproc(fpgaBlobs[1], args, img_shapes, biases=biases)
        t4 = timeit.default_timer()

        prep_time += (t2 - t1)
        exec_time += (t3 - t2)
        post_time += (t4 - t3)

        for i in range(min(batch_sz, len(img_shapes))):
            print("Detected {} boxes in {}".format(len(boxes[i]), pl[i]))

        # Save the result
        if (args['results_dir']):
            for i in range(min(batch_sz, len(img_shapes))):
                filename = os.path.splitext(os.path.basename(pl[i]))[0]
                out_file_txt = os.path.join(args['results_dir'],
                                            filename + '.txt')
                print("Saving {} boxes to {}".format(len(boxes[i]),
                                                     out_file_txt))
                sys.stdout.flush()
                saveDetectionDarknetStyle(out_file_txt, boxes[i],
                                          img_shapes[i])
                if (args['visualize']):
                    out_file_png = os.path.join(args['results_dir'],
                                                filename + '.png')
                    print("Saving result to {}".format(out_file_png))
                    sys.stdout.flush()
                    draw_boxes(pl[i], boxes[i], labels, colors, out_file_png)

    # Profiling results
    if (args['profile']):
        print("\nAverage Latency in ms:")
        print("  Image Prep: {0:3f}".format(prep_time * 1000.0 /
                                            len(img_paths)))
        print("  Exec: {0:3f}".format(exec_time * 1000.0 / len(img_paths)))
        print("  Post Proc: {0:3f}".format(post_time * 1000.0 /
                                           len(img_paths)))
        sys.stdout.flush()

    # mAP calculation
    if (args['golden']):
        print()
        print("Computing mAP score  : ")
        print("Class names are  : {} ".format(labels))
        mAP = calc_detector_mAP(args['results_dir'], args['golden'],
                                len(labels), labels, args['prob_threshold'],
                                args['mapiouthresh'], args['points'])
        sys.stdout.flush()