def main(): """ Attach to DPU driver and prepare for running """ n2cube.dpuOpen() """ Create DPU Kernels for CONV NODE in imniResNet """ kernel = n2cube.dpuLoadKernel(KERNEL_CONV) """ Create DPU Tasks for CONV NODE in miniResNet """ task = n2cube.dpuCreateTask(kernel, 0) listimage = os.listdir(calib_image_dir) for i in range(len(listimage)): path = os.path.join(calib_image_dir, listimage[i]) if os.path.splitext(path)[1] != ".png": continue print("Loading %s" %listimage[i]) """ Load image and Set image into CONV Task """ imageRun=graph_input_fn.calib_input(path) imageRun=imageRun.reshape((imageRun.shape[0]*imageRun.shape[1]*imageRun.shape[2])) input_len=len(imageRun) n2cube.dpuSetInputTensorInHWCFP32(task,CONV_INPUT_NODE,imageRun,input_len) """ Launch miniRetNet task """ n2cube.dpuRunTask(task) """ Get output tensor address of CONV """ conf = n2cube.dpuGetOutputTensorAddress(task, CONV_OUTPUT_NODE) """ Get output channel of CONV """ channel = n2cube.dpuGetOutputTensorChannel(task, CONV_OUTPUT_NODE) """ Get output size of CONV """ size = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE) softmax = [0 for i in range(size)] """ Get output scale of CONV """ scale = n2cube.dpuGetOutputTensorScale(task, CONV_OUTPUT_NODE) batchSize=size//channel """ Calculate softmax and show TOP5 classification result """ n2cube.dpuRunSoftmax(conf, softmax, channel, batchSize, scale) TopK(softmax, calib_image_list) """ Destroy DPU Tasks & free resources """ n2cube.dpuDestroyTask(task) """ Destroy DPU Kernels & free resources """ rtn = n2cube.dpuDestroyKernel(kernel) """ Dettach from DPU driver & free resources """ n2cube.dpuClose()
def run(self): overlay = DpuOverlay("./bitstream/dpu.bit") overlay.load_model("./model/dpu_tf_efficientnet.elf") cv2.setUseOptimized(True) cv2.setNumThreads(4) threadnum = 4 num_iterations = 0 listimage = [[] * i for i in range(threadnum)] result = [[] * i for i in range(threadnum)] img_processed = [[] * i for i in range(threadnum)] cnt = 0 thread = 0 list_image = sorted([i for i in os.listdir(image_folder) if i.endswith("JPEG")]) picture_num = 0 picture_num = len(list_image) for i in list_image: listimage[thread].append(i) if cnt % math.ceil(picture_num/threadnum) == 0 and cnt != 0: thread = thread + 1 cnt = cnt + 1 n2cube.dpuOpen() kernel = n2cube.dpuLoadKernel(KERNEL_CONV) threadAll = [] for i in range(threadnum): t1 = threading.Thread(target=self.run_dpu_task, args=(kernel, i, len(listimage[i]), listimage, result)) threadAll.append(t1) for x in threadAll: x.start() for x in threadAll: x.join() with open(RESULT_FILE, 'w') as result_file: for item in result: for i in item: result_file.write("%s\n" % i) rtn = n2cube.dpuDestroyKernel(kernel) n2cube.dpuClose() # Run all date set and write your outputs to result file. # Please see README and "classification_result.sample" to know the result file format. #time.sleep(10) return
def main(argv): """Attach to DPU driver and prepare for runing""" n2cube.dpuOpen() """Create DPU Kernels for GoogLeNet""" kernel = n2cube.dpuLoadKernel(KERNEL_CONV) image_path = "./../common/image_224_224/" listimage = os.listdir(image_path) path = os.path.join(image_path, listimage[0]) print("Loading %s" %listimage[0]) img = cv2.imread(path) threadAll = [] global threadnum threadnum = int(argv[1]) print("Input thread number is: %d" %threadnum) time1 = time.time() for i in range(int(threadnum)): t1 = threading.Thread(target=RunDPU, args=(kernel, img, i)) threadAll.append(t1) for x in threadAll: x.start() for x in threadAll: x.join() time2 = time.time() timetotal = time2 - time1 fps = float(1000 / timetotal) print("%.2f FPS" %fps) """Destroy DPU Tasks & free resources""" rtn = n2cube.dpuDestroyKernel(kernel) """Dettach from DPU driver & release resources""" n2cube.dpuClose()
def accel_fused(kernel_name, input_name, output_name, layout, out, *ins): # Attach to DPU driver and prepare for running n2cube.dpuOpen() # Create DPU Kernels kernel = n2cube.dpuLoadKernel(kernel_name) # Create DPU Tasks for kernel task = n2cube.dpuCreateTask(kernel, 0) # Load image to DPU X = ins[0].asnumpy().reshape((-1)) n2cube.dpuSetInputTensorInHWCFP32(task, input_name, X, len(X)) # Model run on DPU """ n2cube.dpuRunTask(task) # Get the output tensor size size = n2cube.dpuGetOutputTensorSize(task, output_name) address = n2cube.dpuGetOutputTensorAddress(task, output_name) value = [0 for i in range(size)] # Get the output tensor data n2cube.dpuGetTensorData(address, value, size) scale = n2cube.dpuGetOutputTensorScale(task, output_name, idx=0) value = np.array(value).astype(np.float32) * float(scale) value_shape = tuple(out.shape) if layout == 'NHWC' else \ (out.shape[0], out.shape[2], out.shape[3], out.shape[1]) value = np.reshape(value, value_shape) # DPU output is in NHWC if layout == 'NCHW': value = np.transpose(value, (0, 3, 1, 2)) tvm.nd.array(value).copyto(out)
def main(): print("STARTING UNETv2 on DPU...") if USE_DPU: # Attach to DPU driver n2cube.dpuOpen() # Load DPU Kernel and create a task kernel = n2cube.dpuLoadKernel(KERNEL_CONV) task = n2cube.dpuCreateTask(kernel, 0) # load and preprocess images and load segmentation labels assert os.path.isdir(IMG_TEST_DIR) #print(IMG_TEST_DIR) x_test, y_test, img_file, seg_file = dpu_get_data(IMG_TEST_DIR, SEG_TEST_DIR, cfg.NUM_CLASSES, cfg.WIDTH, cfg.HEIGHT) y_pred = [] # process all images for i in range(len(x_test)): # opened image as BGR, convert it to RGB #B,G,R = cv2.split(x_test[i]) #imageRun = cv2.merge((R,G,B)) imageRun = x_test[i] imageRun = imageRun.reshape( (imageRun.shape[0] * imageRun.shape[1] * imageRun.shape[2])) input_len = len(imageRun) if USE_DPU: # load pre-processed image as DPU input n2cube.dpuSetInputTensorInHWCFP32(task, CONV_INPUT_NODE, imageRun, input_len) dpu_in = n2cube.dpuGetInputTensor(task, CONV_INPUT_NODE) ti_scale = n2cube.dpuGetTensorScale(dpu_in) ti_h = n2cube.dpuGetTensorHeight(dpu_in) ti_w = n2cube.dpuGetTensorWidth(dpu_in) ti_sz = n2cube.dpuGetTensorSize(dpu_in) ti_ch = n2cube.dpuGetTensorChannel(dpu_in) if (i == 0): print( "Input tensor=%3d ch=%3d H=%3d W=%3d Size=%6d scale=%4d" % (i, ti_ch, ti_h, ti_w, ti_sz, ti_scale)) # run DPU task n2cube.dpuRunTask(task) # get output tensor address dpu_out = n2cube.dpuGetOutputTensorAddress(task, CONV_OUTPUT_NODE) # get number of channels in output tensor to_ch = n2cube.dpuGetOutputTensorChannel(task, CONV_OUTPUT_NODE) # get size in bytes of output tensor to_sz = n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE) # get width output tensor to_w = n2cube.dpuGetOutputTensorWidth(task, CONV_OUTPUT_NODE) # get height output tensor to_h = n2cube.dpuGetOutputTensorHeight(task, CONV_OUTPUT_NODE) # get output tensor scale to_scale = n2cube.dpuGetOutputTensorScale(task, CONV_OUTPUT_NODE) softmax = np.zeros(to_sz, dtype=np.float32) if (i == 0): print("Output tensor=%3d ch=%3d H=%3d W=%3d Size=%6d" % (i, to_ch, to_h, to_w, to_sz)) print("Output tensor scaling factor", to_scale) softmax = n2cube.dpuRunSoftmax(dpu_out, to_ch, to_sz // to_ch, to_scale) prediction = softmax.reshape((to_h, to_w, to_ch)) y_pred.append(prediction) if (i == 0): print("prediction shape: ", prediction.shape) # Calculate intersection over union for each segmentation class y_pred = np.asarray(y_pred) y_test = np.asarray(y_test) print("y_pred shape: ", y_pred.shape) print("y_test shape: ", y_test.shape) y_predi = np.argmax(y_pred, axis=3) y_testi = np.argmax(y_test, axis=3) print("shape of y_testi and y_predi ", y_testi.shape, y_predi.shape) dpu_IoU(y_testi, y_predi) # print results print("Processed", len(x_test), "images") print("FINISHED") if USE_DPU: # Destroy DPU Kernel & detach n2cube.dpuDestroyKernel(kernel) n2cube.dpuClose()
def runDPU(preedQueue, dpuresQueue, threadnum): KERNEL_CONV = "testnet" CONV_INPUT_NODE = "ssd_mobilenet_v2_conv2d_conv2d_conv2d_Conv2D" CONV_OUTPUT_NODE = [] CONV_OUTPUT_NODE.append( "ssd_mobilenet_v2_block8_box_conv_cls_2_conv_cls_2_Conv2D") CONV_OUTPUT_NODE.append( "ssd_mobilenet_v2_block7_box_conv_cls_2_conv_cls_2_Conv2D") CONV_OUTPUT_NODE.append( "ssd_mobilenet_v2_block6_box_conv_cls_2_conv_cls_2_Conv2D") CONV_OUTPUT_NODE.append( "ssd_mobilenet_v2_block5_box_conv_cls_2_conv_cls_2_Conv2D") CONV_OUTPUT_NODE.append( "ssd_mobilenet_v2_block8_box_conv_loc_2_conv_loc_2_Conv2D") CONV_OUTPUT_NODE.append( "ssd_mobilenet_v2_block7_box_conv_loc_2_conv_loc_2_Conv2D") CONV_OUTPUT_NODE.append( "ssd_mobilenet_v2_block6_box_conv_loc_2_conv_loc_2_Conv2D") CONV_OUTPUT_NODE.append( "ssd_mobilenet_v2_block5_box_conv_loc_2_conv_loc_2_Conv2D") reshapsize = [] reshapsize.append((1, 56, 96, 6, 10)) reshapsize.append((1, 28, 48, 6, 10)) reshapsize.append((1, 14, 24, 6, 10)) reshapsize.append((1, 7, 12, 6, 10)) reshapsize.append((1, 56, 96, 6, 4)) reshapsize.append((1, 28, 48, 6, 4)) reshapsize.append((1, 14, 24, 6, 4)) reshapsize.append((1, 7, 12, 6, 4)) """ Attach to DPU driver and prepare for running """ n2cube.dpuOpen() """ Create DPU Kernels for CONV NODE in imniResNet """ kernel = n2cube.dpuLoadKernel(KERNEL_CONV) """ Create DPU Tasks for CONV NODE in miniResNet """ task = n2cube.dpuCreateTask(kernel, 0) conv_sbbox_size = [] for i in range(8): conv_sbbox_size.append( n2cube.dpuGetOutputTensorSize(task, CONV_OUTPUT_NODE[i])) print("outputdata %d is %d" % (i, conv_sbbox_size[i])) print("finish set dpu") #tstart=time.time() tdpu = 0 tall1 = time.time() for j in range(runtimes): listin = preedQueue.get() orgimg = listin[0] imgo = listin[1] tpost1 = time.time() # print("preedimg.shape is",preedimg.shape) n2cube.dpuSetInputTensorInHWCInt8(task, CONV_INPUT_NODE, imgo, 1032192) #448*768*3 """ Launch miniRetNet task """ # print("ready to start dpu work") n2cube.dpuRunTask(task) outputData = [] outputData.append(orgimg) for i in range(8): conv_out = n2cube.dpuGetOutputTensorInHWCInt8( task, CONV_OUTPUT_NODE[i], conv_sbbox_size[i]) conv_out = np.reshape(conv_out, reshapsize[i]) outputData.append(conv_out) tpost2 = time.time() tdpu = tdpu + (tpost2 - tpost1) #print("one dpu cost time is", (tpost2 - tpost1)) dpuresQueue.put(outputData) tall2 = time.time() print("all dpu time out is cost", (tall2 - tall1)) print("all dpu cost time is", tdpu) n2cube.dpuDestroyTask(task) return
"""DPU Kernel Name for tf_yolov3_voc""" KERNEL_CONV="tf_yolov3" """DPU IN/OUT Name for tf_yolov3_voc""" CONV_INPUT_NODE="conv2d_1_convolution" CONV_OUTPUT_NODE1="conv2d_59_convolution" CONV_OUTPUT_NODE2="conv2d_67_convolution" CONV_OUTPUT_NODE3="conv2d_75_convolution" if __name__ == "__main__": """ Attach to DPU driver and prepare for running """ n2cube.dpuOpen() """ Create DPU Kernels for tf_yolov3_voc """ kernel = n2cube.dpuLoadKernel(KERNEL_CONV) """ Create DPU Tasks for tf_yolov3_voc """ task = n2cube.dpuCreateTask(kernel, 0) image_folder = "/home/xilinx/jupyter_notebooks/Cityscapes/JPEGImages/" """Load image to DPU""" # listimage = [i for i in os.listdir(image_folder) if i.endswith("jpg")] # listimage.sort() # print("Loading image...") cap = cv2.VideoCapture(0) print("\nYou can press ESC to quit") # imagenumber = len(listimage) # print("\nimagenumber = %d\n"%imagenumber) cv2.namedWindow("Display", cv2.WINDOW_AUTOSIZE) cv2.moveWindow("Display",50,50) # cv2.resizeWindow("Display", 1024, 512)
def main(): # UI: DPU ui = UI() ui.update_boot_window('Initializing DPU...') from dnndk import n2cube from pynq_dpu import DpuOverlay # Set up the DPU IP overlay = DpuOverlay(str(fh.dir_dpu / fh.dpu_bit_file)) overlay.load_model(str(fh.dir_dpu / fh.dpu_assembly_file)) # Set up the Neural Network Runtime (N2Cube) kernel_name = fh.kernel_name kernel_conv_input = fh.kernel_conv_input kernel_fc_output = fh.kernel_fc_output n2cube.dpuOpen() kernel = n2cube.dpuLoadKernel(kernel_name) task = n2cube.dpuCreateTask(kernel, 0) input_tensor_size = n2cube.dpuGetInputTensorSize(task, kernel_conv_input) output_tensor_size = n2cube.dpuGetOutputTensorSize(task, kernel_fc_output) output_tensor_channel = n2cube.dpuGetOutputTensorChannel(task, kernel_fc_output) output_tensor_address = n2cube.dpuGetOutputTensorAddress(task, kernel_fc_output) output_tensor_scale = n2cube.dpuGetOutputTensorScale(task, kernel_fc_output) # UI: Camera ui.update_boot_window('Initializing Camera...') # libcamera libcamera = ctypes.CDLL(fh.dir_cam / fh.libcamera_file) # Getter libcamera.get_frame_ptr.restype = ctypes.POINTER(ctypes.c_ubyte) libcamera.get_frame_ptr.argtypes = [ctypes.c_uint] libcamera.get_throw_bgn_idx.restype = ctypes.c_uint libcamera.get_throw_bgn_idx.argtypes = None libcamera.get_throw_end_idx.restype = ctypes.c_uint libcamera.get_throw_end_idx.argtypes = None libcamera.get_throw_bgn.restype = ctypes.c_bool libcamera.get_throw_bgn.argtypes = None libcamera.get_throw_end.restype = ctypes.c_bool libcamera.get_throw_end.argtypes = None # Setter libcamera.set_frame_rate.restype = None libcamera.set_frame_rate.argtypes = [ctypes.c_double] libcamera.set_buff_size.restype = None libcamera.set_buff_size.argtypes = [ctypes.c_uint] libcamera.set_exposure_time.restype = None libcamera.set_exposure_time.argtypes = [ctypes.c_double] libcamera.set_camera_gain.restype = None libcamera.set_camera_gain.argtypes = [ctypes.c_double] libcamera.set_avg_diffs.restype = None libcamera.set_avg_diffs.argtypes = [ctypes.c_uint] libcamera.set_threshold_mult.restype = None libcamera.set_threshold_mult.argtypes = [ctypes.c_double] libcamera.set_frames_to_acquire.restype = None libcamera.set_frames_to_acquire.argtypes = [ctypes.c_uint] # Camera libcamera.initialize.restype = ctypes.c_int libcamera.initialize.argtypes = None libcamera.reset_global_variables.restype = None libcamera.reset_global_variables.argtypes = None libcamera.start_acquisition.restype = ctypes.c_int libcamera.start_acquisition.argtypes = None libcamera.terminate.restype = ctypes.c_int libcamera.terminate.argtypes = None # Set the global variables according to the module `fhnwtoys.settings` libcamera.set_frame_rate(fh.frame_rate) libcamera.set_buff_size(fh.buff_size) libcamera.set_exposure_time(fh.exposure_time) libcamera.set_camera_gain(fh.camera_gain) libcamera.set_avg_diffs(fh.avg_diffs) libcamera.set_threshold_mult(fh.threshold_mult) libcamera.set_frames_to_acquire(fh.frames_to_acquire) # Initialize Camera initialize = fh.ReturnCodes.NOT_INITIALIZED(*\label{lst:ln:camera_init1}*) initialization_tries = 0 while initialize != fh.ReturnCodes.SUCCESS: if initialization_tries > 0: try: return_code = fh.ReturnCodes(initialize).name except ValueError: return_code = initialize ui.update_boot_window(f'Camera Error ({return_code}), try to replug the camera.') initialize = libcamera.initialize() initialization_tries += 1(*\label{lst:ln:camera_init2}*) # UI: Ready ui.update_boot_window('READY') # Set up the `frames` array frames = np.empty((fh.frames_to_consider,) + fh.bgr_shape, dtype=np.uint8) while True: # Reset the predictions predictions = np.zeros((fh.frames_to_consider, fh.num_objects), dtype=np.float32)(*\label{lst:ln:predictions_matrix}*) # Start acquisition (threaded) # todo: error handling ('Unexpected Error, system reboot required.') # start_acquisition = libcamera.start_acquisition() # non threaded approach t = Thread(target=libcamera.start_acquisition)(*\label{lst:ln:threading}*) # threaded approach (process due to ctypes) t.start() # Wait until the throw has ended (the Ultra96-V2 is not powerful enough to process the data during the acquisition) while not libcamera.get_throw_end(): pass(*\label{lst:ln:polling}*) throw_bgn_idx = libcamera.get_throw_bgn_idx() throw_end_idx = libcamera.get_throw_end_idx() num_frames = throw_end_idx - throw_bgn_idx - 1 # Ignore the last two captured frames # Image processing (including inference) for idx, frame_id in enumerate(range(throw_bgn_idx, throw_end_idx - 1)): frame_ptr = libcamera.get_frame_ptr(frame_id)(*\label{lst:ln:image_preprocessing1}*) raw_frame = np.ctypeslib.as_array(frame_ptr, shape=fh.raw_shape) # Raw Baumer BayerRG8 frame # Transform Baumer BayerRG8 to BGR8 (Baumer BayerRG = OpenCV BayerBG) frames[idx] = cv2.cvtColor(raw_frame, cv2.COLOR_BayerBG2BGR) # Color space conversion # Image scaling using nearest-neighbor interpolation frame_resized = cv2.resize(frames[idx], fh.inf_dsize, interpolation=fh.Interpolation.NEAREST) frame_inference = frame_resized.astype(np.float32) / 255.0(*\label{lst:ln:image_preprocessing2}*) # Normalization (float32 precision) # Inference n2cube.dpuSetInputTensorInHWCFP32(task, kernel_conv_input, frame_inference, input_tensor_size) n2cube.dpuRunTask(task)(*\label{lst:ln:image_classification}*) # Softmax function (normalized exponential function) # Confident predictions lead to all zeros and a NaN, when run through `n2cube.dpuRunSoftmax(.)` # This section replaces the first occurrence of NaN in the `prediction` array with 1.0 and sets everything else to 0.0 prediction = n2cube.dpuRunSoftmax(output_tensor_address, output_tensor_channel, output_tensor_size//output_tensor_channel, output_tensor_scale)(*\label{lst:ln:softmax1}*) nan = np.isnan(prediction) if nan.any(): nan_idx = nan.argmax() # returns the index of the first occurrence of NaN prediction = np.zeros((fh.num_objects,), dtype=np.float32) prediction[nan_idx] = 1.0(*\label{lst:ln:softmax2}*) predictions[idx] = prediction # Only consider `fh.frames_to_consider` frames if idx == fh.frames_to_consider - 1: # (-1: idx starts with 0) break num_frames_considered = min(fh.frames_to_consider, num_frames) window = sine_squared_window(num_frames, num_frames_considered) # weighting function weighted_prediction = np.matmul(window, predictions) / np.sum(window)(*\label{lst:ln:matrix_multiplication}*) # computation of the weighted prediction # UI: Prepare data for the UI weighted_prediction_percent = weighted_prediction * 100 weighted_prediction_sorted = np.sort(weighted_prediction_percent)[::-1] weighted_prediction_argsorted = np.argsort(weighted_prediction_percent)[::-1] # this is the index of the best guess (computed by weighting the `fh.frames_to_consider` frames) guess_idx = weighted_prediction_argsorted[0] relevant_pct_ui = np.asarray(weighted_prediction_percent >= 1.0).nonzero()[0] # value of prediction must be at least 1.0% relevant_pct_ui_len = len(relevant_pct_ui) predictions_ui_len = min(4, relevant_pct_ui_len) # show at most Top 4 predictions_ui = [] # the object names percentages_ui = np.empty((predictions_ui_len + 1,), dtype=np.float32) # the percentages (+1: 'Others') for i, w in enumerate(weighted_prediction_argsorted[0:predictions_ui_len]): predictions_ui.append(fh.objects_ui[w]) percentages_ui[i] = weighted_prediction_percent[w] # the object names predictions_ui.append('Others') # the percentages percentages_ui[-1] = np.sum(weighted_prediction_sorted[predictions_ui_len:]) percentages_ui = lrm_round(percentages_ui) # the frame wighted_guesses = np.multiply(window, predictions[:, guess_idx])(*\label{lst:ln:frame_selection1}*) frame_ui_idx = wighted_guesses.argmax() frame_ui_resized = cv2.resize(frames[frame_ui_idx], fh.ui_dsize, interpolation=fh.Interpolation.NEAREST) _, frame_ui_png = cv2.imencode('.png', frame_ui_resized) frame_ui = frame_ui_png.tobytes()(*\label{lst:ln:frame_selection2}*) # the frame # UI: Show results if percentages_ui[-1] == 0.0: predictions_ui = predictions_ui[:-1] percentages_ui = percentages_ui[:-1] # UI: Inference ui.update_inference_window(predictions_ui, percentages_ui, frame_ui) # Wait until the camera thread (process due to ctypes) is terminated t.join() # Reset the global variables (has to be done manually to avoid race conditions) libcamera.reset_global_variables() # Under regular circumstances, this section should never be reached # Terminate Camera terminate = libcamera.terminate() # Clean up the DPU IP n2cube.dpuDestroyKernel(kernel) n2cube.dpuDestroyTask(task)
def run(image_folder, shortsize, KERNEL_CONV, KERNEL_CONV_INPUT, KERNEL_FC_OUTPUT, inputscale): start = time.time() # listimage = [i for i in os.listdir(image_folder) if i.endswith("JPEG")] listimage = [i for i in os.listdir(image_folder) if i.endswith("jpg")] listimage.sort() # wordstxt = os.path.join(image_folder, "words.txt") # with open(wordstxt, "r") as f: # lines = f.readlines() fo = open(resultname, "w") n2cube.dpuOpen() kernel = n2cube.dpuLoadKernel(KERNEL_CONV) task = n2cube.dpuCreateTask(kernel, 0) height, width, inputchannel, mean = parameter(task, KERNEL_CONV_INPUT) # print("mean = %f"%mean[0]) outsize = n2cube.dpuGetOutputTensorSize(task, KERNEL_FC_OUTPUT) # print("size = %d"%size) outputchannel = n2cube.dpuGetOutputTensorChannel(task, KERNEL_FC_OUTPUT) # print("outputchannel = %d"%outputchannel) conf = n2cube.dpuGetOutputTensorAddress(task, KERNEL_FC_OUTPUT) # print("conf = {}".format(conf)) # print("inputscale = %f"%inputscale) inputscale = n2cube.dpuGetInputTensorScale(task, KERNEL_CONV_INPUT) # print("inputscalenow = %f"%inputscale) outputscale = n2cube.dpuGetOutputTensorScale(task, KERNEL_FC_OUTPUT) # print("outputscale = %f"%outputscale) imagenumber = len(listimage) print("\nimagenumber = %d\n" % imagenumber) softlist = [] # imagenumber = 1000 correct = 0 wrong = 0 for i in range(imagenumber): print(f"i = {i+1}") print(listimage[i]) # path = os.path.join(image_folder, listimage[i]) # if i % 50 == 0: # print("\r", listimage[i], end = "") path = image_folder + listimage[i] img = cv2.imread(path) imageRun = predict_label(img, task, inputscale, mean, height, width, inputchannel, shortsize, KERNEL_CONV_INPUT) input_len = len(imageRun) # print(f"input_len = {input_len}") # soft = threadPool.submit(run_dpu_task, outsize, task, outputchannel, conf, outputscale, listimage[i], imageRun, KERNEL_CONV_INPUT, KERNEL_FC_OUTPUT) # softlist.append(soft) # for future in as_completed(softlist): # softmax, listimage = future.result() softmax, listimage[i] = run_dpu_task(outsize, task, outputchannel, conf, outputscale, listimage[i], imageRun, KERNEL_CONV_INPUT, KERNEL_FC_OUTPUT) correct, wrong = TopK(softmax, listimage[i], fo, correct, wrong) print("") fo.close() accuracy = correct / imagenumber print('Correct:', correct, ' Wrong:', wrong, ' Accuracy:', accuracy) n2cube.dpuDestroyTask(task) n2cube.dpuDestroyKernel(kernel) n2cube.dpuClose() print("") end = time.time() total_time = end - start print('\nAll processing time: {} seconds.'.format(total_time)) print('\n{} ms per frame\n'.format(10000 * total_time / imagenumber))
def __init__(self): self.model = n2cube.dpuOpen() self.kernel = n2cube.dpuLoadKernel(KERNEL_CONV) self.colors = np.random.uniform(0, 255, size=(len(CLASS_NAMES), 3))
def main(): # Set up the DPU IP overlay = DpuOverlay(str(fh.dir_dpu / fh.dpu_bit_file)) overlay.load_model(str(fh.dir_dpu / fh.dpu_assembly_file)) # Set up the Neural Network Runtime (N2Cube) kernel_name = fh.kernel_name kernel_conv_input = fh.kernel_conv_input kernel_fc_output = fh.kernel_fc_output n2cube.dpuOpen() kernel = n2cube.dpuLoadKernel(kernel_name) task = n2cube.dpuCreateTask(kernel, 0) input_tensor_size = n2cube.dpuGetInputTensorSize(task, kernel_conv_input) output_tensor_size = n2cube.dpuGetOutputTensorSize(task, kernel_fc_output) output_tensor_channel = n2cube.dpuGetOutputTensorChannel( task, kernel_fc_output) output_tensor_address = n2cube.dpuGetOutputTensorAddress( task, kernel_fc_output) output_tensor_scale = n2cube.dpuGetOutputTensorScale( task, kernel_fc_output) # libcamera libcamera = ctypes.CDLL(fh.dir_cam / fh.libcamera_file) libcamera.get_frame_ptr.restype = ctypes.POINTER(ctypes.c_ubyte) libcamera.get_throw_bgn_idx.restype = ctypes.c_uint libcamera.get_throw_end_idx.restype = ctypes.c_uint libcamera.get_throw_bgn.restype = ctypes.c_bool libcamera.get_throw_end.restype = ctypes.c_bool libcamera.set_frame_rate.restype = None libcamera.set_buff_size.restype = None libcamera.set_exposure_time.restype = None libcamera.set_camera_gain.restype = None libcamera.set_avg_diffs.restype = None libcamera.set_threshold_mult.restype = None libcamera.set_frames_to_acquire.restype = None libcamera.initialize.restype = ctypes.c_int libcamera.start_acquisition.restype = ctypes.c_int libcamera.terminate.restype = ctypes.c_int # Set up of variables frames = np.empty((fh.frames_to_consider, ) + fh.bgr_shape, dtype=np.uint8) # Initialize Camera initialize = libcamera.initialize() if initialize != fh.ReturnCodes.SUCCESS: try: return_code = fh.ReturnCodes(initialize).name except ValueError: return_code = initialize print(f'Initialization failed: {return_code}') sys.exit() else: print( '================================= READY =================================' ) # Reset predictions predictions = np.zeros((fh.frames_to_consider, fh.num_objects), dtype=np.float32) # Start acquisition (Threaded) t = Thread(target=libcamera.start_acquisition) t.start() # Wait until the throw has ended while not libcamera.get_throw_end(): pass stages = [ 'Get raw bayer', 'Transform color', 'Resize', 'Normalize', 'Run inference', 'Softmax', 'Weighting' ] meas_time = {s: get_dict() for s in stages} throw_bgn_idx = libcamera.get_throw_bgn_idx() throw_end_idx = libcamera.get_throw_end_idx() num_frames = throw_end_idx - throw_bgn_idx - 1 # Ignore the last two captured frames for idx, frame_id in enumerate(range(throw_bgn_idx, throw_end_idx - 1)): meas_time['Get raw bayer']['start'].append(datetime.now()) frame_ptr = libcamera.get_frame_ptr(frame_id) raw_frame = np.ctypeslib.as_array(frame_ptr, shape=fh.raw_shape) meas_time['Get raw bayer']['end'].append(datetime.now()) # Transform Baumer BayerRG8 to BGR8 (Baumer BayerRG ≙ OpenCV BayerBG) meas_time['Transform color']['start'].append(datetime.now()) frames[idx] = cv2.cvtColor(raw_frame, cv2.COLOR_BayerBG2BGR) meas_time['Transform color']['end'].append(datetime.now()) meas_time['Resize']['start'].append(datetime.now()) frame_resized = cv2.resize(frames[idx], fh.inf_dsize, interpolation=fh.Interpolation.NEAREST) meas_time['Resize']['end'].append(datetime.now()) meas_time['Normalize']['start'].append(datetime.now()) frame_inference = frame_resized.astype(np.float32) / 255.0 meas_time['Normalize']['end'].append(datetime.now()) meas_time['Run inference']['start'].append(datetime.now()) n2cube.dpuSetInputTensorInHWCFP32(task, kernel_conv_input, frame_inference, input_tensor_size) n2cube.dpuRunTask(task) meas_time['Run inference']['end'].append(datetime.now()) # n2cube.dpuRunSoftmax(.) sometimes returns all zeros except one NaN # This section replaces the first occurrence of NaN in the prediction array with 1.0 and sets everything else to 0.0 meas_time['Softmax']['start'].append(datetime.now()) prediction = n2cube.dpuRunSoftmax( output_tensor_address, output_tensor_channel, output_tensor_size // output_tensor_channel, output_tensor_scale) nan = np.isnan(prediction) if nan.any(): nan_idx = nan.argmax( ) # return the index of the first occurrence of NaN prediction = np.zeros((fh.num_objects, ), dtype=np.float32) prediction[nan_idx] = 1.0 predictions[idx] = prediction meas_time['Softmax']['end'].append(datetime.now()) if idx == fh.frames_to_consider - 1: break meas_time['Weighting']['start'].append(datetime.now()) num_frames_considered = min(fh.frames_to_consider, num_frames) window = sine_window(num_frames, num_frames_considered) # weighting weighted_prediction = np.matmul(window, predictions) / np.sum(window) meas_time['Weighting']['end'].append(datetime.now()) for k in meas_time: meas_time[k] = [ (e - s).total_seconds() * 1000 for s, e in zip(meas_time[k]['start'], meas_time[k]['end']) ] meas_time[k] = sum(meas_time[k]) / len(meas_time[k]) # create output file mmax = 0 for s in stages: if len(s) > mmax: mmax = len(s) output = f'Number of captured frames: {num_frames_considered}\n\n' for idx, s in enumerate(stages): output += f'{s}:{" "*(mmax - len(stages[idx]))} {meas_time[s]:.3f} ms\n' output += f'\nSum:{" "*(mmax - len("Sum"))} {sum(meas_time.values()):.3f} ms\n' output += f'Frame rate:{" "*(mmax - len("Frame rate"))} {1000 / sum(meas_time.values()):.3f} fps\n' print(output) with open(fh.dir_verification / 'throughput.log', 'w') as f: f.write(output) # Wait until the camera thread (process due to ctypes) is terminated t.join() # Terminate Camera terminate = libcamera.terminate() # Clean up the DPU IP n2cube.dpuDestroyKernel(kernel) n2cube.dpuDestroyTask(task)