def detection_job(detection_model, image_name, num_inferences): """Runs detection job.""" engine = DetectionEngine(detection_model) with open_image(image_name) as img: # Resized image. _, height, width, _ = engine.get_input_tensor_shape() tensor = np.asarray(img.resize((width, height), Image.NEAREST)).flatten() # Using `detect_with_input_tensor` to exclude image down-scale cost. for _ in range(num_inferences): engine.detect_with_input_tensor(tensor, top_k=1)
def run_two_models_one_tpu(classification_model, detection_model, image_name, num_inferences, batch_size): """Runs two models ALTERNATIVELY using one Edge TPU. It runs classification model `batch_size` times and then switch to run detection model `batch_size` time until each model is run `num_inferences` times. Args: classification_model: string, path to classification model detection_model: string, path to detection model. image_name: string, path to input image. num_inferences: int, number of inferences to run for each model. batch_size: int, indicates how many inferences to run one model before switching to the other one. Returns: double, wall time it takes to finish the job. """ start_time = time.perf_counter() engine_a = ClassificationEngine(classification_model) # `engine_b` shares the same Edge TPU as `engine_a` engine_b = DetectionEngine(detection_model, engine_a.device_path()) with open_image(image_name) as image: # Resized image for `engine_a`, `engine_b`. tensor_a = get_input_tensor(engine_a, image) tensor_b = get_input_tensor(engine_b, image) num_iterations = (num_inferences + batch_size - 1) // batch_size for _ in range(num_iterations): # Using `classify_with_input_tensor` and `detect_with_input_tensor` on purpose to # exclude image down-scale cost. for _ in range(batch_size): engine_a.classify_with_input_tensor(tensor_a, top_k=1) for _ in range(batch_size): engine_b.detect_with_input_tensor(tensor_b, top_k=1) return time.perf_counter() - start_time
class CoralObjectDetector: """Performs inference on Edge TPU. """ def __init__(self, model_path, device_path): self.__engine = DetectionEngine(model_path=os.path.join( model_path, 'edgetpu.tflite'), device_path=device_path) self.__model_shape = itemgetter(1, 2)( self.__engine.get_input_tensor_shape()) @property def device_name(self): return "Coral" def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): pass def detect(self, image_shape, image_np, detections: List[Detection]): image_np = cv2.resize(image_np, dsize=self.__model_shape, interpolation=cv2.INTER_LINEAR) objs = self.__engine.detect_with_input_tensor( input_tensor=image_np.flatten(), top_k=len(detections)) d = 0 max_width = image_shape[1] - 1 max_height = image_shape[0] - 1 while d < len(objs) and d < len(detections): detection = detections[d] obj = objs[d] detection.label = obj.label_id + 1 detection.confidence = obj.score detection.bounding_box.y_min = int(obj.bounding_box[0][1] * max_height) detection.bounding_box.x_min = int(obj.bounding_box[0][0] * max_width) detection.bounding_box.y_max = int(obj.bounding_box[1][1] * max_height) detection.bounding_box.x_max = int(obj.bounding_box[1][0] * max_width) d += 1 return self.__engine.get_inference_time()
class EdgeTPUInferencer: def __init__(self, model): self.engine = DetectionEngine(model) self.watch = Stopwatch() def inference(self, img): self.watch.start() initial_h, initial_w, _ = img.shape if (initial_h, initial_w) != (300, 300): frame = cv2.resize(img, (300, 300)) else: frame = img frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) self.watch.stop(Stopwatch.MODE_PREPROCESS) self.watch.start() ans = self.engine.detect_with_input_tensor(frame.flatten(), threshold=0.5, top_k=10) self.watch.stop(Stopwatch.MODE_INFER) # Display result self.watch.start() results = [] for obj in ans: box = obj.bounding_box.flatten().tolist() bbox = [0] * 4 bbox[0] = box[0] * initial_w bbox[1] = box[1] * initial_h bbox[2] = (box[2] - box[0]) * initial_w bbox[3] = (box[3] - box[1]) * initial_h result = (bbox, obj.label_id + 1, obj.score) results.append(result) self.watch.stop(Stopwatch.MODE_POSTPROCESS) return results
class SmartPiCamContr(object): # Step 2: Constructor which defines default values for settings def __init__(self, appDuration=30, cameraResolution=(304, 304), useVideoPort=True, minObjectScore=0.35): self.cameraResolution = cameraResolution self.useVideoPort = useVideoPort self.appDuration = appDuration #seconds to run self.minObjectScore = minObjectScore modelFile = 'mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite' objectLabelsFile = 'coco_labels.txt' print("Reading Model: ", modelFile) self.engine = DetectionEngine(modelFile) print("Reading object labels: ", objectLabelsFile) self.labels = self.readLabelFile(objectLabelsFile) print("Minimal object score: ", self.minObjectScore) # Step 4: Configure PiCam # Return parameter: created PiCam def configurePiCam(self): print("\nConfigure and warming up PiCamera") self.cam = PiCamera() self.cam.resolution = self.cameraResolution print("Camera resolution: " + repr(self.cam.resolution)) self.cam.start_preview() sleep(2) self.cam.stop_preview() return self.cam #Step 7: Take a photo returned as numpy array def takePhoto(self): picData = np.empty( (self.cameraResolution[1], self.cameraResolution[0], 3), dtype=np.uint8) self.cam.capture(picData, format='rgb', use_video_port=self.useVideoPort) #24bit rgb format # Coco-Model requires 300 x 300 resolution # Remove last 4 rows and last 4 colummns in all 3 dimensions picData = picData[:-4, :-4] return picData # Function to read labels from text files. def readLabelFile(self, file_path): with open(file_path, 'r') as f: lines = f.readlines() ret = {} for line in lines: pair = line.strip().split(maxsplit=1) ret[int(pair[0])] = pair[1].strip() return ret #Step 10: Predict the picture by running it on the TPU def predict(self, picData): print("\nPredicting imgage on TPU") print('Shape of data: ', picData.shape) flatArray = picData.flatten() #3D to 1D conversion print('Input array size: ', flatArray.shape) #Call the TPU to detect objects on the image with a neural network result = self.engine.detect_with_input_tensor( flatArray, threshold=self.minObjectScore, top_k=10) return result #Step 12: Analyse the result of inferencing on the TPU. #The result is analysed and all objects will be set as detected #if they belong to the objects IDs of interest def analyseResult(self, predResult, objectIdsOfInterest): print("Analysing results...") detectedObjList = [] lbl = '' if predResult: for obj in predResult: if obj.label_id in objectIdsOfInterest: if self.labels: lbl = self.labels[obj.label_id] print(lbl, obj.label_id) print('score = ', obj.score) box = obj.bounding_box.flatten() box *= self.cameraResolution[1] #scale up to resolution print('box = ', box.tolist()) detectedObjList.append((lbl, box)) if len(detectedObjList) == 0: print('No object detected!') return detectedObjList #Step 15: Depending on the detected objects and location #take desired action def processResult(self, detectedObjects): print('Number of detected objects: ', len(detectedObjects))
def main(): cam_w, cam_h = 640, 480 default_model_dir = "../all_models" default_model = 'mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite' default_labels = 'coco_labels.txt' parser = argparse.ArgumentParser() parser.add_argument('--model', help='.tflite model path', default=os.path.join(default_model_dir, default_model)) parser.add_argument('--labels', help='label file path', default=os.path.join(default_model_dir, default_labels)) parser.add_argument('--top_k', type=int, default=5, help='number of classes with highest score to display') parser.add_argument('--threshold', type=float, default=0.5, help='class score threshold') args = parser.parse_args() with open(args.labels, 'r') as f: pairs = (l.strip().split(maxsplit=1) for l in f.readlines()) labels = dict((int(k), v) for k, v in pairs) print("Loading %s with %s labels." % (args.model, args.labels)) engine = DetectionEngine(args.model) labels = load_labels(args.labels) pygame.init() pygame.font.init() font = pygame.font.SysFont("Arial", 20) pygame.camera.init() camlist = pygame.camera.list_cameras() _, w, h, _ = engine.get_input_tensor_shape() print("By default using camera: ", camlist[-1]) camera = pygame.camera.Camera(camlist[-1], (cam_w, cam_h)) try: display = pygame.display.set_mode((cam_w, cam_h), 0) except pygame.error as e: sys.stderr.write( "\nERROR: Unable to open a display window. Make sure a monitor is attached and that " "the DISPLAY environment variable is set. Example: \n" ">export DISPLAY=\":0\" \n") raise e red = pygame.Color(255, 0, 0) camera.start() try: last_time = time.monotonic() while True: mysurface = camera.get_image() imagen = pygame.transform.scale(mysurface, (w, h)) input = np.frombuffer(imagen.get_buffer(), dtype=np.uint8) start_time = time.monotonic() results = engine.detect_with_input_tensor(input, threshold=args.threshold, top_k=args.top_k) stop_time = time.monotonic() inference_ms = (stop_time - start_time) * 1000.0 fps_ms = 1.0 / (stop_time - last_time) last_time = stop_time annotate_text = "Inference: %5.2fms FPS: %3.1f" % (inference_ms, fps_ms) for result in results: x0, y0, x1, y1 = result.bounding_box.flatten().tolist() rect = pygame.Rect(x0 * cam_w, y0 * cam_h, (x1 - x0) * cam_w, (y1 - y0) * cam_h) pygame.draw.rect(mysurface, red, rect, 1) label = "%.0f%% %s" % (100 * result.score, labels[result.label_id]) text = font.render(label, True, red) print(label, ' ', end='') mysurface.blit(text, (x0 * cam_w, y0 * cam_h)) text = font.render(annotate_text, True, red) print(annotate_text) mysurface.blit(text, (0, 0)) display.blit(mysurface, (0, 0)) pygame.display.flip() finally: camera.stop()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', help='Path of the detection model.', required=True, type=str) parser.add_argument('--ip', "-i", help='File path of the input image.', required=True, type=str) parser.add_argument('--report_interval', '-r', help="Duration of reporting interval, in seconds", default=10, type=int) parser.add_argument('-v', "--verbose", help="Print information about detected objects", action='store_true') args = parser.parse_args() relay = Relay(args.ip, args.verbose) engine = DetectionEngine(args.model) watch = Stopwatch() while True: if args.verbose: print("ready for next inference") img = relay.get_image() if img is None: break if args.verbose: print("Received image ", watch.numread) watch.start() initial_h, initial_w, _ = img.shape if (initial_h, initial_w) != (300, 300): frame = cv2.resize(img, (300, 300)) else: frame = img frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) watch.stop(Stopwatch.MODE_PREPROCESS) watch.start() ans = engine.detect_with_input_tensor(frame.flatten(), threshold=0.5, top_k=10) watch.stop(Stopwatch.MODE_INFER) if args.verbose: print("Got inference results for frame ", watch.numread, ": ", ans) watch.start() # Display result results = [] for obj in ans: box = obj.bounding_box.flatten().tolist() bbox = [0] * 4 bbox[0] = int(box[0] * initial_w) bbox[1] = int(box[1] * initial_h) bbox[2] = int(box[2] * initial_w) bbox[3] = int(box[3] * initial_h) result = (bbox, obj.label_id + 1, obj.score) results.append(result) relay.send_results(results) watch.stop(Stopwatch.MODE_POSTPROCESS) if watch.report(): print("TCP Latency to source: ", round(measure_latency(host=args.ip, port=relay.port)[0], 3), "ms") relay.close()
class FaceDetectRPC(object): def __init__(self): # Init TPU engine. self.face_engine = DetectionEngine(FACE_DET_MODEL) # Load face recognition model and the label encoder. with open(FACE_CLASS_MODEL, 'rb') as fp: self.recognizer = pickle.load(fp) with open(FACE_LABEL_MAP, 'rb') as fp: self.le = pickle.load(fp) def detect_faces(self, test_image_paths): # List that will hold all images with any face detection information. objects_detected_faces = [] # Loop over the images paths provided. for obj in test_image_paths: logging.debug('**********Find Face(s) for {}'.format(obj['image'])) for label in obj['labels']: # If the object detected is a person then try to identify face. if label['name'] == 'person': # Read image from disk. img = cv2.imread(MOUNT_POINT + obj['image']) if img is None: # Bad image was read. logging.error('Bad image was read.') label['face'] = None continue # First bound the roi using the coord info passed in. # The roi is area around person(s) detected in image. # (x1, y1) are the top left roi coordinates. # (x2, y2) are the bottom right roi coordinates. y2 = int(label['box']['ymin']) x1 = int(label['box']['xmin']) y1 = int(label['box']['ymax']) x2 = int(label['box']['xmax']) roi = img[y2:y1, x1:x2, :] #cv2.imwrite('./roi.jpg', roi) if roi.size == 0: # Bad object roi...move on to next image. logging.error('Bad object roi.') label['face'] = None continue # Need roi shape for later conversion of face coords. (h, w) = roi.shape[:2] # Resize roi for face detection. # The tpu face det model used requires (320, 320). res = resize_to_square(img=roi, size=320, keep_aspect_ratio=True, interpolation=cv2.INTER_AREA) #cv2.imwrite('./res.jpg', res) # Detect the (x, y)-coordinates of the bounding boxes corresponding # to a face in the input image using the TPU engine. # Its assumed that only one face is in the image. # NB: reshape(-1) converts the np img array into 1-d. detection = self.face_engine.detect_with_input_tensor( res.reshape(-1), threshold=0.05, top_k=1) if not detection: # No face detected...move on to next image. logging.debug('No face detected.') label['face'] = None continue # Convert coords and carve out face roi. box = (detection[0].bounding_box.flatten().tolist() ) * np.array([w, h, w, h]) (face_left, face_top, face_right, face_bottom) = box.astype('int') face_roi = roi[face_top:face_bottom, face_left:face_right, :] #cv2.imwrite('./face_roi.jpg', face_roi) (f_h, f_w) = face_roi.shape[:2] # If face width or height are not sufficiently large then skip. if f_h < FACE_MIN or f_w < FACE_MIN: logging.debug('Face too small to recognize.') label['face'] = None continue # Compute the focus measure of the face # using the Variance of Laplacian method. # See https://www.pyimagesearch.com/2015/09/07/blur-detection-with-opencv/ gray = cv2.cvtColor(face_roi, cv2.COLOR_BGR2GRAY) fm = cv2.Laplacian(gray, cv2.CV_64F).var() # If fm below a threshold then face probably isn't clear enough # for face recognition to work, so skip it. if fm < FACE_FOCUS_MEASURE_THRESHOLD: logging.debug('Face too blurry to recognize.') label['face'] = None continue # Find the 128-dimension face encoding for face in image. # Convert image roi from BGR (OpenCV ordering) to dlib ordering (RGB). rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB) # Convert face bbox into dlib format. boxes = [(face_top, face_right, face_bottom, face_left)] # Generate encodings. Only one face is assumed so take the 1st element. encoding = face_recognition.face_encodings( face_image=rgb, known_face_locations=boxes, num_jitters=FACE_NUM_JITTERS)[0] logging.debug('face encoding {}'.format(encoding)) # Perform svm classification on the encodings to recognize the face. (name, proba) = face_classifier(recognizer=self.recognizer, le=self.le, encoding=encoding, min_proba=FACE_MIN_PROBA) # Add face name to label metadata. label['face'] = name # Add face confidence to label metadata. # (First convert NumPy value to native Python type for json serialization.) label['faceProba'] = proba.item() # Add processed image to output list. objects_detected_faces.append(obj) # Convert json to string and return data. return (json.dumps(objects_detected_faces))
class ObjDetectRPC(object): def __init__(self): self.obj_engine = DetectionEngine(OBJ_MODEL) self.labels_map = ReadLabelFile(OBJ_LABEL_MAP) def detect_objects(self, test_image_paths): objects_in_image = [] # holds all objects found in image labels = [] # labels of detected objects frame_num = 0 # ZoneMinder current alarm frame number monitor = '' # ZoneMinder current monitor name for image_path in test_image_paths: logging.debug('**********Find object(s) for {}'.format(image_path)) # If consecutive frames then repeat last label and skip inference. # This behavior controlled by CON_IMG_SKIP. skip, frame_num, monitor = skip_inference(frame_num, monitor, labels, image_path, objects_in_image) if skip is True: continue # Read image from disk. img = cv2.imread(MOUNT_POINT + image_path) #cv2.imwrite('./obj_img.jpg', img) if img is None: # Bad image was read. logging.error('Bad image was read.') objects_in_image.append({'image': image_path, 'labels': []}) continue # Resize. The tpu obj det requires (300, 300). res = resize_to_square(img=img, size=300, keep_aspect_ratio=True, interpolation=cv2.INTER_AREA) #cv2.imwrite('./obj_res.jpg', res) # Run object inference. detection = self.obj_engine.detect_with_input_tensor( res.reshape(-1), threshold=0.05, top_k=3) # Get labels and scores of detected objects. labels = [] # new detection, clear labels list. (h, w) = img.shape[:2] # use original image size for box coords for obj in detection: logging.debug('id: {} name: {} score: {}'.format( obj.label_id, self.labels_map[obj.label_id], obj.score)) if obj.score > OBJ_MIN_SCORE_THRESH: object_dict = {} object_dict['id'] = obj.label_id object_dict['name'] = self.labels_map[obj.label_id] object_dict['score'] = float(obj.score) (xmin, ymin, xmax, ymax) = (obj.bounding_box.flatten().tolist()) * np.array( [w, h, w, h]) object_dict['box'] = { 'ymin': ymin, 'xmin': xmin, 'ymax': ymax, 'xmax': xmax } labels.append(object_dict) objects_in_image.append({'image': image_path, 'labels': labels}) return json.dumps(objects_in_image)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--model', help='File path of Tflite model.', required=True) parser.add_argument('--label', help='File path of label file.') args = parser.parse_args() labels = dataset_utils.read_label_file(args.label) if args.label else None engine = DetectionEngine(args.model) with picamera.PiCamera() as camera: preview_size = (640, 480) camera.resolution = preview_size camera.framerate = 30 # camera.hflip = True # camera.vflip = True # camera.rotation = 90 _, input_height, input_width, _ = engine.get_input_tensor_shape() input_size = (input_width, input_height) # Width is rounded up to the nearest multiple of 32, # height to the nearest multiple of 16. capture_size = (math.ceil(input_width / 32) * 32, math.ceil(input_height / 16) * 16) # Actual detection area on preview. detect_size = (preview_size[0] * input_size[0] / capture_size[0], preview_size[1] * input_size[1] / capture_size[1]) # Make annotator smaller for efficiency. annotator_factor = 0.5 annotator_size = (int(preview_size[0] * annotator_factor), int(preview_size[1] * annotator_factor)) # Font for drawing detection candidates font = ImageFont.truetype( '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf', size=12) camera.start_preview() annotator = Annotator(camera, dimensions=annotator_size, default_color=(255, 255, 255, 64)) def annotate(candidates): annotator.clear() # Get actual coordinates to draw def translate(relative_coord): return (detect_size[0] * relative_coord[0] * annotator_factor, detect_size[1] * relative_coord[1] * annotator_factor) for c in candidates: top_left = translate(c.bounding_box[0]) bottom_right = translate(c.bounding_box[1]) annotator.bounding_box(top_left + bottom_right) text = '{} {:.2f}'.format(labels[c.label_id], c.score) \ if labels else '{:.2f}'.format(c.score) annotator.text(top_left, text, font=font) annotator.update() try: stream = io.BytesIO() for _ in camera.capture_continuous( stream, format='rgb', use_video_port=True, resize=capture_size): stream.truncate() stream.seek(0) input_tensor = np.frombuffer(stream.getvalue(), dtype=np.uint8) if input_size != capture_size: # Crop to input size. Note dimension order (height, width, channels) input_tensor = input_tensor.reshape( (capture_size[1], capture_size[0], 3))[ 0:input_height, 0:input_width, :].ravel() start_ms = time.time() results = engine.detect_with_input_tensor(input_tensor, top_k=3) elapsed_ms = time.time() - start_ms annotate(results) camera.annotate_text = '{:.2f}ms'.format(elapsed_ms * 1000.0) finally: # Maybe should make this an annotator method camera.remove_overlay(annotator._overlay) camera.stop_preview()
num_of_frames = 0 while True: num_of_frames += 1 success, img = vidcap.read() if not success: break # Process frame image_cv2 = resize_image(img) image_pil = Image.fromarray(image_cv2) draw = ImageDraw.Draw(image_pil) # Run inference and update track detections = engine.detect_with_input_tensor(image_cv2.flatten(), threshold=0.3, top_k=10) iou_tracker.update_tracks(detections) # Draw detection and save output frame draw_boxes(detections, draw) output_frames.append(numpy.array(image_pil)) # Print results print('Number of processed frames: {0}'.format(num_of_frames)) print('Time {0}'.format(time.time() - start)) print('Cars detected {0}'.format(iou_tracker.get_tracked_number())) # Generate video out = cv2.VideoWriter(args.output, cv2.VideoWriter_fourcc(*'DIVX'), 30, (args.size, args.size))
class ObjectDetector(object): def __init__(self, model_path, label_path, use_coral_flag, use_tpu_flag, res_x, res_y, min_conf_threshold): self.res_y = res_y self.res_x = res_x self.use_coral_flag = use_coral_flag if use_coral_flag: from edgetpu.detection.engine import DetectionEngine from edgetpu.utils import dataset_utils self.min_conf_threshold = min_conf_threshold # Load the label map with open(label_path, 'r') as f: self.labels = [line.strip() for line in f.readlines()] if self.labels[0] == '???': del (self.labels[0]) if use_tpu_flag: self.interpreter = Interpreter( model_path=model_path, experimental_delegates=[load_delegate('libedgetpu.so.1.0')]) else: self.interpreter = Interpreter(model_path=model_path) self.interpreter.allocate_tensors() # Get model details self.input_details = self.interpreter.get_input_details() self.output_details = self.interpreter.get_output_details() self.height = self.input_details[0]['shape'][1] self.width = self.input_details[0]['shape'][2] self.is_floating_model = (self.input_details[0]['dtype'] == np.float32) self.input_mean = 127.5 self.input_std = 127.5 #Coral if use_coral_flag: self.engine = DetectionEngine(model_path) self.labels = dataset_utils.read_label_file(label_path) _, height, width, _ = self.engine.get_input_tensor_shape() def apply_coral_model(self, input_data): print("here") ans = self.engine.detect_with_input_tensor(input_data, threshold=0.05, top_k=10) print("here2") for obj in ans: if self.labels: print(self.labels[obj.label_id]) print('score = ', obj.score) box = obj.bounding_box.flatten().tolist() print('box = ', box) def apply_tflite_model(self, input_data): # Perform the actual detection by running the model with the image as input self.interpreter.set_tensor(self.input_details[0]['index'], input_data) self.interpreter.invoke() # Retrieve detection results boxes = self.interpreter.get_tensor(self.output_details[0]['index'])[ 0] # Bounding box coordinates of detected objects classes = self.interpreter.get_tensor(self.output_details[1]['index'])[ 0] # Class index of detected objects scores = self.interpreter.get_tensor(self.output_details[2]['index'])[ 0] # Confidence of detected objects return (boxes, classes, scores) def process_frame(self, frame): frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_resized = cv2.resize(frame_rgb, (self.width, self.height)) input_data = np.expand_dims(frame_resized, axis=0) # Normalize pixel values if using a floating model (i.e. if model is non-quantized) if self.is_floating_model: input_data = (np.float32(input_data) - self.input_mean) / self.input_std if self.use_coral_flag: self.apply_coral_model(input_data) scores = [] else: (boxes, classes, scores) = self.apply_tflite_model(input_data) return (frame, boxes, classes, scores) def is_interesting_object(self, scores, classes): is_interesting_object = False interesting_classes = [] for i in range(len(scores)): if ((scores[i] > self.min_conf_threshold) and (scores[i] <= 1.0)): is_interesting_object = True interesting_classes.append(self.labels[int(classes[i])]) return is_interesting_object, interesting_classes def draw_frame(self, frame, boxes, classes, scores): # Loop over all detections and draw detection box if confidence is above minimum threshold for i in range(len(scores)): if ((scores[i] > self.min_conf_threshold) and (scores[i] <= 1.0)): # Get bounding box coordinates and draw box # Interpreter can return coordinates that are outside of image dimensions, need to force them to be within image using max() and min() ymin = int(max(1, (boxes[i][0] * self.res_y))) xmin = int(max(1, (boxes[i][1] * self.res_x))) ymax = int(min(self.res_y, (boxes[i][2] * self.res_y))) xmax = int(min(self.res_x, (boxes[i][3] * self.res_x))) cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (10, 255, 0), 4) # Draw label object_name = self.labels[int( classes[i] )] # Look up object name from "labels" array using class index label = '%s: %d%%' % (object_name, int(scores[i] * 100) ) # Example: 'person: 72%' labelSize, baseLine = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2) # Get font size label_ymin = max( ymin, labelSize[1] + 10 ) # Make sure not to draw label too close to top of window cv2.rectangle( frame, (xmin, label_ymin - labelSize[1] - 10), (xmin + labelSize[0], label_ymin + baseLine - 10), (255, 255, 255), cv2.FILLED) # Draw white box to put label text in cv2.putText(frame, label, (xmin, label_ymin - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2) # Draw label text (flag, encodedImage) = cv2.imencode(".jpg", frame) return encodedImage