def parse_annotation(self, annotation, mAP='False'): if TRAIN_LOAD_IMAGES_TO_RAM: image_path = annotation[0] image = annotation[2] else: image_path = annotation[0] image = cv2.imread(image_path) bboxes = np.array( [list(map(int, box.split(','))) for box in annotation[1]]) if self.data_aug: image, bboxes = self.random_horizontal_flip( np.copy(image), np.copy(bboxes)) image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes)) image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes)) #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) if mAP == True: return image, bboxes image, bboxes = image_preprocess(np.copy(image), [self.input_sizes, self.input_sizes], np.copy(bboxes)) return image, bboxes
def get_bounding_boxes(image_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors=''): original_image = cv2.imread(image_path) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) image_data = image_data[np.newaxis, ...].astype(np.float32) if YOLO_FRAMEWORK == "tf": pred_bbox = VehicleDetector.yolo_obj.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = VehicleDetector.yolo_obj(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') return bboxes, original_image
def image_crop_2_array(img_path, debug=True): pil_image = PIL.Image.open(img_path).convert('RGB') print(pil_image) original_image = np.array(pil_image) #cv2.imread(img_path) if debug: plt.figure(figsize=(30, 15)) plt.imshow(original_image) pil_image.show() image_data = image_preprocess(np.copy(original_image), [YOLO_INPUT_SIZE, YOLO_INPUT_SIZE]) image_data = image_data[np.newaxis, ...].astype(np.float32) pred_bbox = yolo.predict(image_data) image = detect_image(yolo, image_path, "", input_size=YOLO_INPUT_SIZE, show=False, CLASSES=TRAIN_CLASSES, rectangle_colors=(255, 0, 0)) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) if debug: plt.figure(figsize=(30, 15)) plt.imshow(image) #image.show() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, YOLO_INPUT_SIZE, TEST_SCORE_THRESHOLD) bboxes = nms(bboxes, TEST_IOU_THRESHOLD, method='nms') if len(bboxes) != 0: return original_image[int(bboxes[0][1]):int(bboxes[0][3]), int(bboxes[0][0]):int(bboxes[0][2])]
def detect_fall(YoloV3, img, input_size=416, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors=''): try: original_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) except: raise ('Invalid image!') image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) image_data = tf.expand_dims(image_data, 0) t1 = time.time() pred_bbox = YoloV3.predict(image_data) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') ms = (t2 - t1) * 1000 fps = 1000 / ms print('Time: {:.2f}ms, {:.1f} FPS'.format(ms, fps)) fall_bboxes = [] for i, bbox in enumerate(bboxes): coor = np.array(bbox[:4], dtype=np.int32) class_ind = int(bbox[5]) (x1, y1), (x2, y2) = (coor[0], coor[1]), (coor[2], coor[3]) if check_fall(CLASSES, class_ind, x2 - x1, y2 - y1): fall_bboxes.append(bbox) if len(fall_bboxes) > 0: image = draw_bbox(original_image, fall_bboxes, rectangle_colors=rectangle_colors) cv2.imwrite('fall-detection.jpg', image) return True else: return False
def parse_annotation(self, annotation): if TRAIN_LOAD_IMAGES_TO_RAM: image = annotation[0] else: image_path = annotation[0] image = cv2.imread(image_path) bboxes = np.array([list(map(int, box.split(',')[:-1])) + [box[-1]] for box in annotation[1]]) if self.data_aug: image, bboxes = self.random_horizontal_flip(np.copy(image), np.copy(bboxes)) image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes)) image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes)) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image, bboxes = image_preprocess(np.copy(image), [self.train_input_size, self.train_input_size], np.copy(bboxes)) return image, bboxes
def parse_annotation(self, annotation): line = annotation.split() image_path = line[0] if not os.path.exists(image_path): raise KeyError("%s does not exist ... " % image_path) image = cv2.imread(image_path) bboxes = np.array([list(map(int, box.split(','))) for box in line[1:]]) if self.data_aug: image, bboxes = self.random_horizontal_flip( np.copy(image), np.copy(bboxes)) image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes)) image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes)) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image, bboxes = image_preprocess( np.copy(image), [self.train_input_size, self.train_input_size], np.copy(bboxes)) return image, bboxes
def parse_annotation(self, annotation, mAP='False'): if TRAIN_LOAD_IMAGES_TO_RAM: image_path = annotation[0] image = annotation[2] else: image_path = annotation[0] image = cv2.imread(image_path) bboxes = np.array( [list(map(int, box.split(','))) for box in annotation[1]]) if self.data_aug: # image, bboxes = self.random_horizontal_flip(np.copy(image), np.copy(bboxes)) image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes)) image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes)) # image = iaa.GaussianBlur(sigma=0.5)(image=image) # image = iaa.AddToBrightness((-30, 30))(image=image) image = self.random_color(np.copy(image)) image = self.random_noise(np.copy(image)) # coor = bboxes[:, :4] # sh_img = image # for i in coor: # cv2.rectangle(sh_img, (i[0], i[1]), (i[2], i[3]), (255, 0, 0)) # cv2.imshow('img', sh_img) # cv2.waitKey() #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) if mAP == True: return image, bboxes image, bboxes = image_preprocess(np.copy(image), [self.input_sizes, self.input_sizes], np.copy(bboxes)) return image, bboxes
def Object_tracking(Yolo, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only = []): # Definition of the parameters max_cosine_distance = 0.7 nn_budget = None #initialize deep sort object model_filename = 'model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times, times_2 = [], [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int length = int(vid.get(cv2.CAP_PROP_FRAME_COUNT)) width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) print("VIDEO PROPERTIES:FrameCount:{}\tWidth:{}\tHeight:{}\tFps:{}\t".format(length,width,height,fps)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) #1.BACKGROUND DETECTION backSub = cv2.createBackgroundSubtractorMOG2(history = 400, varThreshold = 16, detectShadows = False) bgMask=None frame_no=0 while True: _, frame = vid.read() frame_no=frame_no+1 try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break #1.1 BACKGROUND Update fgMask = backSub.apply(original_frame) bgMask = backSub.getBackgroundImage() if frame_no % 100==0: print(frame_no) image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) #image_data = tf.expand_dims(image_data, 0) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) #t1 = time.time() #pred_bbox = Yolo.predict(image_data) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) !=0 and NUM_CLASS[int(bbox[5])] in Track_only or len(Track_only) == 0: boxes.append([bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int)-bbox[0].astype(int), bbox[3].astype(int)-bbox[1].astype(int)]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) features = np.array(encoder(original_frame, boxes)) detections = [Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip(boxes, scores, names, features)] # Pass detections to the deepsort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 5: continue bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class() #Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index(class_name)] # Get predicted object index by object name tracked_bboxes.append(bbox.tolist() + [tracking_id, index]) # Structure data, that we could use it with our draw_bbox function #Save to File box_item=bbox.tolist() + [tracking_id, index,frame_no] ts.save(box_item) # draw detection on frame image = draw_bbox(original_frame, tracked_bboxes, CLASSES=CLASSES, tracking=True) t3 = time.time() times.append(t2-t1) times_2.append(t3-t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times)/len(times)*1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2)/len(times_2)*1000) image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # draw original yolo detection #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) print("Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format(ms, fps, fps2)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break cv2.imwrite(ts.out_bg_img,bgMask) cv2.destroyAllWindows()
def Object_tracking(Yolo, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only = [], custom_yolo=None, custom_classes=YOLO_CUSTOM_CLASSES, Custom_track_only=[]): # Definition of the parameters max_cosine_distance = 0.7 nn_budget = None #initialize deep sort object model_filename = 'model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times, times_2 = [], [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) # set a bunch of flags and variables for made baskets and possessions possession = None possession_list = [] combined_possession_avg = 0.5 total_basket_count=0 basket_frame_list = [] baskets_dict = {"Dark": 0, "Light": 0} made_basket_first_frame = 0 made_basket_frames = 0 basket_marked = False if custom_yolo: NUM_CUSTOM_CLASS = read_class_names(custom_classes) custom_key_list = list(NUM_CUSTOM_CLASS.keys()) custom_val_list = list(NUM_CUSTOM_CLASS.values()) frame_counter = 0 # loop through each frame in video while True: _, frame = vid.read() try: first_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB) frame_counter += 1 except: break image_data = image_preprocess(np.copy(first_frame), [input_size, input_size]) #image_data = tf.expand_dims(image_data, 0) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() # CUSTOM BLOCK FOR BASKETBALL if custom_yolo: if YOLO_FRAMEWORK == "tf": # use yolo model to make prediction on the image data custom_pred_bbox = custom_yolo.predict(image_data) # reshape our data to be in correct form for processing custom_pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in custom_pred_bbox] custom_pred_bbox = tf.concat(custom_pred_bbox, axis=0) # get boxes based on threshhold custom_bboxes = postprocess_boxes(custom_pred_bbox, original_frame, input_size, 0.3) # custom_bboxes = nms(custom_bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names custom_boxes, custom_scores, custom_names = [], [], [] for bbox in custom_bboxes: if len(Custom_track_only) !=0 and NUM_CUSTOM_CLASS[int(bbox[5])] in Custom_track_only or len(Custom_track_only) == 0: custom_boxes.append([bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int)-bbox[0].astype(int), bbox[3].astype(int)-bbox[1].astype(int)]) custom_scores.append(bbox[4]) custom_names.append(NUM_CUSTOM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. custom_boxes = np.array(custom_boxes) custom_names = np.array(custom_names) custom_scores = np.array(custom_scores) # take note of the highest "scoring" made basket and basketball obj in each frame highest_scoring_basketball = 0 basketball_box = None basketball_center = None highest_scoring_made_basket = 0 made_basket_box = None for i, bbox in enumerate(custom_bboxes): # loop through each bounding box to get the "best one" of the frame # we do this because sometimes our model will detect two, and we know there can only be one name = custom_names[i] score = round(custom_scores[i], 3) if name == 'basketball': if score > highest_scoring_basketball: highest_scoring_basketball = score basketball_box = bbox if name == 'made-basket': if score > .85 and score > highest_scoring_made_basket: highest_scoring_made_basket = score made_basket_box = bbox # if it sees a basketball, put a box on it and note the center (for possession) if basketball_box is not None: cv2.rectangle(original_frame, (int(basketball_box[0]), int(basketball_box[1])), (int(basketball_box[2]), int(basketball_box[3])), (0,0,255), 1) cv2.rectangle(original_frame, (int(basketball_box[0]), int(basketball_box[1]-30)), (int(basketball_box[0])+(10)*17, int(basketball_box[1])), (0,0,255), -1) cv2.putText(original_frame, "basketball" + "-" + str(highest_scoring_basketball),(int(basketball_box[0]), int(basketball_box[1]-10)),0, 0.5, (255,255,255),1) basketball_center = ( (basketball_box[2]+basketball_box[0])/2, (basketball_box[3]+basketball_box[1])/2 ) if made_basket_box is not None: # if theres a made basket put the box on it cv2.rectangle(original_frame, (int(made_basket_box[0]), int(made_basket_box[1])), (int(made_basket_box[2]), int(made_basket_box[3])), (0,255,0), 1) cv2.rectangle(original_frame, (int(made_basket_box[0]), int(made_basket_box[1]-30)), (int(made_basket_box[0])+(15)*17, int(made_basket_box[1])), (0,255,0), -1) cv2.putText(original_frame, "made-basket" + " " + str(highest_scoring_made_basket),(int(made_basket_box[0]), int(made_basket_box[1]-10)),0, 0.6, (0,0,0),1) if made_basket_frames == 0: # if this is the first frame in the sequence made_basket_first_frame = frame_counter # increment a counter for made basket frames made_basket_frames += 1 # if there were 3 consecuative frames AND we havnt marked the basket yet then lets count it! if made_basket_frames >= 3 and not basket_marked: basket_marked = True basket_frame_list.append(made_basket_first_frame) if possession: # record which "team" scored the basket baskets_dict[possession] += 1 # if no made basket make sure the made basket counter is at zero else: # no made basket made_basket_frames = 0 # 60 frames after a made basket we can reset the "marked basket" flag to False # in essence this means we start looking for made baskets again if basket_marked and frame_counter > basket_frame_list[-1] + 60: basket_marked = False # END CUSTOM BLOCK # PRESON PREDICTION and TRACKING BLOCK if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) !=0 and NUM_CLASS[int(bbox[5])] in Track_only or len(Track_only) == 0: w = bbox[2].astype(int)-bbox[0].astype(int) h = bbox[3].astype(int)-bbox[1].astype(int) if h < height/3 and w < width/4: if h > 120: boxes.append([bbox[0].astype(int), bbox[1].astype(int), w, h]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) # detect jersey color using the tracked persons bounding box patches = [gdet.extract_image_patch(frame, box, [box[3], box[2]]) for box in boxes] color_ratios = [find_color(patch) for patch in patches] features = np.array(encoder(original_frame, boxes)) # mark the detection detections = [Detection(bbox, score, class_name, feature, color_ratio) for bbox, score, class_name, feature, color_ratio in zip(boxes, scores, names, features, color_ratios)] # Pass detections to the deepsort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] color_ratio_list = [] check_possession = False for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 5: continue color_ratio = track.get_color_ratio() color_ratio_list.append(color_ratio) bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class() #Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index(class_name)] # Get predicted object index by object name tracked_bboxes.append(bbox.tolist() + [tracking_id, index]) # Structure data, that we could use it with our draw_bbox function # if there is a basketball in the frame, and its "in" a ersons bounding box, check what box it is in for psosession if basketball_center: if basketball_center[0] >= bbox[0] and basketball_center[0] <= bbox[2]: if basketball_center[1] >= bbox[1] and basketball_center[1] <= bbox[3]: check_possession = True if color_ratio <= .2: # light team possession_list.append(0) else: # dark team possession_list.append(1) else: # no basketball in frame # possession_list.append(-1) # test_list.pop(0) pass # if the ball is in a bounding box, update out possession tracker if check_possession: if len(possession_list) > 60: # this function takes an average of the last 60 posessions marked to determine current position # it weights the most recent detections more # this algo is a WIP possession_list = possession_list[-60:] # full_avg = sum(possession_list)/len(possession) last_60_avg = sum(possession_list[-60:])/60 last_30_avg = sum(possession_list[-30:])/30 last_15_avg = sum(possession_list[-15:])/15 last_5_avg = sum(possession_list[-5:])/5 combined_possession_avg = round((last_60_avg + last_30_avg + last_15_avg + last_5_avg)/4,3) #most_common_possession = stats.mode(possession_list)[0] else: combined_possession_avg = round(sum(possession_list)/len(possession_list),3) # use our possession average to determine who has the ball right now if combined_possession_avg < 0.5: possession = "Light" elif combined_possession_avg > 0.5: possession = "Dark" # draw detection on frame image = draw_bbox(original_frame, tracked_bboxes, color_ratios=color_ratio_list, CLASSES=CLASSES, tracking=True) t3 = time.time() times.append(t2-t1) times_2.append(t3-t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times)/len(times)*1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2)/len(times_2)*1000) if possession == "Light": image = cv2.putText(image, "Posession: {}".format(possession), (width-400, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (50, 255, 255), 2) else: image = cv2.putText(image, "Posession: {}".format(possession), (width-400, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # image = cv2.putText(image, "Light: {} Dark: {} None: {}".format(possession_list.count(0), possession_list.count(1), possession_list.count(-1)), (400, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) image = cv2.putText(image, "Posession Avg: {}".format(combined_possession_avg), (400, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # draw original yolo detection #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) print("Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format(ms, fps, fps2)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break cv2.destroyAllWindows() return_data = {"baskets_dict": baskets_dict, "basket_frame_list": basket_frame_list} print("video saved to {}".format(output_path)) return(return_data)
# video_path = "./IMAGES/street_drive.mp4" yolo = Create_Yolov3(input_size=input_size, CLASSES='../' + YOLO_COCO_CLASSES) load_yolo_weights(yolo, '../' + Darknet_weights) # use Darknet weights print(f'weight data load ok {Darknet_weights}') # %% image_path = "../IMAGES/kite.jpg" #이미지 로딩 & 전처리 original_image = cv2.imread(image_path) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) # 0~255, 0~1 사이 소수로 바꾸고 크기도 416 싸이즈 안에 맞춰 집어 넣는다. image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) # plt.figure() # plt.imshow(image_data) # plt.colorbar() # plt.grid(False) # plt.show() display(Image.fromarray((image_data * 256).astype('uint8'))) # %% # 검출박스 구하기 image_data = tf.expand_dims(image_data, 0) YoloV3 = yolo
def get_mAP(model, dataset, score_threshold=0.25, iou_threshold=0.50, TEST_INPUT_SIZE=TEST_INPUT_SIZE): MINOVERLAP = 0.5 # default value (defined in the PASCAL VOC2012 challenge) NUM_CLASS = read_class_names(TRAIN_CLASSES) ground_truth_dir_path = 'mAP/ground-truth' if os.path.exists(ground_truth_dir_path): shutil.rmtree(ground_truth_dir_path) if not os.path.exists('mAP'): os.mkdir('mAP') os.mkdir(ground_truth_dir_path) print(f'\ncalculating mAP{int(iou_threshold*100)}...\n') gt_counter_per_class = {} for index in range(dataset.num_samples): ann_dataset = dataset.annotations[index] original_image, bbox_data_gt = dataset.parse_annotation( ann_dataset, True) if len(bbox_data_gt) == 0: bboxes_gt = [] classes_gt = [] else: bboxes_gt, classes_gt = bbox_data_gt[:, :4], bbox_data_gt[:, 4] ground_truth_path = os.path.join(ground_truth_dir_path, str(index) + '.txt') num_bbox_gt = len(bboxes_gt) bounding_boxes = [] for i in range(num_bbox_gt): class_name = NUM_CLASS[classes_gt[i]] xmin, ymin, xmax, ymax = list(map(str, bboxes_gt[i])) bbox = xmin + " " + ymin + " " + xmax + " " + ymax bounding_boxes.append({ "class_name": class_name, "bbox": bbox, "used": False }) # count that object if class_name in gt_counter_per_class: gt_counter_per_class[class_name] += 1 else: # if class didn't exist yet gt_counter_per_class[class_name] = 1 bbox_mess = ' '.join([class_name, xmin, ymin, xmax, ymax]) + '\n' with open(f'{ground_truth_dir_path}/{str(index)}_ground_truth.json', 'w') as outfile: json.dump(bounding_boxes, outfile) gt_classes = list(gt_counter_per_class.keys()) # sort the classes alphabetically gt_classes = sorted(gt_classes) n_classes = len(gt_classes) times = [] json_pred = [[] for i in range(n_classes)] for index in range(dataset.num_samples): ann_dataset = dataset.annotations[index] image_name = ann_dataset[0].split('/')[-1] original_image, bbox_data_gt = dataset.parse_annotation( ann_dataset, True) image = image_preprocess(np.copy(original_image), [TEST_INPUT_SIZE, TEST_INPUT_SIZE]) image_data = tf.expand_dims(image, 0) t1 = time.time() pred_bbox = model.predict(image_data) t2 = time.time() times.append(t2 - t1) pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, TEST_INPUT_SIZE, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') for bbox in bboxes: coor = np.array(bbox[:4], dtype=np.int32) score = bbox[4] class_ind = int(bbox[5]) class_name = NUM_CLASS[class_ind] score = '%.4f' % score xmin, ymin, xmax, ymax = list(map(str, coor)) bbox = xmin + " " + ymin + " " + xmax + " " + ymax json_pred[gt_classes.index(class_name)].append({ "confidence": str(score), "file_id": str(index), "bbox": str(bbox) }) ms = sum(times) / len(times) * 1000 fps = 1000 / ms for class_name in gt_classes: json_pred[gt_classes.index(class_name)].sort( key=lambda x: float(x['confidence']), reverse=True) with open(f'{ground_truth_dir_path}/{class_name}_predictions.json', 'w') as outfile: json.dump(json_pred[gt_classes.index(class_name)], outfile) # Calculate the AP for each class sum_AP = 0.0 ap_dictionary = {} # open file to store the results with open("mAP/results.txt", 'w') as results_file: results_file.write("# AP and precision/recall per class\n") count_true_positives = {} for class_index, class_name in enumerate(gt_classes): count_true_positives[class_name] = 0 # Load predictions of that class predictions_file = f'{ground_truth_dir_path}/{class_name}_predictions.json' predictions_data = json.load(open(predictions_file)) # Assign predictions to ground truth objects nd = len(predictions_data) tp = [0] * nd # creates an array of zeros of size nd fp = [0] * nd for idx, prediction in enumerate(predictions_data): file_id = prediction["file_id"] # assign prediction to ground truth object if any # open ground-truth with that file_id gt_file = f'{ground_truth_dir_path}/{str(file_id)}_ground_truth.json' ground_truth_data = json.load(open(gt_file)) ovmax = -1 gt_match = -1 # load prediction bounding-box bb = [float(x) for x in prediction["bbox"].split() ] # bounding box of prediction for obj in ground_truth_data: # look for a class_name match if obj["class_name"] == class_name: bbgt = [float(x) for x in obj["bbox"].split() ] # bounding box of ground truth bi = [ max(bb[0], bbgt[0]), max(bb[1], bbgt[1]), min(bb[2], bbgt[2]), min(bb[3], bbgt[3]) ] iw = bi[2] - bi[0] + 1 ih = bi[3] - bi[1] + 1 if iw > 0 and ih > 0: # compute overlap (IoU) = area of intersection / area of union ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + ( bbgt[2] - bbgt[0] + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih ov = iw * ih / ua if ov > ovmax: ovmax = ov gt_match = obj # assign prediction as true positive/don't care/false positive if ovmax >= MINOVERLAP: # if ovmax > minimum overlap if not bool(gt_match["used"]): # true positive tp[idx] = 1 gt_match["used"] = True count_true_positives[class_name] += 1 # update the ".json" file with open(gt_file, 'w') as f: f.write(json.dumps(ground_truth_data)) else: # false positive (multiple detection) fp[idx] = 1 else: # false positive fp[idx] = 1 # compute precision/recall cumsum = 0 for idx, val in enumerate(fp): fp[idx] += cumsum cumsum += val cumsum = 0 for idx, val in enumerate(tp): tp[idx] += cumsum cumsum += val #print(tp) rec = tp[:] for idx, val in enumerate(tp): rec[idx] = float(tp[idx]) / gt_counter_per_class[class_name] #print(rec) prec = tp[:] for idx, val in enumerate(tp): prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx]) #print(prec) ap, mrec, mprec = voc_ap(rec, prec) sum_AP += ap text = "{0:.3f}%".format( ap * 100 ) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100) rounded_prec = ['%.3f' % elem for elem in prec] rounded_rec = ['%.3f' % elem for elem in rec] # Write to results.txt results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n") print(text) ap_dictionary[class_name] = ap results_file.write("\n# mAP of all classes\n") mAP = sum_AP / n_classes text = "mAP = {:.3f}%, {:.2f} FPS".format(mAP * 100, fps) results_file.write(text + "\n") print(text) return mAP * 100
def Object_tracking(YoloV3, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only=[]): # Definition of the parameters max_cosine_distance = 0.7 nn_budget = None #initialize deep sort object model_filename = 'model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times = [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) while True: _, img = vid.read() try: original_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) image_data = tf.expand_dims(image_data, 0) t1 = time.time() pred_bbox = YoloV3.predict(image_data) t2 = time.time() times.append(t2 - t1) times = times[-20:] pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) != 0 and NUM_CLASS[int( bbox[5])] in Track_only or len(Track_only) == 0: boxes.append([ bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int) - bbox[0].astype(int), bbox[3].astype(int) - bbox[1].astype(int) ]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) features = np.array(encoder(original_image, boxes)) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( boxes, scores, names, features) ] # Pass detections to the deepsort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 5: continue bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class( ) #Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index( class_name)] # Get predicted object index by object name tracked_bboxes.append( bbox.tolist() + [tracking_id, index] ) # Structure data, that we could use it with our draw_bbox function ms = sum(times) / len(times) * 1000 fps = 1000 / ms # draw detection on frame image = draw_bbox(original_image, tracked_bboxes, CLASSES=CLASSES, tracking=True) image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # draw original yolo detection #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) #print("Time: {:.2f}ms, {:.1f} FPS".format(ms, fps)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break cv2.destroyAllWindows()
def track_object(Yolo, video_path, vid_output_path, text_output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', tracking=True, track_only=[], tracker_max_age=30, passenger_det=False, face_score_threshold=0.3, color="bincount"): """ Do detection on video :param Yolo: <model_obj> YOLO model for vehicle detection :param video_path: <str> Path to video file. Leave empty to use camera :param vid_output_path: <str> Path to save processed video. Leave empty to not save :param input_size: <int> YOLO model input size :param show: <bool> True if you want to see processing live :param CLASSES: <obj> YOLO model classed. By default they are taken from the config file :param score_threshold: <float> minimum confidence for vehicle detection :param iou_threshold: <float> minimum bounding box overlap for them to be counted as same object :param rectangle_colors: bounding box colors. Currently does nothing :param tracking: whether to use vehicle tracking :param track_only: <list> List of objects to track if detector detects more :param tracker_max_age: <int> number of missed before track is deleted :param face_det: <bool> whether to initialize face detection :param face_score_threshold: <float> minimum confidence for face detection :param color: <str> Color detection method to use. None if neither one :return: """ if not Yolo: Yolo = load_yolo_model() if passenger_det: passenger_det = FaceDetector() else: passenger_det = None if text_output_path: write_csv([[ "x1", "y1", "x2", "y2", "id", "class", "probability", "color" if color else None, "passengers" if passenger_det else None ]], text_output_path) # Definition of the deep sort parameters max_cosine_distance = 0.7 nn_budget = None # initialize deep sort object model_filename = 'model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric, max_age=tracker_max_age) times, times_2 = [], [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(vid_output_path, codec, fps, (width, height)) # vid_output_path must be .mp4 NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) while True: _, frame = vid.read() try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) # image_data = tf.expand_dims(image_data, 0) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(track_only) != 0 and NUM_CLASS[int( bbox[5])] in track_only or len(track_only) == 0: boxes.append([ bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int) - bbox[0].astype(int), bbox[3].astype(int) - bbox[1].astype(int) ]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) features = np.array(encoder(original_frame, boxes)) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( boxes, scores, names, features) ] # if score >= confidence_threshold] # Pass detections to the deep sort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 5: continue bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class( ) # Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index( class_name)] # Get predicted object index by object name tracked_bboxes.append( bbox.tolist() + [tracking_id, index, track.class_confidence] ) # Structure data, that we could use it with our draw_bbox function # draw detection on frame image = draw_bbox(original_frame, tracked_bboxes, CLASSES=CLASSES, tracking=True, color=color, text_output_path=text_output_path, passenger_detector=passenger_det, passenger_threshold=face_score_threshold) t3 = time.time() times.append(t2 - t1) times_2.append(t3 - t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2) / len(times_2) * 1000) image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # print("Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format(ms, fps, fps2)) if vid_output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break cv2.destroyAllWindows()
def Object_tracking(Yolo, video_path, output_path, input_size=416, show=True, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only=[]): output_file = "D:/PELUSO/ITSligo/lectures_MENG/4-Symulation and Testing/assignments/assignment 2 - group/simV5_anotation.CSV" csv_file = open(output_file, mode='a') #new results_csv = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) results_csv.writerow([ 'Frame_index', 'Score', 'Confidence', 'Pixel_Area', 'X1', 'Y1', 'X2', 'Y2', 'ClassID' ]) # Definition of the parameters max_cosine_distance = 0.9 nn_budget = None #initialize deep sort object model_filename = "D:/PELUSO/ITSligo/lectures_MENG/4-Symulation and Testing/assignments/assignment 2 - group/TensorFlow-2.x-YOLOv3-master/model_data/mars-small128.pb" ##'model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times, times_2 = [], [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'mp4v') ##(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 print("FPS:::::" + str(fps)) NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) frame_idx = 0 while True: _, frame = vid.read() try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) #image_data = tf.expand_dims(image_data, 0) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) #t1 = time.time() #pred_bbox = Yolo.predict(image_data) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms', sigma=0.4) print(np.argmax(pred_bbox[:, 5:], axis=-1)) # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) != 0 and NUM_CLASS[int( bbox[5])] in Track_only or len(Track_only) == 0: boxes.append([ bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int) - bbox[0].astype(int), bbox[3].astype(int) - bbox[1].astype(int) ]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) ##---esse eh o fidaputi features = np.array(encoder(original_frame, boxes)) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( boxes, scores, names, features) ] # Pass detections to the deepsort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] tracked_scores = [] for i, track in enumerate(tracker.tracks): if not track.is_confirmed() or track.time_since_update > 5: continue bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class( ) #Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index( class_name)] # Get predicted object index by object name tracked_bboxes.append( bbox.tolist() + [tracking_id, index] ) # Structure data, that we could use it with our draw_bbox function try: tracked_scores.append(track.Confidence) ##new except: print("skip") # draw detection on frame draw_bbox(image, bboxes, CLASSES=YOLO_COCO_CLASSES, show_label=True, show_confidence = True, Text_colors=(255,255,0), rectangle_colors='', tracking=False) frame_idx += 1 print("------frame_idx-------" + str(frame_idx)) image = draw_bbox(original_frame, results_csv, tracked_bboxes, frame_idx, tracked_scores, show_label=True, show_confidence=True, CLASSES=CLASSES, tracking=True) ##new t3 = time.time() times.append(t2 - t1) times_2.append(t3 - t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2) / len(times_2) * 1000) ##image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # draw original yolo detection #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) print( "Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format( ms, fps, fps2)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break results_csv.close() cv2.destroyAllWindows()
def Object_tracking(Yolo, video_path, input_size=416, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, Track_only=["person"]): # Definition of the parameters max_cosine_distance = 0.7 nn_budget = None #initialize deep sort object model_filename = 'data/model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times, times_2 = [], [] vid = cv2.VideoCapture(video_path) # detect on video # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) for x in range(120): _, frame = vid.read() try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) #image_data = tf.expand_dims(image_data, 0) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) #t1 = time.time() #pred_bbox = Yolo.predict(image_data) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) != 0 and NUM_CLASS[int( bbox[5])] in Track_only or len(Track_only) == 0: boxes.append([ bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int) - bbox[0].astype(int), bbox[3].astype(int) - bbox[1].astype(int) ]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) features = np.array(encoder(original_frame, boxes)) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( boxes, scores, names, features) ] # Pass detections to the deepsort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 5: continue bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class( ) #Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index( class_name)] # Get predicted object index by object name tracked_bboxes.append( bbox.tolist() + [tracking_id, index] ) # Structure data, that we could use it with our draw_bbox function # draw detection on frame image = draw_bbox(original_frame, tracked_bboxes, CLASSES=CLASSES, tracking=True) t3 = time.time() times.append(t2 - t1) times_2.append(t3 - t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2) / len(times_2) * 1000) image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # draw original yolo detection #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) out_tracked_bboxes.append(tracked_bboxes) print( "Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format( ms, fps, fps2))
def Object_tracking(YoloV3, webapi, recording_id, video_path, model, cate_predictor, landmark_tensor, input_size=416, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only=[]): # Definition of the parameters max_cosine_distance = 0.7 nn_budget = None # initialize deep sort object encoder = gdet.create_box_encoder(DEEP_SORT_MODEL_FILE, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times = [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) bookmarks = {} while True: _, img = vid.read() print(vid.get(cv2.CAP_PROP_POS_MSEC)) try: original_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) image_data = tf.expand_dims(image_data, 0) t1 = time.time() pred_bbox = YoloV3.predict(image_data) t2 = time.time() times.append(t2 - t1) times = times[-20:] pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) != 0 and NUM_CLASS[int( bbox[5])] in Track_only or len(Track_only) == 0: boxes.append([ bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int) - bbox[0].astype(int), bbox[3].astype(int) - bbox[1].astype(int) ]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) features = np.array(encoder(original_image, boxes)) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( boxes, scores, names, features) ] # Pass detections to the deepsort object and obtain the track information. tracker.predict() deleted_tracks = tracker.update(detections, vid.get(cv2.CAP_PROP_POS_MSEC), original_image) # Throw frames into classifier once a person is deleted from the tracker marks = predict_tracks_cate(model, cate_predictor, deleted_tracks, landmark_tensor, video_path) add_text_to_bookmarks(bookmarks, marks) ms = sum(times) / len(times) * 1000 fps = 1000 / ms print("Time: {:.2f}ms, {:.1f} FPS".format(ms, fps)) marks = predict_tracks_cate(model, cate_predictor, tracker.tracks, landmark_tensor, video_path) add_text_to_bookmarks(bookmarks, marks) timestamp = int(os.path.splitext(video_path)[0].split('-')[-1]) for sec_since_start, texts in bookmarks.items(): webapi.add_bookmark(recording_id, ' | '.join(texts), '', timestamp + sec_since_start)