def init_tracker(self): for cls in self.cls_out: if self.od_model == 'yolo': if cls in [0, 2, 5, 7]: self.tracker[cls] = Tracker( nn_matching.NearestNeighborDistanceMetric( "cosine", self.max_cosine_distance)) else: self.other_trackers[cls] = Sort(max_age=300) else: if cls in [0, 1, 4, 5]: self.tracker[cls] = Tracker( nn_matching.NearestNeighborDistanceMetric( "cosine", self.max_cosine_distance)) else: self.other_trackers[cls] = Sort(max_age=300)
def main(yolo, sequence_file, fps_render_rate, writeVideo_flag, labels_file, hide_window): # Compute output file file_name = os.path.splitext( os.path.basename(sequence_file))[0] if sequence_file != '0' else '0' if sequence_file == '0': sequence_file = 0 # Compute the action map if labels provided action_map = dict() if labels_file != None: action_map = parse_labels_file(labels_file) print(action_map) # Build directory path frames_dir_path = "output/action_tubes/" + file_name if not writeVideo_flag: if os.path.exists(frames_dir_path): shutil.rmtree(frames_dir_path) os.mkdir(frames_dir_path) # Create coords dir for movie coords_path = 'output/tracked_bounding_boxes/' + file_name + '.json' output_seq = 'output/annotated_videos/' + file_name + '.avi' # Dict of coordinates for each tracked individual track_map = dict() # Definition of the parameters max_cosine_distance = 0.3 nn_budget = None nms_max_overlap = 1.0 # deep_sort model_filename = 'object_detection/model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) video_capture = cv2.VideoCapture(sequence_file) if writeVideo_flag: # Define the codec and create VideoWriter object w = int(video_capture.get(3)) h = int(video_capture.get(4)) fourcc = cv2.VideoWriter_fourcc(*'XVID') #*'MJPG' # Build video output handler only if we are not cropping out = cv2.VideoWriter(output_seq, fourcc, fps_render_rate, (w, h)) list_file = open('detection.txt', 'w') frame_index = -1 fps = 0.0 frame_number = 0 while video_capture.isOpened(): frame_number += 1 ret, frame = video_capture.read() # frame shape 640*480*3 if ret != True: break t1 = time.time() # image = Image.fromarray(frame) image = Image.fromarray(frame[..., ::-1]) #bgr to rgb boxs = yolo.detect_image(image) # print("box_num",len(boxs)) features = encoder(frame, boxs) # score to 1.0 here). detections = [ Detection(bbox, 1.0, feature) for bbox, feature in zip(boxs, features) ] # Run non-maxima suppression. boxes = np.array([d.tlwh for d in detections]) scores = np.array([d.confidence for d in detections]) indices = preprocessing.non_max_suppression(boxes, nms_max_overlap, scores) detections = [detections[i] for i in indices] # Call the tracker tracker.predict() tracker.update(detections) for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 1: continue bbox = track.to_tlbr() crop_img = frame[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])].copy() # Append coordinates for individual to track map if track.track_id not in track_map: track_map[track.track_id] = [ (frame_number, [int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])]) ] else: track_map[track.track_id].append( (frame_number, [int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])])) # Build directory path frames_dir_path = "output/action_tubes/" + file_name + '/' + str( track.track_id) if not os.path.exists(frames_dir_path) and not writeVideo_flag: os.mkdir(frames_dir_path) # Write frame or annotate frame if not writeVideo_flag: cv2.imwrite(frames_dir_path + "/" + str(frame_number) + ".jpg", crop_img) else: cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 255, 255), 2) append_str = str(track.track_id) + ": Person" if track.track_id in action_map: append_str += ' ' + action_map[track.track_id] cv2.putText(frame, append_str, (int(bbox[0]), int(bbox[1])), 0, 5e-3 * 200, (0, 255, 0), 2) with open(coords_path, 'w') as fp: json.dump(track_map, fp) for det in detections: bbox = det.to_tlbr() if writeVideo_flag: cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2) if not hide_window: cv2.imshow('', cv2.resize(frame, (1200, 675))) if writeVideo_flag: # save a frame out.write(frame) frame_index = frame_index + 1 list_file.write(str(frame_index) + ' ') if len(boxs) != 0: for i in range(0, len(boxs)): list_file.write( str(boxs[i][0]) + ' ' + str(boxs[i][1]) + ' ' + str(boxs[i][2]) + ' ' + str(boxs[i][3]) + ' ') list_file.write('\n') fps = (fps + (1. / (time.time() - t1))) / 2 print("fps= %f" % (fps)) # Press Q to stop! if cv2.waitKey(1) & 0xFF == ord('q'): break video_capture.release() if writeVideo_flag: out.release() list_file.close() cv2.destroyAllWindows()
print(COLOR) cap = cv2.VideoCapture("videos/01.mp4") weights = "models/yolo/weights/yolov4_tiny.weights" config = "models/yolo/configs/yolov4_tiny.cfg" classes = "models/yolo/classes.txt" detector = Detector(weights, config, gpu=False, classes_name=classes) detector.init_yolo() print("===============================================================") MAX_COSINE_DISTANCE = 0.3 nn_budget = None model_filename = "models/deepsort_model/mars-small128.pb" encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", MAX_COSINE_DISTANCE, nn_budget) tracker = Tracker(metric) # Define face detection model face_detector = get_detector(hog=True) fourcc = cv2.VideoWriter_fourcc(*"XVID") w = int(cap.get(3)) h = int(cap.get(4)) fourcc = cv2.VideoWriter_fourcc(*"XVID") out = cv2.VideoWriter("Test_Result.avi", fourcc, 30, (w, h)) cv2.namedWindow("Test") cv2.setMouseCallback("Test", mouse_event)
def main(yolo, hide_window, weights_file): print('Starting pipeline...') input_file = './web_server/input.mp4' output_file = './web_server/output.avi' # Definition of the parameters max_cosine_distance = 0.3 nn_budget = None nms_max_overlap = 1.0 metrics = MetricsAtTopK(k=2) losses = LossFunctions() # Load in model # model = load_model( # 'action_recognition/architectures/weights/lstm_1_2.hdf5', # custom_objects={ # "weighted_binary_crossentropy": losses.weighted_binary_crossentropy, # "recall_at_k": metrics.recall_at_k, # "precision_at_k": metrics.precision_at_k, # "f1_at_k": metrics.f1_at_k, # "hamming_loss": hamming_loss, # } # ) model = cnn_lstm(INPUT_SHAPE, KERNEL_SHAPE, POOL_SHAPE, CLASSES) model.load_weights('action_recognition/architectures/weights/' + weights_file + '.hdf5') metrics = MetricsAtTopK(k=2) losses = LossFunctions() model.compile(loss=losses.weighted_binary_crossentropy, optimizer='adam', metrics=['accuracy', metrics.recall_at_k, metrics.precision_at_k, metrics.f1_at_k, losses.hamming_loss]) # Track id frame batch track_tubeMap = {} # Track id action track_actionMap = {} # Image data generator datagen = ImageDataGenerator() # deep_sort model_filename = 'object_detection/model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename,batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) video_capture = cv2.VideoCapture(input_file) # Define the codec and create VideoWriter object w = int(video_capture.get(3)) h = int(video_capture.get(4)) fourcc = cv2.VideoWriter_fourcc(*'MJPG') #*'XVID' # Build video output handler only if we are not cropping out = cv2.VideoWriter(output_file, fourcc, 11, (w, h)) list_file = open('detection.txt', 'w') frame_index = -1 fps = 0.0 frame_number = 0 while video_capture.isOpened(): frame_number+=1 ret, frame = video_capture.read() # frame shape 640*480*3 if ret != True: break t1 = time.time() # image = Image.fromarray(frame) image = Image.fromarray(frame[...,::-1]) #bgr to rgb boxs = yolo.detect_image(image) # print("box_num",len(boxs)) features = encoder(frame,boxs) # score to 1.0 here). detections = [Detection(bbox, 1.0, feature) for bbox, feature in zip(boxs, features)] # Run non-maxima suppression. boxes = np.array([d.tlwh for d in detections]) scores = np.array([d.confidence for d in detections]) indices = preprocessing.non_max_suppression(boxes, nms_max_overlap, scores) detections = [detections[i] for i in indices] # Call the tracker tracker.predict() tracker.update(detections) for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 1: continue bbox = track.to_tlbr() # Init text to be appended to bbox append_str = str(track.track_id) if track.track_id not in track_actionMap: track_actionMap[track.track_id] = 'Unknown' # Init new key if necessary if track.track_id not in track_tubeMap: track_tubeMap[track.track_id] = [] # Add frame segment to track dict block = frame[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])].copy() track_tubeMap[track.track_id].append(block / 255.0) # Check size of track bucket if len(track_tubeMap[track.track_id]) == FRAME_NUM: # Process action tube batch = process_batch(track_tubeMap[track.track_id]) batch = batch.reshape(1, FRAME_NUM, FRAME_LENGTH, FRAME_WIDTH, 3) # Generate predictions preds = model.predict(batch)[0].tolist() print(preds) # Clear the list track_tubeMap[track.track_id] = [] # Update action label to match corresponding action action_label = actions_header[preds.index(max(preds))] print(f"Person {track.track_id} is {action_label}") # Update map track_actionMap[track.track_id] = action_label # Update text to be appended append_str += ' ' + track_actionMap[track.track_id] # Create bbox and text label cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])),(255,255,255), 2) cv2.putText(frame, append_str,(int(bbox[0]), int(bbox[1])),0, 5e-3 * 200, (0,255,0),2) for det in detections: bbox = det.to_tlbr() cv2.rectangle(frame,(int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])),(255,0,0), 2) # Display video as processed if necessary if not hide_window: cv2.imshow('', cv2.resize(frame, (1200, 675))) # save a frame out.write(frame) frame_index = frame_index + 1 list_file.write(str(frame_index)+' ') if len(boxs) != 0: for i in range(0,len(boxs)): list_file.write(str(boxs[i][0]) + ' '+str(boxs[i][1]) + ' '+str(boxs[i][2]) + ' '+str(boxs[i][3]) + ' ') list_file.write('\n') fps = ( fps + (1./(time.time()-t1)) ) / 2 print("fps= %f"%(fps)) # Press Q to stop! if cv2.waitKey(1) & 0xFF == ord('q'): break video_capture.release() out.release() list_file.close() cv2.destroyAllWindows()
def main(yolo, hide_window, weights_file, test_mode, test_output, bayesian, batch_factor, input_file, progress_recorder): if test_mode: global object_detection_file global object_tracking_directory global action_recognition_directory initialiseTestMode(test_output) print('Starting pipeline...') # Define output path based on file name and web_server dir file_name = os.path.splitext(os.path.basename(input_file))[0] #output_file = f'./web_server/output_{file_name}.avi' #output_file = f'./gui/webapp/static/output_{file_name}.avi' # Existing pipeline version. Commented for easier Django display. output_file = f'./gui/webapp/static/output.mp4' ### # Definition of the parameters max_cosine_distance = 0.3 nn_budget = None nms_max_overlap = 1.0 model = TS_CNN_LSTM(INPUT_SHAPE, CLASSES) model.load_weights('action_recognition/architectures/weights/' + weights_file + '.hdf5') # Track id frame batch track_tubeMap = {} # Track id action track_actionMap = {} # Image data generator datagen = ImageDataGenerator() # deep_sort model_filename = 'object_detection/model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) ################################################################################ # New: Count total number of frames! video_count = cv2.VideoCapture(input_file) frame_total = int(video_count.get(cv2.CAP_PROP_FRAME_COUNT)) print("FRAME COUNT: ", frame_total) video_count.release() ################################################################################ video_capture = FileVideoStream(input_file).start() # Let input stream load some frames time.sleep(5) # Define the codec and create VideoWriter object w = 3840 h = 2160 #fourcc = cv2.VideoWriter_fourcc(*'MJPG') #*'XVID' fourcc = cv2.VideoWriter_fourcc(*'H264') # Now mp4! # Build video output handler only if we are not cropping out = None if not test_mode: out = cv2.VideoWriter(output_file, fourcc, 11, (w, h)) fps = 0.0 location = (0, 0) frame_number = 0 track_buffer = [] unprocessedFrames = [] processedFrames = [] processedTracks = [] locations = [] skip = 1 while video_capture.more(): #### frame = video_capture.read() # frame shape 640*480*3 if not isinstance(frame, np.ndarray): break t1 = time.time() x = w y = h scaledX = 640 scaledY = 360 xScale = x / scaledX yScale = y / scaledY if (frame_number % skip == 0): if (not location) or frame_number % 5 == 0: location = [0, 0, scaledX, scaledY] else: location = rescale(location, xScale, yScale, 0, 0, False) frameCopy = frame.copy() frame = cv2.resize(frame, (scaledX, scaledY), interpolation=cv2.INTER_AREA) # image = Image.fromarray(frame) image = Image.fromarray(frame[..., ::-1]) #bgr to rgb diffx = location[0] diffy = location[1] image = image.crop( (location[0], location[1], location[2], location[3])) boxs = yolo.detect_image(image) features = encoder(frame, boxs) # score to 1.0 here). detections = [ Detection(bbox, 1.0, feature).rescale(xScale, yScale, diffx, diffy) for bbox, feature in zip(boxs, features) ] # Run non-maxima suppression. boxes = np.array([d.tlwh for d in detections]) scores = np.array([d.confidence for d in detections]) indices = preprocessing.non_max_suppression( boxes, nms_max_overlap, scores) detections = [detections[i] for i in indices] if test_mode: if len(detections) != 0: for i in range(0, len(detections)): bbox = detections[i].to_tlbr() object_detection_file.write( str(frame_number) + ' ' + str(int(bbox[0])) + ' ' + str(int(bbox[1])) + ' ' + str(int(bbox[2])) + ' ' + str(int(bbox[3])) + '\n') # Call the tracker tracker.predict() tracker.update(detections) frame = frameCopy tracks = dict() for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 1: continue bbox = track.to_tlbr() tracks[track.track_id] = bbox if frame_number == 0: track_buffer.append(tracks) processedFrames.append(frame) processedTracks.append(tracks) else: firstTrack = track_buffer.pop(0) secondTrack = tracks.copy() for i, oldFrame in enumerate(unprocessedFrames, 1): tracks = dict() for trackId in secondTrack: if trackId in firstTrack: min_x, min_y, max_x, max_y = interpolateBbox( firstTrack[trackId], secondTrack[trackId], i / skip) result = [ x + y for x, y in zip([min_x, min_y, max_x, max_y], firstTrack[trackId]) ] tracks[trackId] = result processedFrames.append(oldFrame) processedTracks.append(tracks.copy()) unprocessedFrames = [] processedFrames.append(frame) processedTracks.append(secondTrack) track_buffer.append(secondTrack) location = rescale(location, xScale, yScale, 0, 0, True) locations.append(location) if (frame_number >= skip): frame = processFrame(locations, processedFrames, processedTracks, track_tubeMap, track_actionMap, model, test_mode, bayesian, batch_factor) currentXs = [] currentYs = [] for det in detections: bbox = det.to_tlbr() currentXs.extend([int(bbox[0]), int(bbox[2])]) currentYs.extend([int(bbox[1]), int(bbox[3])]) else: unprocessedFrames.append(frame) locations.append([0, 0, 0, 0]) if (frame_number >= skip): frame = processFrame(locations, processedFrames, processedTracks, track_tubeMap, track_actionMap, model, test_mode, bayesian, batch_factor) # Display video as processed if necessary # save a frame if (frame_number >= skip): writeFrame(frame, out, hide_window, test_mode) frame_number += 1 # Updates progress bar every 10 frames if frame_number % 10 == 0: progress_recorder.set_progress(frame_number, frame_total) #### if frame_number % 5 != 0: location = calculateLocation(currentXs, currentYs) fps = (fps + (1. / (time.time() - t1))) / 2 print("fps= %f" % (fps / skip)) # Press Q to stop! if cv2.waitKey(1) & 0xFF == ord('q'): break video_capture.stop() while len(processedFrames) != 0: frame_number += 1 frame = processFrame(locations, processedFrames, processedTracks, track_tubeMap, track_actionMap, model, test_mode, bayesian, batch_factor) writeFrame(frame, out, hide_window, test_mode) while len(unprocessedFrames) != 0: frame_number += 1 frame = unprocessedFrames.pop() writeFrame(frame, out, hide_window, test_mode) if not test_mode: out.release() cv2.destroyAllWindows() if test_mode: object_detection_file.close()
if re.findall('([-\w]+\.(?:jpg|gif|png))', os.path.basename(file_path[0].lower())): return render_template('person_reid.html') else: return render_template('person_det.html') print('Load Input Arguments') args = parse_args() print('Load Tracker ...') encoder = None if args.tracking_type == "deep_sort": encoder = gdet.create_box_encoder(model_filename=args.tracker_weights, batch_size=8) metric = nn_matching.NearestNeighborDistanceMetric( "cosine", args.max_cosine_distance) print('Load Object Detection model ...') person_handler = PersonHandler(args, encoder=encoder) print('Load Label Map') cls_dict = load_cls_dict(args.data_path) # grab image and do object detection (until stopped by user) print('starting to loop and detect') vis = BBoxVisualization(cls_dict) @app.route('/person_reid', methods=['GET']) def person_reid(): return Response(person_handler.loop_and_detect(loader, vis, file_path),