def init_tracker(self):
     for cls in self.cls_out:
         if self.od_model == 'yolo':
             if cls in [0, 2, 5, 7]:
                 self.tracker[cls] = Tracker(
                     nn_matching.NearestNeighborDistanceMetric(
                         "cosine", self.max_cosine_distance))
             else:
                 self.other_trackers[cls] = Sort(max_age=300)
         else:
             if cls in [0, 1, 4, 5]:
                 self.tracker[cls] = Tracker(
                     nn_matching.NearestNeighborDistanceMetric(
                         "cosine", self.max_cosine_distance))
             else:
                 self.other_trackers[cls] = Sort(max_age=300)
コード例 #2
0
ファイル: demo.py プロジェクト: SpecDI/cs407
def main(yolo, sequence_file, fps_render_rate, writeVideo_flag, labels_file,
         hide_window):
    # Compute output file
    file_name = os.path.splitext(
        os.path.basename(sequence_file))[0] if sequence_file != '0' else '0'
    if sequence_file == '0':
        sequence_file = 0

    # Compute the action map if labels provided
    action_map = dict()
    if labels_file != None:
        action_map = parse_labels_file(labels_file)
    print(action_map)

    # Build directory path
    frames_dir_path = "output/action_tubes/" + file_name
    if not writeVideo_flag:
        if os.path.exists(frames_dir_path):
            shutil.rmtree(frames_dir_path)
        os.mkdir(frames_dir_path)

    # Create coords dir for movie
    coords_path = 'output/tracked_bounding_boxes/' + file_name + '.json'

    output_seq = 'output/annotated_videos/' + file_name + '.avi'

    # Dict of coordinates for each tracked individual
    track_map = dict()

    # Definition of the parameters
    max_cosine_distance = 0.3
    nn_budget = None
    nms_max_overlap = 1.0

    # deep_sort
    model_filename = 'object_detection/model_data/mars-small128.pb'
    encoder = gdet.create_box_encoder(model_filename, batch_size=1)

    metric = nn_matching.NearestNeighborDistanceMetric("cosine",
                                                       max_cosine_distance,
                                                       nn_budget)
    tracker = Tracker(metric)

    video_capture = cv2.VideoCapture(sequence_file)

    if writeVideo_flag:
        # Define the codec and create VideoWriter object
        w = int(video_capture.get(3))
        h = int(video_capture.get(4))
        fourcc = cv2.VideoWriter_fourcc(*'XVID')  #*'MJPG'
        # Build video output handler only if we are not cropping
        out = cv2.VideoWriter(output_seq, fourcc, fps_render_rate, (w, h))
        list_file = open('detection.txt', 'w')
        frame_index = -1

    fps = 0.0
    frame_number = 0
    while video_capture.isOpened():
        frame_number += 1
        ret, frame = video_capture.read()  # frame shape 640*480*3
        if ret != True:
            break
        t1 = time.time()

        # image = Image.fromarray(frame)
        image = Image.fromarray(frame[..., ::-1])  #bgr to rgb
        boxs = yolo.detect_image(image)
        # print("box_num",len(boxs))
        features = encoder(frame, boxs)

        # score to 1.0 here).
        detections = [
            Detection(bbox, 1.0, feature)
            for bbox, feature in zip(boxs, features)
        ]

        # Run non-maxima suppression.
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        indices = preprocessing.non_max_suppression(boxes, nms_max_overlap,
                                                    scores)
        detections = [detections[i] for i in indices]

        # Call the tracker
        tracker.predict()
        tracker.update(detections)
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlbr()
            crop_img = frame[int(bbox[1]):int(bbox[3]),
                             int(bbox[0]):int(bbox[2])].copy()

            # Append coordinates for individual to track map
            if track.track_id not in track_map:
                track_map[track.track_id] = [
                    (frame_number,
                     [int(bbox[0]),
                      int(bbox[1]),
                      int(bbox[2]),
                      int(bbox[3])])
                ]
            else:
                track_map[track.track_id].append(
                    (frame_number,
                     [int(bbox[0]),
                      int(bbox[1]),
                      int(bbox[2]),
                      int(bbox[3])]))

            # Build directory path
            frames_dir_path = "output/action_tubes/" + file_name + '/' + str(
                track.track_id)
            if not os.path.exists(frames_dir_path) and not writeVideo_flag:
                os.mkdir(frames_dir_path)
            # Write frame or annotate frame
            if not writeVideo_flag:
                cv2.imwrite(frames_dir_path + "/" + str(frame_number) + ".jpg",
                            crop_img)
            else:
                cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                              (int(bbox[2]), int(bbox[3])), (255, 255, 255), 2)

                append_str = str(track.track_id) + ": Person"
                if track.track_id in action_map:
                    append_str += ' ' + action_map[track.track_id]
                cv2.putText(frame, append_str, (int(bbox[0]), int(bbox[1])), 0,
                            5e-3 * 200, (0, 255, 0), 2)

        with open(coords_path, 'w') as fp:
            json.dump(track_map, fp)

        for det in detections:
            bbox = det.to_tlbr()
            if writeVideo_flag:
                cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                              (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2)

        if not hide_window:
            cv2.imshow('', cv2.resize(frame, (1200, 675)))

        if writeVideo_flag:
            # save a frame
            out.write(frame)
            frame_index = frame_index + 1
            list_file.write(str(frame_index) + ' ')
            if len(boxs) != 0:
                for i in range(0, len(boxs)):
                    list_file.write(
                        str(boxs[i][0]) + ' ' + str(boxs[i][1]) + ' ' +
                        str(boxs[i][2]) + ' ' + str(boxs[i][3]) + ' ')
            list_file.write('\n')

        fps = (fps + (1. / (time.time() - t1))) / 2
        print("fps= %f" % (fps))

        # Press Q to stop!
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    video_capture.release()
    if writeVideo_flag:
        out.release()
        list_file.close()
    cv2.destroyAllWindows()
コード例 #3
0
print(COLOR)
cap = cv2.VideoCapture("videos/01.mp4")

weights = "models/yolo/weights/yolov4_tiny.weights"
config = "models/yolo/configs/yolov4_tiny.cfg"
classes = "models/yolo/classes.txt"

detector = Detector(weights, config, gpu=False, classes_name=classes)
detector.init_yolo()
print("===============================================================")
MAX_COSINE_DISTANCE = 0.3
nn_budget = None

model_filename = "models/deepsort_model/mars-small128.pb"
encoder = gdet.create_box_encoder(model_filename, batch_size=1)
metric = nn_matching.NearestNeighborDistanceMetric("cosine", MAX_COSINE_DISTANCE, nn_budget)
tracker = Tracker(metric)

# Define face detection model
face_detector = get_detector(hog=True)
fourcc = cv2.VideoWriter_fourcc(*"XVID")

w = int(cap.get(3))
h = int(cap.get(4))
fourcc = cv2.VideoWriter_fourcc(*"XVID")
out = cv2.VideoWriter("Test_Result.avi", fourcc, 30, (w, h))


cv2.namedWindow("Test")
cv2.setMouseCallback("Test", mouse_event)
コード例 #4
0
ファイル: pipeline.py プロジェクト: SpecDI/cs407
def main(yolo, hide_window, weights_file):
    print('Starting pipeline...')

    input_file = './web_server/input.mp4'
    output_file = './web_server/output.avi'

    # Definition of the parameters
    max_cosine_distance = 0.3
    nn_budget = None
    nms_max_overlap = 1.0

    metrics = MetricsAtTopK(k=2)
    losses = LossFunctions()

    # Load in model
    # model = load_model(
    #     'action_recognition/architectures/weights/lstm_1_2.hdf5',
    #     custom_objects={
    #         "weighted_binary_crossentropy": losses.weighted_binary_crossentropy,
    #         "recall_at_k": metrics.recall_at_k, 
    #         "precision_at_k": metrics.precision_at_k, 
    #         "f1_at_k": metrics.f1_at_k,
    #         "hamming_loss": hamming_loss,
    #     }
    # )

    model = cnn_lstm(INPUT_SHAPE, KERNEL_SHAPE, POOL_SHAPE, CLASSES)
    model.load_weights('action_recognition/architectures/weights/' + weights_file + '.hdf5')

    metrics = MetricsAtTopK(k=2)
    losses = LossFunctions()
    model.compile(loss=losses.weighted_binary_crossentropy, 
                    optimizer='adam', metrics=['accuracy', 
                                                metrics.recall_at_k, 
                                                metrics.precision_at_k, 
                                                metrics.f1_at_k, 
                                                losses.hamming_loss])

    # Track id frame batch
    track_tubeMap = {}

    # Track id action
    track_actionMap = {}

    # Image data generator
    datagen = ImageDataGenerator()

    # deep_sort 
    model_filename = 'object_detection/model_data/mars-small128.pb'
    encoder = gdet.create_box_encoder(model_filename,batch_size=1)
    
    metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)
    tracker = Tracker(metric)

    video_capture = cv2.VideoCapture(input_file)

    # Define the codec and create VideoWriter object
    w = int(video_capture.get(3))
    h = int(video_capture.get(4))
    fourcc = cv2.VideoWriter_fourcc(*'MJPG') #*'XVID'
    # Build video output handler only if we are not cropping
    out = cv2.VideoWriter(output_file, fourcc, 11, (w, h))
    list_file = open('detection.txt', 'w')
    frame_index = -1

    fps = 0.0
    frame_number = 0
    while video_capture.isOpened():
        frame_number+=1
        ret, frame = video_capture.read()  # frame shape 640*480*3
        if ret != True:
            break
        t1 = time.time()

        # image = Image.fromarray(frame)
        image = Image.fromarray(frame[...,::-1]) #bgr to rgb
        boxs = yolo.detect_image(image)
       # print("box_num",len(boxs))
        features = encoder(frame,boxs)
        
        # score to 1.0 here).
        detections = [Detection(bbox, 1.0, feature) for bbox, feature in zip(boxs, features)]
        
        # Run non-maxima suppression.
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        indices = preprocessing.non_max_suppression(boxes, nms_max_overlap, scores)
        detections = [detections[i] for i in indices]

        # Call the tracker
        tracker.predict()
        tracker.update(detections)
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlbr()
            
            # Init text to be appended to bbox
            append_str = str(track.track_id)

            if track.track_id not in track_actionMap:
                track_actionMap[track.track_id] = 'Unknown'

            # Init new key if necessary
            if track.track_id not in track_tubeMap:
                track_tubeMap[track.track_id] = []

            # Add frame segment to track dict
            block = frame[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])].copy()
            track_tubeMap[track.track_id].append(block / 255.0)

            # Check size of track bucket
            if len(track_tubeMap[track.track_id]) == FRAME_NUM:
                # Process action tube
                batch = process_batch(track_tubeMap[track.track_id])
                batch = batch.reshape(1, FRAME_NUM, FRAME_LENGTH, FRAME_WIDTH, 3)
                
                # Generate predictions
                preds = model.predict(batch)[0].tolist()
                print(preds)

                # Clear the list
                track_tubeMap[track.track_id] = []
                # Update action label to match corresponding action
                action_label = actions_header[preds.index(max(preds))]
                print(f"Person {track.track_id} is {action_label}")

                # Update map
                track_actionMap[track.track_id] = action_label

            # Update text to be appended
            append_str += ' ' + track_actionMap[track.track_id]
            # Create bbox and text label
            cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])),(255,255,255), 2)
            cv2.putText(frame, append_str,(int(bbox[0]), int(bbox[1])),0, 5e-3 * 200, (0,255,0),2)

        for det in detections:
            bbox = det.to_tlbr()
            cv2.rectangle(frame,(int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])),(255,0,0), 2)

        # Display video as processed if necessary
        if not hide_window:
            cv2.imshow('', cv2.resize(frame, (1200, 675)))

        # save a frame
        out.write(frame)
        frame_index = frame_index + 1
        list_file.write(str(frame_index)+' ')
        if len(boxs) != 0:
            for i in range(0,len(boxs)):
                list_file.write(str(boxs[i][0]) + ' '+str(boxs[i][1]) + ' '+str(boxs[i][2]) + ' '+str(boxs[i][3]) + ' ')
        list_file.write('\n')


        fps  = ( fps + (1./(time.time()-t1)) ) / 2
        print("fps= %f"%(fps))

        # Press Q to stop!
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    video_capture.release()
    out.release()
    list_file.close()
    cv2.destroyAllWindows()
コード例 #5
0
def main(yolo, hide_window, weights_file, test_mode, test_output, bayesian,
         batch_factor, input_file, progress_recorder):
    if test_mode:
        global object_detection_file
        global object_tracking_directory
        global action_recognition_directory
        initialiseTestMode(test_output)

    print('Starting pipeline...')

    # Define output path based on file name and web_server dir
    file_name = os.path.splitext(os.path.basename(input_file))[0]
    #output_file = f'./web_server/output_{file_name}.avi'
    #output_file = f'./gui/webapp/static/output_{file_name}.avi' # Existing pipeline version. Commented for easier Django display.
    output_file = f'./gui/webapp/static/output.mp4'  ###

    # Definition of the parameters
    max_cosine_distance = 0.3
    nn_budget = None
    nms_max_overlap = 1.0

    model = TS_CNN_LSTM(INPUT_SHAPE, CLASSES)
    model.load_weights('action_recognition/architectures/weights/' +
                       weights_file + '.hdf5')

    # Track id frame batch
    track_tubeMap = {}

    # Track id action
    track_actionMap = {}

    # Image data generator
    datagen = ImageDataGenerator()

    # deep_sort
    model_filename = 'object_detection/model_data/mars-small128.pb'
    encoder = gdet.create_box_encoder(model_filename, batch_size=1)

    metric = nn_matching.NearestNeighborDistanceMetric("cosine",
                                                       max_cosine_distance,
                                                       nn_budget)
    tracker = Tracker(metric)

    ################################################################################
    # New: Count total number of frames!
    video_count = cv2.VideoCapture(input_file)
    frame_total = int(video_count.get(cv2.CAP_PROP_FRAME_COUNT))
    print("FRAME COUNT: ", frame_total)
    video_count.release()
    ################################################################################

    video_capture = FileVideoStream(input_file).start()

    # Let input stream load some frames
    time.sleep(5)

    # Define the codec and create VideoWriter object
    w = 3840
    h = 2160
    #fourcc = cv2.VideoWriter_fourcc(*'MJPG') #*'XVID'
    fourcc = cv2.VideoWriter_fourcc(*'H264')  # Now mp4!
    # Build video output handler only if we are not cropping

    out = None
    if not test_mode:
        out = cv2.VideoWriter(output_file, fourcc, 11, (w, h))

    fps = 0.0
    location = (0, 0)

    frame_number = 0

    track_buffer = []

    unprocessedFrames = []

    processedFrames = []

    processedTracks = []

    locations = []

    skip = 1
    while video_capture.more():  ####
        frame = video_capture.read()  # frame shape 640*480*3
        if not isinstance(frame, np.ndarray):
            break
        t1 = time.time()

        x = w
        y = h

        scaledX = 640
        scaledY = 360

        xScale = x / scaledX
        yScale = y / scaledY

        if (frame_number % skip == 0):
            if (not location) or frame_number % 5 == 0:
                location = [0, 0, scaledX, scaledY]
            else:
                location = rescale(location, xScale, yScale, 0, 0, False)

            frameCopy = frame.copy()

            frame = cv2.resize(frame, (scaledX, scaledY),
                               interpolation=cv2.INTER_AREA)
            # image = Image.fromarray(frame)
            image = Image.fromarray(frame[..., ::-1])  #bgr to rgb

            diffx = location[0]
            diffy = location[1]

            image = image.crop(
                (location[0], location[1], location[2], location[3]))

            boxs = yolo.detect_image(image)

            features = encoder(frame, boxs)

            # score to 1.0 here).
            detections = [
                Detection(bbox, 1.0, feature).rescale(xScale, yScale, diffx,
                                                      diffy)
                for bbox, feature in zip(boxs, features)
            ]

            # Run non-maxima suppression.
            boxes = np.array([d.tlwh for d in detections])
            scores = np.array([d.confidence for d in detections])
            indices = preprocessing.non_max_suppression(
                boxes, nms_max_overlap, scores)
            detections = [detections[i] for i in indices]

            if test_mode:
                if len(detections) != 0:
                    for i in range(0, len(detections)):
                        bbox = detections[i].to_tlbr()
                        object_detection_file.write(
                            str(frame_number) + ' ' + str(int(bbox[0])) + ' ' +
                            str(int(bbox[1])) + ' ' + str(int(bbox[2])) + ' ' +
                            str(int(bbox[3])) + '\n')
            # Call the tracker
            tracker.predict()
            tracker.update(detections)

            frame = frameCopy

            tracks = dict()
            for track in tracker.tracks:
                if not track.is_confirmed() or track.time_since_update > 1:
                    continue
                bbox = track.to_tlbr()

                tracks[track.track_id] = bbox

            if frame_number == 0:
                track_buffer.append(tracks)

                processedFrames.append(frame)
                processedTracks.append(tracks)
            else:
                firstTrack = track_buffer.pop(0)
                secondTrack = tracks.copy()

                for i, oldFrame in enumerate(unprocessedFrames, 1):
                    tracks = dict()
                    for trackId in secondTrack:
                        if trackId in firstTrack:
                            min_x, min_y, max_x, max_y = interpolateBbox(
                                firstTrack[trackId], secondTrack[trackId],
                                i / skip)
                            result = [
                                x + y
                                for x, y in zip([min_x, min_y, max_x, max_y],
                                                firstTrack[trackId])
                            ]
                            tracks[trackId] = result

                    processedFrames.append(oldFrame)
                    processedTracks.append(tracks.copy())
                    unprocessedFrames = []
                processedFrames.append(frame)
                processedTracks.append(secondTrack)
                track_buffer.append(secondTrack)

            location = rescale(location, xScale, yScale, 0, 0, True)
            locations.append(location)

            if (frame_number >= skip):
                frame = processFrame(locations, processedFrames,
                                     processedTracks, track_tubeMap,
                                     track_actionMap, model, test_mode,
                                     bayesian, batch_factor)

            currentXs = []
            currentYs = []
            for det in detections:
                bbox = det.to_tlbr()

                currentXs.extend([int(bbox[0]), int(bbox[2])])
                currentYs.extend([int(bbox[1]), int(bbox[3])])

        else:
            unprocessedFrames.append(frame)
            locations.append([0, 0, 0, 0])
            if (frame_number >= skip):
                frame = processFrame(locations, processedFrames,
                                     processedTracks, track_tubeMap,
                                     track_actionMap, model, test_mode,
                                     bayesian, batch_factor)

        # Display video as processed if necessary

        # save a frame

        if (frame_number >= skip):
            writeFrame(frame, out, hide_window, test_mode)

        frame_number += 1

        # Updates progress bar every 10 frames
        if frame_number % 10 == 0:
            progress_recorder.set_progress(frame_number, frame_total)  ####

        if frame_number % 5 != 0:
            location = calculateLocation(currentXs, currentYs)

        fps = (fps + (1. / (time.time() - t1))) / 2
        print("fps= %f" % (fps / skip))

        # Press Q to stop!
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    video_capture.stop()

    while len(processedFrames) != 0:
        frame_number += 1
        frame = processFrame(locations, processedFrames, processedTracks,
                             track_tubeMap, track_actionMap, model, test_mode,
                             bayesian, batch_factor)
        writeFrame(frame, out, hide_window, test_mode)

    while len(unprocessedFrames) != 0:
        frame_number += 1
        frame = unprocessedFrames.pop()
        writeFrame(frame, out, hide_window, test_mode)

    if not test_mode:
        out.release()

    cv2.destroyAllWindows()

    if test_mode:
        object_detection_file.close()
コード例 #6
0
        if re.findall('([-\w]+\.(?:jpg|gif|png))',
                      os.path.basename(file_path[0].lower())):
            return render_template('person_reid.html')
        else:
            return render_template('person_det.html')


print('Load Input Arguments')
args = parse_args()

print('Load Tracker ...')
encoder = None
if args.tracking_type == "deep_sort":
    encoder = gdet.create_box_encoder(model_filename=args.tracker_weights,
                                      batch_size=8)
    metric = nn_matching.NearestNeighborDistanceMetric(
        "cosine", args.max_cosine_distance)

print('Load Object Detection model ...')
person_handler = PersonHandler(args, encoder=encoder)

print('Load Label Map')
cls_dict = load_cls_dict(args.data_path)

# grab image and do object detection (until stopped by user)
print('starting to loop and detect')
vis = BBoxVisualization(cls_dict)


@app.route('/person_reid', methods=['GET'])
def person_reid():
    return Response(person_handler.loop_and_detect(loader, vis, file_path),