Ejemplo n.º 1
0
def excuteModel(videoname):
    # Video's path
    # set int to use webcam, set str to read from a video file

    if videoname is not None:
        video_src = os.path.join(r'D:\GitHub\Detection\server\uploads', f"{videoname}.mp4")
    else:
        video_src = 'D:\\GitHub\\Detection\\server\AImodel\\videotest\\default.mp4'

    compound_coef = 2
    trained_weights = 'D:\\GitHub\\Detection\\server\\AImodel\\weights\\efficientdet-video.pth'

    force_input_size = None  # set None to use default size

    threshold = 0.2
    iou_threshold = 0.2

    use_cuda = True
    use_float16 = False
    cudnn.fastest = True
    cudnn.benchmark = True

    obj_list = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
                'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
                'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie',
                'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
                'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
                'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
                'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
                'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
                'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
                'toothbrush']

    # tf bilinear interpolation is different from any other's, just make do
    input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
    input_size = input_sizes[compound_coef] if force_input_size is None else force_input_size

    # load model
    model = EfficientDetBackbone(
        compound_coef=compound_coef, num_classes=len(obj_list))
    model.load_state_dict(torch.load(trained_weights))

    model.requires_grad_(False)
    model.eval()

    if use_cuda:
        model = model.cuda()
    if use_float16:
        model = model.half()

    # function for display

    # Box
    regressBoxes = BBoxTransform()
    clipBoxes = ClipBoxes()

    # Video capture
    cap = cv2.VideoCapture(video_src)
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    writer = None
    # try to determine the total number of frames in the video file
    try:
        prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() \
            else cv2.CAP_PROP_FRAME_COUNT
        total = int(vs.get(prop))
        print("[INFO] {} total frames in video".format(total))

    # an error occurred while trying to determine the total
    # number of frames in the video file
    except:
        print("[INFO] could not determine # of frames in video")
        total = -1

    path_out = os.path.join(os.path.dirname(
        os.path.abspath(__file__)), 'outvideo')

    path_result = r"D:\GitHub\Detection\server\AImodel\videotest\default.mp4"
    path_asset = r"D:\GitHub\Detection\client\src\assets"
    for i in range(0, length):
        ret, frame = cap.read()
        if not ret:
            break

        # frame preprocessing
        ori_imgs, framed_imgs, framed_metas = preprocess_video(
            frame, max_size=input_size)

        if use_cuda:
            x = torch.stack([torch.from_numpy(fi).cuda()
                             for fi in framed_imgs], 0)
        else:
            x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)

        x = x.to(torch.float32 if not use_float16 else torch.float16).permute(
            0, 3, 1, 2)

        # model predict
        with torch.no_grad():
            features, regression, classification, anchors = model(x)

            out = postprocess(x,
                              anchors, regression, classification,
                              regressBoxes, clipBoxes,
                              threshold, iou_threshold)

        # result
        out = invert_affine(framed_metas, out)
        img_show = display(out, ori_imgs, obj_list)

        if writer is None:

            # initialize our video writer
            fourcc = 0x00000021
            #fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            if videoname is not None:
                path_result = os.path.join(path_out, f"{videoname}.mp4")
            else:
                path_result = os.path.join(path_out, "default.mp4")

            writer = cv2.VideoWriter(path_result, fourcc, 30, (img_show.shape[1], img_show.shape[0]), True)


        # write the output frame to disk
        writer.write(img_show)
        print("Processing data... " + str(round((i+1)/length, 3)*100) + " %")
        # show frame by frame
        #cv2.imshow('frame', img_show)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    print("[INFO] cleaning up...")

    writer.release()
    cap.release()
    cv2.destroyAllWindows()

    if videoname is not None:
        path_asset = os.path.join(path_asset, f"{videoname}.mp4")
    else:
        path_asset = os.path.join(path_asset, "default.mp4")
    copyfile(path_result, path_asset)
    return path_asset
Ejemplo n.º 2
0

# Box
regressBoxes = BBoxTransform()
clipBoxes = ClipBoxes()

# Video capture
cap = cv2.VideoCapture(video_src)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # frame preprocessing
    ori_imgs, framed_imgs, framed_metas = preprocess_video(frame,
                                                           max_size=input_size)

    if use_cuda:
        x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
    else:
        x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)

    x = x.to(torch.float32 if not use_float16 else torch.float16).permute(
        0, 3, 1, 2)

    # model predict
    with torch.no_grad():
        features, regression, classification, anchors = model(x)

        out = postprocess(x, anchors, regression, classification, regressBoxes,
                          clipBoxes, threshold, iou_threshold)
            frame_width = int(cap.get(3))
            frame_height = int(cap.get(4))
            fourcc = cv2.VideoWriter_fourcc(*'MPEG')
            fps = cap.get(cv2.CAP_PROP_FPS)
            print(fps)
            outp = cv2.VideoWriter(dir + "/" + "output_" + vid, fourcc, fps,
                                   (frame_width, frame_height))

            while True:
                ret, frame = cap.read()
                if not ret:
                    break

                # frame preprocessing
                ori_imgs, framed_imgs, framed_metas, t1 = preprocess_video(
                    frame, width=input_size, height=input_size)
                print("Preprocessing time", time.time() - t1)
                if use_cuda:
                    x = torch.stack([fi.cuda() for fi in framed_imgs], 0)
                else:
                    x = torch.stack(
                        [torch.from_numpy(fi) for fi in framed_imgs], 0)

                t1 = time.time()
                # model predict
                with torch.no_grad():
                    features, regression, classification, anchors = model(x)

                    out = postprocess(x, anchors, regression, classification,
                                      regressBoxes, clipBoxes, threshold,
                                      iou_threshold)
def efficientDet_video_inference(video_src,compound_coef = 0,force_input_size=None,
                                 frame_skipping = 3,
                                 threshold=0.2,out_path=None,imshow=False,
                                 display_fps=False):

    #deep-sort variables

    # Definition of the parameters
    max_cosine_distance = 0.3
    nn_budget = None
    nms_max_overlap = 1.0


    model_filename = '/home/shaheryar/Desktop/Projects/Football-Monitoring/deep_sort/model_weights/mars-small128.pb'
    encoder = gdet.create_box_encoder(model_filename, batch_size=1)
    metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)
    tracker = Tracker(metric,n_init=5)

    # efficientDet-pytorch variables
    iou_threshold = 0.4
    use_cuda = True
    use_float16 = False
    cudnn.fastest = True
    cudnn.benchmark = True

    input_size = input_sizes[compound_coef] if force_input_size is None else force_input_size

    # load model
    model = EfficientDetBackbone(compound_coef=compound_coef, num_classes=len(obj_list))
    model.load_state_dict(torch.load(f'weights/efficientdet-d{compound_coef}.pth'))
    model.requires_grad_(False)
    model.eval()

    if use_cuda:
        model = model.cuda()
    if use_float16:
        model = model.half()

    regressBoxes = BBoxTransform()
    clipBoxes = ClipBoxes()

    # Video capture
    cap = cv2.VideoCapture(video_src)
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    fourcc = cv2.VideoWriter_fourcc(*'MPEG')
    fps = cap.get(cv2.CAP_PROP_FPS)
    print("Video fps",fps)
    if(out_path is not None):
        outp = cv2.VideoWriter(out_path, fourcc, fps, (frame_width, frame_height))
    i=0
    start= time.time()
    current_frame_fps=0
    while True:

        ret, frame = cap.read()

        if not ret:
            break
        t1=time.time()
        if (frame_skipping==0 or i%frame_skipping==0):
        # if(True):


            # frame preprocessing (running detections)
            ori_imgs, framed_imgs, framed_metas, t1 = preprocess_video(frame, width=input_size, height=input_size)
            if use_cuda:
                x = torch.stack([fi.cuda() for fi in framed_imgs], 0)
            else:
                x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)
            # model predict
            t1=time.time()
            with torch.no_grad():
                features, regression, classification, anchors = model(x)

                out = postprocess(x,
                                  anchors, regression, classification,
                                  regressBoxes, clipBoxes,
                                  threshold, iou_threshold)
            # Post processing
            out = invert_affine(framed_metas, out)
            # decoding bbox ,object name and scores
            boxes,classes,scores =decode_predictions(out[0])
            org_boxes = boxes.copy()
            t2 = time.time() - t1

            # feature extraction for deep sort
            boxes = [convert_bbox_to_deep_sort_format(frame.shape, b) for b in boxes]

            features = encoder(frame,boxes)
            detections = [Detection(bbox, 1.0, feature) for bbox, feature in zip(boxes, features)]
            boxes = np.array([d.tlwh for d in detections])
            # print(boxes)
            scores = np.array([d.confidence for d in detections])
            indices = preprocessing.non_max_suppression(boxes, nms_max_overlap, scores)
            detections = [detections[i] for i in indices]
            tracker.predict()
            tracker.update(detections)



        i = i + 1
        img_show=frame.copy()
        for j in range(len(org_boxes)):
            img_show =drawBoxes(img_show,org_boxes[j],(255,255,0),str(tracker.tracks[j].track_id))

        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlbr()
            x1=int(bbox[0])
            y1 = int(bbox[1])
            x2 = int(bbox[2])
            y2=int(bbox[3])
            roi= frame[y1:y2,x1:x2]
            cv2.rectangle(img_show, (x1, y1), (x2, y2), update_color_association(roi, track.track_id), 2)
            cv2.putText(img_show, str(track.track_id), (x1, y1), 0, 5e-3 * 100, (255, 255, 0), 1)


        if display_fps:
            current_frame_fps=1/t2
        else:
            current_frame_fps=0

        cv2.putText(img_show, 'FPS: {0:.2f}'.format(current_frame_fps), (30, 50), cv2.FONT_HERSHEY_SIMPLEX, 1,
                    (255, 255, 0),
                    2, cv2.LINE_AA)
        if (i % int(fps) == 0):
            print("Processed ", str(int(i / fps)), "seconds")
            print("Time taken",time.time()-start)
            # print(color_dict)

        if imshow:
            img_show=cv2.resize(img_show,(0,0),fx=0.75,fy=0.75)
            cv2.imshow('Frame',img_show)
            # Press Q on keyboard to  exit
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        if out_path is not None:
            outp.write(img_show)

    cap.release()
    outp.release()
Ejemplo n.º 5
0
def effdet_detection(content, effdet):

    video_src = 0  # set int to use webcam, set str to read from a video file

    compound_coef = 0
    force_input_size = None  # set None to use default size

    threshold = 0.5
    iou_threshold = 0.2

    use_cuda = True
    use_float16 = False
    cudnn.fastest = True
    cudnn.benchmark = True

    obj_list = [
        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
        'truck', 'boat', 'traffic light', 'fire hydrant', '', 'stop sign',
        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
        'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack',
        'umbrella', '', '', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
        'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
        'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass',
        'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
        'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
        'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '',
        'toilet', '', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
        'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
        '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
        'toothbrush'
    ]

    # tf bilinear interpolation is different from any other's, just make do
    input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
    input_size = input_sizes[
        compound_coef] if force_input_size is None else force_input_size

    # load model
    model = EfficientDetBackbone(compound_coef=compound_coef,
                                 num_classes=len(obj_list))
    model.load_state_dict(
        torch.load(f'weights/efficientdet-d{compound_coef}.pth'))
    model.requires_grad_(False)
    model.eval()

    if use_cuda:
        model = model.cuda()
    if use_float16:
        model = model.half()

    # function for display
    def display(preds, imgs, content, effdet):
        for i in range(len(imgs)):
            if len(preds[i]['rois']) == 0:
                return imgs[i]

            for j in range(len(preds[i]['rois'])):
                (x1, y1, x2, y2) = preds[i]['rois'][j].astype(np.int)
                #cv2.rectangle(imgs[i], (x1, y1), (x2, y2), (255, 255, 0), 2)
                obj = obj_list[preds[i]['class_ids'][j]]
                score = float(preds[i]['scores'][j])

                if obj == content:
                    effdet.send_message_to_scratch(
                        (x1 + x2) * 0.5 * 0.625 - 200)  #发送指定类别的识别框位置到scratch
                    print((x1 + x2) * 0.5 * 0.625 - 200)
                    cv2.rectangle(imgs[i], (x1, y1), (x2, y2), (255, 255, 0),
                                  2)

                    cv2.putText(imgs[i], '{}, {:.3f}'.format(obj, score),
                                (x1, y1 + 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                                (255, 255, 0), 1)

            return imgs[i]

    # Box
    regressBoxes = BBoxTransform()
    clipBoxes = ClipBoxes()

    # Video capture
    cap = cv2.VideoCapture(video_src)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # frame preprocessing
        ori_imgs, framed_imgs, framed_metas = preprocess_video(
            frame, max_size=input_size)

        if use_cuda:
            x = torch.stack(
                [torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
        else:
            x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)

        x = x.to(torch.float32 if not use_float16 else torch.float16).permute(
            0, 3, 1, 2)

        # model predict
        with torch.no_grad():
            features, regression, classification, anchors = model(x)

            out = postprocess(x, anchors, regression, classification,
                              regressBoxes, clipBoxes, threshold,
                              iou_threshold)

        # result
        out = invert_affine(framed_metas, out)
        img_show = display(out, ori_imgs, content, effdet)

        # show frame by frame
        cv2.imshow('frame', img_show)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
    def detect_video(self):
        # Box
        regressBoxes = BBoxTransform()
        clipBoxes = ClipBoxes()

        # Video capture
        cap = cv2.VideoCapture(self.video_src)
        if not cap.isOpened():
            raise IOError("Couldn't open webcam or video")
        video_FourCC = int(cap.get(cv2.CAP_PROP_FOURCC))
        video_fps = cap.get(cv2.CAP_PROP_FPS)
        video_size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
                      int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))

        # video recorder
        isVideoOutput = True if (self.video_output != "") or (self.video_output is not None) else False
        if isVideoOutput:
            # print("TYPE:", type(self.video_output), type(video_FourCC), type(video_fps), type(video_size))
            output2video = cv2.VideoWriter(self.video_output, video_FourCC, video_fps, video_size)

        # text recorder
        isTextOutput = True if (self.text_output != "") or (self.text_output is not None) else False
        if isTextOutput:
            output2text = open(self.text_output, 'w', encoding='utf-8')
            output2text.write("Frame,Obj_ID,Type,x1,y1,x2,y2\n")
            track_result = {}
            for obj_cls in self.obj_list:
                track_result[obj_cls] = set([])
        else:
            output2text = None
            track_result = None

        accum_time = 0
        curr_fps = 0
        fps = "FPS: ??"
        prev_time = timer()

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            # frame preprocessing
            ori_imgs, framed_imgs, framed_metas = preprocess_video(frame, max_size=self.input_size)

            if self.use_cuda:
                x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
            else:
                x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)

            x = x.to(torch.float32 if not self.use_float16 else torch.float16).permute(0, 3, 1, 2)

            # model predict
            with torch.no_grad():
                features, regression, classification, anchors = self.detector(x)

                out = postprocess(x,
                                  anchors, regression, classification,
                                  regressBoxes, clipBoxes,
                                  self.threshold, self.iou_threshold)

            # detector result
            out = invert_affine(framed_metas, out)
            # out = [{[xyxy], [class], [scores]}, ...]
            bbox_xyxy = out[0]['rois']
            bbox_xywh = xyxy_to_xywh(bbox_xyxy)
            cls_ids = out[0]['class_ids']
            cls_conf = out[0]['scores']

            # tracker results
            # frame, class, conf, object identification
            tracker_out = {'rois': np.empty(shape=(0, 4)), 'class_ids': np.empty(shape=(0,), dtype=np.int),
                           'obj_ids': np.empty(shape=(0,), dtype=np.int)}
            for index, target in enumerate(self.selected_target):
                mask = cls_ids == target
                bbox = bbox_xywh[mask]
                conf = cls_conf[mask]

                outputs = self.trackers[index].update(bbox, conf, frame)

                if len(outputs) > 0:
                    tracker_out['rois'] = np.append(tracker_out['rois'], outputs[:, 0:4], axis=0)
                    tracker_out['class_ids'] = np.append(tracker_out['class_ids'], np.repeat(target, outputs.shape[0]))
                    tracker_out['obj_ids'] = np.append(tracker_out['obj_ids'], outputs[:, -1])

            # show bbox info and results
            img_show = self._display(tracker_out, ori_imgs[0], output2text, track_result)

            # show frame by frame
            curr_time = timer()
            exec_time = curr_time - prev_time
            prev_time = curr_time
            accum_time = accum_time + exec_time
            curr_fps = curr_fps + 1
            if accum_time > 1:
                accum_time = accum_time - 1
                fps = "FPS: " + str(curr_fps)
                curr_fps = 0

            # show FPS
            cv2.putText(img_show, text=fps, org=(3, 15), fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=0.50, color=(255, 255, 0), thickness=2)
            cv2.namedWindow("frame", cv2.WINDOW_NORMAL)
            cv2.imshow("frame", img_show)

            if isVideoOutput:
                output2video.write(img_show)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        cap.release()
        cv2.destroyAllWindows()
        output2text.close()
        for obj_cls in self.obj_list:
            print(obj_cls + ': ' + str(len(track_result[obj_cls])))