Esempio n. 1
0
def convertData(gesture):
    parser = argparse.ArgumentParser(description='Pose detector')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    args = parser.parse_args()

    # load model
    pose_detector = PoseDetector("posenet",
                                 "models/coco_posenet.npz",
                                 device=args.gpu)
    hand_detector = HandDetector("handnet",
                                 "models/handnet.npz",
                                 device=args.gpu)
    dataset = buildGestureDict("dataset/")
    gesturedf = pd.read_csv("sample.csv")
    for video in dataset[gesture]["videos"]:
        print("Currently processing the video for " + video["filename"])
        startvideo = time.time()
        cap = cv2.VideoCapture(video["filepath"])
        cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
        amount_of_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
        print("Amount of Frames:", amount_of_frames)
        cap.set(cv2.CAP_PROP_FPS, 5)
        ret, img = cap.read()
        counter = 1
        df = pd.DataFrame(columns=["Head", "Left", "Right"])
        frame_tracker = int(amount_of_frames / 12)
        framecounter = 0
        #print(frame_tracker)
        left = 0
        right = 0
        while ret:
            ret, img = cap.read()
            # get video frame
            if not ret:
                print("Failed to capture image")
                break
            person_pose_array, _ = pose_detector(img)
            res_img = cv2.addWeighted(img, 0.6,
                                      draw_person_pose(img, person_pose_array),
                                      0.4, 0)
            if (counter % frame_tracker == 0):
                for person_pose in person_pose_array:
                    firstPerson = True
                    if not firstPerson:
                        continue
                    unit_length = pose_detector.get_unit_length(person_pose)
                    # hands estimation
                    # print("Estimating hands keypoints...")
                    hands = pose_detector.crop_hands(img, person_pose,
                                                     unit_length)
                    if hands["left"] is not None:
                        hand_img = hands["left"]["img"]
                        bbox = hands["left"]["bbox"]
                        hand_keypoints = hand_detector(hand_img,
                                                       hand_type="left")
                        for x in range(len(hand_keypoints)):
                            if (hand_keypoints[x] != None):
                                hand_keypoints[x] = list(
                                    np.delete(hand_keypoints[x], 2))
                                hand_keypoints[x] = [
                                    int(y) for y in hand_keypoints[x]
                                ]
                        res_img = draw_hand_keypoints(res_img, hand_keypoints,
                                                      (bbox[0], bbox[1]))
                        left = hand_keypoints
                        cv2.rectangle(res_img, (bbox[0], bbox[1]),
                                      (bbox[2], bbox[3]), (255, 255, 255), 1)
                    else:
                        left = [[1000, 1000], [1000, 1000], [1000, 1000],
                                [1000, 1000], [1000, 1000], [1000, 1000],
                                [1000, 1000], [1000, 1000], [1000, 1000],
                                [1000, 1000], [1000, 1000], [1000, 1000],
                                [1000, 1000], [1000, 1000], [1000, 1000],
                                [1000, 1000], [1000, 1000], [1000, 1000],
                                [1000, 1000], [1000, 1000], [1000, 1000]]

                    if hands["right"] is not None:
                        hand_img = hands["right"]["img"]
                        bbox = hands["right"]["bbox"]
                        hand_keypoints = hand_detector(hand_img,
                                                       hand_type="right")
                        for x in range(len(hand_keypoints)):
                            if (hand_keypoints[x] != None):
                                hand_keypoints[x] = list(
                                    np.delete(hand_keypoints[x], 2))
                                hand_keypoints[x] = [
                                    int(y) for y in hand_keypoints[x]
                                ]
                        res_img = draw_hand_keypoints(res_img, hand_keypoints,
                                                      (bbox[0], bbox[1]))
                        right = hand_keypoints
                        cv2.rectangle(res_img, (bbox[0], bbox[1]),
                                      (bbox[2], bbox[3]), (255, 255, 255), 1)
                    else:
                        right = [[1000, 1000], [1000, 1000], [1000, 1000],
                                 [1000, 1000], [1000, 1000], [1000, 1000],
                                 [1000, 1000], [1000, 1000], [1000, 1000],
                                 [1000, 1000], [1000, 1000], [1000, 1000],
                                 [1000, 1000], [1000, 1000], [1000, 1000],
                                 [1000, 1000], [1000, 1000], [1000, 1000],
                                 [1000, 1000], [1000, 1000], [1000, 1000]]
                    print("Body Pose")
                    person_pose = np.delete(person_pose, 9, 0)
                    person_pose = np.delete(person_pose, 9, 0)
                    person_pose = np.delete(person_pose, 10, 0)
                    person_pose = np.delete(person_pose, 10, 0)
                    person_pose = person_pose.tolist()
                    for z in range(len(person_pose)):
                        if (person_pose[z] != None):
                            person_pose[z] = list(np.delete(person_pose[z], 2))
                            person_pose[z] = [int(a) for a in person_pose[z]]
                    print(person_pose)
                    print("Left")
                    print(left)
                    print("Right")
                    print(right)
                cv2.imshow("result", res_img)
                head = person_pose
                for x in range(len(head)):
                    if (head[x] == None):
                        head[x] = [1000, 1000]
                pca = sklearnPCA(n_components=1)
                head = pca.fit_transform(head)
                dfhead = pd.DataFrame(data=head)
                dfhead = dfhead.T
                dfhead = dfhead.rename(
                    columns={
                        0: "head_1",
                        1: "head_2",
                        2: "head_3",
                        3: "head_4",
                        4: "head_5",
                        5: "head_6",
                        6: "head_7",
                        7: "head_8",
                        8: "head_9",
                        9: "head_10",
                        10: "head_11",
                        11: "head_12",
                        12: "head_13",
                        13: "head_14"
                    })
                for x in range(len(left)):
                    if (left[x] == None):
                        left[x] = [1000, 1000]
                pca = sklearnPCA(n_components=1)
                left = pca.fit_transform(left)
                dfleft = pd.DataFrame(data=left)
                dfleft = dfleft.T
                dfleft = dfleft.rename(
                    columns={
                        0: "left_1",
                        1: "left_2",
                        2: "left_3",
                        3: "left_4",
                        4: "left_5",
                        5: "left_6",
                        6: "left_7",
                        7: "left_8",
                        8: "left_9",
                        9: "left_10",
                        10: "left_11",
                        11: "left_12",
                        12: "left_13",
                        13: "left_14",
                        14: "left_15",
                        15: "left_16",
                        16: "left_17",
                        17: "left_18",
                        18: "left_19",
                        19: "left_20",
                        20: "left_21"
                    })
                for x in range(len(right)):
                    if (right[x] == None):
                        right[x] = [1000, 1000]
                pca = sklearnPCA(n_components=1)
                right = pca.fit_transform(right)
                dfright = pd.DataFrame(data=right)
                dfright = dfright.T
                dfright = dfright.rename(
                    columns={
                        0: "right_1",
                        1: "right_2",
                        2: "right_3",
                        3: "right_4",
                        4: "right_5",
                        5: "right_6",
                        6: "right_7",
                        7: "right_8",
                        8: "right_9",
                        9: "right_10",
                        10: "right_11",
                        11: "right_12",
                        12: "right_13",
                        13: "right_14",
                        14: "right_15",
                        15: "right_16",
                        16: "right_17",
                        17: "right_18",
                        18: "right_19",
                        19: "right_20",
                        20: "right_21"
                    })
                df2 = pd.concat([dfhead, dfleft, dfright], axis=1)
                df2["frame"] = framecounter
                df2["gesture"] = video["gesture"]
                df2["speaker"] = video["actor"]
                framecounter = framecounter + 1
                df2["frame"] = df2["frame"].astype(int)
                newdf = newdf.append(df2, sort=False)
                gesturedf = gesturedf.append(df2, sort=False)
                firstPerson = False
            else:
                cv2.imshow("result", img)
                counter = counter + 1
                #print("Frame",counter)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break  #print(df)
        cap.release()
        cv2.destroyAllWindows()
    gesturedf.to_csv("dataset720new/" + gesture + ".csv", index=False)
    print("Done Recording for: " + gesture)
    print("Took " + str(time.time() - startvideo) + "seconds")
        unit_length = pose_detector.get_unit_length(person_pose)

        # face estimation
        print("Estimating face keypoints...")
        cropped_face_img, bbox = pose_detector.crop_face(
            img, person_pose, unit_length)
        if cropped_face_img is not None:
            face_keypoints = face_detector(cropped_face_img)
            res_img = draw_face_keypoints(res_img, face_keypoints,
                                          (bbox[0], bbox[1]))
            cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
                          (255, 255, 255), 1)

        # hands estimation
        print("Estimating hands keypoints...")
        hands = pose_detector.crop_hands(img, person_pose, unit_length)
        if hands["left"] is not None:
            hand_img = hands["left"]["img"]
            bbox = hands["left"]["bbox"]
            hand_keypoints = hand_detector(hand_img, hand_type="left")
            res_img = draw_hand_keypoints(res_img, hand_keypoints,
                                          (bbox[0], bbox[1]))
            cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
                          (255, 255, 255), 1)

        if hands["right"] is not None:
            hand_img = hands["right"]["img"]
            bbox = hands["right"]["bbox"]
            hand_keypoints = hand_detector(hand_img, hand_type="right")
            res_img = draw_hand_keypoints(res_img, hand_keypoints,
                                          (bbox[0], bbox[1]))
    # each person detected
    for person_pose in person_pose_array:
        unit_length = pose_detector.get_unit_length(person_pose)

        # face estimation
        print("Estimating face keypoints...")
        cropped_face_img, bbox = pose_detector.crop_face(img, person_pose, unit_length)
        if cropped_face_img is not None:
            face_keypoints = face_detector(cropped_face_img)
            res_img = draw_face_keypoints(res_img, face_keypoints, (bbox[0], bbox[1]))
            cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1)

        # hands estimation
        print("Estimating hands keypoints...")
        hands = pose_detector.crop_hands(img, person_pose, unit_length)
        if hands["left"] is not None:
            hand_img = hands["left"]["img"]
            bbox = hands["left"]["bbox"]
            hand_keypoints = hand_detector(hand_img, hand_type="left")
            res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1]))
            cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1)

        if hands["right"] is not None:
            hand_img = hands["right"]["img"]
            bbox = hands["right"]["bbox"]
            hand_keypoints = hand_detector(hand_img, hand_type="right")
            res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1]))
            cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1)

    print('Saving result into result.png...')
def main(cap, im_scale=2, view_results=False):
    debug_i = 0
    fps_timer_arr = [0] * 16
    fps = 0

    # load model
    pose_device = 0
    pose_model_dir = '../../Chainer_Realtime_Multi-Person_Pose_Estimation/models'
    pose_detector = PoseDetector("posenet",
                                 f"{pose_model_dir}/coco_posenet.npz",
                                 device=pose_device)
    hand_detector = HandDetector("handnet",
                                 f"{pose_model_dir}/handnet.npz",
                                 device=pose_device)

    # cv2.namedWindow('display', flags=(cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE))
    if view_results: cv2.namedWindow('display')

    video_label_file = VideoLabelFile(cap.video_fname,
                                      fname_add='pre_points_pose')
    labels_current = defaultdict(lambda: [])
    labels_all_previous = video_label_file.load_previous()

    im_input = cap.read()
    im_input_shape = im_input.shape[0:2]

    first_run = True

    while (not cap.eof):
        fps_time_begin = time.perf_counter()
        debug_i += 1

        im_input = cap.read()
        current_frame_id = cap.frame_idx()
        # print(cap.info())

        im_pose = cv2.resize(im_input, (round(im_input_shape[1] / im_scale),
                                        round(im_input_shape[0] / im_scale)))
        if first_run:
            print(
                f"Video size {im_input.shape} -> Model input size {im_pose.shape}"
            )
            first_run = False

        ##########################################
        person_pose_array, _ = pose_detector(im_pose)
        im_display = cv2.addWeighted(
            im_pose, 0.6, draw_person_pose(im_pose, person_pose_array), 0.4, 0)

        for person_pose in person_pose_array:
            unit_length = pose_detector.get_unit_length(person_pose)

            # arr = np.array([a for a in person_pose if a is not None])
            # if arr.any():
            #     arr[:, 0:2] *= im_scale
            #     labels_current[current_frame_id].append(['pre_person_pose', arr.tolist()])

            # hands estimation
            hands = pose_detector.crop_hands(im_pose, person_pose, unit_length)
            if hands["left"] is not None:
                hand_img = hands["left"]["img"]
                bbox = hands["left"]["bbox"]
                hand_keypoints = hand_detector(hand_img, hand_type="left")
                im_display = draw_hand_keypoints(im_display, hand_keypoints,
                                                 (bbox[0], bbox[1]))
                cv2.rectangle(im_display, (bbox[0], bbox[1]),
                              (bbox[2], bbox[3]), (255, 255, 255), 1)

                if hand_keypoints[5] and hand_keypoints[8]:
                    f_points = np.array(
                        [hand_keypoints[5][:2], hand_keypoints[8][:2]])
                    f_points = (f_points +
                                np.array([bbox[0], bbox[1]])) * im_scale
                    #f_points = tuple(map(tuple, f_points.astype(int)))
                    f_points = f_points.astype(int).tolist()
                    labels_current[current_frame_id].append(f_points)

            if hands["right"] is not None:
                hand_img = hands["right"]["img"]
                bbox = hands["right"]["bbox"]
                hand_keypoints = hand_detector(hand_img, hand_type="right")
                im_display = draw_hand_keypoints(im_display, hand_keypoints,
                                                 (bbox[0], bbox[1]))
                cv2.rectangle(im_display, (bbox[0], bbox[1]),
                              (bbox[2], bbox[3]), (255, 255, 255), 1)

                if hand_keypoints[5] and hand_keypoints[8]:
                    f_points = np.array(
                        [hand_keypoints[5][:2], hand_keypoints[8][:2]])
                    f_points = (f_points +
                                np.array([bbox[0], bbox[1]])) * im_scale
                    #f_points = tuple(map(tuple, f_points.astype(int)))
                    f_points = f_points.astype(int).tolist()
                    labels_current[current_frame_id].append(f_points)

        #############################################
        for l in labels_current[current_frame_id]:
            cv2.circle(im_display,
                       (round(l[0][0] / im_scale), round(l[0][1] / im_scale)),
                       10, (255, 0, 0), 2)
            cv2.circle(im_display,
                       (round(l[1][0] / im_scale), round(l[1][1] / im_scale)),
                       10, (0, 255, 0), 2)

        cv2.putText(im_display,
                    f"frame {int(current_frame_id)}, fps: {int(fps)}.",
                    (10, im_display.shape[0] - 10), cv2.FONT_HERSHEY_SIMPLEX,
                    0.8, (255, 255, 255), 2)

        if view_results:
            #cv2.imshow('display', im_display)
            cv2.imshow('display', im_pose)
        else:
            print(".", end="")
            sys.stdout.flush()

        # labels_current[current_frame_id].append

        #############################################
        ## KEYBOARD

        k = cv2.waitKey(5)
        if k == 27:  # esc
            break
        elif k == ord('c'):
            import ipdb
            ipdb.set_trace()
            # ipdb.set_trace()
            # pdb.set_trace()

        fps_timer_arr[debug_i % 16] = time.perf_counter() - fps_time_begin
        fps = int(len(fps_timer_arr) * 1 / sum(fps_timer_arr))

    print(". ")
    # cap.release()
    video_label_file.save_current_labels(labels_current,
                                         append_previous=False,
                                         custom_lists=True)

    if view_results: cv2.destroyAllWindows()
Esempio n. 5
0
def estimate_pose(img_path, gpu = -1):
    # parser = argparse.ArgumentParser(description='Pose detector')
    # parser.add_argument('--img', help='image file path')
    # parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)')
    # args = parser.parse_args()

    # load model
    print("Loading pose detection model...")
    pose_detector = PoseDetector("posenet", "models/coco_posenet.npz", device=gpu)
    print("Loading hand detection model...")
    hand_detector = HandDetector("handnet", "models/handnet.npz", device=gpu)
    # face_detector = FaceDetector("facenet", "models/facenet.npz", device=args.gpu)

    # read image
    img = cv2.imread(img_path)

    # inference
    print("Estimating pose...")
    person_pose_array, _ = pose_detector(img)

    res_img = cv2.addWeighted(img, 0.6, draw_person_pose(img, person_pose_array), 0.4, 0)

    # will cause the loop below to perform only at most 1 iteration; which means only 1 person will be recognized
    has_detected = False

    # each person detected
    for person_pose in person_pose_array:
        if has_detected:
            continue

        has_detected = True

        print("Body:", person_pose)
        unit_length = pose_detector.get_unit_length(person_pose)

        # face estimation
        # print("Estimating face keypoints...")
        # cropped_face_img, bbox = pose_detector.crop_face(img, person_pose, unit_length)
        # if cropped_face_img is not None:
        #     face_keypoints = face_detector(cropped_face_img)
        #     res_img = draw_face_keypoints(res_img, face_keypoints, (bbox[0], bbox[1]))
        #     cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1)

        # hands estimation
        print("Estimating hands keypoints...")
        hands = pose_detector.crop_hands(img, person_pose, unit_length)
        if hands["left"] is not None:
            hand_img = hands["left"]["img"]
            bbox = hands["left"]["bbox"]
            hand_keypoints = hand_detector(hand_img, hand_type="left")
            print("Left hand: ", print_arr(hand_keypoints))

            res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1]))
            cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1)

        if hands["right"] is not None:
            hand_img = hands["right"]["img"]
            bbox = hands["right"]["bbox"]
            hand_keypoints = hand_detector(hand_img, hand_type="right")
            print("Right hand: ", print_arr(hand_keypoints))
            res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1]))
            cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1)

    print('Saving result into result.png...')
    cv2.imwrite('result.png', res_img)