Esempio n. 1
0
    def get_single_item(self, index):
        start_index, end_index = self.vid_indices[index]

        with h5py.File(self.h5_file, 'r') as db:
            self.db = db

            kp_2d = self.db['joints2D'][start_index:end_index + 1]
            kp_2d = convert_kps(kp_2d, src='insta', dst='spin')
            kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16)

            input = torch.from_numpy(
                self.db['features'][start_index:end_index + 1]).float()

            vid_name = self.db['vid_name'][start_index:end_index + 1]
            frame_id = self.db['frame_id'][start_index:end_index +
                                           1].astype(str)
            instance_id = np.array(
                [v.decode('ascii') + f for v, f in zip(vid_name, frame_id)])

        for idx in range(self.seqlen):
            kp_2d[idx, :, :2] = normalize_2d_kp(kp_2d[idx, :, :2], 224)
            kp_2d_tensor[idx] = kp_2d[idx]

        target = {
            'features': input,
            'kp_2d': torch.from_numpy(kp_2d_tensor).float(
            ),  # 2D keypoints transformed according to bbox cropping
            # 'instance_id': instance_id
        }

        return target
Esempio n. 2
0
def read_data_train(dataset_path, debug=False):
    h, w = 2048, 2048
    dataset = {
        'vid_name': [],
        'frame_id': [],
        'joints3D': [],
        'joints2D': [],
        'bbox': [],
        'img_name': [],
        'features': [],
    }

    model = spin.get_pretrained_hmr()

    # training data
    user_list = range(1, 9)
    seq_list = range(1, 3)
    vid_list = list(range(3)) + list(range(4, 9))

    # product = product(user_list, seq_list, vid_list)
    # user_i, seq_i, vid_i = product[process_id]

    for user_i in user_list:
        for seq_i in seq_list:
            seq_path = os.path.join(dataset_path, 'S' + str(user_i),
                                    'Seq' + str(seq_i))
            # mat file with annotations
            annot_file = os.path.join(seq_path, 'annot.mat')
            annot2 = sio.loadmat(annot_file)['annot2']
            annot3 = sio.loadmat(annot_file)['annot3']
            # calibration file and camera parameters
            for j, vid_i in enumerate(vid_list):
                # image folder
                imgs_path = os.path.join(seq_path, 'video_' + str(vid_i))
                # per frame
                pattern = os.path.join(imgs_path, '*.jpg')
                img_list = sorted(glob.glob(pattern))
                vid_used_frames = []
                vid_used_joints = []
                vid_used_bbox = []
                vid_segments = []
                vid_uniq_id = "subj" + str(user_i) + '_seq' + str(
                    seq_i) + "_vid" + str(vid_i) + "_seg0"
                for i, img_i in tqdm_enumerate(img_list):

                    # for each image we store the relevant annotations
                    img_name = img_i.split('/')[-1]
                    joints_2d_raw = np.reshape(annot2[vid_i][0][i], (1, 28, 2))
                    joints_2d_raw = np.append(joints_2d_raw,
                                              np.ones((1, 28, 1)),
                                              axis=2)
                    joints_2d = convert_kps(joints_2d_raw, "mpii3d",
                                            "spin").reshape((-1, 3))

                    # visualize = True
                    # if visualize == True and i == 500:
                    #     import matplotlib.pyplot as plt
                    #
                    #     frame = cv2.cvtColor(cv2.imread(img_i), cv2.COLOR_BGR2RGB)
                    #
                    #     for k in range(49):
                    #         kp = joints_2d[k]
                    #
                    #         frame = cv2.circle(
                    #             frame.copy(),
                    #             (int(kp[0]), int(kp[1])),
                    #             thickness=3,
                    #             color=(255, 0, 0),
                    #             radius=5,
                    #         )
                    #
                    #         cv2.putText(frame, f'{k}', (int(kp[0]), int(kp[1]) + 1), cv2.FONT_HERSHEY_SIMPLEX, 1.5,
                    #                     (0, 255, 0),
                    #                     thickness=3)
                    #
                    #     plt.imshow(frame)
                    #     plt.show()

                    joints_3d_raw = np.reshape(annot3[vid_i][0][i],
                                               (1, 28, 3)) / 1000
                    joints_3d = convert_kps(joints_3d_raw, "mpii3d",
                                            "spin").reshape((-1, 3))

                    bbox = get_bbox_from_kp2d(
                        joints_2d[~np.all(joints_2d == 0, axis=1)]).reshape(4)

                    joints_3d = joints_3d - joints_3d[39]  # 4 is the root

                    # check that all joints are visible
                    x_in = np.logical_and(joints_2d[:, 0] < w,
                                          joints_2d[:, 0] >= 0)
                    y_in = np.logical_and(joints_2d[:, 1] < h,
                                          joints_2d[:, 1] >= 0)
                    ok_pts = np.logical_and(x_in, y_in)
                    if np.sum(ok_pts) < joints_2d.shape[0]:
                        vid_uniq_id = "_".join(vid_uniq_id.split("_")[:-1])+ "_seg" +\
                                          str(int(dataset['vid_name'][-1].split("_")[-1][3:])+1)
                        continue

                    dataset['vid_name'].append(vid_uniq_id)
                    dataset['frame_id'].append(img_name.split(".")[0])
                    dataset['img_name'].append(img_i)
                    dataset['joints2D'].append(joints_2d)
                    dataset['joints3D'].append(joints_3d)
                    dataset['bbox'].append(bbox)
                    vid_segments.append(vid_uniq_id)
                    vid_used_frames.append(img_i)
                    vid_used_joints.append(joints_2d)
                    vid_used_bbox.append(bbox)

                vid_segments = np.array(vid_segments)
                ids = np.zeros((len(set(vid_segments)) + 1))
                ids[-1] = len(vid_used_frames) + 1
                if (np.where(
                        vid_segments[:-1] != vid_segments[1:])[0]).size != 0:
                    ids[1:-1] = (np.where(
                        vid_segments[:-1] != vid_segments[1:])[0]) + 1

                for i in tqdm(range(len(set(vid_segments)))):
                    features = extract_features(
                        model,
                        np.array(vid_used_frames)[int(ids[i]):int(ids[i + 1])],
                        vid_used_bbox[int(ids[i]):int((ids[i + 1]))],
                        kp_2d=np.array(
                            vid_used_joints)[int(ids[i]):int(ids[i + 1])],
                        dataset='spin',
                        debug=False)
                    dataset['features'].append(features)

    for k in dataset.keys():
        dataset[k] = np.array(dataset[k])
    dataset['features'] = np.concatenate(dataset['features'])

    return dataset
Esempio n. 3
0
def read_test_data(dataset_path):

    dataset = {
        'vid_name': [],
        'frame_id': [],
        'joints3D': [],
        'joints2D': [],
        'bbox': [],
        'img_name': [],
        'features': [],
        "valid_i": []
    }

    model = spin.get_pretrained_hmr()

    user_list = range(1, 7)

    for user_i in user_list:
        print('Subject', user_i)
        seq_path = os.path.join(dataset_path, 'mpi_inf_3dhp_test_set',
                                'TS' + str(user_i))
        # mat file with annotations
        annot_file = os.path.join(seq_path, 'annot_data.mat')
        mat_as_h5 = h5py.File(annot_file, 'r')
        annot2 = np.array(mat_as_h5['annot2'])
        annot3 = np.array(mat_as_h5['univ_annot3'])
        valid = np.array(mat_as_h5['valid_frame'])

        vid_used_frames = []
        vid_used_joints = []
        vid_used_bbox = []
        vid_segments = []
        vid_uniq_id = "subj" + str(user_i) + "_seg0"

        for frame_i, valid_i in tqdm(enumerate(valid)):

            img_i = os.path.join('mpi_inf_3dhp_test_set', 'TS' + str(user_i),
                                 'imageSequence',
                                 'img_' + str(frame_i + 1).zfill(6) + '.jpg')

            joints_2d_raw = np.expand_dims(annot2[frame_i, 0, :, :], axis=0)
            joints_2d_raw = np.append(joints_2d_raw,
                                      np.ones((1, 17, 1)),
                                      axis=2)

            joints_2d = convert_kps(joints_2d_raw,
                                    src="mpii3d_test",
                                    dst="spin").reshape((-1, 3))

            # visualize = True
            # if visualize == True:
            #     import matplotlib.pyplot as plt
            #
            #     frame = cv2.cvtColor(cv2.imread(os.path.join(dataset_path, img_i)), cv2.COLOR_BGR2RGB)
            #
            #     for k in range(49):
            #         kp = joints_2d[k]
            #
            #         frame = cv2.circle(
            #             frame.copy(),
            #             (int(kp[0]), int(kp[1])),
            #             thickness=3,
            #             color=(255, 0, 0),
            #             radius=5,
            #         )
            #
            #         cv2.putText(frame, f'{k}', (int(kp[0]), int(kp[1]) + 1), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0),
            #                     thickness=3)
            #
            #     plt.imshow(frame)
            #     plt.show()

            joints_3d_raw = np.reshape(annot3[frame_i, 0, :, :],
                                       (1, 17, 3)) / 1000
            joints_3d = convert_kps(joints_3d_raw, "mpii3d_test",
                                    "spin").reshape((-1, 3))
            joints_3d = joints_3d - joints_3d[
                39]  # substract pelvis zero is the root for test

            bbox = get_bbox_from_kp2d(
                joints_2d[~np.all(joints_2d == 0, axis=1)]).reshape(4)

            # check that all joints are visible
            img_file = os.path.join(dataset_path, img_i)
            I = cv2.imread(img_file)
            h, w, _ = I.shape
            x_in = np.logical_and(joints_2d[:, 0] < w, joints_2d[:, 0] >= 0)
            y_in = np.logical_and(joints_2d[:, 1] < h, joints_2d[:, 1] >= 0)
            ok_pts = np.logical_and(x_in, y_in)

            if np.sum(ok_pts) < joints_2d.shape[0]:
                vid_uniq_id = "_".join(vid_uniq_id.split("_")[:-1]) + "_seg" + \
                              str(int(dataset['vid_name'][-1].split("_")[-1][3:]) + 1)
                continue

            dataset['vid_name'].append(vid_uniq_id)
            dataset['frame_id'].append(img_file.split("/")[-1].split(".")[0])
            dataset['img_name'].append(img_file)
            dataset['joints2D'].append(joints_2d)
            dataset['joints3D'].append(joints_3d)
            dataset['bbox'].append(bbox)
            dataset['valid_i'].append(valid_i)

            vid_segments.append(vid_uniq_id)
            vid_used_frames.append(img_file)
            vid_used_joints.append(joints_2d)
            vid_used_bbox.append(bbox)

        vid_segments = np.array(vid_segments)
        ids = np.zeros((len(set(vid_segments)) + 1))
        ids[-1] = len(vid_used_frames) + 1
        if (np.where(vid_segments[:-1] != vid_segments[1:])[0]).size != 0:
            ids[1:-1] = (np.where(
                vid_segments[:-1] != vid_segments[1:])[0]) + 1

        for i in tqdm(range(len(set(vid_segments)))):
            features = extract_features(
                model,
                np.array(vid_used_frames)[int(ids[i]):int(ids[i + 1])],
                vid_used_bbox[int(ids[i]):int(ids[i + 1])],
                kp_2d=np.array(vid_used_joints)[int(ids[i]):int(ids[i + 1])],
                dataset='spin',
                debug=False)
            dataset['features'].append(features)

    for k in dataset.keys():
        dataset[k] = np.array(dataset[k])
    dataset['features'] = np.concatenate(dataset['features'])

    return dataset
Esempio n. 4
0
    def get_single_item(self, index):
        start_index, end_index = self.vid_indices[index]

        kp_2d = self.db['joints2D'][start_index:end_index + 1]
        if self.dataset_name != 'posetrack':
            kp_2d = convert_kps(kp_2d, src=self.dataset_name, dst='spin')
        kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16)

        bbox = self.db['bbox'][start_index:end_index + 1]

        input = torch.from_numpy(self.db['features'][start_index:end_index +
                                                     1]).float()

        for idx in range(self.seqlen):
            # crop image and transform 2d keypoints
            kp_2d[idx, :, :2], trans = transfrom_keypoints(
                kp_2d=kp_2d[idx, :, :2],
                center_x=bbox[idx, 0],
                center_y=bbox[idx, 1],
                width=bbox[idx, 2],
                height=bbox[idx, 3],
                patch_width=224,
                patch_height=224,
                do_augment=False,
            )

            kp_2d[idx, :, :2] = normalize_2d_kp(kp_2d[idx, :, :2], 224)
            kp_2d_tensor[idx] = kp_2d[idx]

        vid_name = self.db['vid_name'][start_index:end_index + 1]
        frame_id = self.db['img_name'][start_index:end_index + 1].astype(str)
        instance_id = np.array([v + f for v, f in zip(vid_name, frame_id)])

        target = {
            'features': input,
            'kp_2d': torch.from_numpy(kp_2d_tensor).float(
            ),  # 2D keypoints transformed according to bbox cropping
            # 'instance_id': instance_id,
        }

        if self.debug:
            from lib.data_utils.img_utils import get_single_image_crop

            vid_name = self.db['vid_name'][start_index]

            if self.dataset_name == 'pennaction':
                vid_folder = "frames"
                vid_name = vid_name.split('/')[-1].split('.')[0]
                img_id = "img_name"
            elif self.dataset_name == 'posetrack':
                vid_folder = osp.join('images', vid_name.split('/')[-2])
                vid_name = vid_name.split('/')[-1].split('.')[0]
                img_id = "img_name"
            else:
                vid_name = '_'.join(vid_name.split('_')[:-1])
                vid_folder = 'imageFiles'
                img_id = 'frame_id'
            f = osp.join(self.folder, vid_folder, vid_name)
            video_file_list = [
                osp.join(f, x) for x in sorted(os.listdir(f))
                if x.endswith('.jpg')
            ]
            frame_idxs = self.db[img_id][start_index:end_index + 1]
            if self.dataset_name == 'pennaction' or self.dataset_name == 'posetrack':
                video = frame_idxs
            else:
                video = [video_file_list[i] for i in frame_idxs]

            video = torch.cat([
                get_single_image_crop(image, bbox).unsqueeze(0)
                for image, bbox in zip(video, bbox)
            ],
                              dim=0)

            target['video'] = video

        return target
Esempio n. 5
0
def read_data(folder, set):
    dataset = {
        'img_name': [],
        'joints2D': [],
        'bbox': [],
        'vid_name': [],
        'features': [],
    }

    model = spin.get_pretrained_hmr()

    file_names = glob.glob(
        osp.join(folder, 'posetrack_data/annotations/', f'{set}/*.json'))
    file_names = sorted(file_names)
    nn_corrupted = 0
    tot_frames = 0
    min_frame_number = 8

    for fid, fname in tqdm_enumerate(file_names):
        if fname == osp.join(folder,
                             'annotations/train/021133_mpii_train.json'):
            continue

        with open(fname, 'r') as entry:
            anns = json.load(entry)
        # num_frames = anns['images'][0]['nframes']
        anns['images'] = [
            item for item in anns['images'] if item['is_labeled']
        ]
        num_frames = len(anns['images'])
        frame2imgname = dict()
        for el in anns['images']:
            frame2imgname[el['frame_id']] = el['file_name']

        num_people = -1
        for x in anns['annotations']:
            if num_people < x['track_id']:
                num_people = x['track_id']
        num_people += 1
        posetrack_joints = get_posetrack_original_kp_names()
        idxs = [
            anns['categories'][0]['keypoints'].index(h)
            for h in posetrack_joints
            if h in anns['categories'][0]['keypoints']
        ]
        for x in anns['annotations']:
            kps = np.array(x['keypoints']).reshape((17, 3))
            kps = kps[idxs, :]
            x['keypoints'] = list(kps.flatten())

        tot_frames += num_people * num_frames
        for p_id in range(num_people):

            annot_pid = [(item['keypoints'], item['bbox'], item['image_id'])
                         for item in anns['annotations']
                         if item['track_id'] == p_id
                         and not (np.count_nonzero(item['keypoints']) == 0)]

            if len(annot_pid) < min_frame_number:
                nn_corrupted += len(annot_pid)
                continue

            bbox = np.zeros((len(annot_pid), 4))
            # perm_idxs = get_perm_idxs('posetrack', 'common')
            kp_2d = np.zeros((len(annot_pid), len(annot_pid[0][0]) // 3, 3))
            img_paths = np.zeros((len(annot_pid)))

            for i, (key2djnts, bbox_p, image_id) in enumerate(annot_pid):

                if (bbox_p[2] == 0 or bbox_p[3] == 0):
                    nn_corrupted += 1
                    continue

                img_paths[i] = image_id
                key2djnts[2::3] = len(key2djnts[2::3]) * [1]

                kp_2d[i, :] = np.array(key2djnts).reshape(
                    int(len(key2djnts) / 3), 3)  # [perm_idxs, :]
                for kp_loc in kp_2d[i, :]:
                    if kp_loc[0] == 0 and kp_loc[1] == 0:
                        kp_loc[2] = 0

                x_tl = bbox_p[0]
                y_tl = bbox_p[1]
                w = bbox_p[2]
                h = bbox_p[3]
                bbox_p[0] = x_tl + w / 2
                bbox_p[1] = y_tl + h / 2
                #

                w = h = np.where(w / h > 1, w, h)
                w = h = h * 0.8
                bbox_p[2] = w
                bbox_p[3] = h
                bbox[i, :] = bbox_p

            img_paths = list(img_paths)
            img_paths = [
                osp.join(folder, frame2imgname[item]) if item != 0 else 0
                for item in img_paths
            ]

            bbx_idxs = []
            for bbx_id, bbx in enumerate(bbox):
                if np.count_nonzero(bbx) == 0:
                    bbx_idxs += [bbx_id]

            kp_2d = np.delete(kp_2d, bbx_idxs, 0)
            img_paths = np.delete(np.array(img_paths), bbx_idxs, 0)
            bbox = np.delete(bbox, np.where(~bbox.any(axis=1))[0], axis=0)

            # Convert to common 2d keypoint format
            if bbox.size == 0 or bbox.shape[0] < min_frame_number:
                nn_corrupted += 1
                continue

            kp_2d = convert_kps(kp_2d, src='posetrack', dst='spin')

            dataset['vid_name'].append(
                np.array([f'{fname}_{p_id}'] * img_paths.shape[0]))
            dataset['img_name'].append(np.array(img_paths))
            dataset['joints2D'].append(kp_2d)
            dataset['bbox'].append(np.array(bbox))

            # compute_features
            features = extract_features(
                model,
                np.array(img_paths),
                bbox,
                kp_2d=kp_2d,
                dataset='spin',
                debug=False,
            )

            assert kp_2d.shape[0] == img_paths.shape[0] == bbox.shape[0]

            dataset['features'].append(features)

    print(nn_corrupted, tot_frames)
    for k in dataset.keys():
        dataset[k] = np.array(dataset[k])

    for k in dataset.keys():
        dataset[k] = np.concatenate(dataset[k])

    for k, v in dataset.items():
        print(k, v.shape)

    return dataset
Esempio n. 6
0
def main(args):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    video_file = args.vid_file

    # ========= [Optional] download the youtube video ========= #
    if video_file.startswith('https://www.youtube.com'):
        print(f'Donwloading YouTube video \"{video_file}\"')
        video_file = download_youtube_clip(video_file, '/tmp')

        if video_file is None:
            exit('Youtube url is not valid!')

        print(f'YouTube Video has been downloaded to {video_file}...')

    if not os.path.isfile(video_file):
        exit(f'Input video \"{video_file}\" does not exist!')

    output_path = os.path.join(
        args.output_folder,
        os.path.basename(video_file).replace('.mp4', ''))
    os.makedirs(output_path, exist_ok=True)

    image_folder, num_frames, img_shape = video_to_images(video_file,
                                                          return_info=True)

    print(f'Input video number of frames {num_frames}')
    orig_height, orig_width = img_shape[:2]

    total_time = time.time()

    # ========= Run tracking ========= #
    bbox_scale = 1.1
    if args.tracking_method == 'pose':
        if not os.path.isabs(video_file):
            video_file = os.path.join(os.getcwd(), video_file)
        tracking_results = run_posetracker(video_file,
                                           staf_folder=args.staf_dir,
                                           display=args.display)
    else:
        # run multi object tracker
        mot = MPT(
            device=device,
            batch_size=args.tracker_batch_size,
            display=args.display,
            detector_type=args.detector,
            output_format='dict',
            yolo_img_size=args.yolo_img_size,
        )
        tracking_results = mot(image_folder)

    # remove tracklets if num_frames is less than MIN_NUM_FRAMES
    for person_id in list(tracking_results.keys()):
        if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES:
            del tracking_results[person_id]

    # ========= Define VIBE model ========= #
    model = VIBE_Demo(
        seqlen=16,
        n_layers=2,
        hidden_size=1024,
        add_linear=True,
        use_residual=True,
    ).to(device)

    # ========= Load pretrained weights ========= #
    pretrained_file = download_ckpt(use_3dpw=False)
    if torch.cuda.is_available():
        ckpt = torch.load(pretrained_file)
    else:
        ckpt = torch.load(pretrained_file, map_location=torch.device('cpu'))

    print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
    ckpt = ckpt['gen_state_dict']
    model.load_state_dict(ckpt, strict=False)
    model.eval()
    print(f'Loaded pretrained weights from \"{pretrained_file}\"')

    # ========= Run VIBE on each person ========= #
    print(f'Running VIBE on each tracklet...')
    vibe_time = time.time()
    vibe_results = {}
    for person_id in tqdm(list(tracking_results.keys())):
        bboxes = joints2d = None

        if args.tracking_method == 'bbox':
            bboxes = tracking_results[person_id]['bbox']
        elif args.tracking_method == 'pose':
            joints2d = tracking_results[person_id]['joints2d']

        frames = tracking_results[person_id]['frames']

        dataset = Inference(
            image_folder=image_folder,
            frames=frames,
            bboxes=bboxes,
            joints2d=joints2d,
            scale=bbox_scale,
        )

        bboxes = dataset.bboxes
        frames = dataset.frames
        has_keypoints = True if joints2d is not None else False

        dataloader = DataLoader(dataset,
                                batch_size=args.vibe_batch_size,
                                num_workers=16)

        with torch.no_grad():

            pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], []

            for batch in dataloader:
                if has_keypoints:
                    batch, nj2d = batch
                    norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3))

                batch = batch.unsqueeze(0)
                batch = batch.to(device)

                batch_size, seqlen = batch.shape[:2]
                output = model(batch)[-1]

                pred_cam.append(output['theta'][:, :, :3].reshape(
                    batch_size * seqlen, -1))
                pred_verts.append(output['verts'].reshape(
                    batch_size * seqlen, -1, 3))
                pred_pose.append(output['theta'][:, :, 3:75].reshape(
                    batch_size * seqlen, -1))
                pred_betas.append(output['theta'][:, :, 75:].reshape(
                    batch_size * seqlen, -1))
                pred_joints3d.append(output['kp_3d'].reshape(
                    batch_size * seqlen, -1, 3))

            pred_cam = torch.cat(pred_cam, dim=0)
            pred_verts = torch.cat(pred_verts, dim=0)
            pred_pose = torch.cat(pred_pose, dim=0)
            pred_betas = torch.cat(pred_betas, dim=0)
            pred_joints3d = torch.cat(pred_joints3d, dim=0)

            del batch

        # ========= [Optional] run Temporal SMPLify to refine the results ========= #
        if args.run_smplify and args.tracking_method == 'pose':
            norm_joints2d = np.concatenate(norm_joints2d, axis=0)
            norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin')
            norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device)

            # Run Temporal SMPLify
            update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \
            new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner(
                pred_rotmat=pred_pose,
                pred_betas=pred_betas,
                pred_cam=pred_cam,
                j2d=norm_joints2d,
                device=device,
                batch_size=norm_joints2d.shape[0],
                pose2aa=False,
            )

            # update the parameters after refinement
            print(
                f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}'
            )
            pred_verts = pred_verts.cpu()
            pred_cam = pred_cam.cpu()
            pred_pose = pred_pose.cpu()
            pred_betas = pred_betas.cpu()
            pred_joints3d = pred_joints3d.cpu()
            pred_verts[update] = new_opt_vertices[update]
            pred_cam[update] = new_opt_cam[update]
            pred_pose[update] = new_opt_pose[update]
            pred_betas[update] = new_opt_betas[update]
            pred_joints3d[update] = new_opt_joints3d[update]

        elif args.run_smplify and args.tracking_method == 'bbox':
            print(
                '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!'
            )
            print('[WARNING] Continuing without running Temporal SMPLify!..')

        # ========= Save results to a pickle file ========= #
        pred_cam = pred_cam.cpu().numpy()
        pred_verts = pred_verts.cpu().numpy()
        pred_pose = pred_pose.cpu().numpy()
        pred_betas = pred_betas.cpu().numpy()
        pred_joints3d = pred_joints3d.cpu().numpy()

        orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam,
                                                bbox=bboxes,
                                                img_width=orig_width,
                                                img_height=orig_height)

        output_dict = {
            'pred_cam': pred_cam,
            'orig_cam': orig_cam,
            'verts': pred_verts,
            'pose': pred_pose,
            'betas': pred_betas,
            'joints3d': pred_joints3d,
            'joints2d': joints2d,
            'bboxes': bboxes,
            'frame_ids': frames,
        }

        vibe_results[person_id] = output_dict

    del model

    end = time.time()
    fps = num_frames / (end - vibe_time)

    print(f'VIBE FPS: {fps:.2f}')
    total_time = time.time() - total_time
    print(
        f'Total time spent: {total_time:.2f} seconds (including model loading time).'
    )
    print(
        f'Total FPS (including model loading time): {num_frames / total_time:.2f}.'
    )

    print(
        f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".'
    )

    joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl"))

    if not args.no_render:
        # ========= Render results as a single video ========= #
        renderer = Renderer(resolution=(orig_width, orig_height),
                            orig_img=True,
                            wireframe=args.wireframe)

        output_img_folder = f'{image_folder}_output'
        os.makedirs(output_img_folder, exist_ok=True)

        print(f'Rendering output video, writing frames to {output_img_folder}')

        # prepare results for rendering
        frame_results = prepare_rendering_results(vibe_results, num_frames)
        mesh_color = {
            k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0)
            for k in vibe_results.keys()
        }

        image_file_names = sorted([
            os.path.join(image_folder, x) for x in os.listdir(image_folder)
            if x.endswith('.png') or x.endswith('.jpg')
        ])

        for frame_idx in tqdm(range(len(image_file_names))):
            img_fname = image_file_names[frame_idx]
            img = cv2.imread(img_fname)

            if args.sideview:
                side_img = np.zeros_like(img)

            for person_id, person_data in frame_results[frame_idx].items():
                frame_verts = person_data['verts']
                frame_cam = person_data['cam']

                mc = mesh_color[person_id]

                mesh_filename = None

                if args.save_obj:
                    mesh_folder = os.path.join(output_path, 'meshes',
                                               f'{person_id:04d}')
                    os.makedirs(mesh_folder, exist_ok=True)
                    mesh_filename = os.path.join(mesh_folder,
                                                 f'{frame_idx:06d}.obj')

                img = renderer.render(
                    img,
                    frame_verts,
                    cam=frame_cam,
                    color=mc,
                    mesh_filename=mesh_filename,
                )

                if args.sideview:
                    side_img = renderer.render(
                        side_img,
                        frame_verts,
                        cam=frame_cam,
                        color=mc,
                        angle=270,
                        axis=[0, 1, 0],
                    )

            if args.sideview:
                img = np.concatenate([img, side_img], axis=1)

            cv2.imwrite(
                os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img)

            if args.display:
                cv2.imshow('Video', img)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

        if args.display:
            cv2.destroyAllWindows()

        # ========= Save rendered video ========= #
        vid_name = os.path.basename(video_file)
        save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4'
        save_name = os.path.join(output_path, save_name)
        print(f'Saving result video to {save_name}')
        images_to_video(img_folder=output_img_folder,
                        output_vid_file=save_name)
        shutil.rmtree(output_img_folder)

    shutil.rmtree(image_folder)
    print('================= END =================')
Esempio n. 7
0
    def get_single_item(self, index):
        start_index, end_index = self.vid_indices[index]

        is_train = self.set == 'train'

        if self.dataset_name == '3dpw':
            kp_2d = convert_kps(self.db['joints2D'][start_index:end_index + 1], src='common', dst='spin')
            kp_3d = self.db['joints3D'][start_index:end_index + 1]
        elif self.dataset_name == 'mpii3d':
            kp_2d = self.db['joints2D'][start_index:end_index + 1]
            if is_train:
                kp_3d = self.db['joints3D'][start_index:end_index + 1]
            else:
                kp_3d = convert_kps(self.db['joints3D'][start_index:end_index + 1], src='spin', dst='common')
        elif self.dataset_name == 'h36m':
            kp_2d = self.db['joints2D'][start_index:end_index + 1]
            if is_train:
                kp_3d = self.db['joints3D'][start_index:end_index + 1]
            else:
                kp_3d = convert_kps(self.db['joints3D'][start_index:end_index + 1], src='spin', dst='common')

        kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16)
        nj = 14 if not is_train else 49
        kp_3d_tensor = np.zeros((self.seqlen, nj, 3), dtype=np.float16)


        if self.dataset_name == '3dpw':
            pose  = self.db['pose'][start_index:end_index+1]
            shape = self.db['shape'][start_index:end_index+1]
            w_smpl = torch.ones(self.seqlen).float()
            w_3d = torch.ones(self.seqlen).float()
        elif self.dataset_name == 'h36m':
            if not is_train:
                pose = np.zeros((kp_2d.shape[0], 72))
                shape = np.zeros((kp_2d.shape[0], 10))
                w_smpl = torch.zeros(self.seqlen).float()
                w_3d = torch.ones(self.seqlen).float()
            else:
                pose = self.db['pose'][start_index:end_index + 1]
                shape = self.db['shape'][start_index:end_index + 1]
                w_smpl = torch.ones(self.seqlen).float()
                w_3d = torch.ones(self.seqlen).float()
        elif self.dataset_name == 'mpii3d':
            pose = np.zeros((kp_2d.shape[0], 72))
            shape = np.zeros((kp_2d.shape[0], 10))
            w_smpl = torch.zeros(self.seqlen).float()
            w_3d = torch.ones(self.seqlen).float()

        bbox = self.db['bbox'][start_index:end_index + 1]
        input = torch.from_numpy(self.db['features'][start_index:end_index+1]).float()

        theta_tensor = np.zeros((self.seqlen, 85), dtype=np.float16)

        for idx in range(self.seqlen):
            # crop image and transform 2d keypoints
            kp_2d[idx,:,:2], trans = transfrom_keypoints(
                kp_2d=kp_2d[idx,:,:2],
                center_x=bbox[idx,0],
                center_y=bbox[idx,1],
                width=bbox[idx,2],
                height=bbox[idx,3],
                patch_width=224,
                patch_height=224,
                do_augment=False,
            )

            kp_2d[idx,:,:2] = normalize_2d_kp(kp_2d[idx,:,:2], 224)

            # theta shape (85,)
            theta = np.concatenate((np.array([1., 0., 0.]), pose[idx], shape[idx]), axis=0)

            kp_2d_tensor[idx] = kp_2d[idx]
            theta_tensor[idx] = theta
            kp_3d_tensor[idx] = kp_3d[idx]

        target = {
            'features': input,
            'theta': torch.from_numpy(theta_tensor).float(), # camera, pose and shape
            'kp_2d': torch.from_numpy(kp_2d_tensor).float(), # 2D keypoints transformed according to bbox cropping
            'kp_3d': torch.from_numpy(kp_3d_tensor).float(), # 3D keypoints
            'w_smpl': w_smpl,
            'w_3d': w_3d,
        }

        if self.dataset_name == 'mpii3d' and not is_train:
            target['valid'] = self.db['valid_i'][start_index:end_index+1]

        if self.dataset_name == '3dpw' and not is_train:
            vn = self.db['vid_name'][start_index:end_index + 1]
            fi = self.db['frame_id'][start_index:end_index + 1]
            target['instance_id'] = [f'{v}/{f}'for v,f in zip(vn,fi)]



        # if self.dataset_name == '3dpw' and not self.is_train:
            # target['imgname'] = self.db['img_name'][start_index:end_index+1].tolist()
            # target['imgname'] = np.array(target['imgname'])
            # print(target['imgname'].dtype)
            # target['center'] = self.db['bbox'][start_index:end_index+1, :2]
            # target['valid'] = torch.from_numpy(self.db['valid'][start_index:end_index+1])

        if self.debug:
            from lib.data_utils.img_utils import get_single_image_crop

            if self.dataset_name == 'mpii3d':
                video = self.db['img_name'][start_index:end_index+1]
                # print(video)
            elif self.dataset_name == 'h36m':
                video = self.db['img_name'][start_index:end_index + 1]
            else:
                vid_name = self.db['vid_name'][start_index]
                vid_name = '_'.join(vid_name.split('_')[:-1])
                f = osp.join(self.folder, 'imageFiles', vid_name)
                video_file_list = [osp.join(f, x) for x in sorted(os.listdir(f)) if x.endswith('.jpg')]
                frame_idxs = self.db['frame_id'][start_index:end_index + 1]
                # print(f, frame_idxs)
                video = [video_file_list[i] for i in frame_idxs]

            video = torch.cat(
                [get_single_image_crop(image, bbox).unsqueeze(0) for image, bbox in zip(video, bbox)], dim=0
            )

            target['video'] = video

        return target