Ejemplo n.º 1
0
        prediction = camera_to_world(prediction, R=rot, t=0)
        # We don't have the trajectory, but at least we can rebase the height
        prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    
    anim_output = {'Reconstruction': prediction}
    if ground_truth is not None and not args.viz_no_ground_truth:
        anim_output['Ground truth'] = ground_truth
    
    input_keypoints = image_coordinates(input_keypoints[..., :2], w=width_of, h=height_of)

    manual_fps = 25

    from common.visualization import render_animation
    render_animation(input_keypoints, anim_output,
                     dataset.skeleton(), manual_fps, args.viz_bitrate, cam['azimuth'], args.viz_output,
                     limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size,
                     input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']),
                     input_video_skip=args.viz_skip)
    
else:
    print('Evaluating...')
    all_actions = {}
    all_actions_by_subject = {}
    for subject in subjects_test:
        if subject not in all_actions_by_subject:
            all_actions_by_subject[subject] = {}

        for action in dataset[subject].keys():
            action_name = action.split(' ')[0]
            if action_name not in all_actions:
                all_actions[action_name] = []
Ejemplo n.º 2
0
def main(args):
    print('==> Using settings {}'.format(args))

    print('==> Loading dataset...')
    dataset_path = path.join('data', 'data_3d_' + args.dataset + '.npz')
    if args.dataset == 'h36m':
        from common.h36m_dataset import Human36mDataset
        dataset = Human36mDataset(dataset_path)
    else:
        raise KeyError('Invalid dataset')

    print('==> Preparing data...')
    dataset = read_3d_data(dataset)

    print('==> Loading 2D detections...')
    keypoints = create_2d_data(path.join('data', 'data_2d_' + args.dataset + '.npz'), dataset)

    cudnn.benchmark = True
    device = torch.device("cuda")

    # Create model
    print("==> Creating model...")

    if args.architecture == 'pose_gtac':
        from models.pose_gtac import PoseGTAC
        from common.graph_utils import adj_mx_from_skeleton
        p_dropout = (None if args.dropout == 0.0 else args.dropout)
        model_pos = PoseGTAC(args.hid_dim, p_dropout=p_dropout).to(device)
    else:
        raise KeyError('Invalid model architecture')

    print("==> Total parameters: {:.2f}M".format(sum(p.numel() for p in model_pos.parameters()) / 1000000.0))

    # Resume from a checkpoint
    ckpt_path = args.evaluate

    if path.isfile(ckpt_path):
        print("==> Loading checkpoint '{}'".format(ckpt_path))
        ckpt = torch.load(ckpt_path)
        start_epoch = ckpt['epoch']
        error_best = ckpt['error']
        model_pos.load_state_dict(ckpt['state_dict'])
        print("==> Loaded checkpoint (Epoch: {} | Error: {})".format(start_epoch, error_best))
    else:
        raise RuntimeError("==> No checkpoint found at '{}'".format(ckpt_path))

    print('==> Rendering...')

    poses_2d = keypoints[args.viz_subject][args.viz_action]
    out_poses_2d = poses_2d[args.viz_camera]
    out_actions = [args.viz_camera] * out_poses_2d.shape[0]

    poses_3d = dataset[args.viz_subject][args.viz_action]['positions_3d']
    assert len(poses_3d) == len(poses_2d), 'Camera count mismatch'
    out_poses_3d = poses_3d[args.viz_camera]

    ground_truth = dataset[args.viz_subject][args.viz_action]['positions_3d'][args.viz_camera].copy()

    input_keypoints = out_poses_2d.copy()
    render_loader = DataLoader(PoseGenerator([out_poses_3d], [out_poses_2d], [out_actions]), batch_size=args.batch_size,
                               shuffle=False, num_workers=args.num_workers, pin_memory=True)

    prediction = evaluate(render_loader, model_pos, device, args.architecture)[0]

    # Invert camera transformation
    cam = dataset.cameras()[args.viz_subject][args.viz_camera]
    prediction = camera_to_world(prediction, R=cam['orientation'], t=0)
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    ground_truth = camera_to_world(ground_truth, R=cam['orientation'], t=0)
    ground_truth[:, :, 2] -= np.min(ground_truth[:, :, 2])

    anim_output = {'Regression': prediction, 'Ground truth': ground_truth}
    input_keypoints = image_coordinates(input_keypoints[..., :2], w=cam['res_w'], h=cam['res_h'])
    render_animation(input_keypoints, anim_output, dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'],
                     args.viz_output, limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size,
                     input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']),
                     input_video_skip=args.viz_skip)
Ejemplo n.º 3
0
        anim_output = {'Reconstruction': prediction}
        if ground_truth is not None and not args.viz_no_ground_truth:
            anim_output['Ground truth'] = ground_truth

        input_keypoints = image_coordinates(input_keypoints[..., :2],
                                            w=cam['res_w'],
                                            h=cam['res_h'])

        from common.visualization import render_animation
        render_animation(input_keypoints,
                         keypoints_metadata,
                         anim_output,
                         dataset.skeleton(),
                         dataset.fps(),
                         args.viz_bitrate,
                         cam['azimuth'],
                         args.viz_output,
                         limit=args.viz_limit,
                         downsample=args.viz_downsample,
                         size=args.viz_size,
                         input_video_path=args.viz_video,
                         viewport=(cam['res_w'], cam['res_h']),
                         input_video_skip=args.viz_skip)

else:
    print('Evaluating...')
    all_actions = {}
    all_actions_by_subject = {}
    for subject in subjects_test:
        if subject not in all_actions_by_subject:
            all_actions_by_subject[subject] = {}
Ejemplo n.º 4
0
def the_main_kaboose(args):
    print(args)

    try:
        # Create checkpoint directory if it does not exist
        os.makedirs(args.checkpoint)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise RuntimeError('Unable to create checkpoint directory:',
                               args.checkpoint)

    print('Loading dataset...')
    dataset_path = 'data/data_3d_' + args.dataset + '.npz'
    if args.dataset == 'h36m':
        from common.h36m_dataset import Human36mDataset
        dataset = Human36mDataset(dataset_path)
    elif args.dataset.startswith('humaneva'):
        from common.humaneva_dataset import HumanEvaDataset
        dataset = HumanEvaDataset(dataset_path)
    elif args.dataset.startswith('custom'):
        from common.custom_dataset import CustomDataset
        dataset = CustomDataset('data/data_2d_' + args.dataset + '_' +
                                args.keypoints + '.npz')
    else:
        raise KeyError('Invalid dataset')

    print('Preparing data...')
    for subject in dataset.subjects():
        for action in dataset[subject].keys():
            anim = dataset[subject][action]

            # this only works when training.
            if 'positions' in anim:
                positions_3d = []
                for cam in anim['cameras']:
                    pos_3d = world_to_camera(anim['positions'],
                                             R=cam['orientation'],
                                             t=cam['translation'])
                    pos_3d[:,
                           1:] -= pos_3d[:, :
                                         1]  # Remove global offset, but keep trajectory in first position
                    positions_3d.append(pos_3d)
                anim['positions_3d'] = positions_3d

    print('Loading 2D detections...')
    keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints +
                        '.npz',
                        allow_pickle=True)
    keypoints_metadata = keypoints['metadata'].item()
    keypoints_symmetry = keypoints_metadata['keypoints_symmetry']
    kps_left, kps_right = list(keypoints_symmetry[0]), list(
        keypoints_symmetry[1])
    joints_left, joints_right = list(dataset.skeleton().joints_left()), list(
        dataset.skeleton().joints_right())
    keypoints = keypoints['positions_2d'].item()

    # THIS IS ABOUT TRAINING. ignore pls.
    for subject in dataset.subjects():
        assert subject in keypoints, 'Subject {} is missing from the 2D detections dataset'.format(
            subject)
        for action in dataset[subject].keys():
            assert action in keypoints[
                subject], 'Action {} of subject {} is missing from the 2D detections dataset'.format(
                    action, subject)
            if 'positions_3d' not in dataset[subject][action]:
                continue

            for cam_idx in range(len(keypoints[subject][action])):

                # We check for >= instead of == because some videos in H3.6M contain extra frames
                mocap_length = dataset[subject][action]['positions_3d'][
                    cam_idx].shape[0]
                assert keypoints[subject][action][cam_idx].shape[
                    0] >= mocap_length

                if keypoints[subject][action][cam_idx].shape[0] > mocap_length:
                    # Shorten sequence
                    keypoints[subject][action][cam_idx] = keypoints[subject][
                        action][cam_idx][:mocap_length]

            assert len(keypoints[subject][action]) == len(
                dataset[subject][action]['positions_3d'])

    # normalize camera frame?
    for subject in keypoints.keys():
        for action in keypoints[subject]:
            for cam_idx, kps in enumerate(keypoints[subject][action]):
                # Normalize camera frame
                cam = dataset.cameras()[subject][cam_idx]
                kps[..., :2] = normalize_screen_coordinates(kps[..., :2],
                                                            w=cam['res_w'],
                                                            h=cam['res_h'])
                keypoints[subject][action][cam_idx] = kps

    subjects_train = args.subjects_train.split(',')
    subjects_semi = [] if not args.subjects_unlabeled else args.subjects_unlabeled.split(
        ',')
    if not args.render:
        subjects_test = args.subjects_test.split(',')
    else:
        subjects_test = [args.viz_subject]

    semi_supervised = len(subjects_semi) > 0
    if semi_supervised and not dataset.supports_semi_supervised():
        raise RuntimeError(
            'Semi-supervised training is not implemented for this dataset')

    def fetch(subjects, action_filter=None, subset=1, parse_3d_poses=True):
        out_poses_3d = []
        out_poses_2d = []
        out_camera_params = []
        for subject in subjects:
            print("gonna check actions for subject " + subject)

        for subject in subjects:
            for action in keypoints[subject].keys():
                if action_filter is not None:
                    found = False
                    for a in action_filter:
                        if action.startswith(a):
                            found = True
                            break
                    if not found:
                        continue

                poses_2d = keypoints[subject][action]
                for i in range(len(poses_2d)):  # Iterate across cameras
                    out_poses_2d.append(poses_2d[i])

                if subject in dataset.cameras():
                    cams = dataset.cameras()[subject]
                    assert len(cams) == len(poses_2d), 'Camera count mismatch'
                    for cam in cams:
                        if 'intrinsic' in cam:
                            out_camera_params.append(cam['intrinsic'])

                if parse_3d_poses and 'positions_3d' in dataset[subject][
                        action]:
                    poses_3d = dataset[subject][action]['positions_3d']
                    assert len(poses_3d) == len(
                        poses_2d), 'Camera count mismatch'
                    for i in range(len(poses_3d)):  # Iterate across cameras
                        out_poses_3d.append(poses_3d[i])

        if len(out_camera_params) == 0:
            out_camera_params = None
        if len(out_poses_3d) == 0:
            out_poses_3d = None

        stride = args.downsample
        if subset < 1:
            for i in range(len(out_poses_2d)):
                n_frames = int(
                    round(len(out_poses_2d[i]) // stride * subset) * stride)
                start = deterministic_random(
                    0,
                    len(out_poses_2d[i]) - n_frames + 1,
                    str(len(out_poses_2d[i])))
                out_poses_2d[i] = out_poses_2d[i][start:start +
                                                  n_frames:stride]
                if out_poses_3d is not None:
                    out_poses_3d[i] = out_poses_3d[i][start:start +
                                                      n_frames:stride]
        elif stride > 1:
            # Downsample as requested
            for i in range(len(out_poses_2d)):
                out_poses_2d[i] = out_poses_2d[i][::stride]
                if out_poses_3d is not None:
                    out_poses_3d[i] = out_poses_3d[i][::stride]

        return out_camera_params, out_poses_3d, out_poses_2d

    action_filter = None if args.actions == '*' else args.actions.split(',')
    if action_filter is not None:
        print('Selected actions:', action_filter)

    # when you run inference, this returns None, None, and the keypoints array renamed as poses_valid_2d
    cameras_valid, poses_valid, poses_valid_2d = fetch(subjects_test,
                                                       action_filter)

    filter_widths = [int(x) for x in args.architecture.split(',')]
    if not args.disable_optimizations and not args.dense and args.stride == 1:
        # Use optimized model for single-frame predictions
        shape_2 = poses_valid_2d[0].shape[-2]
        shape_1 = poses_valid_2d[0].shape[-1]
        numJoints = dataset.skeleton().num_joints()
        model_pos_train = TemporalModelOptimized1f(shape_2,
                                                   shape_1,
                                                   numJoints,
                                                   filter_widths=filter_widths,
                                                   causal=args.causal,
                                                   dropout=args.dropout,
                                                   channels=args.channels)
    else:
        # When incompatible settings are detected (stride > 1, dense filters, or disabled optimization) fall back to normal model
        model_pos_train = TemporalModel(poses_valid_2d[0].shape[-2],
                                        poses_valid_2d[0].shape[-1],
                                        dataset.skeleton().num_joints(),
                                        filter_widths=filter_widths,
                                        causal=args.causal,
                                        dropout=args.dropout,
                                        channels=args.channels,
                                        dense=args.dense)

    model_pos = TemporalModel(poses_valid_2d[0].shape[-2],
                              poses_valid_2d[0].shape[-1],
                              dataset.skeleton().num_joints(),
                              filter_widths=filter_widths,
                              causal=args.causal,
                              dropout=args.dropout,
                              channels=args.channels,
                              dense=args.dense)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2  # Padding on each side
    if args.causal:
        print('INFO: Using causal convolutions')
        causal_shift = pad
    else:
        causal_shift = 0

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()
        model_pos_train = model_pos_train.cuda()

    if args.resume or args.evaluate:
        chk_filename = os.path.join(
            args.checkpoint, args.resume if args.resume else args.evaluate)
        print('Loading checkpoint', chk_filename)
        checkpoint = torch.load(chk_filename,
                                map_location=lambda storage, loc: storage)
        print('This model was trained for {} epochs'.format(
            checkpoint['epoch']))
        model_pos_train.load_state_dict(checkpoint['model_pos'])
        model_pos.load_state_dict(checkpoint['model_pos'])

        if args.evaluate and 'model_traj' in checkpoint:
            # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild)
            model_traj = TemporalModel(poses_valid_2d[0].shape[-2],
                                       poses_valid_2d[0].shape[-1],
                                       1,
                                       filter_widths=filter_widths,
                                       causal=args.causal,
                                       dropout=args.dropout,
                                       channels=args.channels,
                                       dense=args.dense)
            if torch.cuda.is_available():
                model_traj = model_traj.cuda()
            model_traj.load_state_dict(checkpoint['model_traj'])
        else:
            model_traj = None

    test_generator = UnchunkedGenerator(cameras_valid,
                                        poses_valid,
                                        poses_valid_2d,
                                        pad=pad,
                                        causal_shift=causal_shift,
                                        augment=False,
                                        kps_left=kps_left,
                                        kps_right=kps_right,
                                        joints_left=joints_left,
                                        joints_right=joints_right)
    print('INFO: Testing on {} frames'.format(test_generator.num_frames()))

    # Evaluate
    def evaluate(eval_generator,
                 action=None,
                 return_predictions=False,
                 use_trajectory_model=False):
        epoch_loss_3d_pos = 0
        epoch_loss_3d_pos_procrustes = 0
        epoch_loss_3d_pos_scale = 0
        epoch_loss_3d_vel = 0
        with torch.no_grad():
            if not use_trajectory_model:
                model_pos.eval()
            else:
                model_traj.eval()
            N = 0
            for _, batch, batch_2d in eval_generator.next_epoch():
                inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
                if torch.cuda.is_available():
                    inputs_2d = inputs_2d.cuda()

                # Positional model
                if not use_trajectory_model:
                    predicted_3d_pos = model_pos(inputs_2d)
                else:
                    predicted_3d_pos = model_traj(inputs_2d)

                # Test-time augmentation (if enabled)
                if eval_generator.augment_enabled():
                    # Undo flipping and take average with non-flipped version
                    predicted_3d_pos[1, :, :, 0] *= -1
                    if not use_trajectory_model:
                        predicted_3d_pos[1, :, joints_left +
                                         joints_right] = predicted_3d_pos[
                                             1, :, joints_right + joints_left]
                    predicted_3d_pos = torch.mean(predicted_3d_pos,
                                                  dim=0,
                                                  keepdim=True)

                if return_predictions:
                    return predicted_3d_pos.squeeze(0).cpu().numpy()

                inputs_3d = torch.from_numpy(batch.astype('float32'))
                if torch.cuda.is_available():
                    inputs_3d = inputs_3d.cuda()
                inputs_3d[:, :, 0] = 0
                if eval_generator.augment_enabled():
                    inputs_3d = inputs_3d[:1]

                error = mpjpe(predicted_3d_pos, inputs_3d)
                epoch_loss_3d_pos_scale += inputs_3d.shape[
                    0] * inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos,
                                                      inputs_3d).item()

                epoch_loss_3d_pos += inputs_3d.shape[0] * inputs_3d.shape[
                    1] * error.item()
                N += inputs_3d.shape[0] * inputs_3d.shape[1]

                inputs = inputs_3d.cpu().numpy().reshape(
                    -1, inputs_3d.shape[-2], inputs_3d.shape[-1])
                predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(
                    -1, inputs_3d.shape[-2], inputs_3d.shape[-1])

                epoch_loss_3d_pos_procrustes += inputs_3d.shape[
                    0] * inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs)

                # Compute velocity error
                epoch_loss_3d_vel += inputs_3d.shape[0] * inputs_3d.shape[
                    1] * mean_velocity_error(predicted_3d_pos, inputs)

        if action is None:
            print('----------')
        else:
            print('----' + action + '----')
        e1 = (epoch_loss_3d_pos / N) * 1000
        e2 = (epoch_loss_3d_pos_procrustes / N) * 1000
        e3 = (epoch_loss_3d_pos_scale / N) * 1000
        ev = (epoch_loss_3d_vel / N) * 1000
        print('Test time augmentation:', eval_generator.augment_enabled())
        print('Protocol #1 Error (MPJPE):', e1, 'mm')
        print('Protocol #2 Error (P-MPJPE):', e2, 'mm')
        print('Protocol #3 Error (N-MPJPE):', e3, 'mm')
        print('Velocity Error (MPJVE):', ev, 'mm')
        print('----------')

        return e1, e2, e3, ev

    if args.render:
        print('Rendering...')

        input_keypoints = keypoints[args.viz_subject][args.viz_action][
            args.viz_camera].copy()
        ground_truth = None
        if args.viz_subject in dataset.subjects(
        ) and args.viz_action in dataset[args.viz_subject]:
            if 'positions_3d' in dataset[args.viz_subject][args.viz_action]:
                ground_truth = dataset[args.viz_subject][
                    args.viz_action]['positions_3d'][args.viz_camera].copy()
        if ground_truth is None:
            print(
                'INFO: this action is unlabeled. Ground truth will not be rendered.'
            )

        gen = UnchunkedGenerator(None,
                                 None, [input_keypoints],
                                 pad=pad,
                                 causal_shift=causal_shift,
                                 augment=args.test_time_augmentation,
                                 kps_left=kps_left,
                                 kps_right=kps_right,
                                 joints_left=joints_left,
                                 joints_right=joints_right)
        prediction = evaluate(gen, return_predictions=True)
        if model_traj is not None and ground_truth is None:
            prediction_traj = evaluate(gen,
                                       return_predictions=True,
                                       use_trajectory_model=True)
            prediction += prediction_traj

        if args.viz_export is not None:
            print('Exporting joint positions to', args.viz_export)
            # Predictions are in camera space
            np.save(args.viz_export, prediction)

        if args.viz_output is not None:
            if ground_truth is not None:
                # Reapply trajectory
                trajectory = ground_truth[:, :1]
                ground_truth[:, 1:] += trajectory
                prediction += trajectory

            # Invert camera transformation
            cam = dataset.cameras()[args.viz_subject][args.viz_camera]
            if ground_truth is not None:
                prediction = camera_to_world(prediction,
                                             R=cam['orientation'],
                                             t=cam['translation'])
                ground_truth = camera_to_world(ground_truth,
                                               R=cam['orientation'],
                                               t=cam['translation'])
            else:
                # If the ground truth is not available, take the camera extrinsic params from a random subject.
                # They are almost the same, and anyway, we only need this for visualization purposes.
                for subject in dataset.cameras():
                    if 'orientation' in dataset.cameras()[subject][
                            args.viz_camera]:
                        rot = dataset.cameras()[subject][
                            args.viz_camera]['orientation']
                        break
                prediction = camera_to_world(prediction, R=rot, t=0)
                # We don't have the trajectory, but at least we can rebase the height
                prediction[:, :, 2] -= np.min(prediction[:, :, 2])

            anim_output = {'Reconstruction': prediction}
            if ground_truth is not None and not args.viz_no_ground_truth:
                anim_output['Ground truth'] = ground_truth

            input_keypoints = image_coordinates(input_keypoints[..., :2],
                                                w=cam['res_w'],
                                                h=cam['res_h'])

            print("Writing to json")

            import json
            # format the data in the same format as mediapipe, so we can load it in unity with the same script
            # we need a list (frames) of lists of 3d landmarks.
            # but prediction[] only has 17 landmarks, and we need 25 in our unity script
            unity_landmarks = prediction.tolist()

            with open(args.output_json, "w") as json_file:
                json.dump(unity_landmarks, json_file)

            if args.rendervideo == "yes":

                from common.visualization import render_animation
                render_animation(input_keypoints,
                                 keypoints_metadata,
                                 anim_output,
                                 dataset.skeleton(),
                                 dataset.fps(),
                                 args.viz_bitrate,
                                 cam['azimuth'],
                                 args.viz_output,
                                 limit=args.viz_limit,
                                 downsample=args.viz_downsample,
                                 size=args.viz_size,
                                 input_video_path=args.viz_video,
                                 viewport=(cam['res_w'], cam['res_h']),
                                 input_video_skip=args.viz_skip)
Ejemplo n.º 5
0
def main(args):
    assert alpha_pose, 'detector_2d should be in ({alpha, hr, open}_pose)'

    # 2D kpts loads or generate
    if not args.input_npz:
        video_name = args.viz_video
        keypoints = alpha_pose(video_name)
    else:
        npz = np.load(args.input_npz)
        keypoints = npz['kpts']  # (N, 17, 2)

    keypoints_symmetry = metadata['keypoints_symmetry']
    kps_left, kps_right = list(keypoints_symmetry[0]), list(keypoints_symmetry[1])
    joints_left, joints_right = list([4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16])

    # normlization keypoints  Suppose using the camera parameter
    keypoints = normalize_screen_coordinates(keypoints[..., :2], w=1000, h=1002)

    model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()

    ckpt, time1 = ckpt_time(time0)
    print('-------------- load data spends {:.2f} seconds'.format(ckpt))

    # load trained model
    chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage)  # 把loc映射到storage
    model_pos.load_state_dict(checkpoint['model_pos'])

    ckpt, time2 = ckpt_time(time1)
    print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt))

    #  Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3]
    receptive_field = model_pos.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    print('Rendering...')
    input_keypoints = keypoints.copy()
    gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation,
                             kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
    prediction = evaluate(gen, model_pos, return_predictions=True)

    # save 3D joint points
    np.save('outputs/{0}_3d_output.npy'.format(args.basename), prediction, allow_pickle=True)

    rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32)
    prediction = camera_to_world(prediction, R=rot, t=0)

    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    anim_output = {'Reconstruction': prediction}
    input_keypoints = image_coordinates(input_keypoints[..., :2], w=1000, h=1002)

    ckpt, time3 = ckpt_time(time2)
    print('-------------- generate reconstruction 3D data spends {:.2f} seconds'.format(ckpt))

    from common.visualization import render_animation
    render_animation(input_keypoints, anim_output, Skeleton(), 25, args.viz_bitrate, np.array(70., dtype=np.float32), args.viz_output,
                     limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size,
                     input_video_path=args.viz_video, viewport=(1000, 1002), input_video_skip=args.viz_skip)

    ckpt, time4 = ckpt_time(time3)
    print('total spend {:2f} second'.format(ckpt))
Ejemplo n.º 6
0
ckpt, time3 = ckpt_time(time2)
print('------- generate reconstruction 3D data spends {:.2f} seconds'.format(
    ckpt))

if not args.viz_output:
    args.viz_output = 'xxxx.gif'

# args.viz_limit = 100

from common.visualization import render_animation
render_animation(input_keypoints,
                 anim_output,
                 skeleton(),
                 25,
                 args.viz_bitrate,
                 np.array(70., dtype=np.float32),
                 args.viz_output,
                 limit=args.viz_limit,
                 downsample=args.viz_downsample,
                 size=args.viz_size,
                 input_video_path=args.viz_video,
                 viewport=(1000, 1002),
                 input_video_skip=args.viz_skip)

ckpt, time4 = ckpt_time(time3)
print('------- generaye video spends {:.2f} seconds'.format(ckpt))

ckpt, _ = ckpt_time(time0)
print(' =================== total spends {:.2f} seconds'.format(ckpt))
    pace_net.load_weights('weights_pace_network.bin')
    
    model = PoseNetworkLongTerm(30, dataset.skeleton())
    if torch.cuda.is_available():
        model.cuda()
    model.load_weights(long_term_weights_path) # Load pretrained model
    
    if len(sys.argv) == 1:
        for subject in dataset.subjects():
            for action in dataset[subject].keys():
                if '_d0' not in action or '_m' in action:
                    continue
                print('Showing subject %s, action %s.' % (subject, action))
                annotated_spline = pace_net.predict(dataset[subject][action]['spline'])
                animation = model.generate_motion(annotated_spline, dataset[subject][action])
                render_animation(animation, dataset.skeleton(), dataset.fps(), output='interactive')
    else:
        # Visualize a particular action
        action = sys.argv[1]
        if action not in dataset[default_subject].keys():
            raise ValueError("The specified animation does not exist")
        annotated_spline = pace_net.predict(dataset[default_subject][action]['spline'])
        animation = model.generate_motion(annotated_spline, dataset[default_subject][action])
        if len(sys.argv) == 2:
            output_mode = 'interactive'
        else:
            plt.switch_backend('agg')
            output_mode = sys.argv[2]
        render_animation(animation, dataset.skeleton(), dataset.fps(), output=output_mode)
        
Ejemplo n.º 8
0
def main():
    #cap = cv2.VideoCapture(0)
    cap = cv2.VideoCapture('D://data//videos//VID_29551_cam0_crop.mkv')

    #parser = argparse.ArgumentParser()
    opWrapper = op.WrapperPython()

    params = dict()
    params["model_folder"] = "D://models//"

    opWrapper.configure(params)

    opWrapper.start()

    if not glfw.init():
        return

    window = glfw.create_window(w_width, w_height, "My OpenGL window", None,
                                None)

    if not window:
        glfw.terminate()
        return

    glfw.make_context_current(window)
    glfw.set_window_size_callback(window, window_resize)

    vertex_shader = """
    #version 330
    in vec3 position;

    uniform mat4 view;
    uniform mat4 model;
    uniform mat4 projection;

    void main()
    {
        gl_Position = projection * view * model * vec4(position, 1.0f);
    }
    """

    fragment_shader = """
    #version 330
    out vec4 outColor;
    void main()
    {
        outColor = vec4(1.0f,1.0f,1.0f,1.0f);
    }
    """
    shader = OpenGL.GL.shaders.compileProgram(
        OpenGL.GL.shaders.compileShader(vertex_shader, GL_VERTEX_SHADER),
        OpenGL.GL.shaders.compileShader(fragment_shader, GL_FRAGMENT_SHADER))

    VBO = glGenBuffers(1)
    glBindBuffer(GL_ARRAY_BUFFER, VBO)
    glBufferData(GL_ARRAY_BUFFER, 17 * 3 * 4, None, GL_DYNAMIC_DRAW)

    EBO = glGenBuffers(1)
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, EBO)
    glBufferData(GL_ELEMENT_ARRAY_BUFFER, 32 * 8, parentsIndices,
                 GL_STATIC_DRAW)

    position = glGetAttribLocation(shader, "position")
    glVertexAttribPointer(position, 3, GL_FLOAT, GL_FALSE, 0,
                          ctypes.c_void_p(0))
    glEnableVertexAttribArray(position)

    glUseProgram(shader)

    view = pyrr.matrix44.create_from_translation(pyrr.Vector3([0.0, 0.0,
                                                               -3.0]))
    projection = pyrr.matrix44.create_perspective_projection_matrix(
        45.0, w_width / w_height, 0.1, 100.0)
    model = pyrr.matrix44.create_from_translation(pyrr.Vector3([0.0, 0.0,
                                                                0.0]))

    view_loc = glGetUniformLocation(shader, "view")
    proj_loc = glGetUniformLocation(shader, "projection")
    model_loc = glGetUniformLocation(shader, "model")

    glUniformMatrix4fv(view_loc, 1, GL_FALSE, view)
    glUniformMatrix4fv(proj_loc, 1, GL_FALSE, projection)
    glUniformMatrix4fv(model_loc, 1, GL_FALSE, model)

    glClearColor(114.0 / 255.0, 144.0 / 255.0, 154.0 / 255.0, 1.0)
    glEnable(GL_DEPTH_TEST)
    glViewport(0, 0, w_width, w_height)

    args = parse_args()
    print(args)

    try:
        # Create checkpoint directory if it does not exist
        os.makedirs(args.checkpoint)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise RuntimeError('Unable to create checkpoint directory:',
                               args.checkpoint)

    print('Loading 2D detections...')

    keypoints = np.load('data/data_2d_' + args.keypoints + '.npz')

    keypoints = keypoints['positions_2d'].item()

    subject = 'S1'

    action = 'Directions 1'

    width_of = 410
    height_of = 374

    for cam_idx, kps in enumerate(keypoints[subject][action]):

        # Normalize camera frame
        # cam = dataset.cameras()[subject][cam_idx]
        kps[..., :2] = normalize_screen_coordinates(kps[..., :2],
                                                    w=width_of,
                                                    h=height_of)
        keypoints[subject][action][cam_idx] = kps

    subjects_train = args.subjects_train.split(',')
    subjects_semi = [] if not args.subjects_unlabeled else args.subjects_unlabeled.split(
        ',')
    subjects_test = args.subjects_test.split(',')

    semi_supervised = len(subjects_semi) > 0
    if semi_supervised and not dataset.supports_semi_supervised():
        raise RuntimeError(
            'Semi-supervised training is not implemented for this dataset')

    action_filter = None if args.actions == '*' else args.actions.split(',')
    if action_filter is not None:
        print('Selected actions:', action_filter)

    cameras_valid, poses_valid, poses_valid_2d = fetch(subjects_test,
                                                       keypoints,
                                                       args.downsample,
                                                       action_filter)

    filter_widths = [int(x) for x in args.architecture.split(',')]

    # IF RENDERING TO A VIDEO
    if args.viz_output:
        model_pos = TemporalModel(poses_valid_2d[0].shape[1],
                                  poses_valid_2d[0].shape[2],
                                  17,
                                  filter_widths=filter_widths,
                                  causal=args.causal,
                                  dropout=args.dropout,
                                  channels=args.channels,
                                  dense=args.dense)
    else:
        model_pos = TemporalModelOptimized1f(poses_valid_2d[0].shape[1],
                                             poses_valid_2d[0].shape[2],
                                             17,
                                             filter_widths=filter_widths,
                                             causal=args.causal,
                                             dropout=args.dropout,
                                             channels=args.channels)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2  # Padding on each side
    if args.causal:
        print('INFO: Using causal convolutions')
        causal_shift = pad
    else:
        causal_shift = 0

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()
    #    model_pos_train = model_pos_train.cuda()

    if args.resume or args.evaluate:
        chk_filename = os.path.join(
            args.checkpoint, args.resume if args.resume else args.evaluate)
        print('Loading checkpoint', chk_filename)
        checkpoint = torch.load(chk_filename,
                                map_location=lambda storage, loc: storage)
        print('This model was trained for {} epochs'.format(
            checkpoint['epoch']))
        model_pos.load_state_dict(checkpoint['model_pos'])

    # IF RENDERING TO A VIDEO
    if args.viz_output:

        print('Rendering...')
        my_action = 'Directions 1'

        input_keypoints = keypoints[args.viz_subject][my_action][
            args.viz_camera].copy()

        gen = UnchunkedGenerator(None,
                                 None, [input_keypoints],
                                 pad=pad,
                                 causal_shift=causal_shift,
                                 augment=args.test_time_augmentation,
                                 kps_left=kps_left,
                                 kps_right=kps_right,
                                 joints_left=joints_left,
                                 joints_right=joints_right)

        prediction = evaluate(gen, model_pos, return_predictions=True)

        ground_truth = None

        # these values taken from a camera in the h36m dataset, would be good to get/determine values rom stereo calibration of the pip cameras
        prediction = camera_to_world(
            prediction,
            R=[0.14070565, -0.15007018, -0.7552408, 0.62232804],
            t=[1.841107, 4.9552846, 0.5634454])
        # We don't have the trajectory, but at least we can rebase the height
        prediction[:, :, 2] -= np.min(prediction[:, :, 2])

        anim_output = {'Reconstruction': prediction}

        input_keypoints = image_coordinates(input_keypoints[..., :2],
                                            w=width_of,
                                            h=height_of)

        manual_fps = 25

        np.savez('out_3D_vp3d', anim_output['Reconstruction'])
        camAzimuth = 70.0
        from common.visualization import render_animation
        render_animation(input_keypoints,
                         anim_output,
                         manual_fps,
                         args.viz_bitrate,
                         camAzimuth,
                         args.viz_output,
                         limit=args.viz_limit,
                         downsample=args.viz_downsample,
                         size=args.viz_size,
                         input_video_path=args.viz_video,
                         viewport=(width_of, height_of),
                         input_video_skip=args.viz_skip)
    # IF RENDERING LIVE

    else:
        print('Rendering...')
        my_action = 'Directions 1'

        input_keypoints = keypoints[args.viz_subject][my_action][
            args.viz_camera].copy()

        gen = UnchunkedGenerator(None,
                                 None, [input_keypoints],
                                 pad=pad,
                                 causal_shift=causal_shift,
                                 augment=args.test_time_augmentation,
                                 kps_left=kps_left,
                                 kps_right=kps_right,
                                 joints_left=joints_left,
                                 joints_right=joints_right)

        prediction = evaluateLive(gen,
                                  model_pos,
                                  VBO,
                                  window,
                                  model_loc,
                                  cap,
                                  opWrapper,
                                  return_predictions=True)
        glfw.terminate()
        cap.release()
        cv2.destroyAllWindows()