Ejemplo n.º 1
0
    def __getitem__(self, index):
        TParsing, SPose, SParsing, SFG, seq_idx = self.update_frame_idx_parser(
            self.sparsing_paths, index)
        sparsing_paths = self.sparsing_paths[seq_idx]
        sfg_paths = self.sfg_paths[seq_idx]
        n_frames_total, start_idx, t_step = get_video_params(
            self.opt, self.n_frames_total, len(sparsing_paths), self.frame_idx)

        sparsing = Image.open(sparsing_paths[start_idx])
        sfg = Image.open(sfg_paths[start_idx]).convert('RGB')
        size = sfg.size

        BigSizeFlag = True
        if size[0] / size[1] > 1:
            BigSizeFlag = True
        else:
            BigSizeFlag = False

        if BigSizeFlag:
            params = get_img_params(self.opt, (1920, 1080))
        else:
            params = get_img_params(self.opt, size)

        tparsing_path = self.tparsing_paths[seq_idx][0]

        TParsing = self.get_TImage(tparsing_path, size, params, BigSizeFlag)
        TParsing = self.crop(TParsing)

        frame_range = list(range(n_frames_total)) if (
            self.opt.isTrain
            or self.TPose is None) else [self.opt.n_frames_G - 1]
        for i in frame_range:
            sparsing_path = sparsing_paths[start_idx + i * t_step]
            spose_path = self.spose_paths[seq_idx][start_idx + i * t_step]
            sfg_path = sfg_paths[start_idx + i * t_step]

            SPose_i, SParsing_i, SFG_i = self.get_SImage(
                spose_path, sparsing_path, sfg_path, size, params, BigSizeFlag)

            SParsing_i = self.crop(SParsing_i)
            SPose_i = self.crop(SPose_i)
            SFG_i = self.crop(SFG_i)

            SPose = concat_frame(SPose, SPose_i, n_frames_total)
            SParsing = concat_frame(SParsing, SParsing_i, n_frames_total)
            SFG = concat_frame(SFG, SFG_i, n_frames_total)

        if not self.opt.isTrain:
            self.TParsing, self.SPose, self.SParsing, self.SFG = TParsing, SPose, SParsing, SFG
            self.frame_idx += 1
        change_seq = False if self.opt.isTrain else self.change_seq
        return_list = {
            'TParsing': TParsing,
            'SPose': SPose,
            'SParsing': SParsing,
            'SFG': SFG,
            'A_path': sparsing_path,
            'change_seq': change_seq
        }
        return return_list
Ejemplo n.º 2
0
    def __getitem__(self, index):
        tG = self.opt.n_frames_G
        A_paths = self.A_paths[index % self.n_of_seqs]
        B_paths = self.B_paths[index % self.n_of_seqs]
        if self.opt.use_instance:
            I_paths = self.I_paths[index % self.n_of_seqs]

        # setting parameters
        n_frames_total, start_idx, t_step = get_video_params(self.opt, self.n_frames_total, len(A_paths), index)

        # setting transformers
        B_img = Image.open(B_paths[start_idx]).convert('RGB')
        params = get_img_params(self.opt, B_img.size)
        transform_scaleB = get_transform(self.opt, params)
        transform_scaleA = get_transform(self.opt, params, method=Image.NEAREST, normalize=False) if self.A_is_label else transform_scaleB

        # read in images
        A = B = inst = 0
        for i in range(n_frames_total):
            A_path = A_paths[start_idx + i * t_step]
            B_path = B_paths[start_idx + i * t_step]
            Ai = self.get_image(A_path, transform_scaleA, is_label=self.A_is_label)
            Bi = self.get_image(B_path, transform_scaleB)

            A = Ai if i == 0 else torch.cat([A, Ai], dim=0)
            B = Bi if i == 0 else torch.cat([B, Bi], dim=0)

            if self.opt.use_instance:
                I_path = I_paths[start_idx + i * t_step]
                Ii = self.get_image(I_path, transform_scaleA) * 255.0
                inst = Ii if i == 0 else torch.cat([inst, Ii], dim=0)

        return_list = {'A': A, 'B': B, 'inst': inst, 'A_path': A_path, 'B_paths': B_path}
        return return_list
Ejemplo n.º 3
0
    def __getitem__(self, index):
        self.A, self.B, self.I, seq_idx = self.update_frame_idx(self.A_paths, index)
        tG = self.opt.n_frames_G
              
        A_img = Image.open(self.A_paths[seq_idx][0]).convert('RGB')        
        params = get_img_params(self.opt, A_img.size)
        transform_scaleB = get_transform(self.opt, params)
        transform_scaleA = get_transform(self.opt, params, method=Image.NEAREST, normalize=False) if self.A_is_label else transform_scaleB
        frame_range = list(range(tG)) if self.A is None else [tG-1]
           
        for i in frame_range:                                                   
            A_path = self.A_paths[seq_idx][self.frame_idx + i]            
            Ai = self.get_image(A_path, transform_scaleA, is_label=self.A_is_label)            
            self.A = concat_frame(self.A, Ai, tG)

            if self.use_real:
                B_path = self.B_paths[seq_idx][self.frame_idx + i]
                Bi = self.get_image(B_path, transform_scaleB)                
                self.B = concat_frame(self.B, Bi, tG)
            else:
                self.B = 0

            if self.opt.use_instance:
                I_path = self.I_paths[seq_idx][self.frame_idx + i]
                Ii = self.get_image(I_path, transform_scaleA) * 255.0                
                self.I = concat_frame(self.I, Ii, tG)
            else:
                self.I = 0

        self.frame_idx += 1        
        return_list = {'A': self.A, 'B': self.B, 'inst': self.I, 'A_path': A_path, 'change_seq': self.change_seq}
        return return_list
Ejemplo n.º 4
0
    def __getitem__(self, index):
        A, B, _, seq_idx = self.update_frame_idx(self.img_paths, index)
        img_paths = self.img_paths[seq_idx]
        n_frames_total, start_idx, t_step = get_video_params(
            self.opt, self.n_frames_total, len(img_paths), self.frame_idx)

        img = Image.open(img_paths[start_idx]).convert('RGB')
        size = img.size
        params = get_img_params(self.opt, size)

        frame_range = list(range(n_frames_total)) if (
            self.opt.isTrain or self.A is None) else [self.opt.n_frames_G - 1]
        for i in frame_range:
            img_path = img_paths[start_idx + i * t_step]
            if not self.opt.openpose_only:
                dp_path = self.dp_paths[seq_idx][start_idx + i * t_step]
                Di = self.get_image(dp_path,
                                    size,
                                    params,
                                    input_type='densepose')
                Di[2, :, :] = (
                    (Di[2, :, :] * 0.5 + 0.5) * 255 / 24 - 0.5) / 0.5
            if not self.opt.densepose_only:
                op_path = self.op_paths[seq_idx][start_idx + i * t_step]
                Oi = self.get_image(op_path,
                                    size,
                                    params,
                                    input_type='openpose')

            if self.opt.openpose_only:
                Ai = Oi
            elif self.opt.densepose_only:
                Ai = Di
            else:
                Ai = torch.cat([Di, Oi])
            Bi = self.get_image(img_path, size, params, input_type='img')

            Ai, Bi = self.crop(Ai), self.crop(
                Bi)  # only crop the central half region to save time
            A = concat_frame(A, Ai, n_frames_total)
            B = concat_frame(B, Bi, n_frames_total)

        if not self.opt.isTrain:
            self.A, self.B = A, B
            self.frame_idx += 1
        change_seq = False if self.opt.isTrain else self.change_seq
        return_list = {
            'A': A,
            'B': B,
            'inst': 0,
            'A_path': img_path,
            'change_seq': change_seq
        }

        return return_list
Ejemplo n.º 5
0
    def __getitem__(self, index):
        A, B, I, seq_idx = self.update_frame_idx(self.A_paths, index)
        A_paths = self.A_paths[seq_idx]
        B_paths = self.B_paths[seq_idx]
        n_frames_total, start_idx, t_step = get_video_params(
            self.opt, self.n_frames_total, len(A_paths), self.frame_idx)

        B_img = Image.open(B_paths[0]).convert('RGB')
        B_size = B_img.size
        points = np.loadtxt(A_paths[0], delimiter=',')
        is_first_frame = self.opt.isTrain or not hasattr(self, 'min_x')
        if is_first_frame:  # crop only the face region
            self.get_crop_coords(points, B_size)
        params = get_img_params(self.opt, self.crop(B_img).size)
        transform_scaleA = get_transform(self.opt,
                                         params,
                                         method=Image.BILINEAR,
                                         normalize=False)
        transform_label = get_transform(self.opt,
                                        params,
                                        method=Image.NEAREST,
                                        normalize=False)
        transform_scaleB = get_transform(self.opt, params)

        # read in images
        frame_range = list(range(n_frames_total)) if self.A is None else [
            self.opt.n_frames_G - 1
        ]
        for i in frame_range:
            A_path = A_paths[start_idx + i * t_step]
            B_path = B_paths[start_idx + i * t_step]
            B_img = Image.open(B_path)
            Ai, Li = self.get_face_image(A_path, transform_scaleA,
                                         transform_label, B_size, B_img)
            Bi = transform_scaleB(self.crop(B_img))
            A = concat_frame(A, Ai, n_frames_total)
            B = concat_frame(B, Bi, n_frames_total)
            I = concat_frame(I, Li, n_frames_total)

        if not self.opt.isTrain:
            self.A, self.B, self.I = A, B, I
            self.frame_idx += 1
        change_seq = False if self.opt.isTrain else self.change_seq
        return_list = {
            'A': A,
            'B': B,
            'inst': I,
            'A_path': A_path,
            'change_seq': change_seq
        }

        return return_list
    def __getitem__(self, index):
        opt = self.opt
        if opt.isTrain:
            np.random.seed()
            seq_idx = np.random.randint(self.n_of_seqs)
            L_paths = self.L_paths[seq_idx]
            I_paths = self.I_paths[seq_idx]
            ref_L_paths, ref_I_paths = L_paths, I_paths
        else:
            L_paths, I_paths = self.L_paths, self.I_paths
            ref_L_paths, ref_I_paths = self.ref_L_paths, self.ref_I_paths

        n_frames_total, start_idx, t_step, ref_indices = get_video_params(
            opt, self.n_frames_total, len(I_paths), index)
        w, h = opt.fineSize, int(opt.fineSize / opt.aspect_ratio)
        img_params = get_img_params(opt, (w, h))
        is_first_frame = opt.isTrain or index == 0

        transform_L = get_transform(opt,
                                    img_params,
                                    method=Image.BILINEAR,
                                    normalize=False)
        transform_I = get_transform(opt, img_params, color_aug=opt.isTrain)

        ### read in reference images
        Lr, Ir = self.Lr, self.Ir
        if is_first_frame:
            # get crop coordinates and stroke width
            points = self.read_data(ref_L_paths[ref_indices[0]],
                                    data_type='np')
            ref_crop_coords = self.get_crop_coords(points)
            self.bw = max(1, (ref_crop_coords[1] - ref_crop_coords[0]) // 256)

            # get keypoints for all reference frames
            ref_L_paths = [ref_L_paths[idx] for idx in ref_indices]
            all_keypoints = self.read_all_keypoints(ref_L_paths,
                                                    ref_crop_coords,
                                                    is_ref=True)

            # read all reference images
            for i, idx in enumerate(ref_indices):
                keypoints = all_keypoints[i]
                ref_img = self.crop(self.read_data(ref_I_paths[idx]),
                                    ref_crop_coords)
                Li = self.get_face_image(keypoints, transform_L, ref_img.size)
                Ii = transform_I(ref_img)
                Lr = self.concat_frame(Lr, Li.unsqueeze(0))
                Ir = self.concat_frame(Ir, Ii.unsqueeze(0))
            if not opt.isTrain:
                self.Lr, self.Ir = Lr, Ir

        ### read in target images
        if is_first_frame:
            # get crop coordinates
            points = self.read_data(L_paths[start_idx], data_type='np')
            crop_coords = self.get_crop_coords(points)
            if not opt.isTrain:
                if self.fix_crop_pos: self.crop_coords = crop_coords
                else: self.crop_size = B_img.size
            self.bw = max(1, (crop_coords[1] - crop_coords[0]) // 256)

            # get keypoints for all frames
            end_idx = (start_idx +
                       n_frames_total * t_step) if opt.isTrain else (
                           start_idx + opt.how_many)
            L_paths = L_paths[start_idx:end_idx:t_step]
            crop_coords = crop_coords if self.fix_crop_pos else None
            all_keypoints = self.read_all_keypoints(L_paths,
                                                    crop_coords,
                                                    is_ref=False)
            if not opt.isTrain: self.all_keypoints = all_keypoints
        else:
            # use same crop coordinates as previous frames
            if self.fix_crop_pos:
                crop_coords = self.crop_coords
            else:
                crop_coords = self.get_crop_coords(points, self.crop_size)
            all_keypoints = self.all_keypoints

        L, I = self.L, self.I
        for t in range(n_frames_total):
            ti = t if opt.isTrain else start_idx + t
            keypoints = all_keypoints[ti]
            I_path = I_paths[start_idx + t * t_step]
            img = self.crop(self.read_data(I_path), crop_coords)
            Lt = self.get_face_image(keypoints, transform_L, img.size)
            It = transform_I(img)
            L = self.concat_frame(L, Lt.unsqueeze(0))
            I = self.concat_frame(I, It.unsqueeze(0))
        if not opt.isTrain:
            self.L, self.I = L, I
        seq = path.basename(path.dirname(
            opt.ref_img_path)) + '-' + opt.ref_img_id + '_' + path.basename(
                path.dirname(opt.seq_path))

        return_list = {
            'tgt_label': L,
            'tgt_image': I,
            'ref_label': Lr,
            'ref_image': Ir,
            'path': I_path,
            'seq': seq
        }
        return return_list
Ejemplo n.º 7
0
    def __getitem__(self, index):
        opt = self.opt
        if opt.isTrain:
            np.random.seed(index)
            seq_idx = np.random.randint(
                self.n_of_seqs)  # which sequence to load

            img_paths = self.img_paths[seq_idx]
            op_paths = self.op_paths[seq_idx]
            dp_paths = self.dp_paths[seq_idx]
            ref_img_paths, ref_op_paths, ref_dp_paths = img_paths, op_paths, dp_paths
        else:
            img_paths, op_paths, dp_paths = self.img_paths, self.op_paths, self.dp_paths
            ref_img_paths, ref_op_paths, ref_dp_paths = self.ref_img_paths, self.ref_op_paths, self.ref_dp_paths

        ### setting parameters
        # n_frames_total: # of frames to train
        # start_idx: which frame index to start with
        # t_step: # of frames between neighboring frames
        # ref_indices: frame indices for the reference images
        n_frames_total, start_idx, t_step, ref_indices = get_video_params(
            self.opt, self.n_frames_total, len(img_paths), index)
        w, h = opt.fineSize, int(opt.fineSize / opt.aspect_ratio)
        img_params = get_img_params(opt, (w, h))
        is_first_frame = opt.isTrain or index == 0

        ### reference image
        Lr, Ir = self.Lr, self.Ir
        if is_first_frame:  # need to read reference images for every training iter or at beginning of inference
            ref_crop_coords = [None] * opt.n_shot
            for i, idx in enumerate(ref_indices):
                ref_size = self.read_data(ref_img_paths[idx]).size
                Li, Ii, ref_crop_coords[i], ref_face_pts = self.get_images(
                    ref_img_paths, ref_op_paths, ref_dp_paths, idx, ref_size,
                    img_params, self.ref_crop_coords[i])
                Lr = self.concat_frame(Lr, Li.unsqueeze(0))
                Ir = self.concat_frame(Ir, Ii.unsqueeze(0))

            if not opt.isTrain:  # keep track of non-changing variables during inference
                read_keypoints.face_ratio = None
                self.Lr, self.Ir = Lr, Ir
                self.ref_face_pts = None
                self.ref_crop_coords = ref_crop_coords

        ### target image
        size = self.read_data(img_paths[0]).size
        crop_coords = self.crop_coords if not opt.isTrain else ref_crop_coords[
            0]

        L, I = self.L, self.I
        for t in range(n_frames_total):
            idx = start_idx + t * t_step
            Lt, It, crop_coords, _ = self.get_images(img_paths, op_paths, dp_paths, idx, size, img_params, \
                crop_coords, self.ref_face_pts)
            L = self.concat_frame(L, Lt.unsqueeze(0))
            I = self.concat_frame(I, It.unsqueeze(0))

        if not opt.isTrain:
            self.L, self.I = L, I
            if index == 0: self.crop_coords = crop_coords
        seq = path.basename(path.dirname(
            opt.ref_img_path)) + '-' + opt.ref_img_id + '_' + path.basename(
                path.dirname(opt.seq_path))

        return_list = {
            'tgt_label': L,
            'tgt_image': I,
            'ref_label': Lr,
            'ref_image': Ir,
            'path': img_paths[idx],
            'seq': seq
        }

        return return_list
Ejemplo n.º 8
0
    def __getitem__(self, index):
        TParsing, TFG, SPose, SParsing, SFG, SFG_full, BG, BG_flag, SI, seq_idx = self.update_frame_idx_composer(
            self.simg_paths, index)
        simg_paths = self.simg_paths[seq_idx]
        n_frames_total, start_idx, t_step = get_video_params(
            self.opt, self.n_frames_total, len(simg_paths), self.frame_idx)

        simg = Image.open(simg_paths[start_idx]).convert('RGB')
        size = simg.size

        BigSizeFlag = True
        if size[0] / size[1] > 1:
            BigSizeFlag = True
        else:
            BigSizeFlag = False

        if BigSizeFlag:
            params = get_img_params(self.opt, (1920, 1080))
        else:
            params = get_img_params(self.opt, size)

        tparsing_path = self.tparsing_paths[seq_idx][0]
        timg_path = self.timg_paths[seq_idx][0]

        video_name = timg_path[timg_path.rfind('video'):timg_path.rfind('/timg'
                                                                        )]
        bg_path = self.dir_bg + '/' + video_name + '.jpg'
        BG_i, BG_flag = self.get_bg_image(bg_path, size, params, BigSizeFlag)

        TParsing, TFG = self.get_TImage(tparsing_path, timg_path, size, params,
                                        BigSizeFlag)
        TParsing, TFG = self.crop(TParsing), self.crop(TFG)

        frame_range = list(range(n_frames_total)) if (
            self.opt.isTrain
            or self.TPose is None) else [self.opt.n_frames_G - 1]
        for i in frame_range:
            simg_path = simg_paths[start_idx + i * t_step]
            sfg_path = self.sfg_paths[seq_idx][start_idx + i * t_step]
            spose_path = self.spose_paths[seq_idx][start_idx + i * t_step]
            sparsing_path = self.sparsing_paths[seq_idx][start_idx +
                                                         i * t_step]

            SPose_i, SParsing_i, SFG_i, SFG_full_i, SI_i = self.get_SImage(
                spose_path, sparsing_path, sfg_path, simg_path, size, params,
                BigSizeFlag)

            SParsing_i = self.crop(SParsing_i)
            SFG_i = self.crop(SFG_i)
            SPose_i, SFG_full_i, SI_i = self.crop(SPose_i), self.crop(
                SFG_full_i), self.crop(SI_i)

            SPose = concat_frame(SPose, SPose_i, n_frames_total)
            SParsing = concat_frame(SParsing, SParsing_i, n_frames_total)
            SFG = concat_frame(SFG, SFG_i, n_frames_total)
            SFG_full = concat_frame(SFG_full, SFG_full_i, n_frames_total)
            SI = concat_frame(SI, SI_i, n_frames_total)
            BG = concat_frame(BG, BG_i, n_frames_total)

        if not self.opt.isTrain:
            self.TParsing, self.TFG, self.SPose, self.SParsing, self.SFG, self.SFG_full, self.BG, self.BG_flag, self.SI = TParsing, TFG, SPose, SParsing, SFG, SFG_full, BG, BG_flag, SI
            self.frame_idx += 1
        change_seq = False if self.opt.isTrain else self.change_seq
        return_list = {
            'TParsing': TParsing,
            'TFG': TFG,
            'SPose': SPose,
            'SParsing': SParsing,
            'SFG': SFG,
            'SFG_full': SFG_full,
            'BG': BG,
            'BG_flag': BG_flag,
            'SI': SI,
            'A_path': simg_path,
            'change_seq': change_seq
        }
        return return_list
Ejemplo n.º 9
0
    def __getitem__(self, index):
        opt = self.opt
        if opt.isTrain:
            L_paths = self.L_paths[index % self.n_of_seqs]
            I_paths = self.I_paths[index % self.n_of_seqs]
            ref_L_paths, ref_I_paths = L_paths, I_paths
        else:
            L_paths, I_paths = self.L_paths, self.I_paths
            ref_L_paths, ref_I_paths = self.ref_L_paths, self.ref_I_paths

        ### setting parameters
        n_frames_total, start_idx, t_step, ref_indices = get_video_params(
            opt, self.n_frames_total, len(I_paths), index)
        w, h = opt.fineSize, int(opt.fineSize / opt.aspect_ratio)
        img_params = get_img_params(opt, (w, h))
        is_first_frame = opt.isTrain or index == 0

        transform_I = get_transform(opt, img_params, color_aug=opt.isTrain)
        transform_L = get_transform(
            opt, img_params, method=Image.NEAREST,
            normalize=False) if self.L_is_label else transform_I

        ### read in reference image
        Lr, Ir = self.Lr, self.Ir
        if is_first_frame:
            for idx in ref_indices:
                Li = self.get_image(ref_L_paths[idx],
                                    transform_L,
                                    is_label=self.L_is_label)
                Ii = self.get_image(ref_I_paths[idx], transform_I)
                Lr = self.concat_frame(Lr, Li.unsqueeze(0))
                Ir = self.concat_frame(Ir, Ii.unsqueeze(0))

            if not opt.isTrain:  # keep track of non-changing variables during inference
                self.Lr, self.Ir = Lr, Ir

        ### read in target images
        L, I = self.L, self.I
        for t in range(n_frames_total):
            idx = start_idx + t * t_step
            Lt = self.get_image(L_paths[idx],
                                transform_L,
                                is_label=self.L_is_label)
            It = self.get_image(I_paths[idx], transform_I)
            L = self.concat_frame(L, Lt.unsqueeze(0))
            I = self.concat_frame(I, It.unsqueeze(0))

        if not opt.isTrain:
            self.L, self.I = L, I

        seq = path.basename(path.dirname(
            opt.ref_img_path)) + '-' + opt.ref_img_id + '_' + path.basename(
                path.dirname(opt.seq_path))

        return_list = {
            'tgt_label': L,
            'tgt_image': I,
            'ref_label': Lr,
            'ref_image': Ir,
            'path': I_paths[idx],
            'seq': seq
        }
        return return_list
Ejemplo n.º 10
0
    def __getitem__(self, index):
        opt = self.opt
        if opt.isTrain:
            np.random.seed()
            seq_idx = np.random.randint(self.n_of_seqs)

            L_paths = self.L_paths[seq_idx]
            I_paths = self.I_paths[seq_idx]
            ref_L_paths, ref_I_paths = L_paths, I_paths

            # read in videos
            I_videos = cv2.VideoCapture(I_paths)
            ref_I_videos = I_videos

        elif opt.example:
            L_paths = self.L_paths[index]
            I_paths = self.I_paths[index]

            # debug
            L_paths = os.path.join(opt.dataroot, 'unzip/test_video',
                                   'id00017/lZf1RB6l5Gs/00152_aligned.npy')
            I_paths = os.path.join(opt.dataroot, 'unzip/test_video',
                                   'id00017/lZf1RB6l5Gs/00152_aligned.mp4')

            ref_L_paths, ref_I_paths = L_paths, I_paths

            # read in videos
            I_videos = cv2.VideoCapture(I_paths)
            ref_I_videos = I_videos

        else:
            L_paths, I_paths = self.L_paths, self.I_paths
            ref_L_paths, ref_I_paths = self.ref_L_paths, self.ref_I_paths

            I_videos = self.I_videos
            ref_I_videos = self.ref_I_videos

        # opt.example = False

        n_frames_total, start_idx, t_step, ref_indices = get_video_params(
            opt, self.n_frames_total,
            int(I_videos.get(cv2.CAP_PROP_FRAME_COUNT)) - 1, index)

        w, h = opt.fineSize, int(opt.fineSize / opt.aspect_ratio)
        img_params = get_img_params(opt, (w, h))
        is_first_frame = opt.isTrain or index == 0 or opt.example

        transform_L = get_transform(opt,
                                    img_params,
                                    method=Image.BILINEAR,
                                    normalize=False)
        transform_I = get_transform(opt, img_params, color_aug=opt.isTrain)

        # pdb.set_trace()

        ### read in reference images
        Lr, Ir = self.Lr, self.Ir
        if is_first_frame:
            # get crop coordinates and stroke width
            tot_points = self.read_data(ref_L_paths, data_type='npy')
            points = tot_points[ref_indices[0]]
            ref_crop_coords = self.get_crop_coords(points, for_ref=True)
            self.bw = max(1, (ref_crop_coords[1] - ref_crop_coords[0]) // 256)

            # get keypoints for all reference frames
            all_keypoints = self.get_all_key_points(tot_points[ref_indices],
                                                    ref_crop_coords,
                                                    is_ref=True)

            # current to do
            # read all reference images
            for i, idx in enumerate(ref_indices):
                keypoints = all_keypoints[i]
                ref_img = self.crop(self.get_specific_frame(idx, ref_I_videos),
                                    ref_crop_coords)
                Li = self.get_face_image(keypoints, transform_L, ref_img.size)
                Ii = transform_I(ref_img)
                Lr = self.concat_frame(Lr, Li.unsqueeze(0), ref=True)
                Ir = self.concat_frame(Ir, Ii.unsqueeze(0), ref=True)
            if not opt.isTrain and not opt.example:
                self.Lr, self.Ir = Lr, Ir

        ### read in target images
        if is_first_frame:
            # get crop coordinates
            tot_points = self.read_data(L_paths, data_type='npy')
            points = tot_points[start_idx]
            crop_coords = self.get_crop_coords(points)
            if not opt.isTrain:
                if self.fix_crop_pos: self.crop_coords = crop_coords
                else:
                    self.crop_size = crop_coords[1] - crop_coords[
                        0], crop_coords[3] - crop_coords[2]
            self.bw = max(1, (crop_coords[1] - crop_coords[0]) // 256)

            # get keypoints for all framess
            end_idx = (start_idx + n_frames_total * t_step) if (
                opt.isTrain or opt.example) else (start_idx + opt.how_many)
            L_points = tot_points[start_idx:end_idx:t_step]

            crop_coords = crop_coords if self.fix_crop_pos else None
            all_keypoints = self.get_all_key_points(L_points,
                                                    crop_coords,
                                                    is_ref=False)  # 1
            if not opt.isTrain: self.all_keypoints = all_keypoints
        else:
            # use same crop coordinates as previous frames
            if self.fix_crop_pos:
                crop_coords = self.crop_coords
            else:
                tot_points = self.read_data(L_paths, data_type='npy')
                crop_coords = self.get_crop_coords(tot_points[start_idx],
                                                   self.crop_size)
            all_keypoints = self.all_keypoints

        L, I = self.L, self.I
        for t in range(n_frames_total):  # 1
            ti = t if (opt.isTrain or opt.example) else start_idx + t  # 0
            keypoints = all_keypoints[ti]
            img = self.crop(
                self.get_specific_frame(start_idx + t * t_step, I_videos),
                crop_coords)  # 52
            Lt = self.get_face_image(keypoints, transform_L, img.size)
            It = transform_I(img)
            L = self.concat_frame(L, Lt.unsqueeze(0), ref=False)
            I = self.concat_frame(I, It.unsqueeze(0), ref=False)
        if not opt.isTrain and not opt.example:
            self.L, self.I = L, I
        seq = path.basename(path.dirname(
            opt.ref_img_path)) + '-' + opt.ref_img_id + '_' + path.basename(
                path.dirname(opt.seq_path))

        return_list = {
            'tgt_label': L,
            'tgt_image': I,
            'ref_label': Lr,
            'ref_image': Ir,
            'path': I_paths,
            'seq': seq
        }
        return return_list