Ejemplo n.º 1
0
    def __init__(
        self,
        root_path="data/BSLCP",
        inp_res=224,
        resize_res=256,
        setname="train",
        scale_factor=0.1,
        num_in_frames=16,
        evaluate_video=False,
        hflip=0.5,
        stride=0.5,
        gpu_collation=False,
        word_data_pkl=None,
        featurize_mask="",
        featurize_mode=False,
    ):
        self.root_path = root_path
        self.setname = setname  # train, val or test
        self.featurize_mode = featurize_mode
        self.featurize_mask = featurize_mask
        self.gpu_collation = gpu_collation
        self.inp_res = inp_res
        self.resize_res = resize_res
        self.scale_factor = scale_factor
        self.num_in_frames = num_in_frames
        self.evaluate_video = evaluate_video
        self.hflip = hflip
        self.stride = stride

        infofile = os.path.join(root_path, "info/info.pkl")
        self.video_folder = "videos-resized-25fps-256x256-signdict_signbank"

        print(f"Loading {infofile}")
        data = pkl.load(open(infofile, "rb"))

        self.set_video_metadata(data,
                                meta_key="videos",
                                fixed_sz_frames=gpu_collation)
        self.set_class_names(data=data, word_data_pkl=word_data_pkl)

        self.train = list(
            np.where(np.asarray(data["videos"]["split"]) == 0)[0])
        self.valid = list(
            np.where(np.asarray(data["videos"]["split"]) == 2)[0])

        self.videos = [s.strip() for s in data["videos"]["name"]]

        if evaluate_video:
            self.valid, self.t_beg = self._slide_windows(self.valid)

        VideoDataset.__init__(self)
Ejemplo n.º 2
0
def get_training_data(video_path,
                      annotation_path,
                      dataset_name,
                      input_type,
                      file_type,
                      spatial_transform=None,
                      temporal_transform=None,
                      target_transform=None):
    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'somethingv2',
        'somethingv1'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5']
    if 'somethingv1' in dataset_name:
        formatter = sthv1_image_name_formatter
    elif 'somethingv2' in dataset_name:
        formatter = sthv2_image_name_formatter
    else:
        formatter = image_name_formatter
    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        if get_image_backend() == 'accimage':
            from datasets.loader import ImageLoaderAccImage
            loader = VideoLoader(formatter, ImageLoaderAccImage())
        else:
            loader = VideoLoader(formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / '{}.hdf5'.format(video_id))

    if dataset_name == 'activitynet':
        training_data = ActivityNet(video_path,
                                    annotation_path,
                                    'training',
                                    spatial_transform=spatial_transform,
                                    temporal_transform=temporal_transform,
                                    target_transform=target_transform,
                                    video_loader=loader,
                                    video_path_formatter=video_path_formatter)
    else:
        training_data = VideoDataset(video_path,
                                     annotation_path,
                                     'training',
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter)

    return training_data
Ejemplo n.º 3
0
def get_validation_data(video_path,
                        annotation_path,
                        dataset_name,
                        input_type,
                        file_type,
                        spatial_transform=None,
                        temporal_transform=None,
                        target_transform=None):
    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5']

    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        if get_image_backend() == 'accimage':
            from datasets.loader import ImageLoaderAccImage
            loader = VideoLoader(image_name_formatter, ImageLoaderAccImage())
        else:
            loader = VideoLoader(image_name_formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')

    if dataset_name == 'activitynet':
        validation_data = VideoDataset(
            video_path,
            annotation_path,
            'validation',
            spatial_transform=spatial_transform,
            temporal_transform=temporal_transform,
            target_transform=target_transform,
            video_loader=loader,
            video_path_formatter=video_path_formatter)
    else:
        validation_data = VideoDatasetMultiClips(
            video_path,
            annotation_path,
            'validation',
            spatial_transform=spatial_transform,
            temporal_transform=temporal_transform,
            target_transform=target_transform,
            video_loader=loader,
            video_path_formatter=video_path_formatter)

    return validation_data, collate_fn
def get_training_data(video_path,
                      annotation_path,
                      dataset_name,
                      input_type,
                      file_type,
                      spatial_transform=None,
                      temporal_transform=None,
                      target_transform=None):
    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5']

    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        loader = VideoLoader(image_name_formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')

    if dataset_name == 'activitynet':
        training_data = ActivityNet(video_path,
                                    annotation_path,
                                    'training',
                                    spatial_transform=spatial_transform,
                                    temporal_transform=temporal_transform,
                                    target_transform=target_transform,
                                    video_loader=loader,
                                    video_path_formatter=video_path_formatter)
    else:
        training_data = VideoDataset(video_path,
                                     annotation_path,
                                     'training',
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter)

    return training_data
Ejemplo n.º 5
0
def get_training_data(video_path,
                      annotation_path,
                      dataset_name,
                      file_type,
                      spatial_transform=None,
                      temporal_transform=None,
                      target_transform=None):
    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit'
    ]
    assert file_type in ['jpg', 'hdf5']

    if file_type == 'jpg':
        loader = VideoLoader(lambda x: f'image_{x:05d}.jpg')
        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
    else:
        loader = VideoLoaderHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')

    if dataset_name == 'activitynet':
        training_data = ActivityNet(video_path,
                                    annotation_path,
                                    'training',
                                    spatial_transform=spatial_transform,
                                    temporal_transform=temporal_transform,
                                    target_transform=target_transform,
                                    video_loader=loader,
                                    video_path_formatter=video_path_formatter)
    else:
        training_data = VideoDataset(video_path,
                                     annotation_path,
                                     'training',
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter)

    return training_data
Ejemplo n.º 6
0
    def __init__(
        self,
        info_pkl_json="misc/bsl1k/info-pkls.json",
        inp_res=224,
        resize_res=256,
        setname="train",
        scale_factor=0.1,
        num_in_frames=16,
        evaluate_video=False,
        hflip=0.5,
        stride=0.5,
        mouthing_prob_thres=0.9,
        gpu_collation=False,
        num_last_frames=20,
        featurize_mode=False,
        featurize_mask="",
        word_data_pkl=None,
        input_type="rgb",
        pose_keys=["body", "face", "lhnd", "rhnd"],
        mask_rgb=None,
        mask_type=None,
        bsl1k_pose_subset=False,
        bsl1k_anno_key="original-mouthings",
    ):
        self.setname = setname  # train, val or test
        self.featurize_mode = featurize_mode
        self.featurize_mask = featurize_mask
        self.gpu_collation = gpu_collation
        self.inp_res = inp_res
        self.resize_res = resize_res
        self.scale_factor = scale_factor
        self.num_in_frames = num_in_frames
        self.evaluate_video = evaluate_video
        self.hflip = hflip
        self.stride = stride
        self.input_type = input_type
        self.pose_keys = pose_keys
        self.mask_rgb = mask_rgb
        self.mask_type = mask_type

        assert self.num_in_frames == 16
        self.num_last_frames = num_last_frames
        print(f"Using only {self.num_last_frames} last frames of videos")

        with open(info_pkl_json, "r") as f:
            pkls = json.load(f)[bsl1k_anno_key]
        infofile = pkls["info"]

        self.video_folder = pkls["videos"]

        print(f"Loading {infofile}")
        data = pkl.load(open(infofile, "rb"))
        if self.input_type == "pose":
            pose_pkl = pkls["pose"]
            print(f"Loading {pose_pkl}")
            self.pose_data = pkl.load(open(pose_pkl, "rb"))
        if self.mask_rgb:
            assert bsl1k_pose_subset
            assert mask_type
        if self.mask_rgb == "face":
            face_pkl = pkls["face_bbox"]
            print(f"Loading {face_pkl}")
            self.face_data = pkl.load(open(face_pkl, "rb"))

        if bsl1k_pose_subset:  # self.mask_rgb:
            mouth_pkl = pkls["mouth_bbox"]
            print(f"Loading {mouth_pkl}")
            self.mouth_data = pkl.load(open(mouth_pkl, "rb"))

        self.set_video_metadata(data,
                                meta_key="videos",
                                fixed_sz_frames=gpu_collation)
        subset_ix = self.set_class_names(data=data,
                                         word_data_pkl=word_data_pkl)

        self.train = list(
            np.where(np.asarray(data["videos"]["split"]) == 0)[0])  # train
        self.valid = list(
            np.where(np.asarray(data["videos"]["split"]) == 2)[0])  # test
        self.videos = [s.strip() for s in data["videos"]["name"]]

        # Take subsets based on 'mouthing_prob'
        confident_mouthing = np.where(
            np.asarray(data["videos"]["mouthing_prob"]) >= mouthing_prob_thres
        )[0]
        msg = (
            f"Keeping {len(confident_mouthing)}/{len(data['videos']['mouthing_prob'])} "
            f"videos with more than {mouthing_prob_thres} mouthing confidence."
        )
        print(msg)
        self.train = [i for i in self.train if i in confident_mouthing]
        self.valid = [i for i in self.valid if i in confident_mouthing]

        print("Taking subsets according to word vocab")
        self.train = list(set(self.train).intersection(set(subset_ix)))
        self.valid = list(set(self.valid).intersection(set(subset_ix)))

        if self.input_type == "pose":
            valid_pose_ix = np.where(
                np.array([i is not None for i in self.pose_data["pose"]]))[0]
            print(f"{len(self.train)} train, {len(self.valid)} val samples.")
            print("Taking subsets according to having pose or not")
            self.train = list(set(self.train).intersection(set(valid_pose_ix)))
            self.valid = list(set(self.valid).intersection(set(valid_pose_ix)))
            print(f"{len(self.train)} train, {len(self.valid)} val samples.")

        if bsl1k_pose_subset:  # self.mask_rgb:
            # Valid mouth ix should be equivalent to valid face ix, so leaving this bit.
            valid_mouth_ix = np.where(
                np.array([i is not None for i in self.mouth_data]))[0]
            print(f"{len(self.train)} train, {len(self.valid)} val samples.")
            print("Taking subsets according to having pose or not")
            self.train = list(
                set(self.train).intersection(set(valid_mouth_ix)))
            self.valid = list(
                set(self.valid).intersection(set(valid_mouth_ix)))
            print(f"{len(self.train)} train, {len(self.valid)} val samples.")

        # Take a subset for validation if too large
        if self.setname == "val" and len(self.valid) > 1300:
            self.valid = self.valid[::int(len(self.valid) / 1300)]

        if evaluate_video:
            self.valid, self.t_beg = self._slide_windows(self.valid)

        VideoDataset.__init__(self)
Ejemplo n.º 7
0
    def __init__(
        self,
        root_path="data/PHOENIX-2014-T-release-v3/PHOENIX-2014-T",
        inp_res=224,
        resize_res=256,
        setname="train",
        scale_factor=0.1,
        num_in_frames=16,
        evaluate_video=False,
        hflip=0.5,
        stride=0.5,
        gpu_collation=False,
        assign_labels="auto",
    ):
        self.root_path = root_path
        self.setname = setname  # train, val or test
        self.gpu_collation = gpu_collation
        self.inp_res = inp_res
        self.resize_res = resize_res
        self.scale_factor = scale_factor
        self.num_in_frames = num_in_frames
        self.evaluate_video = evaluate_video
        self.hflip = hflip
        self.stride = stride
        self.assign_labels = assign_labels
        infofile = os.path.join(root_path, "info", "info.pkl")
        print(f"Loading {infofile}")
        data = pkl.load(open(infofile, "rb"))
        self.videos = [s.strip() for s in data["videos"]["name"]]

        other_class_ix = 1232
        self.classes = data["videos"]["gloss_ids"]
        replace_cnt = 0
        for i, seq in enumerate(self.classes):
            for j, gid in enumerate(seq):
                if gid == -1:
                    replace_cnt += 1
                    self.classes[i][j] = other_class_ix
        print(f"Replaced {replace_cnt} -1s with {other_class_ix}")
        with open(os.path.join(self.root_path, "info", "words.txt"), "r") as f:
            self.class_names = f.read().splitlines()

        self.class_names.append("1232 __OTHER__")

        self.video_folder = "videos"
        meta_key = self.video_folder
        if gpu_collation:
            # GPU collation requires all inputs to share the same spatial input size
            self.video_folder = "videos-resized-256fps-256x256"
        self.set_video_metadata(data, meta_key=meta_key, fixed_sz_frames=gpu_collation)

        self.train = list(np.where(np.asarray(data["videos"]["split"]) == 0)[0])
        if self.setname == "val":
            self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 1)[0])
        elif self.setname == "test":
            self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 2)[0])

        if self.assign_labels == "auto":
            self.frame_level_glosses = data["videos"]["alignments"]["gloss_id"]

        if evaluate_video:
            self.valid, self.t_beg = self._slide_windows(self.valid)

        VideoDataset.__init__(self)
Ejemplo n.º 8
0
def get_training_data(video_path,
                      annotation_path,
                      dataset_name,
                      input_type,
                      file_type,
                      spatial_transform=None,
                      temporal_transform=None,
                      target_transform=None,
                      sample_t_stride=1):
    assert dataset_name in [
        'kinetics', 'mini_kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit',
        'breakfast', 'mini_breakfast', 'movingmnist',
        'movingmnist_blackframes', 'movingmnist_longterm',
        'movingmnist_motiondiff', 'movingmnist_motionsame',
        'movingmnist_frequencies', 'movingmnist_frequencies_complex',
        'something', 'movingmnist_static'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5', None]

    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        if 'movingmnist' in dataset_name:
            image_name_formatter = mnist_image_name_formatter
        elif 'something' in dataset_name:
            image_name_formatter = something_image_name_formatter
        else:
            image_name_formatter = usual_image_name_formatter

        if get_image_backend() == 'accimage':
            from datasets.loader import ImageLoaderAccImage
            loader = VideoLoader(image_name_formatter, ImageLoaderAccImage())
        else:
            loader = VideoLoader(image_name_formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)

        if 'movingmnist' in dataset_name or 'something' in dataset_name:
            video_path_formatter = (
                lambda root_path, label, video_id: root_path / video_id)

    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()

        if dataset_name in ['kinetics', 'mini_kinetics']:
            video_path_formatter = (lambda root_path, label, video_id:
                                    root_path / label / f'{video_id}')
        else:
            video_path_formatter = (lambda root_path, label, video_id:
                                    root_path / label / f'{video_id}.hdf5')
    print("video_path_formatter", video_path_formatter)

    if dataset_name == 'activitynet':
        training_data = ActivityNet(video_path,
                                    annotation_path,
                                    'training',
                                    spatial_transform=spatial_transform,
                                    temporal_transform=temporal_transform,
                                    target_transform=target_transform,
                                    video_loader=loader,
                                    video_path_formatter=video_path_formatter)

    elif dataset_name in ['kinetics', 'mini_kinetics']:
        training_data = VideoDataset(Path(
            os.path.join(video_path, "h5_train_frames")),
                                     annotation_path,
                                     'training',
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter)

    else:
        print("Building VideoDataset for", dataset_name)
        #print(spatial_transform)
        #print(temporal_transform)
        #print(loader)
        training_data = VideoDataset(video_path,
                                     annotation_path,
                                     'training',
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter)

    return training_data
Ejemplo n.º 9
0
parser.add_argument('-evd', '--extra_validation_dataset', default=None, help='extra validation dataset')
parser.add_argument('-tdl', '--training_dataset_length', default=default_training_dataset_length, help='training dataset length', type=int)
parser.add_argument('-vdl', '--validation_dataset_length', default=default_validation_dataset_length, help='validation dataset length', type=int)
parser.add_argument('-b', '--batch', help='batch size', default=default_batch_size, type=int)
parser.add_argument('-lr', '--learning_rate', help='learning rate', default=default_learning_rate, type=float)
parser.add_argument('-ep', '--epochs', help='number of epochs', default=default_number_epochs, type=int)
parser.add_argument('-sr','--safe_reset', help="if output file exists, use that as input file to prevent lost work", action="store_true")
parser.add_argument('-su','--supervised', help="train supervised", action="store_true")
parser.add_argument('-n','--net', default="original", help="choose net name")
parser.add_argument('-m','--map', default="homographymap", help="choose map name")
parser.add_argument('-t', '--type', default="training", help="type of dataset: training or video")

args = parser.parse_args()
if args.type == "video":
    print "Video dataset"
    dataset = VideoDataset("datasets/%s" % args.training_dataset, args.training_dataset_length)
    val_dataset = VideoDataset("datasets/%s" % args.validation_dataset, args.validation_dataset_length)
else:
    print "Training dataset"
    dataset = TrainingDataset("datasets/%s" % args.training_dataset, args.training_dataset_length)
    val_dataset = TrainingDataset("datasets/%s" % args.validation_dataset, args.validation_dataset_length)

if args.extra_validation_dataset is not None:

    extra_val_dataset = VideoDataset("datasets/%s" % args.extra_validation_dataset)
else:
    extra_val_dataset = None

input_checkpoint = None if args.input is None else "checkpoints/%s" % args.input
output_checkpoint = None if args.output is None else "checkpoints/%s" % args.output
Ejemplo n.º 10
0
    def __init__(
        self,
        root_path="data/wlasl",
        inp_res=224,
        resize_res=256,
        setname="train",
        scale_factor=0.1,
        num_in_frames=64,
        evaluate_video=False,
        hflip=0.5,
        stride=0.5,
        ram_data=True,
        gpu_collation=False,
        use_bbox=True,
        monolithic_pkl_path="data/pickled-videos/wlasl-compressed-quality-90-resized-256x256.pkl",
        input_type="rgb",
        pose_keys=["body", "face", "lhnd", "rhnd"],
        mask_rgb=None,
        mask_type=None,
        mask_prob=1.0,
    ):
        self.root_path = root_path
        self.setname = setname  # train, val or test
        self.inp_res = inp_res
        self.resize_res = resize_res
        self.scale_factor = scale_factor
        self.num_in_frames = num_in_frames
        self.evaluate_video = evaluate_video
        self.hflip = hflip
        self.gpu_collation = gpu_collation
        self.stride = stride
        self.use_bbox = use_bbox
        self.input_type = input_type
        self.pose_keys = pose_keys
        self.mask_rgb = mask_rgb
        self.mask_type = mask_type

        self.video_folder = "videos_360h_25fps"
        if Path(monolithic_pkl_path).exists() and ram_data:
            print(f"Loading monolithic pickle from {monolithic_pkl_path}")
            self.video_data_dict = memcache(monolithic_pkl_path)
        else:
            self.video_data_dict = None

        infofile = os.path.join(root_path, "info", "info.pkl")
        print(f"Loading {infofile}")
        data = pkl.load(open(infofile, "rb"))

        if self.input_type == "pose":
            pose_pkl = os.path.join(root_path, "info", "pose.pkl")
            print(f"Loading {pose_pkl}")
            self.pose_data = pkl.load(open(pose_pkl, "rb"))
        if self.mask_rgb:
            assert mask_type
        if self.mask_rgb == "face":
            face_pkl = os.path.join(root_path, "info", "face_bbox.pkl")
            print(f"Loading {face_pkl}")
            self.face_data = pkl.load(open(face_pkl, "rb"))

        # Use this to take subset
        if self.input_type == "pose" or self.mask_rgb:
            mouth_pkl = os.path.join(root_path, "info", "mouth_bbox.pkl")
            print(f"Loading {mouth_pkl}")
            self.mouth_data = pkl.load(open(mouth_pkl, "rb"))

        self.videos = [s.strip() for s in data["videos"]["name"]]
        self.videos = np.asarray(self.videos)

        self.classes = data["videos"]["word_id"]
        with open(os.path.join(self.root_path, "info", "words.txt"), "r") as f:
            self.class_names = f.read().splitlines()

        meta_key = self.video_folder
        if gpu_collation and not self.video_data_dict:
            # GPU collation requires all inputs to share the same spatial input size
            self.video_folder = "videos-resized-256fps-256x256"
        self.set_video_metadata(data, meta_key=meta_key, fixed_sz_frames=gpu_collation)

        bboxes_orig = [s for s in np.asarray(data["videos"]["box"])]
        self.bboxes = []
        for i, bb in enumerate(bboxes_orig):
            ht = data["videos"]["videos_original"]["H"][i]
            wt = data["videos"]["videos_original"]["W"][i]
            xmin, ymin, xmax, ymax = bb
            bb_norm = [ymin / ht, xmin / wt, ymax / ht, xmax / wt]
            self.bboxes.append(bb_norm)

        self.train = list(np.where(np.asarray(data["videos"]["split"]) == 0)[0])
        if self.setname == "val":
            self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 1)[0])
        elif self.setname == "test":
            self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 2)[0])

        if self.input_type == "pose" or self.mask_rgb:
            # Valid mouth ix should be equivalent to valid face ix, valid pose ix etc
            valid_mouth_ix = np.where(
                np.array([i is not None for i in self.mouth_data])
            )[0]
            if self.setname == "val" or self.setname == "test":
                print(f"{len(self.train)} train, {len(self.valid)} val samples.")
            print("Taking subsets according to having pose or not")
            self.train = list(set(self.train).intersection(set(valid_mouth_ix)))
            if self.setname == "val" or self.setname == "test":
                self.valid = list(set(self.valid).intersection(set(valid_mouth_ix)))
                print(f"{len(self.train)} train, {len(self.valid)} val samples.")

        if evaluate_video:
            self.valid, self.t_beg = self._slide_windows(self.valid)

        VideoDataset.__init__(self)