def default_transforms(self) -> Dict[str, Callable]: if self.training: post_tensor_transform = [ RandomShortSideScale(min_size=256, max_size=320), RandomCrop(244), RandomHorizontalFlip(p=0.5), ] else: post_tensor_transform = [ ShortSideScale(256), ] return { "post_tensor_transform": Compose([ ApplyTransformToKey( key="video", transform=Compose([UniformTemporalSubsample(8)] + post_tensor_transform), ), ]), "per_batch_transform_on_device": Compose([ ApplyTransformToKey( key="video", transform=K.VideoSequential( K.Normalize(torch.tensor([0.45, 0.45, 0.45]), torch.tensor([0.225, 0.225, 0.225])), data_format="BCTHW", same_on_frame=False ) ), ]), }
def _video_transform(self, mode: str): """ This function contains example transforms using both PyTorchVideo and TorchVision in the same Callable. For 'train' mode, we use augmentations (prepended with 'Random'), for 'val' mode we use the respective determinstic function. """ args = self.args return ApplyTransformToKey( key="video", transform=Compose( [ UniformTemporalSubsample(args.video_num_subsampled), Normalize(args.video_means, args.video_stds), ] + ( [ RandomShortSideScale( min_size=args.video_min_short_side_scale, max_size=args.video_max_short_side_scale, ), RandomCrop(args.video_crop_size), RandomHorizontalFlip(p=args.video_horizontal_flip_p), ] if mode == "train" else [ ShortSideScale(args.video_min_short_side_scale), CenterCrop(args.video_crop_size), ] ) ), )
def test_compose_with_video_transforms(self): video = thwc_to_cthw(create_dummy_video_frames( 20, 30, 40)).to(dtype=torch.float32) test_clip = {"video": video, "label": 0} # Compose using torchvision and pytorchvideo transformst to ensure they interact # correctly. num_subsample = 10 transform = Compose([ ApplyTransformToKey( key="video", transform=Compose([ UniformTemporalSubsample(num_subsample), NormalizeVideo([video.mean()] * 3, [video.std()] * 3), RandomShortSideScale(min_size=15, max_size=25), RandomCropVideo(10), RandomHorizontalFlipVideo(p=0.5), ]), ) ]) actual = transform(test_clip) c, t, h, w = actual["video"].shape self.assertEqual(c, 3) self.assertEqual(t, num_subsample) self.assertEqual(h, 10) self.assertEqual(w, 10)
def get_tfms(self): tfms_list = [ UniformTemporalSubsample(self.transform_params["num_frames"]), Lambda(lambda x: x / 255.0), Normalize(self.mean, self.std), ] if self.resize: tfms_list += [ ShortSideScale(size=self.transform_params["side_size"]), CenterCropVideo(crop_size=(self.transform_params["crop_size"], self.transform_params["crop_size"])) ] # Note that this transform is specific to the x3d model. tfms = ApplyTransformToKey( key="video", transform=Compose(tfms_list), ) # The duration of the input clip is also specific to the model. clip_duration = ( self.transform_params["num_frames"] * self.transform_params["sampling_rate"]) / self.frames_per_second return tfms, clip_duration
def make_transform( post_tensor_transform: List[Callable] = post_tensor_transform, per_batch_transform_on_device: List[ Callable] = per_batch_transform_on_device): return { "post_tensor_transform": Compose([ ApplyTransformToKey( key="video", transform=Compose(post_tensor_transform), ), ]), "per_batch_transform_on_device": Compose([ ApplyTransformToKey(key="video", transform=K.VideoSequential( *per_batch_transform_on_device, data_format="BCTHW", same_on_frame=False)), ]), }
def __init__(self, train_paths, val_paths, clip_duration: int = 2, batch_size: int = 4, num_workers: int = 2, **kwargs): super().__init__() self.train_paths = train_paths self.val_paths = val_paths self.batch_size = batch_size self.num_workers = num_workers self.clip_duration = clip_duration self.num_labels = len( {path[1] for path in train_paths._paths_and_labels}) for k, v in kwargs.items(): setattr(self, k, v) self.train_transforms = ApplyTransformToKey( key='video', transform=Compose([ UniformTemporalSubsample(8), Lambda(lambda x: x / 255.0), Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)), RandomShortSideScale(min_size=256, max_size=320), RandomCrop(224), RandomHorizontalFlip(p=0.5), ])) self.val_transforms = ApplyTransformToKey( key='video', transform=Compose([ UniformTemporalSubsample(8), Lambda(lambda x: x / 255.0), Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)), ShortSideScale(256), CenterCrop(224) ]))
def get_transform(): transform = Compose([ ApplyTransformToKey( key="video", transform=Compose([ UniformTemporalSubsample(8), #Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)), RandomShortSideScale(min_size=256, max_size=320), RandomCrop(244), RandomHorizontalFlip(p=0.5), ]), ), ]) return transform
def _audio_transform(self): """ This function contains example transforms using both PyTorchVideo and TorchAudio in the same Callable. """ args = self.args n_fft = int( float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size ) hop_length = int( float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size ) eps = 1e-10 return ApplyTransformToKey( key="audio", transform=Compose( [ Resample( orig_freq=args.audio_raw_sample_rate, new_freq=args.audio_resampled_rate, ), MelSpectrogram( sample_rate=args.audio_resampled_rate, n_fft=n_fft, hop_length=hop_length, n_mels=args.audio_num_mels, center=False, ), Lambda(lambda x: x.clamp(min=eps)), Lambda(torch.log), UniformTemporalSubsample(args.audio_mel_num_subsample), Lambda(lambda x: x.transpose(1, 0)), # (F, T) -> (T, F) Lambda( lambda x: x.view(1, x.size(0), 1, x.size(1)) ), # (T, F) -> (1, T, 1, F) Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)), ] ), )
def test_video_classifier_finetune_fiftyone(tmpdir): with mock_encoded_video_dataset_folder(tmpdir) as ( dir_name, total_duration, ): half_duration = total_duration / 2 - 1e-9 train_dataset = fo.Dataset.from_dir( dir_name, dataset_type=fo.types.VideoClassificationDirectoryTree, ) datamodule = VideoClassificationData.from_fiftyone( train_dataset=train_dataset, clip_sampler="uniform", clip_duration=half_duration, video_sampler=SequentialSampler, decode_audio=False, ) for sample in datamodule.train_dataset.data: expected_t_shape = 5 assert sample["video"].shape[1] == expected_t_shape assert len(VideoClassifier.available_backbones()) > 5 train_transform = { "post_tensor_transform": Compose([ ApplyTransformToKey( key="video", transform=Compose([ UniformTemporalSubsample(8), RandomShortSideScale(min_size=256, max_size=320), RandomCrop(244), RandomHorizontalFlip(p=0.5), ]), ), ]), "per_batch_transform_on_device": Compose([ ApplyTransformToKey( key="video", transform=K.VideoSequential( K.Normalize(torch.tensor([0.45, 0.45, 0.45]), torch.tensor([0.225, 0.225, 0.225])), K.augmentation.ColorJitter(0.1, 0.1, 0.1, 0.1, p=1.0), data_format="BCTHW", same_on_frame=False ) ), ]), } datamodule = VideoClassificationData.from_fiftyone( train_dataset=train_dataset, clip_sampler="uniform", clip_duration=half_duration, video_sampler=SequentialSampler, decode_audio=False, train_transform=train_transform ) model = VideoClassifier(num_classes=datamodule.num_classes, pretrained=False) trainer = flash.Trainer(fast_dev_run=True) trainer.finetune(model, datamodule=datamodule)
def Ptvssv2(cfg, mode): """ Construct PyTorchVideo Something-Something v2 SSv2 video loader. Load SSv2 data (frame paths, labels, etc. ) to SSv2 Dataset object. The dataset could be downloaded from Chrades official website (https://20bn.com/datasets/something-something). Please see datasets/DATASET.md for more information about the data format. For training and validation, a single clip is randomly sampled from every video with random cropping and scaling. For testing, multiple clips are uniformaly sampled from every video with uniform cropping. For uniform cropping, we take the left, center, and right crop if the width is larger than height, or take top, center, and bottom crop if the height is larger than the width. Args: cfg (CfgNode): configs. mode (string): Options includes `train`, `val`, or `test` mode. """ # Only support train, val, and test mode. assert mode in [ "train", "val", "test", ], "Split '{}' not supported".format(mode) logger.info("Constructing Ptvcharades {}...".format(mode)) if mode in ["train", "val"]: num_clips = 1 num_crops = 1 transform = Compose([ ApplyTransformToKey( key="video", transform=Compose([ Lambda(div255), NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD), RandomShortSideScale( min_size=cfg.DATA.TRAIN_JITTER_SCALES[0], max_size=cfg.DATA.TRAIN_JITTER_SCALES[1], ), RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE), Lambda(rgb2bgr), ] + ([RandomHorizontalFlipVideo( p=0.5)] if cfg.DATA.RANDOM_FLIP else []) + [PackPathway(cfg)]), ), DictToTuple(num_clips, num_crops), ]) clip_sampler = make_clip_sampler( "constant_clips_per_video", 1, # Put arbitrary duration as ssv2 always needs full video clip. num_clips, num_crops, ) if cfg.NUM_GPUS > 1: video_sampler = DistributedSampler else: video_sampler = (RandomSampler if mode == "train" else SequentialSampler) else: assert cfg.TEST.NUM_ENSEMBLE_VIEWS == 1 num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS num_crops = cfg.TEST.NUM_SPATIAL_CROPS transform = Compose([ ApplyTransformToKey( key="video", transform=Compose([ Lambda(div255), NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD), ShortSideScale(size=cfg.DATA.TEST_CROP_SIZE), ]), ), UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE), ApplyTransformToKey( key="video", transform=Compose( [Lambda(rgb2bgr), PackPathway(cfg)], ), ), DictToTuple(num_clips, num_crops), ]) clip_sampler = make_clip_sampler( "constant_clips_per_video", 1, # Put arbitrary duration as ssv2 always needs full video clip. num_clips, num_crops, ) video_sampler = (DistributedSampler if cfg.NUM_GPUS > 1 else SequentialSampler) label_name_file = os.path.join(cfg.DATA.PATH_TO_DATA_DIR, "something-something-v2-labels.json") video_label_file = os.path.join( cfg.DATA.PATH_TO_DATA_DIR, "something-something-v2-{}.json".format( "train" if mode == "train" else "validation")) data_path = os.path.join( cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format("train" if mode == "train" else "val"), ) dataset = SSv2( label_name_file=label_name_file, video_label_file=video_label_file, video_path_label_file=data_path, clip_sampler=clip_sampler, video_sampler=video_sampler, transform=transform, video_path_prefix=cfg.DATA.PATH_PREFIX, frames_per_clip=cfg.DATA.NUM_FRAMES, rand_sample_frames=mode == "train", ) logger.info("Constructing ssv2 dataloader (size: {}) from {}".format( len(dataset._path_to_videos), data_path)) return PTVDatasetWrapper( num_videos=len(dataset._path_to_videos), clips_per_video=num_clips, crops_per_clip=num_crops, dataset=dataset, )
def Ptvcharades(cfg, mode): """ Construct PyTorchVideo Charades video loader. Load Charades data (frame paths, labels, etc. ) to Charades Dataset object. The dataset could be downloaded from Chrades official website (https://allenai.org/plato/charades/). Please see datasets/DATASET.md for more information about the data format. For `train` and `val` mode, a single clip is randomly sampled from every video with random cropping, scaling, and flipping. For `test` mode, multiple clips are uniformaly sampled from every video with center cropping. Args: cfg (CfgNode): configs. mode (string): Options includes `train`, `val`, or `test` mode. For the train and val mode, the data loader will take data from the train or val set, and sample one clip per video. For the test mode, the data loader will take data from test set, and sample multiple clips per video. """ # Only support train, val, and test mode. assert mode in [ "train", "val", "test", ], "Split '{}' not supported".format(mode) logger.info("Constructing Ptvcharades {}...".format(mode)) clip_duration = ((cfg.DATA.NUM_FRAMES - 1) * cfg.DATA.SAMPLING_RATE + 1) / cfg.DATA.TARGET_FPS if mode in ["train", "val"]: num_clips = 1 num_crops = 1 transform = Compose([ ApplyTransformToKey( key="video", transform=Compose([ Lambda(div255), NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD), RandomShortSideScale( min_size=cfg.DATA.TRAIN_JITTER_SCALES[0], max_size=cfg.DATA.TRAIN_JITTER_SCALES[1], ), RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE), Lambda(rgb2bgr), ] + ([RandomHorizontalFlipVideo( p=0.5)] if cfg.DATA.RANDOM_FLIP else []) + [PackPathway(cfg)]), ), Lambda( functools.partial( process_charades_label, mode=mode, num_classes=cfg.MODEL.NUM_CLASSES, )), DictToTuple(num_clips, num_crops), ]) clip_sampler = make_clip_sampler("random", clip_duration) if cfg.NUM_GPUS > 1: video_sampler = DistributedSampler else: video_sampler = (RandomSampler if mode == "train" else SequentialSampler) else: num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS num_crops = cfg.TEST.NUM_SPATIAL_CROPS transform = Compose([ ApplyTransformToKey( key="video", transform=Compose([ Lambda(div255), NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD), ShortSideScale(size=cfg.DATA.TEST_CROP_SIZE), ]), ), UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE), Lambda( functools.partial( process_charades_label, mode=mode, num_classes=cfg.MODEL.NUM_CLASSES, )), ApplyTransformToKey( key="video", transform=Compose( [Lambda(rgb2bgr), PackPathway(cfg)], ), ), DictToTuple(num_clips, num_crops), ]) clip_sampler = make_clip_sampler( "constant_clips_per_video", clip_duration, num_clips, num_crops, ) video_sampler = (DistributedSampler if cfg.NUM_GPUS > 1 else SequentialSampler) data_path = os.path.join(cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(mode)) dataset = Charades( data_path=data_path, clip_sampler=clip_sampler, video_sampler=video_sampler, transform=transform, video_path_prefix=cfg.DATA.PATH_PREFIX, frames_per_clip=cfg.DATA.NUM_FRAMES, ) logger.info("Constructing charades dataloader (size: {}) from {}".format( len(dataset._path_to_videos), data_path)) return PTVDatasetWrapper( num_videos=len(dataset._path_to_videos), clips_per_video=num_clips, crops_per_clip=num_crops, dataset=dataset, )
def Ptvkinetics(cfg, mode): """ Construct the Kinetics video loader with a given csv file. The format of the csv file is: ``` path_to_video_1 label_1 path_to_video_2 label_2 ... path_to_video_N label_N ``` For `train` and `val` mode, a single clip is randomly sampled from every video with random cropping, scaling, and flipping. For `test` mode, multiple clips are uniformaly sampled from every video with center cropping. Args: cfg (CfgNode): configs. mode (string): Options includes `train`, `val`, or `test` mode. For the train and val mode, the data loader will take data from the train or val set, and sample one clip per video. For the test mode, the data loader will take data from test set, and sample multiple clips per video. """ # Only support train, val, and test mode. assert mode in [ "train", "val", "test", ], "Split '{}' not supported".format(mode) logger.info("Constructing Ptvkinetics {}...".format(mode)) clip_duration = (cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE / cfg.DATA.TARGET_FPS) path_to_file = os.path.join(cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(mode)) labeled_video_paths = LabeledVideoPaths.from_path(path_to_file) num_videos = len(labeled_video_paths) labeled_video_paths.path_prefix = cfg.DATA.PATH_PREFIX logger.info("Constructing kinetics dataloader (size: {}) from {}".format( num_videos, path_to_file)) if mode in ["train", "val"]: num_clips = 1 num_crops = 1 transform = Compose([ ApplyTransformToKey( key="video", transform=Compose([ UniformTemporalSubsample(cfg.DATA.NUM_FRAMES), Lambda(div255), NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD), RandomShortSideScale( min_size=cfg.DATA.TRAIN_JITTER_SCALES[0], max_size=cfg.DATA.TRAIN_JITTER_SCALES[1], ), RandomCropVideo(cfg.DATA.TRAIN_CROP_SIZE), ] + ([RandomHorizontalFlipVideo( p=0.5)] if cfg.DATA.RANDOM_FLIP else []) + [PackPathway(cfg)]), ), DictToTuple(num_clips, num_crops), ]) clip_sampler = make_clip_sampler("random", clip_duration) if cfg.NUM_GPUS > 1: video_sampler = DistributedSampler else: video_sampler = (RandomSampler if mode == "train" else SequentialSampler) else: num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS num_crops = cfg.TEST.NUM_SPATIAL_CROPS transform = Compose([ ApplyTransformToKey( key="video", transform=Compose([ UniformTemporalSubsample(cfg.DATA.NUM_FRAMES), Lambda(div255), NormalizeVideo(cfg.DATA.MEAN, cfg.DATA.STD), ShortSideScale(size=cfg.DATA.TRAIN_JITTER_SCALES[0]), ]), ), UniformCropVideo(size=cfg.DATA.TEST_CROP_SIZE), ApplyTransformToKey(key="video", transform=PackPathway(cfg)), DictToTuple(num_clips, num_crops), ]) clip_sampler = make_clip_sampler( "constant_clips_per_video", clip_duration, num_clips, num_crops, ) video_sampler = (DistributedSampler if cfg.NUM_GPUS > 1 else SequentialSampler) return PTVDatasetWrapper( num_videos=num_videos, clips_per_video=num_clips, crops_per_clip=num_crops, dataset=LabeledVideoDataset( labeled_video_paths=labeled_video_paths, clip_sampler=clip_sampler, video_sampler=video_sampler, transform=transform, decode_audio=False, ), )
# Input Transform # slow_r50モデルに固有なパラメータであることに注意!! side_size = 256 mean = [0.45, 0.45, 0.45] std = [0.225, 0.225, 0.225] crop_size = 256 num_frames = 8 sampling_rate = 8 frames_per_second = 30 transform = ApplyTransformToKey(key='video', transform=Compose([ UniformTemporalSubsample(num_frames), Lambda(lambda x: x / 255.0), NormalizeVideo(mean, std), ShortSideScale(size=side_size), CenterCropVideo(crop_size=(crop_size, crop_size)) ])) clip_duration = (num_frames * sampling_rate) / frames_per_second # Load Video video_path = 'archery.mp4' start_sec = 0 end_sec = start_sec + clip_duration video = EncodedVideo.from_path(video_path) video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
"crop_size": 256, "num_frames": 16, "sampling_rate": 5, } } # Get transform parameters based on model transform_params = model_transform_params[model_name] # Note that this transform is specific to the slow_R50 model. transform = ApplyTransformToKey( key="video", transform=Compose([ UniformTemporalSubsample(transform_params["num_frames"]), Lambda(lambda x: x / 255.0), NormalizeVideo(mean, std), ShortSideScale(size=transform_params["side_size"]), CenterCropVideo(crop_size=(transform_params["crop_size"], transform_params["crop_size"])) ]), ) # The duration of the input clip is also specific to the model. clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"]) / frames_per_second def x3dpred(video): # Select the duration of the clip to load by specifying the start and end duration # The start_sec should correspond to where the action occurs in the video