def test_video_classifier_finetune_from_csv(tmpdir): with mock_video_csv_file(tmpdir) as (mock_csv, total_duration): half_duration = total_duration / 2 - 1e-9 datamodule = VideoClassificationData.from_csv( "file", "target", train_file=mock_csv, clip_sampler="uniform", clip_duration=half_duration, video_sampler=SequentialSampler, decode_audio=False, batch_size=1, ) for sample in datamodule.train_dataset.data: expected_t_shape = 5 assert sample["video"].shape[1] == expected_t_shape model = VideoClassifier(num_classes=datamodule.num_classes, pretrained=False, backbone="slow_r50") trainer = flash.Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=torch.cuda.device_count()) trainer.finetune(model, datamodule=datamodule)
def test_video_classifier_finetune_fiftyone(tmpdir): with mock_encoded_video_dataset_folder(tmpdir) as ( dir_name, total_duration, ): half_duration = total_duration / 2 - 1e-9 train_dataset = fo.Dataset.from_dir( dir_name, dataset_type=fo.types.VideoClassificationDirectoryTree, ) datamodule = VideoClassificationData.from_fiftyone( train_dataset=train_dataset, clip_sampler="uniform", clip_duration=half_duration, video_sampler=SequentialSampler, decode_audio=False, batch_size=1, ) for sample in datamodule.train_dataset.data: expected_t_shape = 5 assert sample["video"].shape[1] == expected_t_shape model = VideoClassifier(num_classes=datamodule.num_classes, pretrained=False, backbone="slow_r50") trainer = flash.Trainer(fast_dev_run=True, gpus=torch.cuda.device_count()) trainer.finetune(model, datamodule=datamodule)
def from_kinetics( clip_sampler: str = "uniform", clip_duration: int = 1, decode_audio: bool = False, batch_size=1, **data_module_kwargs, ) -> VideoClassificationData: """Downloads and loads the Kinetics data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/kinetics.zip", "./data") return VideoClassificationData.from_folders( train_folder=os.path.join(os.getcwd(), "data/kinetics/train"), val_folder=os.path.join(os.getcwd(), "data/kinetics/val"), clip_sampler=clip_sampler, clip_duration=clip_duration, decode_audio=decode_audio, batch_size=batch_size, **data_module_kwargs, )
def test_video_classifier_finetune_fiftyone(tmpdir): with mock_encoded_video_dataset_folder(tmpdir) as ( dir_name, total_duration, ): half_duration = total_duration / 2 - 1e-9 train_dataset = fo.Dataset.from_dir( dir_name, dataset_type=fo.types.VideoClassificationDirectoryTree, ) datamodule = VideoClassificationData.from_fiftyone( train_dataset=train_dataset, clip_sampler="uniform", clip_duration=half_duration, video_sampler=SequentialSampler, decode_audio=False, ) for sample in datamodule.train_dataset.data: expected_t_shape = 5 assert sample["video"].shape[1] == expected_t_shape assert len(VideoClassifier.available_backbones()) > 5 train_transform = { "post_tensor_transform": Compose([ ApplyTransformToKey( key="video", transform=Compose([ UniformTemporalSubsample(8), RandomShortSideScale(min_size=256, max_size=320), RandomCrop(244), RandomHorizontalFlip(p=0.5), ]), ), ]), "per_batch_transform_on_device": Compose([ ApplyTransformToKey( key="video", transform=K.VideoSequential( K.Normalize(torch.tensor([0.45, 0.45, 0.45]), torch.tensor([0.225, 0.225, 0.225])), K.augmentation.ColorJitter(0.1, 0.1, 0.1, 0.1, p=1.0), data_format="BCTHW", same_on_frame=False ) ), ]), } datamodule = VideoClassificationData.from_fiftyone( train_dataset=train_dataset, clip_sampler="uniform", clip_duration=half_duration, video_sampler=SequentialSampler, decode_audio=False, train_transform=train_transform ) model = VideoClassifier(num_classes=datamodule.num_classes, pretrained=False) trainer = flash.Trainer(fast_dev_run=True) trainer.finetune(model, datamodule=datamodule)
ApplyTransformToKey(key="video", transform=K.VideoSequential( *per_batch_transform_on_device, data_format="BCTHW", same_on_frame=False)), ]), } # 3. Load the data from directories. datamodule = VideoClassificationData.from_folders( train_folder=os.path.join(flash.PROJECT_ROOT, "data/kinetics/train"), val_folder=os.path.join(flash.PROJECT_ROOT, "data/kinetics/val"), predict_folder=os.path.join(flash.PROJECT_ROOT, "data/kinetics/predict"), train_transform=make_transform(train_post_tensor_transform), val_transform=make_transform(val_post_tensor_transform), predict_transform=make_transform(val_post_tensor_transform), batch_size=8, clip_sampler="uniform", clip_duration=1, video_sampler=RandomSampler, decode_audio=False, num_workers=8) # 4. List the available models print(VideoClassifier.available_backbones()) # out: ['efficient_x3d_s', 'efficient_x3d_xs', ... ,slowfast_r50', 'x3d_m', 'x3d_s', 'x3d_xs'] print(VideoClassifier.get_backbone_details("x3d_xs")) # 5. Build the VideoClassifier with a PyTorchVideo backbone. model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes,
import os import torch import flash from flash.core.data.utils import download_data from flash.video import VideoClassificationData, VideoClassifier # 1. Create the DataModule # Find more datasets at https://pytorchvideo.readthedocs.io/en/latest/data.html download_data("https://pl-flash-data.s3.amazonaws.com/kinetics.zip", "./data") datamodule = VideoClassificationData.from_folders( train_folder=os.path.join(os.getcwd(), "data/kinetics/train"), val_folder=os.path.join(os.getcwd(), "data/kinetics/val"), clip_sampler="uniform", clip_duration=1, decode_audio=False, ) # 2. Build the task model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes, pretrained=False) # 3. Create the trainer and finetune the model trainer = flash.Trainer(max_epochs=3, gpus=torch.cuda.device_count()) trainer.finetune(model, datamodule=datamodule, strategy="freeze") # 4. Make a prediction predictions = model.predict(os.path.join(os.getcwd(), "data/kinetics/predict"))
import flash from flash.core.data.utils import download_data from flash.core.integrations.labelstudio.visualizer import launch_app from flash.video import VideoClassificationData, VideoClassifier # 1 Download data download_data( "https://label-studio-testdata.s3.us-east-2.amazonaws.com/lightning-flash/video_data.zip" ) # 2. Load export data datamodule = VideoClassificationData.from_labelstudio( export_json="data/project.json", data_folder="data/upload/", val_split=0.2, clip_sampler="uniform", clip_duration=1, decode_audio=False, ) # 3. Build the task model = VideoClassifier( backbone="slow_r50", num_classes=datamodule.num_classes, ) # 4. Create the trainer and finetune the model trainer = flash.Trainer(max_epochs=3) trainer.finetune(model, datamodule=datamodule, strategy="freeze") # 5. Make a prediction
# See the License for the specific language governing permissions and # limitations under the License. import torch import flash from flash.core.data.utils import download_data from flash.video import VideoClassificationData, VideoClassifier # 1. Create the DataModule # Find more datasets at https://pytorchvideo.readthedocs.io/en/latest/data.html download_data("https://pl-flash-data.s3.amazonaws.com/kinetics.zip", "./data") datamodule = VideoClassificationData.from_folders( train_folder="data/kinetics/train", val_folder="data/kinetics/val", clip_sampler="uniform", clip_duration=1, decode_audio=False, batch_size=1, ) # 2. Build the task model = VideoClassifier(backbone="x3d_xs", labels=datamodule.labels, pretrained=False) # 3. Create the trainer and finetune the model trainer = flash.Trainer( max_epochs=1, gpus=torch.cuda.device_count(), strategy="ddp" if torch.cuda.device_count() > 1 else None ) trainer.finetune(model, datamodule=datamodule, strategy="freeze") # 4. Make a prediction datamodule = VideoClassificationData.from_folders(predict_folder="data/kinetics/predict", batch_size=1)
ApplyTransformToKey(key="video", transform=K.VideoSequential( *per_batch_transform_on_device, data_format="BCTHW", same_on_frame=False)), ]), } # 3. Load the data from directories. datamodule = VideoClassificationData.from_paths( train_data_path=os.path.join(_PATH_ROOT, "data/kinetics/train"), val_data_path=os.path.join(_PATH_ROOT, "data/kinetics/val"), predict_data_path=os.path.join(_PATH_ROOT, "data/kinetics/predict"), clip_sampler="uniform", clip_duration=2, video_sampler=RandomSampler, decode_audio=False, train_transform=make_transform(train_post_tensor_transform), val_transform=make_transform(val_post_tensor_transform), predict_transform=make_transform(val_post_tensor_transform), num_workers=8, batch_size=8, ) # 4. List the available models print(VideoClassifier.available_models()) # out: ['efficient_x3d_s', 'efficient_x3d_xs', ... ,slowfast_r50', 'x3d_m', 'x3d_s', 'x3d_xs'] print(VideoClassifier.get_model_details("x3d_xs")) # 5. Build the model - `x3d_xs` comes with `nn.Softmax` by default for their `head_activation`. model = VideoClassifier(model="x3d_xs", num_classes=datamodule.num_classes) model.serializer = Labels()