def main(local_rank, c10d_backend, rdzv_init_url, max_world_size, classy_args): torch.manual_seed(0) set_video_backend(classy_args.video_backend) # Loads config, sets up task config = load_json(classy_args.config_file) task = build_task(config) # Load checkpoint, if available checkpoint = load_checkpoint(classy_args.checkpoint_folder) task.set_checkpoint(checkpoint) pretrained_checkpoint = load_checkpoint(classy_args.pretrained_checkpoint_folder) if pretrained_checkpoint is not None: assert isinstance( task, FineTuningTask ), "Can only use a pretrained checkpoint for fine tuning tasks" task.set_pretrained_checkpoint(pretrained_checkpoint) hooks = [ LossLrMeterLoggingHook(classy_args.log_freq), ModelComplexityHook(), TimeMetricsHook(), ] if classy_args.checkpoint_folder != "": args_dict = vars(classy_args) args_dict["config"] = config hooks.append( CheckpointHook( classy_args.checkpoint_folder, args_dict, checkpoint_period=classy_args.checkpoint_period, ) ) if classy_args.profiler: hooks.append(ProfilerHook()) task.set_hooks(hooks) assert c10d_backend == Backend.NCCL or c10d_backend == Backend.GLOO if c10d_backend == torch.distributed.Backend.NCCL: # needed to enable NCCL error handling os.environ["NCCL_BLOCKING_WAIT"] = "1" coordinator = CoordinatorP2P( c10d_backend=c10d_backend, init_method=rdzv_init_url, max_num_trainers=max_world_size, process_group_timeout=60000, ) trainer = ElasticTrainer( use_gpu=classy_args.device == "gpu", num_dataloader_workers=classy_args.num_workers, local_rank=local_rank, elastic_coordinator=coordinator, input_args={}, ) trainer.train(task)
def test_video_dataset_from_folder(self): self.create_dataset() # iterate through different backends for backend in ['pyav', 'video_reader']: torchvision.set_video_backend(backend) # create dataset dataset = VideoDataset(self.input_dir, extensions=self.extensions) # __len__ self.assertEqual(len(dataset), self.n_frames_per_video * self.n_videos) # __getitem__ for i in range(len(dataset)): frame, label = dataset[i] self.assertIsInstance(frame, PIL.Image.Image) self.assertEqual(label, i // self.n_frames_per_video) # get_filename for i in range(len(dataset)): frame, label = dataset[i] filename = dataset.get_filename(i) print(filename) self.assertTrue( filename.endswith( f"-{(i % self.n_frames_per_video):02d}-avi.png")) shutil.rmtree(self.input_dir)
def test_video_similar_timestamps_for_different_backends(self): if not VIDEO_DATASET_AVAILABLE: warnings.warn( 'Did not test video dataset because of missing requirements') return self.create_dataset() timestamps = [] offsets = [] backends = [] # iterate through different backends for backend in ['pyav', 'video_reader']: torchvision.set_video_backend(backend) _, video_timestamps, video_offsets, _ = \ _make_dataset(self.input_dir, extensions=self.extensions) timestamps.append(video_timestamps) offsets.append(video_offsets) backends.append(backend) # make sure backends don't match (sanity check) self.assertNotEqual(backends[0], backends[1]) # we expect the same timestamps and offsets self.assertEqual(timestamps[0], timestamps[1]) self.assertEqual(offsets[0], offsets[1]) shutil.rmtree(self.input_dir)
def test_video_similar_timestamps_for_different_backends(self): self.create_dataset() timestamps = [] offsets = [] backends = [] # iterate through different backends for backend in ['pyav', 'video_reader']: torchvision.set_video_backend(backend) _, video_timestamps, video_offsets, _ = \ _make_dataset(self.input_dir, extensions=self.extensions) timestamps.append(video_timestamps) offsets.append(video_offsets) backends.append(backend) # make sure backends don't match (sanity check) self.assertNotEqual(backends[0], backends[1]) # we expect the same timestamps and offsets self.assertEqual(timestamps[0], timestamps[1]) self.assertEqual(offsets[0], offsets[1]) shutil.rmtree(self.input_dir)
def test_invalid_file(self): set_video_backend("video_reader") with pytest.raises(RuntimeError): io.read_video("foo.mp4") set_video_backend("pyav") with pytest.raises(RuntimeError): io.read_video("foo.mp4")
def test_invalid_file(self): set_video_backend('video_reader') with self.assertRaises(RuntimeError): io.read_video('foo.mp4') set_video_backend('pyav') with self.assertRaises(RuntimeError): io.read_video('foo.mp4')
def _worker_init_fn(self, worker_id): # we need to set video backend in the worker process explicitly # because the global variable `_video_backend` in TorchVision will # always start with the default value `pyav` when multiprocessing # context other than `fork` is used, and it won't inherit the value of # `_video_backend` in the main process from torchvision import set_video_backend set_video_backend(self.video_backend)
def test_audio_present(self): """Test if audio frames are returned with video_reader backend.""" set_video_backend('video_reader') for test_video, _ in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) container = av.open(full_path) if container.streams.audio: _, audio, _ = io.read_video(full_path) self.assertGreaterEqual(audio.shape[0], 1) self.assertGreaterEqual(audio.shape[1], 1)
def test_audio_present_sec(self, test_video, backend, start_offset, end_offset): """Test if audio frames are returned with sec unit.""" full_path = os.path.join(VIDEO_DIR, test_video) container = av.open(full_path) if container.streams.audio: set_video_backend(backend) _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec") assert all([dimension > 0 for dimension in audio.shape[:2]])
def main(args, config): # Global flags torch.manual_seed(0) set_image_backend(args.image_backend) set_video_backend(args.video_backend) task = build_task(config) # Load checkpoint, if available. checkpoint = load_checkpoint(args.checkpoint_load_path) task.set_checkpoint(checkpoint) # Load a checkpoint contraining a pre-trained model. This is how we # implement fine-tuning of existing models. pretrained_checkpoint = load_checkpoint(args.pretrained_checkpoint_path) if pretrained_checkpoint is not None: assert isinstance( task, FineTuningTask ), "Can only use a pretrained checkpoint for fine tuning tasks" task.set_pretrained_checkpoint(pretrained_checkpoint) # Configure hooks to do tensorboard logging, checkpoints and so on task.set_hooks(configure_hooks(args, config)) use_gpu = None if args.device is not None: use_gpu = args.device == "gpu" assert torch.cuda.is_available() or not use_gpu, "CUDA is unavailable" # LocalTrainer is used for a single node. DistributedTrainer will setup # training to use PyTorch's DistributedDataParallel. trainer_class = { "none": LocalTrainer, "ddp": DistributedTrainer }[args.distributed_backend] trainer = trainer_class(use_gpu=use_gpu, num_dataloader_workers=args.num_workers) logging.info(f"Starting training on rank {get_rank()} worker. " f"World size is {get_world_size()}") # That's it! When this call returns, training is done. trainer.train(task) output_folder = Path(args.checkpoint_folder).resolve() logging.info("Training successful!") logging.info( f'Results of this training run are available at: "{output_folder}"')
def test_metadata(self): """ Test that the metadata returned via pyav corresponds to the one returned by the new video decoder API """ torchvision.set_video_backend("pyav") for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) reader = Video(full_path, "video") reader_md = reader.get_metadata() self.assertAlmostEqual( config.video_fps, reader_md["video"]["fps"][0], delta=0.0001 ) self.assertAlmostEqual( config.duration, reader_md["video"]["duration"][0], delta=0.5 )
def test_audio_present_sec(self): """Test if audio frames are returned with sec unit.""" backends = ["video_reader", "pyav"] start_offsets = [0, 0.1] end_offsets = [0.3, None] for test_video, _ in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) container = av.open(full_path) if container.streams.audio: for backend, start_offset, end_offset in itertools.product( backends, start_offsets, end_offsets): set_video_backend(backend) _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec") assert all( [dimension > 0 for dimension in audio.shape[:2]])
def main(args, config): # Global flags torch.manual_seed(0) set_image_backend(args.image_backend) set_video_backend(args.video_backend) task = build_task(config) # Load checkpoint, if available. if args.checkpoint_load_path: task.set_checkpoint(args.checkpoint_load_path) # Load a checkpoint contraining a pre-trained model. This is how we # implement fine-tuning of existing models. if args.pretrained_checkpoint_path: assert isinstance( task, FineTuningTask ), "Can only use a pretrained checkpoint for fine tuning tasks" task.set_pretrained_checkpoint(args.pretrained_checkpoint_path) # Configure hooks to do tensorboard logging, checkpoints and so on. # `configure_hooks` adds default hooks, while extra hooks can be specified # in config file and stored in `task.hooks`. Here, we merge them when we # set the final hooks of the task. task.set_hooks(configure_hooks(args, config) + task.hooks) # LocalTrainer is used for a single replica. DistributedTrainer will setup # training to use PyTorch's DistributedDataParallel. trainer_class = { "none": LocalTrainer, "ddp": DistributedTrainer }[args.distributed_backend] trainer = trainer_class() logging.info(f"Starting training on rank {get_rank()} worker. " f"World size is {get_world_size()}") # That's it! When this call returns, training is done. trainer.train(task) output_folder = Path(args.checkpoint_folder).resolve() logging.info("Training successful!") logging.info( f'Results of this training run are available at: "{output_folder}"')
def test_read_video_tensor(self): """ Check if reading the video using the `next` based API yields the same sized tensors as the pyav alternative. """ torchvision.set_video_backend("pyav") for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) # pass 1: decode all frames using existing TV decoder tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec") tv_result = tv_result.permute(0, 3, 1, 2) # pass 2: decode all frames using new api reader = VideoReader(full_path, "video") frames = [] for frame in reader: frames.append(frame['data']) new_api = torch.stack(frames, 0) self.assertEqual(tv_result.size(), new_api.size())
from fractions import Fraction from PIL import Image import torchvision from torchvision import datasets from torchvision import io try: import av AV_AVAILABLE = True except ImportError: AV_AVAILABLE = False if io._HAS_VIDEO_OPT: torchvision.set_video_backend('video_reader') class VideoLoader(): """Implementation of VideoLoader. The VideoLoader is a wrapper around the torchvision video interface. With the VideoLoader you can read specific frames or the next frames of a video. It automatically switches to the `video_loader` backend if available. Reading sequential frames is significantly faster since it uses the VideoReader class from torchvision. The video loader automatically detects if you read out subsequent frames and will use the fast read method if possible. Attributes:
import argparse import timeit import os import pandas as pd import itertools import torchvision parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("n", type=int, help="Number of trials to run") args = parser.parse_args() setup_tvvr = """\ import torch import torchvision torchvision.set_video_backend("video_reader") """ def measure_reading_video(path): vframes, _, _ = torchvision.io.read_video(path) loaders = [] times_per_video = [] times_random_seek = [] video = [] num_frames = [] lib_version = [] for i in range(args.n):
import unittest from torchvision import set_video_backend import test_datasets_video_utils set_video_backend('video_reader') if __name__ == '__main__': suite = unittest.TestLoader().loadTestsFromModule(test_datasets_video_utils) unittest.TextTestRunner(verbosity=1).run(suite)
import torch.utils.data as data from PIL import Image import traceback import logging import torchvision from utils.sample_speedup import * from transforms import * # from memory_profiler import profile import gc ## For Dataset WIDTH = 256 HEIGHT = 340 from torchvision.io.video import read_video torchvision.set_video_backend('pyav') class FullDecodeDataSet(data.Dataset): def __init__(self, data_root, video_list, num_segments, is_train): self._data_root = data_root self._num_segments = num_segments self._is_train = is_train self._iframe_scales = [1, .875, .75] self._mv_scales = [1, .875, .75, .66] self._input_size = 224 self._scale_size = self._input_size * 256 // 224 self._iframe_transform = torchvision.transforms.Compose([ GroupMultiScaleCrop(self._input_size, self._iframe_scales), GroupRandomHorizontalFlip(is_mv=False)
def train_main(args): torchvision.set_video_backend("video_reader") if args.apex: if sys.version_info < (3, 0): raise RuntimeError( "Apex currently only supports Python 3. Aborting.") if amp is None: raise RuntimeError("Failed to import apex. Please install apex " "from https://www.github.com/nvidia/apex " "to enable mixed-precision training.") if args.output_dir: utils.mkdir(args.output_dir) utils.init_distributed_mode(args) print(args) print("torch version: ", torch.__version__) print("torchvision version: ", torchvision.__version__) device = torch.device(args.device) torch.backends.cudnn.benchmark = True writer = setup_tbx(args.output_dir) # Data loading code print("Loading data") print("\t Loading datasets") st = time.time() if not args.eval_only: print("\t Loading train data") transform_train = torchvision.transforms.Compose([ T.ToTensorVideo(), T.Resize((args.scale_h, args.scale_w)), T.RandomHorizontalFlipVideo(), T.NormalizeVideo(mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)), T.RandomCropVideo((args.crop_size, args.crop_size)), ]) dataset = get_dataset(args, transform_train) dataset.video_clips.compute_clips(args.num_frames, 1, frame_rate=15) train_sampler = RandomClipSampler(dataset.video_clips, args.train_bs_multiplier) if args.distributed: train_sampler = DistributedSampler(train_sampler) data_loader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.workers, ) print("\t Loading validation data") transform_test = torchvision.transforms.Compose([ T.ToTensorVideo(), T.Resize((args.scale_h, args.scale_w)), T.NormalizeVideo(mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)), T.CenterCropVideo((args.crop_size, args.crop_size)), ]) dataset_test = get_dataset(args, transform_test, split="val") dataset_test.video_clips.compute_clips(args.num_frames, 1, frame_rate=15) test_sampler = UniformClipSampler(dataset_test.video_clips, args.val_clips_per_video) if args.distributed: test_sampler = DistributedSampler(test_sampler) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=args.batch_size, sampler=test_sampler, num_workers=args.workers, ) criterion = nn.CrossEntropyLoss() print("Creating model") # TODO: model only from our models available_models = {**models.__dict__} model = available_models[args.model](pretraining=args.pretrained) model.to(device) if args.distributed and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if args.resume_from_model and not args.resume: checkpoint = torch.load(args.resume_from_model, map_location="cpu") if "model" in checkpoint.keys(): model.load_state_dict(checkpoint["model"]) else: model.load_state_dict(checkpoint) if args.finetune: assert args.resume_from_model is not None or args.pretrained model.fc = nn.Linear(model.fc.in_features, args.num_finetune_classes) lr = args.lr * args.world_size if args.finetune: params = [ { "params": model.stem.parameters(), "lr": 0 }, { "params": model.layer1.parameters(), "lr": args.l1_lr * args.world_size }, { "params": model.layer2.parameters(), "lr": args.l2_lr * args.world_size }, { "params": model.layer3.parameters(), "lr": args.l3_lr * args.world_size }, { "params": model.layer4.parameters(), "lr": args.l4_lr * args.world_size }, { "params": model.fc.parameters(), "lr": args.fc_lr * args.world_size }, ] else: params = model.parameters() print(params) optimizer = torch.optim.SGD( params, lr=lr, momentum=args.momentum, weight_decay=args.weight_decay, ) if args.apex: model, optimizer = amp.initialize(model, optimizer, opt_level=args.apex_opt_level) # convert scheduler to be per iteration, # not per epoch, for warmup that lasts # between different epochs if not args.eval_only: warmup_iters = args.lr_warmup_epochs * len(data_loader) lr_milestones = [len(data_loader) * m for m in args.lr_milestones] lr_scheduler = WarmupMultiStepLR( optimizer, milestones=lr_milestones, gamma=args.lr_gamma, warmup_iters=warmup_iters, warmup_factor=1e-5, ) if os.path.isfile(os.path.join(args.output_dir, "checkpoint.pth")): args.resume = os.path.join(args.output_dir, "checkpoint.pth") if args.resume: checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) args.start_epoch = checkpoint["epoch"] + 1 model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.eval_only: print("Starting test_only") metric_logger = MetricLogger(delimiter=" ", writer=writer, stat_set="val") evaluate(model, criterion, data_loader_test, device, metric_logger) return # Get training metric logger stat_loggers = get_default_loggers(writer, args.start_epoch) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch( model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq, stat_loggers["train"], args.apex, ) evaluate(model, criterion, data_loader_test, device, stat_loggers["val"]) if args.output_dir: checkpoint = { "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args, } utils.save_on_master( checkpoint, os.path.join(args.output_dir, "model_{}.pth".format(epoch))) utils.save_on_master( checkpoint, os.path.join(args.output_dir, "checkpoint.pth")) # reset all meters in the metric logger for log in stat_loggers: stat_loggers[log].reset_meters() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("Training time {}".format(total_time_str))
checkpoint = { "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args, } utils.save_on_master( checkpoint, os.path.join(args.output_dir, "model_{}.pth".format(epoch))) utils.save_on_master( checkpoint, os.path.join(args.output_dir, "checkpoint.pth")) # reset all meters in the metric logger for log in stat_loggers: stat_loggers[log].reset_meters() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("Training time {}".format(total_time_str)) if __name__ == "__main__": from vmz.func.opts import parse_args import torchvision torchvision.set_video_backend("video_reader") args = parse_args() train_main(args) exit()