def __init__(
        self,
        crop_size: Union[int, List[int]] = VideoConstants.CROP_SIZE,
        size_range: List[int] = VideoConstants.SIZE_RANGE,
        mean: List[float] = VideoConstants.MEAN,
        std: List[float] = VideoConstants.STD,
    ):
        """The constructor method of VideoDefaultAugmentTransform class.

        Args:
            crop_size: expected output crop_size (height, width)
            size_range : a 2-tuple denoting the min- and max size
            mean: a 3-tuple denoting the pixel RGB mean
            std: a 3-tuple denoting the pixel RGB standard deviation

        """

        self._transform = transforms.Compose(
            [
                transforms_video.ToTensorVideo(),
                # TODO(zyan3): migrate VideoClipRandomResizeCrop to TorchVision
                VideoClipRandomResizeCrop(crop_size, size_range),
                transforms_video.RandomHorizontalFlipVideo(),
                transforms_video.NormalizeVideo(mean=mean, std=std),
            ]
        )
Example #2
0
    def test_center_crop_video(self):
        numFrames = random.randint(4, 128)
        height = random.randint(10, 32) * 2
        width = random.randint(10, 32) * 2
        oheight = random.randint(5, (height - 2) / 2) * 2
        owidth = random.randint(5, (width - 2) / 2) * 2

        clip = torch.ones(
            (numFrames, height, width, 3), dtype=torch.uint8) * 255
        oh1 = (height - oheight) // 2
        ow1 = (width - owidth) // 2
        clipNarrow = clip[:, oh1:oh1 + oheight, ow1:ow1 + owidth, :]
        clipNarrow.fill_(0)
        result = Compose([
            transforms.ToTensorVideo(),
            transforms.CenterCropVideo((oheight, owidth)),
        ])(clip)

        msg = "height: " + str(height) + " width: " \
            + str(width) + " oheight: " + str(oheight) + " owidth: " + str(owidth)
        self.assertEqual(result.sum().item(), 0, msg)

        oheight += 1
        owidth += 1
        result = Compose([
            transforms.ToTensorVideo(),
            transforms.CenterCropVideo((oheight, owidth)),
        ])(clip)
        sum1 = result.sum()

        msg = "height: " + str(height) + " width: " \
            + str(width) + " oheight: " + str(oheight) + " owidth: " + str(owidth)
        self.assertEqual(sum1.item() > 1, True, msg)

        oheight += 1
        owidth += 1
        result = Compose([
            transforms.ToTensorVideo(),
            transforms.CenterCropVideo((oheight, owidth)),
        ])(clip)
        sum2 = result.sum()

        msg = "height: " + str(height) + " width: " \
            + str(width) + " oheight: " + str(oheight) + " owidth: " + str(owidth)
        self.assertTrue(sum2.item() > 1, msg)
        self.assertTrue(sum2.item() > sum1.item(), msg)
Example #3
0
def val_transform(s):
    return transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.RandomResizeVideo(s),
        transforms_video.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645],
                                        std=[0.22803, 0.22145, 0.216989]),
        transforms_video.CenterCropVideo(s)
    ])
Example #4
0
def train_transform(s):
    return transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.RandomHorizontalFlipVideo(),
        transforms_video.RandomResizeVideo((s, round(s * 1.5))),
        transforms_video.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645],
                                        std=[0.22803, 0.22145, 0.216989]),
        transforms_video.RandomCropVideo(s)
    ])
def default_transformation_3D(split, size=224):
    return {
        "train": transforms.Compose([
            transforms_video.ToTensorVideo(),
            transforms_video.RandomResizedCropVideo(size),
            RandomVerticalFlipVideo(),
            transforms_video.RandomHorizontalFlipVideo(),
            transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]),
        ]),
        "valid": transforms.Compose([
            transforms_video.ToTensorVideo(),
            VideoClipResize(size),  # not square
            transforms_video.CenterCropVideo(size),
            transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]),
        ]),
        "test": transforms.Compose([
            transforms_video.ToTensorVideo(),
            VideoClipResize(size),  # not square
            transforms_video.CenterCropVideo(size),
            transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]),
        ]),
    }[split]
Example #6
0
    def test_random_resized_crop_video(self):
        numFrames = random.randint(4, 128)
        height = random.randint(10, 32) * 2
        width = random.randint(10, 32) * 2
        oheight = random.randint(5, (height - 2) / 2) * 2
        owidth = random.randint(5, (width - 2) / 2) * 2
        clip = torch.randint(0,
                             256, (numFrames, height, width, 3),
                             dtype=torch.uint8)
        result = Compose([
            transforms.ToTensorVideo(),
            transforms.RandomResizedCropVideo((oheight, owidth)),
        ])(clip)
        self.assertEqual(result.size(2), oheight)
        self.assertEqual(result.size(3), owidth)

        transforms.RandomResizedCropVideo((oheight, owidth)).__repr__()
Example #7
0
    def test_to_tensor_video(self):
        numFrames, height, width = 64, 4, 4
        trans = transforms.ToTensorVideo()

        with self.assertRaises(TypeError):
            trans(np.random.rand(numFrames, height, width, 1).tolist())
            trans(torch.rand((numFrames, height, width, 1), dtype=torch.float))

        with self.assertRaises(ValueError):
            trans(
                torch.ones((3, numFrames, height, width, 3),
                           dtype=torch.uint8))
            trans(torch.ones((height, width, 3), dtype=torch.uint8))
            trans(torch.ones((width, 3), dtype=torch.uint8))
            trans(torch.ones((3), dtype=torch.uint8))

        trans.__repr__()
Example #8
0
    def __init__(
        self,
        size: int = VideoConstants.SIZE_RANGE[0],
        mean: List[float] = VideoConstants.MEAN,
        std: List[float] = VideoConstants.STD,
    ):
        """The constructor method of VideoDefaultNoAugmentTransform class.

        Args:
            size: the short edge of rescaled video clip
            mean: a 3-tuple denoting the pixel RGB mean
            std: a 3-tuple denoting the pixel RGB standard deviation

        """
        self._transform = transforms.Compose(
            # At testing stage, central cropping is not used because we
            # conduct fully convolutional-style testing
            [
                transforms_video.ToTensorVideo(),
                # TODO(zyan3): migrate VideoClipResize to TorchVision
                VideoClipResize(size),
                transforms_video.NormalizeVideo(mean=mean, std=std),
            ])
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torchvision.transforms._transforms_video as v_transform
import torch

TRAIN_BATCH_SIZE = 128
TEST_BATCH_SIZE = 128
FRAME_LENGTH = 16

transform = transforms.Compose([
    v_transform.ToTensorVideo(),
    v_transform.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645],
                               std=[0.22803, 0.22145, 0.216989]),
    v_transform.RandomHorizontalFlipVideo(),
    v_transform.RandomCropVideo(112),
])


def custom_collate(batch):
    filtered_batch = []
    for video, _, label in batch:
        filtered_batch.append((video, label))
    return torch.utils.data.dataloader.default_collate(filtered_batch)


trainset = datasets.UCF101(
    root='data/UCF101/UCF-101',
    annotation_path=
    'data/UCF101TrainTestSplits-RecognitionTask/ucfTrainTestlist',
    frames_per_clip=FRAME_LENGTH,
Example #10
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu
    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))
    netG = moco.builder.MaskGenerator()
    netD = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim,
                             args.moco_k, args.moco_m, args.moco_t, args.mlp)
    print(netG)
    print(netD)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            netG.cuda(args.gpu)
            netD.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            netG = torch.nn.parallel.DistributedDataParallel(
                netG, device_ids=[args.gpu], find_unused_parameters=True)
            netD = torch.nn.parallel.DistributedDataParallel(
                netD, device_ids=[args.gpu], find_unused_parameters=True)
        else:
            netG.cuda()
            netD.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            netG = torch.nn.parallel.DistributedDataParallel(netG)
            netD = torch.nn.parallel.DistributedDataParallel(netD)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        netG = netG.cuda(args.gpu)
        netD = netD.cuda(args.gpu)
        # comment out the following line for debugging
        # raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        pass  # raise NotImplementedError("Only DistributedDataParallel is supported.") for debug on cpu
    # torch.cuda.synchronize()
    optimizer_g = torch.optim.SGD(netG.parameters(),
                                  args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    optimizer_d = torch.optim.SGD(netD.parameters(),
                                  args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    G_criterion = nn.L1Loss().cuda(args.gpu)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            netD.load_state_dict(checkpoint['state_dict'])
            #optimizer_d.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

        if os.path.isfile(args.resumeG):
            print("=> loading checkpoint '{}'".format(args.resumeG))
            if args.gpu is None:
                checkpoint = torch.load(args.resumeG)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resumeG, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            netG.load_state_dict(checkpoint['state_dict'])
            #optimizer_g.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resumeG, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resumeG))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    video_augmentation = transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.RandomResizedCropVideo(args.crop_size, (0.2, 1)),
    ])
    audio_augmentation = moco.loader.DummyAudioTransform()
    augmentation = {'video': video_augmentation, 'audio': audio_augmentation}

    augmentation_gpu = moco.loader.MoCoAugmentV2(
        args.crop_size) if args.aug_plus else moco.loader.MoCoAugment(
            args.crop_size)

    train_dataset = Kinetics400(traindir,
                                args.frame_per_clip,
                                args.step_between_clips,
                                extensions='mp4',
                                transform=augmentation,
                                num_workers=4)

    train_sampler = RandomClipSampler(train_dataset.video_clips, 1)

    if args.distributed:
        # train_sampler = torch.utils.data.distributed.DistributedSampler(train_sampler)
        train_sampler = DistributedSampler(train_sampler)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True,
                                               multiprocessing_context="fork")
    if args.multiprocessing_distributed and args.gpu == 0:
        log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir,
                                                       args.batch_size,
                                                       args.lr, args.crop_size,
                                                       args.frame_per_clip)
        writer = SummaryWriter(log_dir)
    else:
        writer = None
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer_d, epoch, args)
        adjust_learning_rate(optimizer_g, epoch, args)

        # train for one epoch
        train(train_loader, augmentation_gpu, criterion, G_criterion, netG,
              netD, optimizer_g, optimizer_d, epoch, args, writer)

        if (epoch + 1) % 10 == 0 and (not args.multiprocessing_distributed or
                                      (args.multiprocessing_distributed
                                       and args.rank % ngpus_per_node == 0)):
            ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(
                args.ckp_dir, args.batch_size, args.lr, args.crop_size,
                args.frame_per_clip)
            save_checkpoint(epoch, {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': netG.state_dict(),
            },
                            ckp_dir + '/netG',
                            max_save=20,
                            is_best=False)

            save_checkpoint(epoch, {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': netD.state_dict(),
            },
                            ckp_dir + '/netD',
                            max_save=20,
                            is_best=False)
Example #11
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=============> creating model '{}'".format(args.arch))
    model = models.__dict__[args.arch]()
    print(model)
    # freeze all layers but the last fc
    #     for name, param in model.named_parameters():
    #         if name not in ['fc.weight', 'fc.bias']:
    #             param.requires_grad = False
    # init the fc layer
    model.fc = nn.Linear(512, args.num_class, bias=True)
    model.fc.weight.data.normal_(mean=0.0, std=0.01)
    model.fc.bias.data.zero_()

    # load from pre-trained, before DistributedDataParallel constructor
    if args.pretrained:
        if os.path.isfile(args.pretrained):
            print("=> loading checkpoint '{}'".format(args.pretrained))
            checkpoint = torch.load(args.pretrained, map_location="cpu")

            # rename moco pre-trained keys
            state_dict = checkpoint['state_dict']
            for k in list(state_dict.keys()):
                # retain only encoder_q up to before the embedding layer
                if k.startswith('module.encoder_q'
                                ) and not k.startswith('module.encoder_q.fc'):
                    # remove prefix
                    state_dict[k[len("module.encoder_q."):]] = state_dict[k]
                # delete renamed or unused k
                del state_dict[k]

            args.start_epoch = 0
            msg = model.load_state_dict(state_dict, strict=False)
            assert set(msg.missing_keys) == {"fc.weight", "fc.bias"}

            print("=> loaded pre-trained model '{}'".format(args.pretrained))
        else:
            print("=> no checkpoint found at '{}'".format(args.pretrained))

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model)  #.cuda() for debug on cpu
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    # optimize only the linear classifier
    parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
    # assert len(parameters) == 2  # fc.weight, fc.bias
    optimizer = torch.optim.SGD(parameters,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    normalize_video = transforms_video.NormalizeVideo(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    video_augmentation_train = transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.RandomResizedCropVideo(args.crop_size),
        transforms_video.RandomHorizontalFlipVideo(),
        normalize_video,
    ])
    video_augmentation_val = transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.CenterCropVideo(args.crop_size),
        normalize_video,
    ])
    data_dir = os.path.join(args.data, 'data')
    anno_dir = os.path.join(args.data, 'anno')
    audio_augmentation = moco.loader.DummyAudioTransform()
    train_augmentation = {
        'video': video_augmentation_train,
        'audio': audio_augmentation
    }
    val_augmentation = {
        'video': video_augmentation_val,
        'audio': audio_augmentation
    }

    train_dataset = UCF101(data_dir,
                           anno_dir,
                           args.frame_per_clip,
                           args.step_between_clips,
                           fold=1,
                           train=True,
                           transform=train_augmentation,
                           num_workers=16)
    train_sampler = RandomClipSampler(train_dataset.video_clips, 10)
    if args.distributed:
        train_sampler = DistributedSampler(train_sampler)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               multiprocessing_context="fork")

    val_dataset = UCF101(data_dir,
                         anno_dir,
                         args.frame_per_clip,
                         args.step_between_clips,
                         fold=1,
                         train=False,
                         transform=val_augmentation,
                         num_workers=16)
    # Do not use DistributedSampler since it will destroy the testing iteration process
    val_sampler = UniformClipSampler(val_dataset.video_clips,
                                     args.clip_per_video)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.clip_per_video,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             sampler=val_sampler,
                                             multiprocessing_context="fork")

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return
    if args.multiprocessing_distributed and args.gpu == 0:
        log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir,
                                                       args.batch_size,
                                                       args.lr, args.crop_size,
                                                       args.frame_per_clip)
        writer = SummaryWriter(log_dir)
    else:
        writer = None
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args, writer)

        # evaluate on validation set
        val_loss, acc1, acc5 = validate(val_loader, model, criterion, args)
        if writer is not None:
            writer.add_scalar('lincls_val/loss', val_loss, epoch)
            writer.add_scalar('lincls_val/acc1', acc1, epoch)
            writer.add_scalar('lincls_val/acc5', acc5, epoch)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(
                args.ckp_dir, args.batch_size, args.lr, args.crop_size,
                args.frame_per_clip)
            save_checkpoint(epoch, {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            },
                            ckp_dir,
                            max_save=1,
                            is_best=is_best)
    def test_build_field_transform_default_video(self):
        dataset = self.get_test_video_dataset()

        # transform config is not provided. Use default transforms
        config = None
        # default training data transform
        sample = dataset[0]

        transform = build_video_field_transform_default(config, "train")
        output_clip = transform(sample)["input"]["video"]
        self.assertEqual(
            output_clip.size(),
            torch.Size((
                3,
                self.frames_per_clip,
                VideoConstants.CROP_SIZE,
                VideoConstants.CROP_SIZE,
            )),
        )
        # default testing data transform
        sample = dataset[1]
        sample_copy = copy.deepcopy(sample)

        expected_output_clip = transforms_video.ToTensorVideo()(
            sample["input"]["video"])
        expected_output_clip = transforms_video.CenterCropVideo(
            VideoConstants.CROP_SIZE)(expected_output_clip)
        expected_output_clip = transforms_video.NormalizeVideo(
            mean=VideoConstants.MEAN,
            std=VideoConstants.STD)(expected_output_clip)

        transform = build_video_field_transform_default(config, "test")
        output_clip = transform(sample_copy)["input"]["video"]

        rescaled_width = int(VideoConstants.SIZE_RANGE[0] * self.video_width /
                             self.video_height)
        self.assertEqual(
            output_clip.size(),
            torch.Size((3, self.frames_per_clip, VideoConstants.SIZE_RANGE[0],
                        rescaled_width)),
        )
        # transform config is provided. Simulate training config
        sample = dataset[2]
        config = {
            "video": [
                {
                    "name": "ToTensorVideo"
                },
                {
                    "name": "video_clip_random_resize_crop",
                    "crop_size": 64,
                    "size_range": [256, 320],
                },
                {
                    "name": "RandomHorizontalFlipVideo"
                },
                {
                    "name": "NormalizeVideo",
                    "mean": [0.485, 0.456, 0.406],
                    "std": [0.229, 0.224, 0.225],
                },
            ]
        }
        transform = build_video_field_transform_default(config, "train")
        output_clip = transform(sample)["input"]["video"]
        self.assertEqual(output_clip.size(),
                         torch.Size((3, self.frames_per_clip, 64, 64)))
        self.assertTrue(output_clip.dtype == torch.float)

        # transform config is provided. Simulate testing config
        sample = dataset[3]
        config = {
            "video": [
                {
                    "name": "ToTensorVideo"
                },
                {
                    "name": "video_clip_resize",
                    "size": 64
                },
                {
                    "name": "NormalizeVideo",
                    "mean": [0.485, 0.456, 0.406],
                    "std": [0.229, 0.224, 0.225],
                },
            ]
        }
        transform = build_video_field_transform_default(config, "train")
        output_clip = transform(sample)["input"]["video"]

        rescaled_width = int(64 * self.video_width / self.video_height)
        self.assertEqual(
            output_clip.size(),
            torch.Size((3, self.frames_per_clip, 64, rescaled_width)),
        )
        self.assertTrue(output_clip.dtype == torch.float)