def test_normalize_video(self): def samples_from_standard_normal(tensor): p_value = stats.kstest(list(tensor.view(-1)), 'norm', args=(0, 1)).pvalue return p_value > 0.0001 random_state = random.getstate() random.seed(42) for channels in [1, 3]: numFrames = random.randint(4, 128) height = random.randint(32, 256) width = random.randint(32, 256) mean = random.random() std = random.random() clip = torch.normal(mean, std, size=(channels, numFrames, height, width)) mean = [clip[c].mean().item() for c in range(channels)] std = [clip[c].std().item() for c in range(channels)] normalized = transforms.NormalizeVideo(mean, std)(clip) self.assertTrue(samples_from_standard_normal(normalized)) random.setstate(random_state) # Checking the optional in-place behaviour tensor = torch.rand((3, 128, 16, 16)) tensor_inplace = transforms.NormalizeVideo((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)(tensor) self.assertTrue(torch.equal(tensor, tensor_inplace)) transforms.NormalizeVideo((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True).__repr__()
def __init__( self, crop_size: Union[int, List[int]] = VideoConstants.CROP_SIZE, size_range: List[int] = VideoConstants.SIZE_RANGE, mean: List[float] = VideoConstants.MEAN, std: List[float] = VideoConstants.STD, ): """The constructor method of VideoDefaultAugmentTransform class. Args: crop_size: expected output crop_size (height, width) size_range : a 2-tuple denoting the min- and max size mean: a 3-tuple denoting the pixel RGB mean std: a 3-tuple denoting the pixel RGB standard deviation """ self._transform = transforms.Compose( [ transforms_video.ToTensorVideo(), # TODO(zyan3): migrate VideoClipRandomResizeCrop to TorchVision VideoClipRandomResizeCrop(crop_size, size_range), transforms_video.RandomHorizontalFlipVideo(), transforms_video.NormalizeVideo(mean=mean, std=std), ] )
def val_transform(s): return transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizeVideo(s), transforms_video.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]), transforms_video.CenterCropVideo(s) ])
def train_transform(s): return transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomHorizontalFlipVideo(), transforms_video.RandomResizeVideo((s, round(s * 1.5))), transforms_video.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]), transforms_video.RandomCropVideo(s) ])
def default_transformation_3D(split, size=224): return { "train": transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(size), RandomVerticalFlipVideo(), transforms_video.RandomHorizontalFlipVideo(), transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]), ]), "valid": transforms.Compose([ transforms_video.ToTensorVideo(), VideoClipResize(size), # not square transforms_video.CenterCropVideo(size), transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]), ]), "test": transforms.Compose([ transforms_video.ToTensorVideo(), VideoClipResize(size), # not square transforms_video.CenterCropVideo(size), transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]), ]), }[split]
def __init__(self, segments, segment_labels, segment_logits, frame_stride=1, n_test_segments=25): super(TestDataset, self).__init__(segments, segment_labels, segment_logits, frame_stride) self.n_test_segments = n_test_segments self.transforms = Compose([ ToTensorVideo(), ResizeVideo(INPUT_SIZE), transforms.CenterCropVideo(crop_size=INPUT_SIZE), ToZeroOneVideo(), transforms.NormalizeVideo(NORM_MEAN, NORM_STD) ])
def __init__(self, segments, segment_labels, segment_logits, frame_stride=1): super(TrainDataset, self).__init__() self.segments = segments self.segment_labels = segment_labels self.segment_logits = segment_logits self.frame_stride = int(frame_stride) self.transforms = Compose([ ToTensorVideo(), ResizeVideo(INPUT_SIZE), transforms.CenterCropVideo(crop_size=INPUT_SIZE), # transforms.RandomHorizontalFlipVideo(), ToZeroOneVideo(), transforms.NormalizeVideo(NORM_MEAN, NORM_STD) ])
def __init__(self, segments, segment_labels, segment_logits, segment_length, input_size, frame_stride=1, n_test_segments=25): super(TestDataset, self).__init__(segments, segment_labels, segment_logits, segment_length, input_size, frame_stride) self.n_test_segments = n_test_segments self.transforms = Compose([ ToTensorVideo(), ResizeVideo(input_size), transforms.CenterCropVideo(crop_size=input_size), ToZeroOneVideo(), transforms.NormalizeVideo(breakfast.TENSOR_MEAN, breakfast.TENSOR_STD) ])
def __init__( self, size: int = VideoConstants.SIZE_RANGE[0], mean: List[float] = VideoConstants.MEAN, std: List[float] = VideoConstants.STD, ): """The constructor method of VideoDefaultNoAugmentTransform class. Args: size: the short edge of rescaled video clip mean: a 3-tuple denoting the pixel RGB mean std: a 3-tuple denoting the pixel RGB standard deviation """ self._transform = transforms.Compose( # At testing stage, central cropping is not used because we # conduct fully convolutional-style testing [ transforms_video.ToTensorVideo(), # TODO(zyan3): migrate VideoClipResize to TorchVision VideoClipResize(size), transforms_video.NormalizeVideo(mean=mean, std=std), ])
def __init__(self, segments, segment_labels, segment_logits, segment_length, input_size, frame_stride=1): super(TrainDataset, self).__init__() self.segments = segments self.segment_labels = segment_labels self.segment_logits = segment_logits self.segment_length = int(segment_length) self.input_size = int(input_size) self.frame_stride = int(frame_stride) self.transforms = Compose([ ToTensorVideo(), ResizeVideo(input_size), transforms.RandomResizedCropVideo(size=input_size), transforms.RandomHorizontalFlipVideo(), ToZeroOneVideo(), transforms.NormalizeVideo(breakfast.TENSOR_MEAN, breakfast.TENSOR_STD) ])
import torchvision.datasets as datasets from torch.utils.data import DataLoader import torchvision.transforms as transforms import torchvision.transforms._transforms_video as v_transform import torch TRAIN_BATCH_SIZE = 128 TEST_BATCH_SIZE = 128 FRAME_LENGTH = 16 transform = transforms.Compose([ v_transform.ToTensorVideo(), v_transform.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]), v_transform.RandomHorizontalFlipVideo(), v_transform.RandomCropVideo(112), ]) def custom_collate(batch): filtered_batch = [] for video, _, label in batch: filtered_batch.append((video, label)) return torch.utils.data.dataloader.default_collate(filtered_batch) trainset = datasets.UCF101( root='data/UCF101/UCF-101', annotation_path= 'data/UCF101TrainTestSplits-RecognitionTask/ucfTrainTestlist', frames_per_clip=FRAME_LENGTH,
def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]): self.mean = mean self.std = std self.operation_torch = transforms_video.NormalizeVideo(mean, std)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=============> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() print(model) # freeze all layers but the last fc # for name, param in model.named_parameters(): # if name not in ['fc.weight', 'fc.bias']: # param.requires_grad = False # init the fc layer model.fc = nn.Linear(512, args.num_class, bias=True) model.fc.weight.data.normal_(mean=0.0, std=0.01) model.fc.bias.data.zero_() # load from pre-trained, before DistributedDataParallel constructor if args.pretrained: if os.path.isfile(args.pretrained): print("=> loading checkpoint '{}'".format(args.pretrained)) checkpoint = torch.load(args.pretrained, map_location="cpu") # rename moco pre-trained keys state_dict = checkpoint['state_dict'] for k in list(state_dict.keys()): # retain only encoder_q up to before the embedding layer if k.startswith('module.encoder_q' ) and not k.startswith('module.encoder_q.fc'): # remove prefix state_dict[k[len("module.encoder_q."):]] = state_dict[k] # delete renamed or unused k del state_dict[k] args.start_epoch = 0 msg = model.load_state_dict(state_dict, strict=False) assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} print("=> loaded pre-trained model '{}'".format(args.pretrained)) else: print("=> no checkpoint found at '{}'".format(args.pretrained)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model) #.cuda() for debug on cpu # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # optimize only the linear classifier parameters = list(filter(lambda p: p.requires_grad, model.parameters())) # assert len(parameters) == 2 # fc.weight, fc.bias optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code normalize_video = transforms_video.NormalizeVideo( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) video_augmentation_train = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(args.crop_size), transforms_video.RandomHorizontalFlipVideo(), normalize_video, ]) video_augmentation_val = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.CenterCropVideo(args.crop_size), normalize_video, ]) data_dir = os.path.join(args.data, 'data') anno_dir = os.path.join(args.data, 'anno') audio_augmentation = moco.loader.DummyAudioTransform() train_augmentation = { 'video': video_augmentation_train, 'audio': audio_augmentation } val_augmentation = { 'video': video_augmentation_val, 'audio': audio_augmentation } train_dataset = UCF101(data_dir, anno_dir, args.frame_per_clip, args.step_between_clips, fold=1, train=True, transform=train_augmentation, num_workers=16) train_sampler = RandomClipSampler(train_dataset.video_clips, 10) if args.distributed: train_sampler = DistributedSampler(train_sampler) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, multiprocessing_context="fork") val_dataset = UCF101(data_dir, anno_dir, args.frame_per_clip, args.step_between_clips, fold=1, train=False, transform=val_augmentation, num_workers=16) # Do not use DistributedSampler since it will destroy the testing iteration process val_sampler = UniformClipSampler(val_dataset.video_clips, args.clip_per_video) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.clip_per_video, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, multiprocessing_context="fork") if args.evaluate: validate(val_loader, model, criterion, args) return if args.multiprocessing_distributed and args.gpu == 0: log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) writer = SummaryWriter(log_dir) else: writer = None for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, writer) # evaluate on validation set val_loss, acc1, acc5 = validate(val_loader, model, criterion, args) if writer is not None: writer.add_scalar('lincls_val/loss', val_loss, epoch) writer.add_scalar('lincls_val/acc1', acc1, epoch) writer.add_scalar('lincls_val/acc5', acc5, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format( args.ckp_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, ckp_dir, max_save=1, is_best=is_best)
def test_build_field_transform_default_video(self): dataset = self.get_test_video_dataset() # transform config is not provided. Use default transforms config = None # default training data transform sample = dataset[0] transform = build_video_field_transform_default(config, "train") output_clip = transform(sample)["input"]["video"] self.assertEqual( output_clip.size(), torch.Size(( 3, self.frames_per_clip, VideoConstants.CROP_SIZE, VideoConstants.CROP_SIZE, )), ) # default testing data transform sample = dataset[1] sample_copy = copy.deepcopy(sample) expected_output_clip = transforms_video.ToTensorVideo()( sample["input"]["video"]) expected_output_clip = transforms_video.CenterCropVideo( VideoConstants.CROP_SIZE)(expected_output_clip) expected_output_clip = transforms_video.NormalizeVideo( mean=VideoConstants.MEAN, std=VideoConstants.STD)(expected_output_clip) transform = build_video_field_transform_default(config, "test") output_clip = transform(sample_copy)["input"]["video"] rescaled_width = int(VideoConstants.SIZE_RANGE[0] * self.video_width / self.video_height) self.assertEqual( output_clip.size(), torch.Size((3, self.frames_per_clip, VideoConstants.SIZE_RANGE[0], rescaled_width)), ) # transform config is provided. Simulate training config sample = dataset[2] config = { "video": [ { "name": "ToTensorVideo" }, { "name": "video_clip_random_resize_crop", "crop_size": 64, "size_range": [256, 320], }, { "name": "RandomHorizontalFlipVideo" }, { "name": "NormalizeVideo", "mean": [0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225], }, ] } transform = build_video_field_transform_default(config, "train") output_clip = transform(sample)["input"]["video"] self.assertEqual(output_clip.size(), torch.Size((3, self.frames_per_clip, 64, 64))) self.assertTrue(output_clip.dtype == torch.float) # transform config is provided. Simulate testing config sample = dataset[3] config = { "video": [ { "name": "ToTensorVideo" }, { "name": "video_clip_resize", "size": 64 }, { "name": "NormalizeVideo", "mean": [0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225], }, ] } transform = build_video_field_transform_default(config, "train") output_clip = transform(sample)["input"]["video"] rescaled_width = int(64 * self.video_width / self.video_height) self.assertEqual( output_clip.size(), torch.Size((3, self.frames_per_clip, 64, rescaled_width)), ) self.assertTrue(output_clip.dtype == torch.float)