def test_random_resized_crop_video(self): numFrames = random.randint(4, 128) height = random.randint(10, 32) * 2 width = random.randint(10, 32) * 2 oheight = random.randint(5, (height - 2) / 2) * 2 owidth = random.randint(5, (width - 2) / 2) * 2 clip = torch.randint(0, 256, (numFrames, height, width, 3), dtype=torch.uint8) result = Compose([ transforms.ToTensorVideo(), transforms.RandomResizedCropVideo((oheight, owidth)), ])(clip) self.assertEqual(result.size(2), oheight) self.assertEqual(result.size(3), owidth) transforms.RandomResizedCropVideo((oheight, owidth)).__repr__()
def __init__(self, size, interpolation="bilinear", consistent=True, p=1.0): self.size = size self.interpolation = Image.BILINEAR if interpolation == "bilinear" else None self.consistent = consistent self.threshold = p self.operation_torch = transforms_video.RandomResizedCropVideo( size, interpolation_mode=interpolation)
def default_transformation_3D(split, size=224): return { "train": transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(size), RandomVerticalFlipVideo(), transforms_video.RandomHorizontalFlipVideo(), transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]), ]), "valid": transforms.Compose([ transforms_video.ToTensorVideo(), VideoClipResize(size), # not square transforms_video.CenterCropVideo(size), transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]), ]), "test": transforms.Compose([ transforms_video.ToTensorVideo(), VideoClipResize(size), # not square transforms_video.CenterCropVideo(size), transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]), ]), }[split]
def __init__(self, segments, segment_labels, segment_logits, segment_length, input_size, frame_stride=1): super(TrainDataset, self).__init__() self.segments = segments self.segment_labels = segment_labels self.segment_logits = segment_logits self.segment_length = int(segment_length) self.input_size = int(input_size) self.frame_stride = int(frame_stride) self.transforms = Compose([ ToTensorVideo(), ResizeVideo(input_size), transforms.RandomResizedCropVideo(size=input_size), transforms.RandomHorizontalFlipVideo(), ToZeroOneVideo(), transforms.NormalizeVideo(breakfast.TENSOR_MEAN, breakfast.TENSOR_STD) ])
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) netG = moco.builder.MaskGenerator() netD = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) print(netG) print(netD) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) netG.cuda(args.gpu) netD.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) netG = torch.nn.parallel.DistributedDataParallel( netG, device_ids=[args.gpu], find_unused_parameters=True) netD = torch.nn.parallel.DistributedDataParallel( netD, device_ids=[args.gpu], find_unused_parameters=True) else: netG.cuda() netD.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set netG = torch.nn.parallel.DistributedDataParallel(netG) netD = torch.nn.parallel.DistributedDataParallel(netD) elif args.gpu is not None: torch.cuda.set_device(args.gpu) netG = netG.cuda(args.gpu) netD = netD.cuda(args.gpu) # comment out the following line for debugging # raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. pass # raise NotImplementedError("Only DistributedDataParallel is supported.") for debug on cpu # torch.cuda.synchronize() optimizer_g = torch.optim.SGD(netG.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer_d = torch.optim.SGD(netD.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss().cuda(args.gpu) G_criterion = nn.L1Loss().cuda(args.gpu) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] netD.load_state_dict(checkpoint['state_dict']) #optimizer_d.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if os.path.isfile(args.resumeG): print("=> loading checkpoint '{}'".format(args.resumeG)) if args.gpu is None: checkpoint = torch.load(args.resumeG) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resumeG, map_location=loc) args.start_epoch = checkpoint['epoch'] netG.load_state_dict(checkpoint['state_dict']) #optimizer_g.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resumeG, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resumeG)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') video_augmentation = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(args.crop_size, (0.2, 1)), ]) audio_augmentation = moco.loader.DummyAudioTransform() augmentation = {'video': video_augmentation, 'audio': audio_augmentation} augmentation_gpu = moco.loader.MoCoAugmentV2( args.crop_size) if args.aug_plus else moco.loader.MoCoAugment( args.crop_size) train_dataset = Kinetics400(traindir, args.frame_per_clip, args.step_between_clips, extensions='mp4', transform=augmentation, num_workers=4) train_sampler = RandomClipSampler(train_dataset.video_clips, 1) if args.distributed: # train_sampler = torch.utils.data.distributed.DistributedSampler(train_sampler) train_sampler = DistributedSampler(train_sampler) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True, multiprocessing_context="fork") if args.multiprocessing_distributed and args.gpu == 0: log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) writer = SummaryWriter(log_dir) else: writer = None for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer_d, epoch, args) adjust_learning_rate(optimizer_g, epoch, args) # train for one epoch train(train_loader, augmentation_gpu, criterion, G_criterion, netG, netD, optimizer_g, optimizer_d, epoch, args, writer) if (epoch + 1) % 10 == 0 and (not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0)): ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format( args.ckp_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': netG.state_dict(), }, ckp_dir + '/netG', max_save=20, is_best=False) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': netD.state_dict(), }, ckp_dir + '/netD', max_save=20, is_best=False)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=============> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() print(model) # freeze all layers but the last fc # for name, param in model.named_parameters(): # if name not in ['fc.weight', 'fc.bias']: # param.requires_grad = False # init the fc layer model.fc = nn.Linear(512, args.num_class, bias=True) model.fc.weight.data.normal_(mean=0.0, std=0.01) model.fc.bias.data.zero_() # load from pre-trained, before DistributedDataParallel constructor if args.pretrained: if os.path.isfile(args.pretrained): print("=> loading checkpoint '{}'".format(args.pretrained)) checkpoint = torch.load(args.pretrained, map_location="cpu") # rename moco pre-trained keys state_dict = checkpoint['state_dict'] for k in list(state_dict.keys()): # retain only encoder_q up to before the embedding layer if k.startswith('module.encoder_q' ) and not k.startswith('module.encoder_q.fc'): # remove prefix state_dict[k[len("module.encoder_q."):]] = state_dict[k] # delete renamed or unused k del state_dict[k] args.start_epoch = 0 msg = model.load_state_dict(state_dict, strict=False) assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} print("=> loaded pre-trained model '{}'".format(args.pretrained)) else: print("=> no checkpoint found at '{}'".format(args.pretrained)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model) #.cuda() for debug on cpu # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # optimize only the linear classifier parameters = list(filter(lambda p: p.requires_grad, model.parameters())) # assert len(parameters) == 2 # fc.weight, fc.bias optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code normalize_video = transforms_video.NormalizeVideo( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) video_augmentation_train = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(args.crop_size), transforms_video.RandomHorizontalFlipVideo(), normalize_video, ]) video_augmentation_val = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.CenterCropVideo(args.crop_size), normalize_video, ]) data_dir = os.path.join(args.data, 'data') anno_dir = os.path.join(args.data, 'anno') audio_augmentation = moco.loader.DummyAudioTransform() train_augmentation = { 'video': video_augmentation_train, 'audio': audio_augmentation } val_augmentation = { 'video': video_augmentation_val, 'audio': audio_augmentation } train_dataset = UCF101(data_dir, anno_dir, args.frame_per_clip, args.step_between_clips, fold=1, train=True, transform=train_augmentation, num_workers=16) train_sampler = RandomClipSampler(train_dataset.video_clips, 10) if args.distributed: train_sampler = DistributedSampler(train_sampler) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, multiprocessing_context="fork") val_dataset = UCF101(data_dir, anno_dir, args.frame_per_clip, args.step_between_clips, fold=1, train=False, transform=val_augmentation, num_workers=16) # Do not use DistributedSampler since it will destroy the testing iteration process val_sampler = UniformClipSampler(val_dataset.video_clips, args.clip_per_video) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.clip_per_video, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, multiprocessing_context="fork") if args.evaluate: validate(val_loader, model, criterion, args) return if args.multiprocessing_distributed and args.gpu == 0: log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) writer = SummaryWriter(log_dir) else: writer = None for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, writer) # evaluate on validation set val_loss, acc1, acc5 = validate(val_loader, model, criterion, args) if writer is not None: writer.add_scalar('lincls_val/loss', val_loss, epoch) writer.add_scalar('lincls_val/acc1', acc1, epoch) writer.add_scalar('lincls_val/acc5', acc5, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format( args.ckp_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, ckp_dir, max_save=1, is_best=is_best)