def main(args): if args.apex: if sys.version_info < (3, 0): raise RuntimeError( "Apex currently only supports Python 3. Aborting.") if amp is None: raise RuntimeError( "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex " "to enable mixed-precision training.") if args.output_dir: utils.mkdir(args.output_dir) vis = utils.Visualize(args) utils.init_distributed_mode(args) print(args) print("torch version: ", torch.__version__) print("torchvision version: ", torchvision.__version__) device = torch.device(args.device) torch.backends.cudnn.benchmark = True # Data loading code print("Loading data") traindir = os.path.join( args.data_path, 'train_256' if not args.fast_test else 'val_256_bob') valdir = os.path.join(args.data_path, 'val_256_bob') normalize = T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]) print("Loading training data") st = time.time() cache_path = _get_cache_path(traindir) frame_transform_train = utils.make_frame_transform(args.frame_transforms) transform_train = torchvision.transforms.Compose([ # torchvision.transforms.RandomGrayscale(p=1), frame_transform_train, T.ToFloatTensorInZeroOne(), T.Resize((256, 256)), # T.Resize((128, 171)), # T.RandomHorizontalFlip(), # T.GaussianBlurTransform(), normalize, # T.RandomCrop((112, 112)) ]) def make_dataset(is_train): _transform = transform_train if is_train else transform_test if 'kinetics' in args.data_path.lower(): return Kinetics400(traindir if is_train else valdir, frames_per_clip=args.clip_len, step_between_clips=1, transform=transform_train, extensions=('mp4'), frame_rate=args.frame_skip) else: return VideoList( args, is_train, frame_gap=args.frame_skip, transform=_transform, # frame_transform=_frame_transform ) if args.cache_dataset and os.path.exists(cache_path): print("Loading dataset_train from {}".format(cache_path)) dataset, _ = torch.load(cache_path) dataset.transform = transform_train else: if args.distributed: print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster") dataset = make_dataset(is_train=True) if args.cache_dataset: print("Saving dataset_train to {}".format(cache_path)) utils.mkdir(os.path.dirname(cache_path)) utils.save_on_master((dataset, traindir), cache_path) if hasattr(dataset, 'video_clips'): dataset.video_clips.compute_clips(args.clip_len, 1, frame_rate=15) print("Took", time.time() - st) print("Loading validation data") cache_path = _get_cache_path(valdir) transform_test = torchvision.transforms.Compose([ T.ToFloatTensorInZeroOne(), # T.Resize((128, 171)), # normalize, # T.CenterCrop((112, 112)) T.Resize((256, 256)), normalize ]) if args.cache_dataset and os.path.exists(cache_path): print("Loading dataset_test from {}".format(cache_path)) dataset_test, _ = torch.load(cache_path) dataset_test.transform = transform_test else: if args.distributed: print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster") # dataset_test = Kinetics400( # valdir, # frames_per_clip=args.clip_len, # step_between_clips=1, # transform=transform_test, # extensions=('mp4') # ) dataset_test = make_dataset(is_train=False) if args.cache_dataset: print("Saving dataset_test to {}".format(cache_path)) utils.mkdir(os.path.dirname(cache_path)) utils.save_on_master((dataset_test, valdir), cache_path) if hasattr(dataset, 'video_clips'): dataset_test.video_clips.compute_clips(args.clip_len, 1, frame_rate=15) def make_data_sampler(is_train, dataset): if hasattr(dataset, 'video_clips'): _sampler = RandomClipSampler if is_train else UniformClipSampler return _sampler(dataset.video_clips, args.clips_per_video) else: return torch.utils.data.sampler.RandomSampler( dataset) if is_train else None print("Creating data loaders") train_sampler, test_sampler = make_data_sampler(True, dataset), \ make_data_sampler(False, dataset_test) # train_sampler = train_sampler(dataset.video_clips, args.clips_per_video) # test_sampler = test_sampler(dataset_test.video_clips, args.clips_per_video) if args.distributed: train_sampler = DistributedSampler(train_sampler) test_sampler = DistributedSampler(test_sampler) data_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.workers, pin_memory=True, collate_fn=collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=args.batch_size, sampler=test_sampler, num_workers=args.workers, pin_memory=True, collate_fn=collate_fn) print("Creating model") import resnet import timecycle as tc # model = resnet.__dict__[args.model](pretrained=args.pretrained) model = tc.TimeCycle(args) # utils.compute_RF_numerical(model.resnet, torch.ones(1, 3, 1, 112, 112).numpy()) # import pdb; pdb.set_trace() # print(utils.compute_RF_numerical(model,img_np)) model.to(device) if args.distributed and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) criterion = nn.CrossEntropyLoss() lr = args.lr * args.world_size # optimizer = torch.optim.SGD( # model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) if args.apex: model, optimizer = amp.initialize(model, optimizer, opt_level=args.apex_opt_level) # convert scheduler to be per iteration, not per epoch, for warmup that lasts # between different epochs warmup_iters = args.lr_warmup_epochs * len(data_loader) lr_milestones = [len(data_loader) * m for m in args.lr_milestones] lr_scheduler = WarmupMultiStepLR(optimizer, milestones=lr_milestones, gamma=args.lr_gamma, warmup_iters=warmup_iters, warmup_factor=1e-5) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.data_parallel: model = torch.nn.parallel.DataParallel(model) model_without_ddp = model.module if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, criterion, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq, args.apex, vis=vis) # evaluate(model, criterion, data_loader_test, device=device) if args.output_dir: checkpoint = { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args } utils.save_on_master( checkpoint, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) utils.save_on_master( checkpoint, os.path.join(args.output_dir, 'checkpoint.pth')) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): if args.apex and amp is None: raise RuntimeError( "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex " "to enable mixed-precision training.") if args.output_dir: utils.mkdir(args.output_dir) utils.init_distributed_mode(args) print(args) print("torch version: ", torch.__version__) print("torchvision version: ", torchvision.__version__) device = torch.device(args.device) torch.backends.cudnn.benchmark = True # Data loading code print("Loading data") traindir = os.path.join(args.data_path, args.train_dir) valdir = os.path.join(args.data_path, args.val_dir) normalize = T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]) print("Loading training data") st = time.time() cache_path = _get_cache_path(traindir) transform_train = torchvision.transforms.Compose([ T.ToFloatTensorInZeroOne(), T.Resize((128, 171)), T.RandomHorizontalFlip(), normalize, T.RandomCrop((112, 112)) ]) if args.cache_dataset and os.path.exists(cache_path): print("Loading dataset_train from {}".format(cache_path)) dataset, _ = torch.load(cache_path) dataset.transform = transform_train else: if args.distributed: print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster") dataset = torchvision.datasets.Kinetics400( traindir, frames_per_clip=args.clip_len, step_between_clips=1, transform=transform_train, frame_rate=15) if args.cache_dataset: print("Saving dataset_train to {}".format(cache_path)) utils.mkdir(os.path.dirname(cache_path)) utils.save_on_master((dataset, traindir), cache_path) print("Took", time.time() - st) print("Loading validation data") cache_path = _get_cache_path(valdir) transform_test = torchvision.transforms.Compose([ T.ToFloatTensorInZeroOne(), T.Resize((128, 171)), normalize, T.CenterCrop((112, 112)) ]) if args.cache_dataset and os.path.exists(cache_path): print("Loading dataset_test from {}".format(cache_path)) dataset_test, _ = torch.load(cache_path) dataset_test.transform = transform_test else: if args.distributed: print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster") dataset_test = torchvision.datasets.Kinetics400( valdir, frames_per_clip=args.clip_len, step_between_clips=1, transform=transform_test, frame_rate=15) if args.cache_dataset: print("Saving dataset_test to {}".format(cache_path)) utils.mkdir(os.path.dirname(cache_path)) utils.save_on_master((dataset_test, valdir), cache_path) print("Creating data loaders") train_sampler = RandomClipSampler(dataset.video_clips, args.clips_per_video) test_sampler = UniformClipSampler(dataset_test.video_clips, args.clips_per_video) if args.distributed: train_sampler = DistributedSampler(train_sampler) test_sampler = DistributedSampler(test_sampler) data_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.workers, pin_memory=True, collate_fn=collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=args.batch_size, sampler=test_sampler, num_workers=args.workers, pin_memory=True, collate_fn=collate_fn) print("Creating model") model = torchvision.models.video.__dict__[args.model]( pretrained=args.pretrained) model.to(device) if args.distributed and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) criterion = nn.CrossEntropyLoss() lr = args.lr * args.world_size optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.apex: model, optimizer = amp.initialize(model, optimizer, opt_level=args.apex_opt_level) # convert scheduler to be per iteration, not per epoch, for warmup that lasts # between different epochs warmup_iters = args.lr_warmup_epochs * len(data_loader) lr_milestones = [len(data_loader) * m for m in args.lr_milestones] lr_scheduler = WarmupMultiStepLR(optimizer, milestones=lr_milestones, gamma=args.lr_gamma, warmup_iters=warmup_iters, warmup_factor=1e-5) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, criterion, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq, args.apex) evaluate(model, criterion, data_loader_test, device=device) if args.output_dir: checkpoint = { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args } utils.save_on_master( checkpoint, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) utils.save_on_master( checkpoint, os.path.join(args.output_dir, 'checkpoint.pth')) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def feature_extractor(): # loading net with tf.variable_scope('RGB'): net = i3d.InceptionI3d(400, spatial_squeeze=True, final_endpoint='Logits') rgb_input = tf.placeholder(tf.float32, shape=(batch_size, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) _, end_points = net(rgb_input, is_training=False, dropout_keep_prob=1.0) end_feature = end_points['avg_pool3d'] sess = tf.Session() transform = torchvision.transforms.Compose([ T.ToFloatTensorInZeroOne(), T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), T.Resize((224, 224)), T.CenterCrop((224, 224)) ]) # rgb_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) # with tf.variable_scope('RGB'): # rgb_model = i3d.InceptionI3d( # 400, spatial_squeeze=True, final_endpoint='Logits') # rgb_logits, _ = rgb_model( # rgb_input, is_training=False, dropout_keep_prob=1.0) rgb_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': rgb_variable_map[variable.name.replace(':0', '')] = variable rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) rgb_saver.restore(sess, _CHECKPOINT_PATHS['rgb_imagenet']) video_list = open(VIDEO_PATH_FILE).readlines() video_list = [name.strip() for name in video_list] print('video_list', video_list) if not os.path.isdir(OUTPUT_FEAT_DIR): os.mkdir(OUTPUT_FEAT_DIR) print('Total number of videos: %d'%len(video_list)) for cnt, video_name in enumerate(video_list): # video_path = os.path.join(VIDEO_DIR, video_name) video_path = os.path.join(VIDEO_DIR, video_name+'.avi') feat_path = os.path.join(OUTPUT_FEAT_DIR, video_name + '.npy') if os.path.exists(feat_path): print('Feature file for video %s already exists.'%video_name) continue print('video_path', video_path) vframes, _, info = torchvision.io.read_video(video_path, start_pts=0, end_pts=None, pts_unit='sec') vframes = T.frame_temporal_sampling(vframes,start_idx=0,end_idx=None,num_samples=int(round(len(vframes)/info['video_fps']*24))) vframes = transform(vframes).permute(1, 2, 3, 0).numpy() n_frame = vframes.shape[0] print('Total frames: %d'%n_frame) features = [] n_feat = int(n_frame // 8) n_batch = n_feat // batch_size + 1 print('n_frame: %d; n_feat: %d'%(n_frame, n_feat)) print('n_batch: %d'%n_batch) for i in range(n_batch): input_blobs = [] for j in range(batch_size): start_idx = i*batch_size*L + j*L if i==0 else i*batch_size*L + j*L - 8 end_idx = min(n_frame, start_idx+L) input_blob = vframes[start_idx:end_idx].reshape(-1, resize_w, resize_h, 3) # input_blob = [] # for k in range(L): # idx = i*batch_size*L + j*L + k # idx = int(idx) # idx = idx%n_frame + 1 # frame = vframes[idx-1] # # image = Image.open(os.path.join('/data/home2/hacker01/Share/Data/TACoS/images_256p/{}'.format(video_name), '%d.jpg'%idx)) # # image = image.resize((resize_w, resize_h)) # # image = np.array(image, dtype='float32') # ''' # image[:, :, 0] -= 104. # image[:, :, 1] -= 117. # image[:, :, 2] -= 123. # ''' # # image[:, :, :] -= 127.5 # # image[:, :, :] /= 127.5 # input_blob.append(frame) # # input_blob = np.array(input_blob, dtype='float32') input_blobs.append(input_blob) input_blobs = np.array(input_blobs, dtype='float32') clip_feature = sess.run(end_feature, feed_dict={rgb_input: input_blobs}) clip_feature = np.reshape(clip_feature, (-1, clip_feature.shape[-1])) features.append(clip_feature) features = np.concatenate(features, axis=0) # features = features[:n_feat:2] # 16 frames per feature (since 64-frame snippet corresponds to 8 features in I3D) feat_path = os.path.join(OUTPUT_FEAT_DIR, video_name + '.npy') print('Saving features and probs for video: %s ...'%video_name) np.save(feat_path, features) print('%d: %s has been processed...'%(cnt, video_name))
categories = [0, 1, 2, 3] if args.ordinal: # convert category to multi-hot label_enc = LabelEncoder() label_enc.fit(categories) action_category = label_enc.transform(categories).reshape(-1, 1) enc = OneHotEncoder() t = enc.fit(action_category) train_label = train_label.reshape(-1, 1) train_label = np.cumsum(enc.transform(train_label).toarray(), axis=1)[:, 0:3] test_label = test_label.reshape(-1, 1) test_label = np.cumsum(enc.transform(test_label).toarray(), axis=1)[:, 0:3] spatial_transform_train = torchvision.transforms.Compose([ T.ToFloatTensorInZeroOne(), T.Resize((image_height, image_width)), T.RandomHorizontalFlip(), # Normalization done after data is loaded ]) spatial_transform_test = torchvision.transforms.Compose([ T.ToFloatTensorInZeroOne(), T.Resize((image_height, image_width)), # Normalization done after data is loaded ]) print("============== Loading Data ==============") print("Train {} videos".format(len(train_list))) print("Test {} videos".format(len(test_list))) train_set = MyVideoDataset('./new_video_data_clip', train_list, train_label, n_frames=n_frames, fps=fps, spatial_transform=spatial_transform_train, random_slice_size=random_slice_size)