def get_training_data(video_path, annotation_path, dataset_name, input_type, file_type, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'somethingv2', 'somethingv1' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5'] if 'somethingv1' in dataset_name: formatter = sthv1_image_name_formatter elif 'somethingv2' in dataset_name: formatter = sthv2_image_name_formatter else: formatter = image_name_formatter if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' if get_image_backend() == 'accimage': from datasets.loader import ImageLoaderAccImage loader = VideoLoader(formatter, ImageLoaderAccImage()) else: loader = VideoLoader(formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / '{}.hdf5'.format(video_id)) if dataset_name == 'activitynet': training_data = ActivityNet(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) else: training_data = VideoDataset(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return training_data
def get_validation_data(video_path, annotation_path, dataset_name, input_type, file_type, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5'] if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' if get_image_backend() == 'accimage': from datasets.loader import ImageLoaderAccImage loader = VideoLoader(image_name_formatter, ImageLoaderAccImage()) else: loader = VideoLoader(image_name_formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) if dataset_name == 'activitynet': video_path_formatter = ( lambda root_path, label, video_id: root_path / f'v_{video_id}') else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') if dataset_name == 'activitynet': validation_data = ActivityNet( video_path, annotation_path, 'validation', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) else: validation_data = VideoDatasetMultiClips( video_path, annotation_path, 'validation', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return validation_data, collate_fn
def get_inference_data(video_path, annotation_path, dataset_name, input_type, file_type, inference_subset, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'vggsound' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5'] assert inference_subset in ['train', 'val', 'test'] if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' if get_image_backend() == 'accimage': from datasets.loader import ImageLoaderAccImage loader = VideoLoader(image_name_formatter, ImageLoaderAccImage()) else: loader = VideoLoader(image_name_formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') if inference_subset == 'train': subset = 'training' elif inference_subset == 'val': subset = 'validation' elif inference_subset == 'test': subset = 'testing' inference_data = VideoDatasetMultiClips( video_path, annotation_path, subset, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter, target_type=['video_id', 'segment']) return inference_data, collate_fn
def get_training_av_data(video_path, audio_path, annotation_path, dataset_name, input_type, file_type, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'vggsound' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5'] if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' if get_image_backend() == 'accimage': from datasets.loader import ImageLoaderAccImage loader = VideoLoader(image_name_formatter, ImageLoaderAccImage()) else: loader = VideoLoader(image_name_formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) else: if input_type == 'rgb' and dataset_name != 'kinetics': loader = VideoLoaderHDF5() elif input_type == 'rgb' and dataset_name == 'kinetics': loader = VideoLoaderHDF5_kinetics() else: loader = VideoLoaderFlowHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') audio_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.npy') training_data = AudioVideoDataset( video_path, audio_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter, audio_path_formatter=audio_path_formatter) return training_data
def get_validation_data(video_path, annotation_path, sample_duration, use_alternative_label, video_path_formatter, spatial_transform=None, temporal_transform=None, target_transform=None): loader = VideoLoader(lambda x: f'image_{x:05d}.jpg') # TODO allow a more general way to specify validation or test # Right now, with validation we mean 'test set', so if in training you have # a 'real' validation set, this won't work, since you should load data with subset 'testing' validation_data = VideoDatasetMultiClips( video_path, annotation_path, 'validation', sample_duration, use_alternative_label, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return validation_data, collate_fn
def get_inference_data(video_path, annotation_path, dataset_name, file_type, inference_subset, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit' ] assert file_type in ['jpg', 'hdf5'] assert inference_subset in ['train', 'val', 'test'] if file_type == 'jpg': loader = VideoLoader(lambda x: f'image_{x:05d}.jpg') video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) else: loader = VideoLoaderHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') if inference_subset == 'train': subset = 'training' elif inference_subset == 'val': subset = 'validation' elif inference_subset == 'test': subset = 'testing' if dataset_name == 'activitynet': inference_data = ActivityNet(video_path, annotation_path, subset, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter, is_untrimmed_setting=True) else: inference_data = VideoDatasetMultiClips( video_path, annotation_path, subset, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter, target_type=['video_id', 'segment']) return inference_data, collate_fn
def get_inference_data(video_path, input_type, file_type, spatial_transform=None, temporal_transform=None): assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5'] if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' if get_image_backend() == 'accimage': from datasets.loader import ImageLoaderAccImage loader = VideoLoader(image_name_formatter, ImageLoaderAccImage()) else: loader = VideoLoader(image_name_formatter) video_path_formatter = ( lambda root_path, video_name: root_path / video_name) else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') # inference_data = VideoDataset(video_path, # spatial_transform=spatial_transform, # video_loader=loader, # video_path_formatter=video_path_formatter) inference_data = VideoDatasetMultiClips( video_path, spatial_transform=spatial_transform, temporal_transform=temporal_transform, video_loader=loader, video_path_formatter=video_path_formatter) return inference_data, collate_fn
def get_training_data(video_path, annotation_path, dataset_name, input_type, file_type, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5'] if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' loader = VideoLoader(image_name_formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') if dataset_name == 'activitynet': training_data = ActivityNet(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) else: training_data = VideoDataset(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return training_data
def get_validation_data(video_path, annotation_path, dataset_name, file_type, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit' ] assert file_type in ['jpg', 'hdf5'] if file_type == 'jpg': loader = VideoLoader(lambda x: f'image_{x:05d}.jpg') video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) else: loader = VideoLoaderHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') if dataset_name == 'activitynet': validation_data = ActivityNet( video_path, annotation_path, 'validation', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) else: validation_data = VideoDatasetMultiClips( video_path, annotation_path, 'validation', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return validation_data, collate_fn
def get_training_data(video_path, annotation_path, dataset_name, input_type, file_type, spatial_transform=None, temporal_transform=None, target_transform=None, sample_t_stride=1): assert dataset_name in [ 'kinetics', 'mini_kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'breakfast', 'mini_breakfast', 'movingmnist', 'movingmnist_blackframes', 'movingmnist_longterm', 'movingmnist_motiondiff', 'movingmnist_motionsame', 'movingmnist_frequencies', 'movingmnist_frequencies_complex', 'something', 'movingmnist_static' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5', None] if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' if 'movingmnist' in dataset_name: image_name_formatter = mnist_image_name_formatter elif 'something' in dataset_name: image_name_formatter = something_image_name_formatter else: image_name_formatter = usual_image_name_formatter if get_image_backend() == 'accimage': from datasets.loader import ImageLoaderAccImage loader = VideoLoader(image_name_formatter, ImageLoaderAccImage()) else: loader = VideoLoader(image_name_formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) if 'movingmnist' in dataset_name or 'something' in dataset_name: video_path_formatter = ( lambda root_path, label, video_id: root_path / video_id) else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() if dataset_name in ['kinetics', 'mini_kinetics']: video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}') else: video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') print("video_path_formatter", video_path_formatter) if dataset_name == 'activitynet': training_data = ActivityNet(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) elif dataset_name in ['kinetics', 'mini_kinetics']: training_data = VideoDataset(Path( os.path.join(video_path, "h5_train_frames")), annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) else: print("Building VideoDataset for", dataset_name) #print(spatial_transform) #print(temporal_transform) #print(loader) training_data = VideoDataset(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return training_data
def get_inference_data(video_path, annotation_path, dataset_name, input_type, file_type, inference_subset, spatial_transform=None, temporal_transform=None, target_transform=None, sample_t_stride=1): assert dataset_name in [ 'kinetics', 'mini_kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'breakfast', 'mini_breakfast', 'movingmnist', 'movingmnist_blackframes', 'movingmnist_longterm', 'movingmnist_motiondiff', 'movingmnist_motionsame', 'movingmnist_frequencies', 'movingmnist_frequencies_complex', 'something', 'movingmnist_static' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5', None] assert inference_subset in ['train', 'val', 'test'] if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' if 'movingmnist' in dataset_name: image_name_formatter = mnist_image_name_formatter elif 'something' in dataset_name: image_name_formatter = something_image_name_formatter else: image_name_formatter = usual_image_name_formatter if get_image_backend() == 'accimage': from datasets.loader import ImageLoaderAccImage loader = VideoLoader(image_name_formatter, ImageLoaderAccImage()) else: loader = VideoLoader(image_name_formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) if dataset_name in [ 'movingmnist', 'movingmnist_blackframes', 'movingmnist_longterm', 'something' ]: video_path_formatter = ( lambda root_path, label, video_id: root_path / video_id) else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') if inference_subset == 'train': subset = 'training' elif inference_subset == 'val': subset = 'validation' elif inference_subset == 'test': subset = 'testing' if dataset_name == 'activitynet': inference_data = ActivityNet(video_path, annotation_path, subset, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter, is_untrimmed_setting=True) else: inference_data = VideoDatasetMultiClips( video_path, annotation_path, subset, spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter, target_type=['label', 'video_id', 'segment']) return inference_data, collate_fn
def score(self): normalize = get_normalize_method(self.opt.mean, self.opt.std, self.opt.no_mean_norm, self.opt.no_std_norm) spatial_transform = [ Resize(self.opt.sample_size), CenterCrop(self.opt.sample_size), ToTensor() ] spatial_transform.extend([ScaleValue(self.opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if self.opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(self.opt.sample_t_stride)) temporal_transform.append( TemporalEvenCrop(self.opt.sample_duration, self.opt.n_val_samples)) temporal_transform = TemporalCompose(temporal_transform) frame_count = get_n_frames(self.opt.video_jpgs_dir_path) frame_indices = list(range(0, frame_count)) frame_indices = temporal_transform(frame_indices) spatial_transform.randomize_parameters() image_name_formatter = lambda x: f'image_{x:05d}.jpg' loader = VideoLoader(image_name_formatter) print('frame_indices', frame_indices) #clips = [] video_outputs = [] model = generate_model(self.opt) model = load_pretrained_model(model, self.opt.pretrain_path, self.opt.model, self.opt.n_finetune_classes) i =0 for frame_indice in frame_indices: print("%d indice: %s" % (i, str(frame_indice))) i+=1 clip = loader(self.opt.video_jpgs_dir_path, frame_indice) clip = [spatial_transform(img) for img in clip] clip = torch.stack(clip, 0).permute(1, 0, 2, 3) #parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) #print('clips:', clips) #for clip in clips: with torch.no_grad(): print(clip.shape) output = model(torch.unsqueeze(clip, 0)) output = F.softmax(output, dim=1).cpu() #print(output) video_outputs.append(output[0]) del clip video_outputs = torch.stack(video_outputs) average_scores = torch.mean(video_outputs, dim=0) #inference_loader, inference_class_names = main.get_inference_utils(self.opt) with self.opt.annotation_path.open('r') as f: data = json.load(f) class_to_idx = get_class_labels(data) idx_to_class = {} for name, label in class_to_idx.items(): idx_to_class[label] = name print(idx_to_class) inference_result = inference.get_video_results( average_scores, idx_to_class, self.opt.output_topk) print(inference_result)