Beispiel #1
0
def get_training_data(video_path,
                      annotation_path,
                      dataset_name,
                      input_type,
                      file_type,
                      spatial_transform=None,
                      temporal_transform=None,
                      target_transform=None):
    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'somethingv2',
        'somethingv1'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5']
    if 'somethingv1' in dataset_name:
        formatter = sthv1_image_name_formatter
    elif 'somethingv2' in dataset_name:
        formatter = sthv2_image_name_formatter
    else:
        formatter = image_name_formatter
    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        if get_image_backend() == 'accimage':
            from datasets.loader import ImageLoaderAccImage
            loader = VideoLoader(formatter, ImageLoaderAccImage())
        else:
            loader = VideoLoader(formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / '{}.hdf5'.format(video_id))

    if dataset_name == 'activitynet':
        training_data = ActivityNet(video_path,
                                    annotation_path,
                                    'training',
                                    spatial_transform=spatial_transform,
                                    temporal_transform=temporal_transform,
                                    target_transform=target_transform,
                                    video_loader=loader,
                                    video_path_formatter=video_path_formatter)
    else:
        training_data = VideoDataset(video_path,
                                     annotation_path,
                                     'training',
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter)

    return training_data
def get_validation_data(video_path,
                        annotation_path,
                        dataset_name,
                        input_type,
                        file_type,
                        spatial_transform=None,
                        temporal_transform=None,
                        target_transform=None):
    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5']

    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        if get_image_backend() == 'accimage':
            from datasets.loader import ImageLoaderAccImage
            loader = VideoLoader(image_name_formatter, ImageLoaderAccImage())
        else:
            loader = VideoLoader(image_name_formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
        if dataset_name == 'activitynet':
            video_path_formatter = (
                lambda root_path, label, video_id: root_path / f'v_{video_id}')
    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')

    if dataset_name == 'activitynet':
        validation_data = ActivityNet(
            video_path,
            annotation_path,
            'validation',
            spatial_transform=spatial_transform,
            temporal_transform=temporal_transform,
            target_transform=target_transform,
            video_loader=loader,
            video_path_formatter=video_path_formatter)
    else:
        validation_data = VideoDatasetMultiClips(
            video_path,
            annotation_path,
            'validation',
            spatial_transform=spatial_transform,
            temporal_transform=temporal_transform,
            target_transform=target_transform,
            video_loader=loader,
            video_path_formatter=video_path_formatter)

    return validation_data, collate_fn
Beispiel #3
0
def get_inference_data(video_path,
                       annotation_path,
                       dataset_name,
                       input_type,
                       file_type,
                       inference_subset,
                       spatial_transform=None,
                       temporal_transform=None,
                       target_transform=None):
    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'vggsound'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5']
    assert inference_subset in ['train', 'val', 'test']

    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        if get_image_backend() == 'accimage':
            from datasets.loader import ImageLoaderAccImage
            loader = VideoLoader(image_name_formatter, ImageLoaderAccImage())
        else:
            loader = VideoLoader(image_name_formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')

    if inference_subset == 'train':
        subset = 'training'
    elif inference_subset == 'val':
        subset = 'validation'
    elif inference_subset == 'test':
        subset = 'testing'

    inference_data = VideoDatasetMultiClips(
        video_path,
        annotation_path,
        subset,
        spatial_transform=spatial_transform,
        temporal_transform=temporal_transform,
        target_transform=target_transform,
        video_loader=loader,
        video_path_formatter=video_path_formatter,
        target_type=['video_id', 'segment'])

    return inference_data, collate_fn
Beispiel #4
0
def get_training_av_data(video_path,
                         audio_path,
                         annotation_path,
                         dataset_name,
                         input_type,
                         file_type,
                         spatial_transform=None,
                         temporal_transform=None,
                         target_transform=None):

    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'vggsound'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5']

    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        if get_image_backend() == 'accimage':
            from datasets.loader import ImageLoaderAccImage
            loader = VideoLoader(image_name_formatter, ImageLoaderAccImage())
        else:
            loader = VideoLoader(image_name_formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
    else:
        if input_type == 'rgb' and dataset_name != 'kinetics':
            loader = VideoLoaderHDF5()
        elif input_type == 'rgb' and dataset_name == 'kinetics':
            loader = VideoLoaderHDF5_kinetics()
        else:
            loader = VideoLoaderFlowHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')
        audio_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.npy')

    training_data = AudioVideoDataset(
        video_path,
        audio_path,
        annotation_path,
        'training',
        spatial_transform=spatial_transform,
        temporal_transform=temporal_transform,
        target_transform=target_transform,
        video_loader=loader,
        video_path_formatter=video_path_formatter,
        audio_path_formatter=audio_path_formatter)
    return training_data
Beispiel #5
0
def get_validation_data(video_path,
                        annotation_path,
                        sample_duration,
                        use_alternative_label,
                        video_path_formatter,
                        spatial_transform=None,
                        temporal_transform=None,
                        target_transform=None):
    loader = VideoLoader(lambda x: f'image_{x:05d}.jpg')

    # TODO allow a more general way to specify validation or test
    # Right now, with validation we mean 'test set', so if in training you have
    # a 'real' validation set, this won't work, since you should load data with subset 'testing'
    validation_data = VideoDatasetMultiClips(
        video_path,
        annotation_path,
        'validation',
        sample_duration,
        use_alternative_label,
        spatial_transform=spatial_transform,
        temporal_transform=temporal_transform,
        target_transform=target_transform,
        video_loader=loader,
        video_path_formatter=video_path_formatter)

    return validation_data, collate_fn
Beispiel #6
0
def get_inference_data(video_path,
                       annotation_path,
                       dataset_name,
                       file_type,
                       inference_subset,
                       spatial_transform=None,
                       temporal_transform=None,
                       target_transform=None):
    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit'
    ]
    assert file_type in ['jpg', 'hdf5']
    assert inference_subset in ['train', 'val', 'test']

    if file_type == 'jpg':
        loader = VideoLoader(lambda x: f'image_{x:05d}.jpg')
        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
    else:
        loader = VideoLoaderHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')

    if inference_subset == 'train':
        subset = 'training'
    elif inference_subset == 'val':
        subset = 'validation'
    elif inference_subset == 'test':
        subset = 'testing'
    if dataset_name == 'activitynet':
        inference_data = ActivityNet(video_path,
                                     annotation_path,
                                     subset,
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter,
                                     is_untrimmed_setting=True)
    else:
        inference_data = VideoDatasetMultiClips(
            video_path,
            annotation_path,
            subset,
            spatial_transform=spatial_transform,
            temporal_transform=temporal_transform,
            target_transform=target_transform,
            video_loader=loader,
            video_path_formatter=video_path_formatter,
            target_type=['video_id', 'segment'])

    return inference_data, collate_fn
def get_inference_data(video_path,
                       input_type,
                       file_type,
                       spatial_transform=None,
                       temporal_transform=None):
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5']

    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        if get_image_backend() == 'accimage':
            from datasets.loader import ImageLoaderAccImage
            loader = VideoLoader(image_name_formatter, ImageLoaderAccImage())
        else:
            loader = VideoLoader(image_name_formatter)

        video_path_formatter = (
            lambda root_path, video_name: root_path / video_name)
    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')

#     inference_data = VideoDataset(video_path,
#                                   spatial_transform=spatial_transform,
#                                   video_loader=loader,
#                                   video_path_formatter=video_path_formatter)
    inference_data = VideoDatasetMultiClips(
        video_path,
        spatial_transform=spatial_transform,
        temporal_transform=temporal_transform,
        video_loader=loader,
        video_path_formatter=video_path_formatter)

    return inference_data, collate_fn
def get_training_data(video_path,
                      annotation_path,
                      dataset_name,
                      input_type,
                      file_type,
                      spatial_transform=None,
                      temporal_transform=None,
                      target_transform=None):
    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5']

    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        loader = VideoLoader(image_name_formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')

    if dataset_name == 'activitynet':
        training_data = ActivityNet(video_path,
                                    annotation_path,
                                    'training',
                                    spatial_transform=spatial_transform,
                                    temporal_transform=temporal_transform,
                                    target_transform=target_transform,
                                    video_loader=loader,
                                    video_path_formatter=video_path_formatter)
    else:
        training_data = VideoDataset(video_path,
                                     annotation_path,
                                     'training',
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter)

    return training_data
Beispiel #9
0
def get_validation_data(video_path,
                        annotation_path,
                        dataset_name,
                        file_type,
                        spatial_transform=None,
                        temporal_transform=None,
                        target_transform=None):
    assert dataset_name in [
        'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit'
    ]
    assert file_type in ['jpg', 'hdf5']

    if file_type == 'jpg':
        loader = VideoLoader(lambda x: f'image_{x:05d}.jpg')
        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)
    else:
        loader = VideoLoaderHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')

    if dataset_name == 'activitynet':
        validation_data = ActivityNet(
            video_path,
            annotation_path,
            'validation',
            spatial_transform=spatial_transform,
            temporal_transform=temporal_transform,
            target_transform=target_transform,
            video_loader=loader,
            video_path_formatter=video_path_formatter)
    else:
        validation_data = VideoDatasetMultiClips(
            video_path,
            annotation_path,
            'validation',
            spatial_transform=spatial_transform,
            temporal_transform=temporal_transform,
            target_transform=target_transform,
            video_loader=loader,
            video_path_formatter=video_path_formatter)

    return validation_data, collate_fn
Beispiel #10
0
def get_training_data(video_path,
                      annotation_path,
                      dataset_name,
                      input_type,
                      file_type,
                      spatial_transform=None,
                      temporal_transform=None,
                      target_transform=None,
                      sample_t_stride=1):
    assert dataset_name in [
        'kinetics', 'mini_kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit',
        'breakfast', 'mini_breakfast', 'movingmnist',
        'movingmnist_blackframes', 'movingmnist_longterm',
        'movingmnist_motiondiff', 'movingmnist_motionsame',
        'movingmnist_frequencies', 'movingmnist_frequencies_complex',
        'something', 'movingmnist_static'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5', None]

    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        if 'movingmnist' in dataset_name:
            image_name_formatter = mnist_image_name_formatter
        elif 'something' in dataset_name:
            image_name_formatter = something_image_name_formatter
        else:
            image_name_formatter = usual_image_name_formatter

        if get_image_backend() == 'accimage':
            from datasets.loader import ImageLoaderAccImage
            loader = VideoLoader(image_name_formatter, ImageLoaderAccImage())
        else:
            loader = VideoLoader(image_name_formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)

        if 'movingmnist' in dataset_name or 'something' in dataset_name:
            video_path_formatter = (
                lambda root_path, label, video_id: root_path / video_id)

    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()

        if dataset_name in ['kinetics', 'mini_kinetics']:
            video_path_formatter = (lambda root_path, label, video_id:
                                    root_path / label / f'{video_id}')
        else:
            video_path_formatter = (lambda root_path, label, video_id:
                                    root_path / label / f'{video_id}.hdf5')
    print("video_path_formatter", video_path_formatter)

    if dataset_name == 'activitynet':
        training_data = ActivityNet(video_path,
                                    annotation_path,
                                    'training',
                                    spatial_transform=spatial_transform,
                                    temporal_transform=temporal_transform,
                                    target_transform=target_transform,
                                    video_loader=loader,
                                    video_path_formatter=video_path_formatter)

    elif dataset_name in ['kinetics', 'mini_kinetics']:
        training_data = VideoDataset(Path(
            os.path.join(video_path, "h5_train_frames")),
                                     annotation_path,
                                     'training',
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter)

    else:
        print("Building VideoDataset for", dataset_name)
        #print(spatial_transform)
        #print(temporal_transform)
        #print(loader)
        training_data = VideoDataset(video_path,
                                     annotation_path,
                                     'training',
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter)

    return training_data
Beispiel #11
0
def get_inference_data(video_path,
                       annotation_path,
                       dataset_name,
                       input_type,
                       file_type,
                       inference_subset,
                       spatial_transform=None,
                       temporal_transform=None,
                       target_transform=None,
                       sample_t_stride=1):
    assert dataset_name in [
        'kinetics', 'mini_kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit',
        'breakfast', 'mini_breakfast', 'movingmnist',
        'movingmnist_blackframes', 'movingmnist_longterm',
        'movingmnist_motiondiff', 'movingmnist_motionsame',
        'movingmnist_frequencies', 'movingmnist_frequencies_complex',
        'something', 'movingmnist_static'
    ]
    assert input_type in ['rgb', 'flow']
    assert file_type in ['jpg', 'hdf5', None]
    assert inference_subset in ['train', 'val', 'test']

    if file_type == 'jpg':
        assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'

        if 'movingmnist' in dataset_name:
            image_name_formatter = mnist_image_name_formatter
        elif 'something' in dataset_name:
            image_name_formatter = something_image_name_formatter
        else:
            image_name_formatter = usual_image_name_formatter

        if get_image_backend() == 'accimage':
            from datasets.loader import ImageLoaderAccImage
            loader = VideoLoader(image_name_formatter, ImageLoaderAccImage())
        else:
            loader = VideoLoader(image_name_formatter)

        video_path_formatter = (
            lambda root_path, label, video_id: root_path / label / video_id)

        if dataset_name in [
                'movingmnist', 'movingmnist_blackframes',
                'movingmnist_longterm', 'something'
        ]:
            video_path_formatter = (
                lambda root_path, label, video_id: root_path / video_id)
    else:
        if input_type == 'rgb':
            loader = VideoLoaderHDF5()
        else:
            loader = VideoLoaderFlowHDF5()
        video_path_formatter = (lambda root_path, label, video_id: root_path /
                                label / f'{video_id}.hdf5')

    if inference_subset == 'train':
        subset = 'training'
    elif inference_subset == 'val':
        subset = 'validation'
    elif inference_subset == 'test':
        subset = 'testing'
    if dataset_name == 'activitynet':
        inference_data = ActivityNet(video_path,
                                     annotation_path,
                                     subset,
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform,
                                     video_loader=loader,
                                     video_path_formatter=video_path_formatter,
                                     is_untrimmed_setting=True)

    else:
        inference_data = VideoDatasetMultiClips(
            video_path,
            annotation_path,
            subset,
            spatial_transform=spatial_transform,
            temporal_transform=temporal_transform,
            target_transform=target_transform,
            video_loader=loader,
            video_path_formatter=video_path_formatter,
            target_type=['label', 'video_id', 'segment'])

    return inference_data, collate_fn
    def score(self):

        normalize = get_normalize_method(self.opt.mean, self.opt.std, self.opt.no_mean_norm,
                                         self.opt.no_std_norm)
        spatial_transform = [
            Resize(self.opt.sample_size),
            CenterCrop(self.opt.sample_size),
            ToTensor()
        ]

        spatial_transform.extend([ScaleValue(self.opt.value_scale), normalize])
        spatial_transform = Compose(spatial_transform)

        temporal_transform = []
        if self.opt.sample_t_stride > 1:
            temporal_transform.append(TemporalSubsampling(self.opt.sample_t_stride))
        temporal_transform.append(
            TemporalEvenCrop(self.opt.sample_duration, self.opt.n_val_samples))
        temporal_transform = TemporalCompose(temporal_transform)


        frame_count = get_n_frames(self.opt.video_jpgs_dir_path)

        frame_indices = list(range(0, frame_count))

        frame_indices = temporal_transform(frame_indices)

        spatial_transform.randomize_parameters()

        image_name_formatter = lambda x: f'image_{x:05d}.jpg'

        loader = VideoLoader(image_name_formatter)

        print('frame_indices', frame_indices)

        #clips = []
        video_outputs = []
        model = generate_model(self.opt)


        model = load_pretrained_model(model, self.opt.pretrain_path, self.opt.model,
                                      self.opt.n_finetune_classes)

        i =0
        for frame_indice in frame_indices:
            print("%d indice: %s" % (i, str(frame_indice)))
            i+=1

            clip = loader(self.opt.video_jpgs_dir_path, frame_indice)



            clip = [spatial_transform(img) for img in clip]
            clip = torch.stack(clip, 0).permute(1, 0, 2, 3)





            #parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)


            #print('clips:', clips)


            #for clip in clips:
            with torch.no_grad():

                print(clip.shape)
                output = model(torch.unsqueeze(clip, 0))
                output = F.softmax(output, dim=1).cpu()

                #print(output)
                video_outputs.append(output[0])

            del clip

        video_outputs = torch.stack(video_outputs)
        average_scores = torch.mean(video_outputs, dim=0)

        #inference_loader, inference_class_names = main.get_inference_utils(self.opt)
        with self.opt.annotation_path.open('r') as f:
            data = json.load(f)

        class_to_idx = get_class_labels(data)
        idx_to_class = {}
        for name, label in class_to_idx.items():
            idx_to_class[label] = name
        print(idx_to_class)

        inference_result = inference.get_video_results(
            average_scores, idx_to_class, self.opt.output_topk)

        print(inference_result)