コード例 #1
0
def create_stateful_dataset(video_features_file,
                            videos_info,
                            labels,
                            output_path,
                            batch_size,
                            timesteps,
                            subset=None):
    features_size = 4096
    output_size = 201

    f_video_features = h5py.File(video_features_file, 'r')
    output_file = os.path.join(output_path, 'dataset_stateful.hdf5')
    f_dataset = h5py.File(output_file, 'w')

    if not subset:
        subsets = ['training', 'validation']
    else:
        subsets = [subset]

    with open(labels, 'r') as f:
        labels = import_labels(f)

    with open(videos_info, 'r') as f:
        videos_data = json.load(f)

    for subset in subsets:
        videos = [
            k for k in videos_data.keys() if videos_data[k]['subset'] == subset
        ]
        videos = list(set(videos) & set(f_video_features.keys()))
        random.shuffle(videos)

        nb_videos = len(videos)
        print('Number of videos for {} subset: {}'.format(subset, nb_videos))

        # Check how the videos are going to be placed
        sequence_stack = []
        for _ in range(batch_size):
            sequence_stack.append([])
        nb_clips_stack = np.zeros(batch_size).astype(np.int64)
        accumulative_clips_stack = []
        for _ in range(batch_size):
            accumulative_clips_stack.append([])

        for video_id in videos:
            min_pos = np.argmin(nb_clips_stack)
            sequence_stack[min_pos].append(video_id)
            nb_clips_stack[min_pos] += f_video_features[video_id].shape[0]
            accumulative_clips_stack[min_pos].append(nb_clips_stack[min_pos])

        min_sequence = np.min(nb_clips_stack)
        max_sequence = np.max(nb_clips_stack)
        nb_batches_long = max_sequence // timesteps + 1
        nb_batches = min_sequence // timesteps
        print('Number of batches: {}'.format(nb_batches))

        video_features = np.zeros(
            (nb_batches_long * batch_size * timesteps, features_size))
        output = np.zeros(
            (nb_batches_long * batch_size * timesteps, output_size))
        index = np.arange(nb_batches_long * batch_size * timesteps)

        progbar = ProgressBar(max_value=batch_size)
        print('Creating stateful dataset for {} subset'.format(subset))

        for i in range(batch_size):
            batch_index = index // timesteps % batch_size == i
            progbar.update(i)

            pos = 0
            for video_id in sequence_stack[i]:
                # Video features
                vid_features = f_video_features[video_id][...]
                assert vid_features.shape[1] == features_size
                nb_instances = vid_features.shape[0]

                # Output
                output_classes = generate_output(videos_data[video_id], labels)
                assert nb_instances == len(output_classes)

                video_index = index[batch_index][pos:pos + nb_instances]
                video_features[video_index, :] = vid_features
                output[video_index] = to_categorical(output_classes,
                                                     nb_classes=output_size)

                pos += nb_instances

        progbar.finish()

        video_features = video_features[:nb_batches * batch_size *
                                        timesteps, :]
        assert np.all(np.any(video_features, axis=1))
        video_features = video_features.reshape(
            (nb_batches * batch_size, timesteps, features_size))

        output = output[:nb_batches * batch_size * timesteps, :]
        assert np.all(np.any(output, axis=1))
        output = output.reshape(
            (nb_batches * batch_size, timesteps, output_size))

        if subset == 'training':
            background_weight = 0.6
            sample_weights = np.ones(output.shape[:2])
            sample_weights[output[:, :, 0] == 1] = background_weight
        f_dataset_subset = f_dataset.create_group(subset)

        f_dataset_subset.create_dataset('vid_features',
                                        data=video_features,
                                        chunks=(4, timesteps, features_size),
                                        dtype='float32')
        f_dataset_subset.create_dataset('output',
                                        data=output,
                                        chunks=(batch_size, timesteps,
                                                output_size),
                                        dtype='float32')
        if subset == 'training':
            f_dataset_subset.create_dataset('sample_weight',
                                            data=sample_weights,
                                            chunks=(batch_size, timesteps),
                                            dtype='float32')

    f_dataset.close()
    f_video_features.close()
コード例 #2
0
def run_all_pipeline(input_video, smoothing_k, activity_threshold):
    input_size = (112, 112)
    length = 16

    # Load labels
    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)

    print('Reading Video...')
    video_array = video_to_array(input_video, resize=input_size)
    if video_array is None:
        raise Exception('The video could not be read')
    nb_frames = get_num_frames(input_video)
    duration = get_duration(input_video)
    fps = nb_frames / duration
    print('Duration: {:.1f}s'.format(duration))
    print('FPS: {:.1f}'.format(fps))
    print('Number of frames: {}'.format(nb_frames))

    nb_clips = nb_frames // length
    video_array = video_array.transpose(1, 0, 2, 3)
    video_array = video_array[:nb_clips * length, :, :, :]
    video_array = video_array.reshape((nb_clips, length, 3, 112, 112))
    video_array = video_array.transpose(0, 2, 1, 3, 4)

    # Load C3D model and mean
    print('Loading C3D network...')
    model = C3D_conv_features(True)
    model.compile(optimizer='sgd', loss='mse')
    mean_total = np.load('data/models/c3d-sports1M_mean.npy')
    mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True)

    # Extract features
    print('Extracting features...')
    X = video_array - mean
    Y = model.predict(X, batch_size=1, verbose=1)

    # Load the temporal localization network
    print('Loading temporal localization network...')
    model_localization = temporal_localization_network(True)
    model_localization.compile(optimizer='rmsprop',
                               loss='categorical_crossentropy')

    # Predict with the temporal localization network
    print('Predicting...')
    Y = Y.reshape(nb_clips, 1, 4096)
    prediction = model_localization.predict(Y, batch_size=1, verbose=1)
    prediction = prediction.reshape(nb_clips, 201)

    # Post processing the predited output
    print('Post-processing output...')
    labels_idx, scores = get_classification(prediction, k=5)
    print('Video: {}\n'.format(input_video))
    print('Classification:')
    for idx, score in zip(labels_idx, scores):
        label = labels[idx]
        print('{:.4f}\t{}'.format(score, label))

    prediction_smoothed = smoothing(prediction, k=smoothing_k)
    activities_idx, startings, endings, scores = activity_localization(
        prediction_smoothed, activity_threshold)

    print('\nDetection:')
    print('Score\tInterval\t\tActivity')
    for idx, s, e, score in zip(activities_idx, startings, endings, scores):
        start = s * float(length) / fps
        end = e * float(length) / fps
        label = labels[idx]
        print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end,
                                                       label))
コード例 #3
0
def process_prediction(experiment_id, predictions_path, output_path, smoothing_k, activity_threshold, subset=None):
    clip_length = 16.

    if subset == None:
        subsets = ['validation', 'testing']
    else:
        subsets = [subset]

    predictions_file = os.path.join(
        predictions_path,
        'predictions_{experiment_id}.hdf5'.format(experiment_id=experiment_id)
    )

    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)
    with open('dataset/videos.json', 'r') as f:
        videos_info = json.load(f)

    f_predictions = h5py.File(predictions_file, 'r')
    for subset in subsets:
        print('Generating results for {} subset...'.format(subset))
        subset_predictions = f_predictions[subset]

        progbar = ProgressBar(max_value=len(subset_predictions.keys()))
        with open('dataset/templates/results_{}.json'.format(subset), 'r') as f:
            results_classification = json.load(f)
        results_detection = copy.deepcopy(results_classification)

        count = 0
        progbar.update(0)
        for video_id in subset_predictions.keys():
            prediction = subset_predictions[video_id][...]
            video_info = videos_info[video_id]
            fps = float(video_info['num_frames']) / video_info['duration']
            nb_clips = prediction.shape[0]

            # Post processing to obtain the classification
            labels_idx, scores = get_classification(prediction, k=5)
            result_classification = []
            for idx, score in zip(labels_idx, scores):
                label = labels[idx]
                if score > 0:
                    result_classification.append({
                        'score': score,
                        'label': label
                    })
            results_classification['results'][video_id] = result_classification

            # Post Processing to obtain the detection
            prediction_smoothed = smoothing(prediction, k=smoothing_k)
            activities_idx, startings, endings, scores = activity_localization(
                prediction_smoothed,
                activity_threshold
            )
            result_detection = []
            for idx, s, e, score in zip(activities_idx, startings, endings, scores):
                label = labels[idx]
                result_detection.append({
                    'score': score,
                    'segment': [
                        s * clip_length / fps,
                        e * clip_length / fps
                    ],
                    'label': label
                })
            results_detection['results'][video_id] = result_detection

            count += 1
            progbar.update(count)
        progbar.finish()

        classification_output_file = os.path.join(
            output_path,
            'results_classification_{}_{}.json'.format(experiment_id, subset)
        )
        detection_output_file = os.path.join(
            output_path,
            'results_detection_{}_{}.json'.format(experiment_id, subset)
        )
        with open(classification_output_file, 'w') as f:
            json.dump(results_classification, f)
        with open(detection_output_file, 'w') as f:
            json.dump(results_detection, f)

    f_predictions.close()
コード例 #4
0
def run_all_pipeline(input_video, smoothing_k, activity_threshold):
    input_size = (112, 112)
    length = 16

    # Load labels
    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)

    print('Reading Video...')
    video_array = video_to_array(input_video, resize=input_size)
    if video_array is None:
        raise Exception('The video could not be read')
    nb_frames = get_num_frames(input_video)
    duration = get_duration(input_video)
    fps = nb_frames / duration
    print('Duration: {:.1f}s'.format(duration))
    print('FPS: {:.1f}'.format(fps))
    print('Number of frames: {}'.format(nb_frames))

    nb_clips = nb_frames // length
    video_array = video_array.transpose(1, 0, 2, 3)
    video_array = video_array[:nb_clips*length,:,:,:]
    video_array = video_array.reshape((nb_clips, length, 3, 112, 112))
    video_array = video_array.transpose(0, 2, 1, 3, 4)

    # Load C3D model and mean
    print('Loading C3D network...')
    model  = C3D_conv_features(True)
    model.compile(optimizer='sgd', loss='mse')
    mean_total = np.load('data/models/c3d-sports1M_mean.npy')
    mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True)

    # Extract features
    print('Extracting features...')
    X = video_array - mean
    Y = model.predict(X, batch_size=1, verbose=1)

    # Load the temporal localization network
    print('Loading temporal localization network...')
    model_localization = temporal_localization_network(True)
    model_localization.compile(optimizer='rmsprop', loss='categorical_crossentropy')

    # Predict with the temporal localization network
    print('Predicting...')
    Y = Y.reshape(nb_clips, 1, 4096)
    prediction = model_localization.predict(Y, batch_size=1, verbose=1)
    prediction = prediction.reshape(nb_clips, 201)

    # Post processing the predited output
    print('Post-processing output...')
    labels_idx, scores = get_classification(prediction, k=5)
    print('Video: {}\n'.format(input_video))
    print('Classification:')
    for idx, score in zip(labels_idx, scores):
        label = labels[idx]
        print('{:.4f}\t{}'.format(score, label))

    prediction_smoothed = smoothing(prediction, k=smoothing_k)
    activities_idx, startings, endings, scores = activity_localization(
        prediction_smoothed,
        activity_threshold
    )

    print('\nDetection:')
    print('Score\tInterval\t\tActivity')
    for idx, s, e, score in zip(activities_idx, startings, endings, scores):
        start = s * float(length) / fps
        end = e * float(length) / fps
        label = labels[idx]
        print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))
コード例 #5
0
def create_stateful_dataset(video_features_file, videos_info, labels, output_path, batch_size, timesteps, subset=None):
    features_size = 4096
    output_size = 201

    f_video_features = h5py.File(video_features_file, 'r')
    output_file = os.path.join(output_path, 'dataset_stateful.hdf5')
    f_dataset = h5py.File(output_file, 'w')

    if not subset:
        subsets = ['training', 'validation']
    else:
        subsets = [subset]


    with open(labels, 'r') as f:
        labels = import_labels(f)

    with open(videos_info, 'r') as f:
        videos_data = json.load(f)

    for subset in subsets:
        videos = [k for k in videos_data.keys() if videos_data[k]['subset'] == subset]
        videos = list(set(videos) & set(f_video_features.keys()))
        random.shuffle(videos)

        nb_videos = len(videos)
        print('Number of videos for {} subset: {}'.format(subset, nb_videos))

        # Check how the videos are going to be placed
        sequence_stack = []
        for _ in range(batch_size):
            sequence_stack.append([])
        nb_clips_stack = np.zeros(batch_size).astype(np.int64)
        accumulative_clips_stack = []
        for _ in range(batch_size):
            accumulative_clips_stack.append([])

        for video_id in videos:
            min_pos = np.argmin(nb_clips_stack)
            sequence_stack[min_pos].append(video_id)
            nb_clips_stack[min_pos] += f_video_features[video_id].shape[0]
            accumulative_clips_stack[min_pos].append(nb_clips_stack[min_pos])


        min_sequence = np.min(nb_clips_stack)
        max_sequence = np.max(nb_clips_stack)
        nb_batches_long = max_sequence // timesteps + 1
        nb_batches = min_sequence // timesteps
        print('Number of batches: {}'.format(nb_batches))

        video_features = np.zeros((nb_batches_long*batch_size*timesteps, features_size))
        output = np.zeros((nb_batches_long*batch_size*timesteps, output_size))
        index = np.arange(nb_batches_long*batch_size*timesteps)

        progbar = ProgressBar(max_value=batch_size)
        print('Creating stateful dataset for {} subset'.format(subset))

        for i in range(batch_size):
            batch_index = index // timesteps % batch_size == i
            progbar.update(i)

            pos = 0
            for video_id in sequence_stack[i]:
                # Video features
                vid_features = f_video_features[video_id][...]
                assert vid_features.shape[1] == features_size
                nb_instances = vid_features.shape[0]


                # Output
                output_classes = generate_output(videos_data[video_id], labels)
                assert nb_instances == len(output_classes)


                video_index = index[batch_index][pos:pos+nb_instances]
                video_features[video_index,:] = vid_features
                output[video_index] = to_categorical(output_classes, nb_classes=output_size)

                pos += nb_instances

        progbar.finish()

        video_features = video_features[:nb_batches*batch_size*timesteps,:]
        assert np.all(np.any(video_features, axis=1))
        video_features = video_features.reshape((nb_batches*batch_size, timesteps, features_size))

        output = output[:nb_batches*batch_size*timesteps,:]
        assert np.all(np.any(output, axis=1))
        output = output.reshape((nb_batches*batch_size, timesteps, output_size))

        if subset == 'training':
            background_weight = 0.6
            sample_weights = np.ones(output.shape[:2])
            sample_weights[output[:,:,0] == 1] = background_weight
        f_dataset_subset = f_dataset.create_group(subset)

        f_dataset_subset.create_dataset('vid_features', data=video_features, chunks=(4, timesteps, features_size), dtype='float32')
        f_dataset_subset.create_dataset('output', data=output, chunks=(batch_size, timesteps, output_size), dtype='float32')
        if subset == 'training':
            f_dataset_subset.create_dataset('sample_weight', data=sample_weights, chunks=(batch_size, timesteps), dtype='float32')

    f_dataset.close()
    f_video_features.close()
コード例 #6
0
def run_runtime_tests(input_video, model_features, c3d_mean, model_localization):
    input_size = (112, 112)
    length = 16

    # Setup post-processing variables
    smoothing_k = 5
    activity_threshold = .2

    # Load labels
    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)

    print('')
    print('#'*50)
    print(input_video)
    print('Reading Video...')
    t_s = time.time()
    video_array = video_to_array(input_video, resize=input_size)
    t_e = time.time()
    print('Loading Video: {:.2f}s'.format(t_e-t_s))
    runtime_measures['load_video'].append(t_e-t_s)
    if video_array is None:
        raise Exception('The video could not be read')
    nb_frames = get_num_frames(input_video)
    duration = get_duration(input_video)
    fps = nb_frames / duration
    runtime_measures['video_duration'].append(duration)
    print('Duration: {:.1f}s'.format(duration))
    print('FPS: {:.1f}'.format(fps))
    print('Number of frames: {}'.format(nb_frames))

    nb_clips = nb_frames // length
    video_array = video_array.transpose(1, 0, 2, 3)
    video_array = video_array[:nb_clips*length,:,:,:]
    video_array = video_array.reshape((nb_clips, length, 3, 112, 112))
    video_array = video_array.transpose(0, 2, 1, 3, 4)

    # Extract features
    print('Extracting features...')
    t_s = time.time()
    X = video_array - c3d_mean
    Y = model_features.predict(X, batch_size=1, verbose=1)
    t_e = time.time()
    print('Extracting C3D features: {:.2f}s'.format(t_e-t_s))
    runtime_measures['extract_features_c3d'].append(t_e-t_s)

    # Predict with the temporal localization network
    print('Predicting...')
    t_s = time.time()
    Y = Y.reshape(nb_clips, 1, 4096)
    prediction = model_localization.predict(Y, batch_size=1, verbose=1)
    prediction = prediction.reshape(nb_clips, 201)
    t_e = time.time()
    print('Prediction temporal activities: {:.2f}s'.format(t_e-t_s))
    runtime_measures['temporal_localization_network'].append(t_e-t_s)

    # Post processing the predited output
    print('Post-processing output...')
    t_s = time.time()

    labels_idx, scores = get_classification(prediction, k=5)
    print('Video: {}\n'.format(input_video))
    print('Classification:')
    for idx, score in zip(labels_idx, scores):
        label = labels[idx]
        print('{:.4f}\t{}'.format(score, label))

    prediction_smoothed = smoothing(prediction, k=smoothing_k)
    activities_idx, startings, endings, scores = activity_localization(
        prediction_smoothed,
        activity_threshold
    )
    t_e = time.time()
    runtime_measures['post-processing'].append(t_e-t_s)
    print('Post-processing runtime: {:.2f}s'.format(t_e-t_s))

    print('\nDetection:')
    print('Score\tInterval\t\tActivity')
    for idx, s, e, score in zip(activities_idx, startings, endings, scores):
        start = s * float(length) / fps
        end = e * float(length) / fps
        label = labels[idx]
        print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))