def run_all_pipeline(input_video, smoothing_k, activity_threshold):
    input_size = (112, 112)
    length = 16

    # Load labels
    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)

    print('Reading Video...')
    video_array = video_to_array(input_video, resize=input_size)
    if video_array is None:
        raise Exception('The video could not be read')
    nb_frames = get_num_frames(input_video)
    duration = get_duration(input_video)
    fps = nb_frames / duration
    print('Duration: {:.1f}s'.format(duration))
    print('FPS: {:.1f}'.format(fps))
    print('Number of frames: {}'.format(nb_frames))

    nb_clips = nb_frames // length
    video_array = video_array.transpose(1, 0, 2, 3)
    video_array = video_array[:nb_clips * length, :, :, :]
    video_array = video_array.reshape((nb_clips, length, 3, 112, 112))
    video_array = video_array.transpose(0, 2, 1, 3, 4)

    # Load C3D model and mean
    print('Loading C3D network...')
    model = C3D_conv_features(True)
    model.compile(optimizer='sgd', loss='mse')
    mean_total = np.load('data/models/c3d-sports1M_mean.npy')
    mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True)

    # Extract features
    print('Extracting features...')
    X = video_array - mean
    Y = model.predict(X, batch_size=1, verbose=1)

    # Load the temporal localization network
    print('Loading temporal localization network...')
    model_localization = temporal_localization_network(True)
    model_localization.compile(optimizer='rmsprop',
                               loss='categorical_crossentropy')

    # Predict with the temporal localization network
    print('Predicting...')
    Y = Y.reshape(nb_clips, 1, 4096)
    prediction = model_localization.predict(Y, batch_size=1, verbose=1)
    prediction = prediction.reshape(nb_clips, 201)

    # Post processing the predited output
    print('Post-processing output...')
    labels_idx, scores = get_classification(prediction, k=5)
    print('Video: {}\n'.format(input_video))
    print('Classification:')
    for idx, score in zip(labels_idx, scores):
        label = labels[idx]
        print('{:.4f}\t{}'.format(score, label))

    prediction_smoothed = smoothing(prediction, k=smoothing_k)
    activities_idx, startings, endings, scores = activity_localization(
        prediction_smoothed, activity_threshold)

    print('\nDetection:')
    print('Score\tInterval\t\tActivity')
    for idx, s, e, score in zip(activities_idx, startings, endings, scores):
        start = s * float(length) / fps
        end = e * float(length) / fps
        label = labels[idx]
        print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end,
                                                       label))
def run_all_pipeline(input_video, smoothing_k, activity_threshold):
    input_size = (112, 112)
    length = 16

    # Load labels
    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)

    print('Reading Video...')
    video_array = video_to_array(input_video, resize=input_size)
    if video_array is None:
        raise Exception('The video could not be read')
    nb_frames = get_num_frames(input_video)
    duration = get_duration(input_video)
    fps = nb_frames / duration
    print('Duration: {:.1f}s'.format(duration))
    print('FPS: {:.1f}'.format(fps))
    print('Number of frames: {}'.format(nb_frames))

    nb_clips = nb_frames // length
    video_array = video_array.transpose(1, 0, 2, 3)
    video_array = video_array[:nb_clips*length,:,:,:]
    video_array = video_array.reshape((nb_clips, length, 3, 112, 112))
    video_array = video_array.transpose(0, 2, 1, 3, 4)

    # Load C3D model and mean
    print('Loading C3D network...')
    model  = C3D_conv_features(True)
    model.compile(optimizer='sgd', loss='mse')
    mean_total = np.load('data/models/c3d-sports1M_mean.npy')
    mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True)

    # Extract features
    print('Extracting features...')
    X = video_array - mean
    Y = model.predict(X, batch_size=1, verbose=1)

    # Load the temporal localization network
    print('Loading temporal localization network...')
    model_localization = temporal_localization_network(True)
    model_localization.compile(optimizer='rmsprop', loss='categorical_crossentropy')

    # Predict with the temporal localization network
    print('Predicting...')
    Y = Y.reshape(nb_clips, 1, 4096)
    prediction = model_localization.predict(Y, batch_size=1, verbose=1)
    prediction = prediction.reshape(nb_clips, 201)

    # Post processing the predited output
    print('Post-processing output...')
    labels_idx, scores = get_classification(prediction, k=5)
    print('Video: {}\n'.format(input_video))
    print('Classification:')
    for idx, score in zip(labels_idx, scores):
        label = labels[idx]
        print('{:.4f}\t{}'.format(score, label))

    prediction_smoothed = smoothing(prediction, k=smoothing_k)
    activities_idx, startings, endings, scores = activity_localization(
        prediction_smoothed,
        activity_threshold
    )

    print('\nDetection:')
    print('Score\tInterval\t\tActivity')
    for idx, s, e, score in zip(activities_idx, startings, endings, scores):
        start = s * float(length) / fps
        end = e * float(length) / fps
        label = labels[idx]
        print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))
def process_prediction(experiment_id, predictions_path, output_path, smoothing_k, activity_threshold, subset=None):
    clip_length = 16.

    if subset == None:
        subsets = ['validation', 'testing']
    else:
        subsets = [subset]

    predictions_file = os.path.join(
        predictions_path,
        'predictions_{experiment_id}.hdf5'.format(experiment_id=experiment_id)
    )

    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)
    with open('dataset/videos.json', 'r') as f:
        videos_info = json.load(f)

    f_predictions = h5py.File(predictions_file, 'r')
    for subset in subsets:
        print('Generating results for {} subset...'.format(subset))
        subset_predictions = f_predictions[subset]

        progbar = ProgressBar(max_value=len(subset_predictions.keys()))
        with open('dataset/templates/results_{}.json'.format(subset), 'r') as f:
            results_classification = json.load(f)
        results_detection = copy.deepcopy(results_classification)

        count = 0
        progbar.update(0)
        for video_id in subset_predictions.keys():
            prediction = subset_predictions[video_id][...]
            video_info = videos_info[video_id]
            fps = float(video_info['num_frames']) / video_info['duration']
            nb_clips = prediction.shape[0]

            # Post processing to obtain the classification
            labels_idx, scores = get_classification(prediction, k=5)
            result_classification = []
            for idx, score in zip(labels_idx, scores):
                label = labels[idx]
                if score > 0:
                    result_classification.append({
                        'score': score,
                        'label': label
                    })
            results_classification['results'][video_id] = result_classification

            # Post Processing to obtain the detection
            prediction_smoothed = smoothing(prediction, k=smoothing_k)
            activities_idx, startings, endings, scores = activity_localization(
                prediction_smoothed,
                activity_threshold
            )
            result_detection = []
            for idx, s, e, score in zip(activities_idx, startings, endings, scores):
                label = labels[idx]
                result_detection.append({
                    'score': score,
                    'segment': [
                        s * clip_length / fps,
                        e * clip_length / fps
                    ],
                    'label': label
                })
            results_detection['results'][video_id] = result_detection

            count += 1
            progbar.update(count)
        progbar.finish()

        classification_output_file = os.path.join(
            output_path,
            'results_classification_{}_{}.json'.format(experiment_id, subset)
        )
        detection_output_file = os.path.join(
            output_path,
            'results_detection_{}_{}.json'.format(experiment_id, subset)
        )
        with open(classification_output_file, 'w') as f:
            json.dump(results_classification, f)
        with open(detection_output_file, 'w') as f:
            json.dump(results_detection, f)

    f_predictions.close()
Exemple #4
0
def run_runtime_tests(input_video, model_features, c3d_mean, model_localization):
    input_size = (112, 112)
    length = 16

    # Setup post-processing variables
    smoothing_k = 5
    activity_threshold = .2

    # Load labels
    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)

    print('')
    print('#'*50)
    print(input_video)
    print('Reading Video...')
    t_s = time.time()
    video_array = video_to_array(input_video, resize=input_size)
    t_e = time.time()
    print('Loading Video: {:.2f}s'.format(t_e-t_s))
    runtime_measures['load_video'].append(t_e-t_s)
    if video_array is None:
        raise Exception('The video could not be read')
    nb_frames = get_num_frames(input_video)
    duration = get_duration(input_video)
    fps = nb_frames / duration
    runtime_measures['video_duration'].append(duration)
    print('Duration: {:.1f}s'.format(duration))
    print('FPS: {:.1f}'.format(fps))
    print('Number of frames: {}'.format(nb_frames))

    nb_clips = nb_frames // length
    video_array = video_array.transpose(1, 0, 2, 3)
    video_array = video_array[:nb_clips*length,:,:,:]
    video_array = video_array.reshape((nb_clips, length, 3, 112, 112))
    video_array = video_array.transpose(0, 2, 1, 3, 4)

    # Extract features
    print('Extracting features...')
    t_s = time.time()
    X = video_array - c3d_mean
    Y = model_features.predict(X, batch_size=1, verbose=1)
    t_e = time.time()
    print('Extracting C3D features: {:.2f}s'.format(t_e-t_s))
    runtime_measures['extract_features_c3d'].append(t_e-t_s)

    # Predict with the temporal localization network
    print('Predicting...')
    t_s = time.time()
    Y = Y.reshape(nb_clips, 1, 4096)
    prediction = model_localization.predict(Y, batch_size=1, verbose=1)
    prediction = prediction.reshape(nb_clips, 201)
    t_e = time.time()
    print('Prediction temporal activities: {:.2f}s'.format(t_e-t_s))
    runtime_measures['temporal_localization_network'].append(t_e-t_s)

    # Post processing the predited output
    print('Post-processing output...')
    t_s = time.time()

    labels_idx, scores = get_classification(prediction, k=5)
    print('Video: {}\n'.format(input_video))
    print('Classification:')
    for idx, score in zip(labels_idx, scores):
        label = labels[idx]
        print('{:.4f}\t{}'.format(score, label))

    prediction_smoothed = smoothing(prediction, k=smoothing_k)
    activities_idx, startings, endings, scores = activity_localization(
        prediction_smoothed,
        activity_threshold
    )
    t_e = time.time()
    runtime_measures['post-processing'].append(t_e-t_s)
    print('Post-processing runtime: {:.2f}s'.format(t_e-t_s))

    print('\nDetection:')
    print('Score\tInterval\t\tActivity')
    for idx, s, e, score in zip(activities_idx, startings, endings, scores):
        start = s * float(length) / fps
        end = e * float(length) / fps
        label = labels[idx]
        print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))