def run_all_pipeline(input_video, smoothing_k, activity_threshold): input_size = (112, 112) length = 16 # Load labels with open('dataset/labels.txt', 'r') as f: labels = import_labels(f) print('Reading Video...') video_array = video_to_array(input_video, resize=input_size) if video_array is None: raise Exception('The video could not be read') nb_frames = get_num_frames(input_video) duration = get_duration(input_video) fps = nb_frames / duration print('Duration: {:.1f}s'.format(duration)) print('FPS: {:.1f}'.format(fps)) print('Number of frames: {}'.format(nb_frames)) nb_clips = nb_frames // length video_array = video_array.transpose(1, 0, 2, 3) video_array = video_array[:nb_clips * length, :, :, :] video_array = video_array.reshape((nb_clips, length, 3, 112, 112)) video_array = video_array.transpose(0, 2, 1, 3, 4) # Load C3D model and mean print('Loading C3D network...') model = C3D_conv_features(True) model.compile(optimizer='sgd', loss='mse') mean_total = np.load('data/models/c3d-sports1M_mean.npy') mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True) # Extract features print('Extracting features...') X = video_array - mean Y = model.predict(X, batch_size=1, verbose=1) # Load the temporal localization network print('Loading temporal localization network...') model_localization = temporal_localization_network(True) model_localization.compile(optimizer='rmsprop', loss='categorical_crossentropy') # Predict with the temporal localization network print('Predicting...') Y = Y.reshape(nb_clips, 1, 4096) prediction = model_localization.predict(Y, batch_size=1, verbose=1) prediction = prediction.reshape(nb_clips, 201) # Post processing the predited output print('Post-processing output...') labels_idx, scores = get_classification(prediction, k=5) print('Video: {}\n'.format(input_video)) print('Classification:') for idx, score in zip(labels_idx, scores): label = labels[idx] print('{:.4f}\t{}'.format(score, label)) prediction_smoothed = smoothing(prediction, k=smoothing_k) activities_idx, startings, endings, scores = activity_localization( prediction_smoothed, activity_threshold) print('\nDetection:') print('Score\tInterval\t\tActivity') for idx, s, e, score in zip(activities_idx, startings, endings, scores): start = s * float(length) / fps end = e * float(length) / fps label = labels[idx] print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))
def run_all_pipeline(input_video, smoothing_k, activity_threshold): input_size = (112, 112) length = 16 # Load labels with open('dataset/labels.txt', 'r') as f: labels = import_labels(f) print('Reading Video...') video_array = video_to_array(input_video, resize=input_size) if video_array is None: raise Exception('The video could not be read') nb_frames = get_num_frames(input_video) duration = get_duration(input_video) fps = nb_frames / duration print('Duration: {:.1f}s'.format(duration)) print('FPS: {:.1f}'.format(fps)) print('Number of frames: {}'.format(nb_frames)) nb_clips = nb_frames // length video_array = video_array.transpose(1, 0, 2, 3) video_array = video_array[:nb_clips*length,:,:,:] video_array = video_array.reshape((nb_clips, length, 3, 112, 112)) video_array = video_array.transpose(0, 2, 1, 3, 4) # Load C3D model and mean print('Loading C3D network...') model = C3D_conv_features(True) model.compile(optimizer='sgd', loss='mse') mean_total = np.load('data/models/c3d-sports1M_mean.npy') mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True) # Extract features print('Extracting features...') X = video_array - mean Y = model.predict(X, batch_size=1, verbose=1) # Load the temporal localization network print('Loading temporal localization network...') model_localization = temporal_localization_network(True) model_localization.compile(optimizer='rmsprop', loss='categorical_crossentropy') # Predict with the temporal localization network print('Predicting...') Y = Y.reshape(nb_clips, 1, 4096) prediction = model_localization.predict(Y, batch_size=1, verbose=1) prediction = prediction.reshape(nb_clips, 201) # Post processing the predited output print('Post-processing output...') labels_idx, scores = get_classification(prediction, k=5) print('Video: {}\n'.format(input_video)) print('Classification:') for idx, score in zip(labels_idx, scores): label = labels[idx] print('{:.4f}\t{}'.format(score, label)) prediction_smoothed = smoothing(prediction, k=smoothing_k) activities_idx, startings, endings, scores = activity_localization( prediction_smoothed, activity_threshold ) print('\nDetection:') print('Score\tInterval\t\tActivity') for idx, s, e, score in zip(activities_idx, startings, endings, scores): start = s * float(length) / fps end = e * float(length) / fps label = labels[idx] print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))
def process_prediction(experiment_id, predictions_path, output_path, smoothing_k, activity_threshold, subset=None): clip_length = 16. if subset == None: subsets = ['validation', 'testing'] else: subsets = [subset] predictions_file = os.path.join( predictions_path, 'predictions_{experiment_id}.hdf5'.format(experiment_id=experiment_id) ) with open('dataset/labels.txt', 'r') as f: labels = import_labels(f) with open('dataset/videos.json', 'r') as f: videos_info = json.load(f) f_predictions = h5py.File(predictions_file, 'r') for subset in subsets: print('Generating results for {} subset...'.format(subset)) subset_predictions = f_predictions[subset] progbar = ProgressBar(max_value=len(subset_predictions.keys())) with open('dataset/templates/results_{}.json'.format(subset), 'r') as f: results_classification = json.load(f) results_detection = copy.deepcopy(results_classification) count = 0 progbar.update(0) for video_id in subset_predictions.keys(): prediction = subset_predictions[video_id][...] video_info = videos_info[video_id] fps = float(video_info['num_frames']) / video_info['duration'] nb_clips = prediction.shape[0] # Post processing to obtain the classification labels_idx, scores = get_classification(prediction, k=5) result_classification = [] for idx, score in zip(labels_idx, scores): label = labels[idx] if score > 0: result_classification.append({ 'score': score, 'label': label }) results_classification['results'][video_id] = result_classification # Post Processing to obtain the detection prediction_smoothed = smoothing(prediction, k=smoothing_k) activities_idx, startings, endings, scores = activity_localization( prediction_smoothed, activity_threshold ) result_detection = [] for idx, s, e, score in zip(activities_idx, startings, endings, scores): label = labels[idx] result_detection.append({ 'score': score, 'segment': [ s * clip_length / fps, e * clip_length / fps ], 'label': label }) results_detection['results'][video_id] = result_detection count += 1 progbar.update(count) progbar.finish() classification_output_file = os.path.join( output_path, 'results_classification_{}_{}.json'.format(experiment_id, subset) ) detection_output_file = os.path.join( output_path, 'results_detection_{}_{}.json'.format(experiment_id, subset) ) with open(classification_output_file, 'w') as f: json.dump(results_classification, f) with open(detection_output_file, 'w') as f: json.dump(results_detection, f) f_predictions.close()
def run_runtime_tests(input_video, model_features, c3d_mean, model_localization): input_size = (112, 112) length = 16 # Setup post-processing variables smoothing_k = 5 activity_threshold = .2 # Load labels with open('dataset/labels.txt', 'r') as f: labels = import_labels(f) print('') print('#'*50) print(input_video) print('Reading Video...') t_s = time.time() video_array = video_to_array(input_video, resize=input_size) t_e = time.time() print('Loading Video: {:.2f}s'.format(t_e-t_s)) runtime_measures['load_video'].append(t_e-t_s) if video_array is None: raise Exception('The video could not be read') nb_frames = get_num_frames(input_video) duration = get_duration(input_video) fps = nb_frames / duration runtime_measures['video_duration'].append(duration) print('Duration: {:.1f}s'.format(duration)) print('FPS: {:.1f}'.format(fps)) print('Number of frames: {}'.format(nb_frames)) nb_clips = nb_frames // length video_array = video_array.transpose(1, 0, 2, 3) video_array = video_array[:nb_clips*length,:,:,:] video_array = video_array.reshape((nb_clips, length, 3, 112, 112)) video_array = video_array.transpose(0, 2, 1, 3, 4) # Extract features print('Extracting features...') t_s = time.time() X = video_array - c3d_mean Y = model_features.predict(X, batch_size=1, verbose=1) t_e = time.time() print('Extracting C3D features: {:.2f}s'.format(t_e-t_s)) runtime_measures['extract_features_c3d'].append(t_e-t_s) # Predict with the temporal localization network print('Predicting...') t_s = time.time() Y = Y.reshape(nb_clips, 1, 4096) prediction = model_localization.predict(Y, batch_size=1, verbose=1) prediction = prediction.reshape(nb_clips, 201) t_e = time.time() print('Prediction temporal activities: {:.2f}s'.format(t_e-t_s)) runtime_measures['temporal_localization_network'].append(t_e-t_s) # Post processing the predited output print('Post-processing output...') t_s = time.time() labels_idx, scores = get_classification(prediction, k=5) print('Video: {}\n'.format(input_video)) print('Classification:') for idx, score in zip(labels_idx, scores): label = labels[idx] print('{:.4f}\t{}'.format(score, label)) prediction_smoothed = smoothing(prediction, k=smoothing_k) activities_idx, startings, endings, scores = activity_localization( prediction_smoothed, activity_threshold ) t_e = time.time() runtime_measures['post-processing'].append(t_e-t_s) print('Post-processing runtime: {:.2f}s'.format(t_e-t_s)) print('\nDetection:') print('Score\tInterval\t\tActivity') for idx, s, e, score in zip(activities_idx, startings, endings, scores): start = s * float(length) / fps end = e * float(length) / fps label = labels[idx] print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))