def create_stateful_dataset(video_features_file, videos_info, labels, output_path, batch_size, timesteps, subset=None): features_size = 4096 output_size = 201 f_video_features = h5py.File(video_features_file, 'r') output_file = os.path.join(output_path, 'dataset_stateful.hdf5') f_dataset = h5py.File(output_file, 'w') if not subset: subsets = ['training', 'validation'] else: subsets = [subset] with open(labels, 'r') as f: labels = import_labels(f) with open(videos_info, 'r') as f: videos_data = json.load(f) for subset in subsets: videos = [ k for k in videos_data.keys() if videos_data[k]['subset'] == subset ] videos = list(set(videos) & set(f_video_features.keys())) random.shuffle(videos) nb_videos = len(videos) print('Number of videos for {} subset: {}'.format(subset, nb_videos)) # Check how the videos are going to be placed sequence_stack = [] for _ in range(batch_size): sequence_stack.append([]) nb_clips_stack = np.zeros(batch_size).astype(np.int64) accumulative_clips_stack = [] for _ in range(batch_size): accumulative_clips_stack.append([]) for video_id in videos: min_pos = np.argmin(nb_clips_stack) sequence_stack[min_pos].append(video_id) nb_clips_stack[min_pos] += f_video_features[video_id].shape[0] accumulative_clips_stack[min_pos].append(nb_clips_stack[min_pos]) min_sequence = np.min(nb_clips_stack) max_sequence = np.max(nb_clips_stack) nb_batches_long = max_sequence // timesteps + 1 nb_batches = min_sequence // timesteps print('Number of batches: {}'.format(nb_batches)) video_features = np.zeros( (nb_batches_long * batch_size * timesteps, features_size)) output = np.zeros( (nb_batches_long * batch_size * timesteps, output_size)) index = np.arange(nb_batches_long * batch_size * timesteps) progbar = ProgressBar(max_value=batch_size) print('Creating stateful dataset for {} subset'.format(subset)) for i in range(batch_size): batch_index = index // timesteps % batch_size == i progbar.update(i) pos = 0 for video_id in sequence_stack[i]: # Video features vid_features = f_video_features[video_id][...] assert vid_features.shape[1] == features_size nb_instances = vid_features.shape[0] # Output output_classes = generate_output(videos_data[video_id], labels) assert nb_instances == len(output_classes) video_index = index[batch_index][pos:pos + nb_instances] video_features[video_index, :] = vid_features output[video_index] = to_categorical(output_classes, nb_classes=output_size) pos += nb_instances progbar.finish() video_features = video_features[:nb_batches * batch_size * timesteps, :] assert np.all(np.any(video_features, axis=1)) video_features = video_features.reshape( (nb_batches * batch_size, timesteps, features_size)) output = output[:nb_batches * batch_size * timesteps, :] assert np.all(np.any(output, axis=1)) output = output.reshape( (nb_batches * batch_size, timesteps, output_size)) if subset == 'training': background_weight = 0.6 sample_weights = np.ones(output.shape[:2]) sample_weights[output[:, :, 0] == 1] = background_weight f_dataset_subset = f_dataset.create_group(subset) f_dataset_subset.create_dataset('vid_features', data=video_features, chunks=(4, timesteps, features_size), dtype='float32') f_dataset_subset.create_dataset('output', data=output, chunks=(batch_size, timesteps, output_size), dtype='float32') if subset == 'training': f_dataset_subset.create_dataset('sample_weight', data=sample_weights, chunks=(batch_size, timesteps), dtype='float32') f_dataset.close() f_video_features.close()
def run_all_pipeline(input_video, smoothing_k, activity_threshold): input_size = (112, 112) length = 16 # Load labels with open('dataset/labels.txt', 'r') as f: labels = import_labels(f) print('Reading Video...') video_array = video_to_array(input_video, resize=input_size) if video_array is None: raise Exception('The video could not be read') nb_frames = get_num_frames(input_video) duration = get_duration(input_video) fps = nb_frames / duration print('Duration: {:.1f}s'.format(duration)) print('FPS: {:.1f}'.format(fps)) print('Number of frames: {}'.format(nb_frames)) nb_clips = nb_frames // length video_array = video_array.transpose(1, 0, 2, 3) video_array = video_array[:nb_clips * length, :, :, :] video_array = video_array.reshape((nb_clips, length, 3, 112, 112)) video_array = video_array.transpose(0, 2, 1, 3, 4) # Load C3D model and mean print('Loading C3D network...') model = C3D_conv_features(True) model.compile(optimizer='sgd', loss='mse') mean_total = np.load('data/models/c3d-sports1M_mean.npy') mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True) # Extract features print('Extracting features...') X = video_array - mean Y = model.predict(X, batch_size=1, verbose=1) # Load the temporal localization network print('Loading temporal localization network...') model_localization = temporal_localization_network(True) model_localization.compile(optimizer='rmsprop', loss='categorical_crossentropy') # Predict with the temporal localization network print('Predicting...') Y = Y.reshape(nb_clips, 1, 4096) prediction = model_localization.predict(Y, batch_size=1, verbose=1) prediction = prediction.reshape(nb_clips, 201) # Post processing the predited output print('Post-processing output...') labels_idx, scores = get_classification(prediction, k=5) print('Video: {}\n'.format(input_video)) print('Classification:') for idx, score in zip(labels_idx, scores): label = labels[idx] print('{:.4f}\t{}'.format(score, label)) prediction_smoothed = smoothing(prediction, k=smoothing_k) activities_idx, startings, endings, scores = activity_localization( prediction_smoothed, activity_threshold) print('\nDetection:') print('Score\tInterval\t\tActivity') for idx, s, e, score in zip(activities_idx, startings, endings, scores): start = s * float(length) / fps end = e * float(length) / fps label = labels[idx] print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))
def process_prediction(experiment_id, predictions_path, output_path, smoothing_k, activity_threshold, subset=None): clip_length = 16. if subset == None: subsets = ['validation', 'testing'] else: subsets = [subset] predictions_file = os.path.join( predictions_path, 'predictions_{experiment_id}.hdf5'.format(experiment_id=experiment_id) ) with open('dataset/labels.txt', 'r') as f: labels = import_labels(f) with open('dataset/videos.json', 'r') as f: videos_info = json.load(f) f_predictions = h5py.File(predictions_file, 'r') for subset in subsets: print('Generating results for {} subset...'.format(subset)) subset_predictions = f_predictions[subset] progbar = ProgressBar(max_value=len(subset_predictions.keys())) with open('dataset/templates/results_{}.json'.format(subset), 'r') as f: results_classification = json.load(f) results_detection = copy.deepcopy(results_classification) count = 0 progbar.update(0) for video_id in subset_predictions.keys(): prediction = subset_predictions[video_id][...] video_info = videos_info[video_id] fps = float(video_info['num_frames']) / video_info['duration'] nb_clips = prediction.shape[0] # Post processing to obtain the classification labels_idx, scores = get_classification(prediction, k=5) result_classification = [] for idx, score in zip(labels_idx, scores): label = labels[idx] if score > 0: result_classification.append({ 'score': score, 'label': label }) results_classification['results'][video_id] = result_classification # Post Processing to obtain the detection prediction_smoothed = smoothing(prediction, k=smoothing_k) activities_idx, startings, endings, scores = activity_localization( prediction_smoothed, activity_threshold ) result_detection = [] for idx, s, e, score in zip(activities_idx, startings, endings, scores): label = labels[idx] result_detection.append({ 'score': score, 'segment': [ s * clip_length / fps, e * clip_length / fps ], 'label': label }) results_detection['results'][video_id] = result_detection count += 1 progbar.update(count) progbar.finish() classification_output_file = os.path.join( output_path, 'results_classification_{}_{}.json'.format(experiment_id, subset) ) detection_output_file = os.path.join( output_path, 'results_detection_{}_{}.json'.format(experiment_id, subset) ) with open(classification_output_file, 'w') as f: json.dump(results_classification, f) with open(detection_output_file, 'w') as f: json.dump(results_detection, f) f_predictions.close()
def run_all_pipeline(input_video, smoothing_k, activity_threshold): input_size = (112, 112) length = 16 # Load labels with open('dataset/labels.txt', 'r') as f: labels = import_labels(f) print('Reading Video...') video_array = video_to_array(input_video, resize=input_size) if video_array is None: raise Exception('The video could not be read') nb_frames = get_num_frames(input_video) duration = get_duration(input_video) fps = nb_frames / duration print('Duration: {:.1f}s'.format(duration)) print('FPS: {:.1f}'.format(fps)) print('Number of frames: {}'.format(nb_frames)) nb_clips = nb_frames // length video_array = video_array.transpose(1, 0, 2, 3) video_array = video_array[:nb_clips*length,:,:,:] video_array = video_array.reshape((nb_clips, length, 3, 112, 112)) video_array = video_array.transpose(0, 2, 1, 3, 4) # Load C3D model and mean print('Loading C3D network...') model = C3D_conv_features(True) model.compile(optimizer='sgd', loss='mse') mean_total = np.load('data/models/c3d-sports1M_mean.npy') mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True) # Extract features print('Extracting features...') X = video_array - mean Y = model.predict(X, batch_size=1, verbose=1) # Load the temporal localization network print('Loading temporal localization network...') model_localization = temporal_localization_network(True) model_localization.compile(optimizer='rmsprop', loss='categorical_crossentropy') # Predict with the temporal localization network print('Predicting...') Y = Y.reshape(nb_clips, 1, 4096) prediction = model_localization.predict(Y, batch_size=1, verbose=1) prediction = prediction.reshape(nb_clips, 201) # Post processing the predited output print('Post-processing output...') labels_idx, scores = get_classification(prediction, k=5) print('Video: {}\n'.format(input_video)) print('Classification:') for idx, score in zip(labels_idx, scores): label = labels[idx] print('{:.4f}\t{}'.format(score, label)) prediction_smoothed = smoothing(prediction, k=smoothing_k) activities_idx, startings, endings, scores = activity_localization( prediction_smoothed, activity_threshold ) print('\nDetection:') print('Score\tInterval\t\tActivity') for idx, s, e, score in zip(activities_idx, startings, endings, scores): start = s * float(length) / fps end = e * float(length) / fps label = labels[idx] print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))
def create_stateful_dataset(video_features_file, videos_info, labels, output_path, batch_size, timesteps, subset=None): features_size = 4096 output_size = 201 f_video_features = h5py.File(video_features_file, 'r') output_file = os.path.join(output_path, 'dataset_stateful.hdf5') f_dataset = h5py.File(output_file, 'w') if not subset: subsets = ['training', 'validation'] else: subsets = [subset] with open(labels, 'r') as f: labels = import_labels(f) with open(videos_info, 'r') as f: videos_data = json.load(f) for subset in subsets: videos = [k for k in videos_data.keys() if videos_data[k]['subset'] == subset] videos = list(set(videos) & set(f_video_features.keys())) random.shuffle(videos) nb_videos = len(videos) print('Number of videos for {} subset: {}'.format(subset, nb_videos)) # Check how the videos are going to be placed sequence_stack = [] for _ in range(batch_size): sequence_stack.append([]) nb_clips_stack = np.zeros(batch_size).astype(np.int64) accumulative_clips_stack = [] for _ in range(batch_size): accumulative_clips_stack.append([]) for video_id in videos: min_pos = np.argmin(nb_clips_stack) sequence_stack[min_pos].append(video_id) nb_clips_stack[min_pos] += f_video_features[video_id].shape[0] accumulative_clips_stack[min_pos].append(nb_clips_stack[min_pos]) min_sequence = np.min(nb_clips_stack) max_sequence = np.max(nb_clips_stack) nb_batches_long = max_sequence // timesteps + 1 nb_batches = min_sequence // timesteps print('Number of batches: {}'.format(nb_batches)) video_features = np.zeros((nb_batches_long*batch_size*timesteps, features_size)) output = np.zeros((nb_batches_long*batch_size*timesteps, output_size)) index = np.arange(nb_batches_long*batch_size*timesteps) progbar = ProgressBar(max_value=batch_size) print('Creating stateful dataset for {} subset'.format(subset)) for i in range(batch_size): batch_index = index // timesteps % batch_size == i progbar.update(i) pos = 0 for video_id in sequence_stack[i]: # Video features vid_features = f_video_features[video_id][...] assert vid_features.shape[1] == features_size nb_instances = vid_features.shape[0] # Output output_classes = generate_output(videos_data[video_id], labels) assert nb_instances == len(output_classes) video_index = index[batch_index][pos:pos+nb_instances] video_features[video_index,:] = vid_features output[video_index] = to_categorical(output_classes, nb_classes=output_size) pos += nb_instances progbar.finish() video_features = video_features[:nb_batches*batch_size*timesteps,:] assert np.all(np.any(video_features, axis=1)) video_features = video_features.reshape((nb_batches*batch_size, timesteps, features_size)) output = output[:nb_batches*batch_size*timesteps,:] assert np.all(np.any(output, axis=1)) output = output.reshape((nb_batches*batch_size, timesteps, output_size)) if subset == 'training': background_weight = 0.6 sample_weights = np.ones(output.shape[:2]) sample_weights[output[:,:,0] == 1] = background_weight f_dataset_subset = f_dataset.create_group(subset) f_dataset_subset.create_dataset('vid_features', data=video_features, chunks=(4, timesteps, features_size), dtype='float32') f_dataset_subset.create_dataset('output', data=output, chunks=(batch_size, timesteps, output_size), dtype='float32') if subset == 'training': f_dataset_subset.create_dataset('sample_weight', data=sample_weights, chunks=(batch_size, timesteps), dtype='float32') f_dataset.close() f_video_features.close()
def run_runtime_tests(input_video, model_features, c3d_mean, model_localization): input_size = (112, 112) length = 16 # Setup post-processing variables smoothing_k = 5 activity_threshold = .2 # Load labels with open('dataset/labels.txt', 'r') as f: labels = import_labels(f) print('') print('#'*50) print(input_video) print('Reading Video...') t_s = time.time() video_array = video_to_array(input_video, resize=input_size) t_e = time.time() print('Loading Video: {:.2f}s'.format(t_e-t_s)) runtime_measures['load_video'].append(t_e-t_s) if video_array is None: raise Exception('The video could not be read') nb_frames = get_num_frames(input_video) duration = get_duration(input_video) fps = nb_frames / duration runtime_measures['video_duration'].append(duration) print('Duration: {:.1f}s'.format(duration)) print('FPS: {:.1f}'.format(fps)) print('Number of frames: {}'.format(nb_frames)) nb_clips = nb_frames // length video_array = video_array.transpose(1, 0, 2, 3) video_array = video_array[:nb_clips*length,:,:,:] video_array = video_array.reshape((nb_clips, length, 3, 112, 112)) video_array = video_array.transpose(0, 2, 1, 3, 4) # Extract features print('Extracting features...') t_s = time.time() X = video_array - c3d_mean Y = model_features.predict(X, batch_size=1, verbose=1) t_e = time.time() print('Extracting C3D features: {:.2f}s'.format(t_e-t_s)) runtime_measures['extract_features_c3d'].append(t_e-t_s) # Predict with the temporal localization network print('Predicting...') t_s = time.time() Y = Y.reshape(nb_clips, 1, 4096) prediction = model_localization.predict(Y, batch_size=1, verbose=1) prediction = prediction.reshape(nb_clips, 201) t_e = time.time() print('Prediction temporal activities: {:.2f}s'.format(t_e-t_s)) runtime_measures['temporal_localization_network'].append(t_e-t_s) # Post processing the predited output print('Post-processing output...') t_s = time.time() labels_idx, scores = get_classification(prediction, k=5) print('Video: {}\n'.format(input_video)) print('Classification:') for idx, score in zip(labels_idx, scores): label = labels[idx] print('{:.4f}\t{}'.format(score, label)) prediction_smoothed = smoothing(prediction, k=smoothing_k) activities_idx, startings, endings, scores = activity_localization( prediction_smoothed, activity_threshold ) t_e = time.time() runtime_measures['post-processing'].append(t_e-t_s) print('Post-processing runtime: {:.2f}s'.format(t_e-t_s)) print('\nDetection:') print('Score\tInterval\t\tActivity') for idx, s, e, score in zip(activities_idx, startings, endings, scores): start = s * float(length) / fps end = e * float(length) / fps label = labels[idx] print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))