def list_data(dataset_dir, speaker_ids, noise_dirs, max_files=None, shuffle=True, augmentation_factor=1): speech_dataset = AudioVisualDataset(dataset_dir) speech_subset = speech_dataset.subset(speaker_ids, max_files, shuffle) noise_dataset = AudioDataset(noise_dirs) noise_file_paths = noise_dataset.subset(max_files, shuffle) n_files = min(len(speech_subset), len(noise_file_paths)) speech_entries = speech_subset[:n_files] noise_file_paths = noise_file_paths[:n_files] all_speech_entries = speech_entries all_noise_file_paths = noise_file_paths for i in range(augmentation_factor - 1): all_speech_entries += speech_entries all_noise_file_paths += random.sample(noise_file_paths, len(noise_file_paths)) return all_speech_entries, all_noise_file_paths
def list_data( dataset_dir, speaker_ids, noise_dirs, max_files=None, shuffle=True, augmentation_factor=1, oversample_noise=True, ): speech_dataset = AudioVisualDataset(dataset_dir) speech_subset = speech_dataset.subset(speaker_ids, max_files, shuffle) noise_dataset = AudioDataset(noise_dirs) noise_subset = noise_dataset.subset(max_files, shuffle) if not oversample_noise: n_files = min(len(speech_subset), len(noise_subset)) speech_entries = speech_subset[:n_files] noise_entries = noise_subset[:n_files] else: speech_and_noise_entries = [ (s, n) for s, n in zip(speech_subset, itertools.cycle(noise_subset)) ] speech_entries, noise_entries = [ list(x) for x in zip(*speech_and_noise_entries) ] all_speech_entries = speech_entries all_noise_file_paths = noise_entries for i in range(augmentation_factor - 1): all_speech_entries += speech_entries all_noise_file_paths += random.sample(noise_entries, len(noise_entries)) return all_speech_entries, all_noise_file_paths
def list_source_pairs(dataset_dir, speakers): dataset = AudioVisualDataset(dataset_dir) subsets = [ dataset.subset([speaker_id], max_files=20, shuffle=True) for speaker_id in speakers ] return zip(*[subset.audio_paths() for subset in subsets])
def list_source_pairs(dataset_dir, speaker_id, noise_dir): dataset = AudioVisualDataset(dataset_dir) speaker_file_paths = dataset.subset([speaker_id], max_files=20, shuffle=True).audio_paths() noise_file_paths = [os.path.join(noise_dir, f) for f in os.listdir(noise_dir)] random.shuffle(speaker_file_paths) random.shuffle(noise_file_paths) return zip(speaker_file_paths, noise_file_paths)
def list_speakers(args): if args.speakers is None: dataset = AudioVisualDataset(args.dataset_dir) speaker_ids = dataset.list_speakers() else: speaker_ids = args.speakers if args.ignored_speakers is not None: for speaker_id in args.ignored_speakers: speaker_ids.remove(speaker_id) return speaker_ids
def preprocess(args): speaker_ids = list_speakers(args) dataset = AudioVisualDataset(args.dataset_dir) for speaker_id in speaker_ids: data_subset = dataset.subset([speaker_id], shuffle=True) video_samples, audio_samples = data_processor.preprocess_data( data_subset) preprocessed_speaker_path = os.path.join(args.preprocessed_dir, speaker_id) np.savez(preprocessed_speaker_path, video_samples=video_samples, audio_samples=audio_samples)
def predict(args): speaker_ids = list_speakers(args) dataset = AudioVisualDataset(args.dataset_dir) prediction_output_dir = os.path.join( args.prediction_output_dir, '{:%Y-%m-%d_%H-%M-%S}'.format(datetime.now())) os.mkdir(prediction_output_dir) for speaker_id in speaker_ids: #video_samples, audio_samples = load_preprocessed_samples( # args.preprocessed_dir, [speaker_id], max_speaker_samples=800 #) #data_processor.apply_normalization(video_samples, args.normalization_cache) network = VideoToSpeechNet.load(args.model_cache, args.weights_cache) #network.fine_tune(video_samples, audio_samples) speaker_prediction_dir = os.path.join(prediction_output_dir, speaker_id) os.mkdir(speaker_prediction_dir) video_file_paths = dataset.subset([speaker_id], max_files=10).video_paths() for video_file_path in video_file_paths: try: video_sample = data_processor.preprocess_video_sample( video_file_path) data_processor.apply_normalization(video_sample, args.normalization_cache) predicted_audio_sample = network.predict(video_sample) sample_name = os.path.splitext( os.path.basename(video_file_path))[0] reconstructed_signal = data_processor.reconstruct_audio_signal( predicted_audio_sample, sample_rate=44100) reconstructed_signal.save_to_wav_file( os.path.join(speaker_prediction_dir, "%s.wav" % sample_name)) shutil.copy(video_file_path, speaker_prediction_dir) except Exception as e: print("failed to preprocess %s (%s). skipping" % (video_file_path, e))