def list_data(dataset_dir,
              speaker_ids,
              noise_dirs,
              max_files=None,
              shuffle=True,
              augmentation_factor=1):
    speech_dataset = AudioVisualDataset(dataset_dir)
    speech_subset = speech_dataset.subset(speaker_ids, max_files, shuffle)

    noise_dataset = AudioDataset(noise_dirs)
    noise_file_paths = noise_dataset.subset(max_files, shuffle)

    n_files = min(len(speech_subset), len(noise_file_paths))

    speech_entries = speech_subset[:n_files]
    noise_file_paths = noise_file_paths[:n_files]

    all_speech_entries = speech_entries
    all_noise_file_paths = noise_file_paths

    for i in range(augmentation_factor - 1):
        all_speech_entries += speech_entries
        all_noise_file_paths += random.sample(noise_file_paths,
                                              len(noise_file_paths))

    return all_speech_entries, all_noise_file_paths
Esempio n. 2
0
def list_data(
    dataset_dir,
    speaker_ids,
    noise_dirs,
    max_files=None,
    shuffle=True,
    augmentation_factor=1,
    oversample_noise=True,
):
    speech_dataset = AudioVisualDataset(dataset_dir)
    speech_subset = speech_dataset.subset(speaker_ids, max_files, shuffle)

    noise_dataset = AudioDataset(noise_dirs)
    noise_subset = noise_dataset.subset(max_files, shuffle)

    if not oversample_noise:
        n_files = min(len(speech_subset), len(noise_subset))
        speech_entries = speech_subset[:n_files]
        noise_entries = noise_subset[:n_files]
    else:
        speech_and_noise_entries = [
            (s, n) for s, n in zip(speech_subset, itertools.cycle(noise_subset))
        ]
        speech_entries, noise_entries = [
            list(x) for x in zip(*speech_and_noise_entries)
        ]

    all_speech_entries = speech_entries
    all_noise_file_paths = noise_entries

    for i in range(augmentation_factor - 1):
        all_speech_entries += speech_entries
        all_noise_file_paths += random.sample(noise_entries, len(noise_entries))

    return all_speech_entries, all_noise_file_paths
def list_source_pairs(dataset_dir, speakers):
    dataset = AudioVisualDataset(dataset_dir)
    subsets = [
        dataset.subset([speaker_id], max_files=20, shuffle=True)
        for speaker_id in speakers
    ]

    return zip(*[subset.audio_paths() for subset in subsets])
def list_source_pairs(dataset_dir, speaker_id, noise_dir):
	dataset = AudioVisualDataset(dataset_dir)
	speaker_file_paths = dataset.subset([speaker_id], max_files=20, shuffle=True).audio_paths()
	noise_file_paths = [os.path.join(noise_dir, f) for f in os.listdir(noise_dir)]

	random.shuffle(speaker_file_paths)
	random.shuffle(noise_file_paths)

	return zip(speaker_file_paths, noise_file_paths)
def list_speakers(args):
    if args.speakers is None:
        dataset = AudioVisualDataset(args.dataset_dir)
        speaker_ids = dataset.list_speakers()
    else:
        speaker_ids = args.speakers

    if args.ignored_speakers is not None:
        for speaker_id in args.ignored_speakers:
            speaker_ids.remove(speaker_id)

    return speaker_ids
def preprocess(args):
    speaker_ids = list_speakers(args)
    dataset = AudioVisualDataset(args.dataset_dir)

    for speaker_id in speaker_ids:
        data_subset = dataset.subset([speaker_id], shuffle=True)

        video_samples, audio_samples = data_processor.preprocess_data(
            data_subset)

        preprocessed_speaker_path = os.path.join(args.preprocessed_dir,
                                                 speaker_id)
        np.savez(preprocessed_speaker_path,
                 video_samples=video_samples,
                 audio_samples=audio_samples)
def predict(args):
    speaker_ids = list_speakers(args)
    dataset = AudioVisualDataset(args.dataset_dir)

    prediction_output_dir = os.path.join(
        args.prediction_output_dir,
        '{:%Y-%m-%d_%H-%M-%S}'.format(datetime.now()))
    os.mkdir(prediction_output_dir)

    for speaker_id in speaker_ids:
        #video_samples, audio_samples = load_preprocessed_samples(
        #	args.preprocessed_dir, [speaker_id], max_speaker_samples=800
        #)

        #data_processor.apply_normalization(video_samples, args.normalization_cache)

        network = VideoToSpeechNet.load(args.model_cache, args.weights_cache)
        #network.fine_tune(video_samples, audio_samples)

        speaker_prediction_dir = os.path.join(prediction_output_dir,
                                              speaker_id)
        os.mkdir(speaker_prediction_dir)

        video_file_paths = dataset.subset([speaker_id],
                                          max_files=10).video_paths()
        for video_file_path in video_file_paths:
            try:
                video_sample = data_processor.preprocess_video_sample(
                    video_file_path)
                data_processor.apply_normalization(video_sample,
                                                   args.normalization_cache)

                predicted_audio_sample = network.predict(video_sample)

                sample_name = os.path.splitext(
                    os.path.basename(video_file_path))[0]

                reconstructed_signal = data_processor.reconstruct_audio_signal(
                    predicted_audio_sample, sample_rate=44100)
                reconstructed_signal.save_to_wav_file(
                    os.path.join(speaker_prediction_dir,
                                 "%s.wav" % sample_name))

                shutil.copy(video_file_path, speaker_prediction_dir)

            except Exception as e:
                print("failed to preprocess %s (%s). skipping" %
                      (video_file_path, e))