Esempio n. 1
0
def compute_scaler(workspace, data_type, snr):
    """Compute and write out scaler of data.
    """

    # Load data.
    begin_time = time.time()

    hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", data_type, "{}db".format(int(snr)), "data.h5")
    with h5py.File(hdf5_path, 'r') as hf:
        x = np.array(hf.get('x')) # (n_segs, n_concat, n_freq)

    # Compute scaler.
    (n_segs, n_concat, n_freq) = x.shape
    x2d = x.reshape((n_segs * n_concat, n_freq))
    scaler = StandardScaler(with_mean=True, with_std=True).fit(x2d)

    # Write out scaler.
    scaler_filename = "scaler.pickle"
    scaler_dir = os.path.join(workspace, "packed_features", "spectrogram", data_type, "{}db".format(int(snr)))
    create_directory(scaler_dir)

    pickle.dump(scaler, open(os.path.join(scaler_dir, scaler_filename), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

    print()
    print("Building scaler time: {}".format(time.time() - begin_time))
    print()
Esempio n. 2
0
def pack_features(workspace, data_type, snr, n_concat, n_hop):
    """Load all features, apply log and conver to 3D tensor, write out to .h5 file.

    Args:
      workspace: str, path of workspace.
      data_type: str, 'train' | 'test'.
      snr: float, signal to noise ratio to be mixed.
      n_concat: int, number of frames to be concatenated.
      n_hop: int, hop frames.
    """

    inputs = []  # (n_segs, n_concat, n_freq)
    outputs = []  # (n_segs, n_freq)

    time_begin = time.time()

    # Load all features.
    features_dir = os.path.join(workspace, "features", "spectrogram", data_type, "{}db".format(int(snr)))
    for i, features_path in enumerate(glob.glob(features_dir + "/*.pickle")):
        # Load feature.
        data = pickle.load(open(features_path, "rb"))
        mixed_audio_complex_spectrogram, speech_spectrogram, noise_spectrogram, alpha, rule_name = data

        mixed_audio_complex_spectrogram = np.abs(mixed_audio_complex_spectrogram)

        # Pad start and finish of the spectrogram with boarder values.
        n_pad = (n_concat - 1) / 2
        mixed_audio_complex_spectrogram = pad_with_border(mixed_audio_complex_spectrogram, n_pad)
        speech_spectrogram = pad_with_border(speech_spectrogram, n_pad)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_audio_complex_spectrogram_3d = mat_2d_to_3d(mixed_audio_complex_spectrogram, agg_num=n_concat, hop=n_hop)
        inputs.append(mixed_audio_complex_spectrogram_3d)

        # Cut target spectrogram and take the center frame of each 3D segment.
        speech_spectrogram_3d = mat_2d_to_3d(speech_spectrogram, agg_num=n_concat, hop=n_hop)
        y = speech_spectrogram_3d[:, int((n_concat - 1) / 2), :]
        outputs.append(y)

        if (i + 1) % 101 == 0:
            print("Iteration # {}".format(i))

    inputs = np.concatenate(inputs, axis=0)
    outputs = np.concatenate(outputs, axis=0)

    inputs = log_sp(inputs).astype(np.float32)
    outputs = log_sp(outputs).astype(np.float32)

    # Write out data to .h5 file.
    features_filename = "data.h5"
    features_dir = os.path.join(workspace, "packed_features", "spectrogram", data_type, "{}db".format(int(snr)))
    create_directory(features_dir)

    with h5py.File(os.path.join(features_dir, features_filename), "w") as hf:
        hf.create_dataset('x', data=inputs)
        hf.create_dataset('y', data=outputs)

    print()
    print("Packing features time: {}".format(time.time() - time_begin))
    print()
Esempio n. 3
0
def create_rules_for_mixing_speech_with_noises(workspace, speech_dir, noise_dir, data_type, magnification):
    """Create csv containing mixture information.

    Each row in the .csv file contains [speech_filename, noise_filename, noise_onset, noise_offset]

    Args:
      workspace: str, path of workspace.
      speech_dir: str, path of speech data.
      noise_dir: str, path of noise data.
      data_type: str, 'train' | 'test'.
      magnification: int, only used when data_type='train', number of noise
          selected to mix with a speech. E.g., when magnification=3, then 4620
          speech with create 4620*3 mixtures. magnification should not larger
          than the species of noises.
    """
    time_start = time.time()

    random_state = np.random.RandomState(42)

    rules_dir = os.path.join(workspace, "mixing_rules")
    create_directory(rules_dir)

    rules_filename = os.path.join(rules_dir, "{}.csv".format(data_type))
    with open(rules_filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["speech_file_name", "noise_file_name", "noise_begin", "noise_end"])

        noise_paths = glob.glob(noise_dir + "*.wav")
        speech_paths = glob.glob(speech_dir + "*.wav")

        for speech_path in speech_paths:
            (speech_audio, _) = read_audio(speech_path)

            # For training data, mix each speech with randomly picked #magnification noises.
            # For test data, mix each speech with all noises.
            if data_type == "train":
                noise_paths = random_state.choice(noise_paths, size=magnification, replace=False)

            for noise_path in noise_paths:
                (noise_audio, _) = read_audio(noise_path)

                if noise_audio.shape[0] <= speech_audio.shape[0]:
                    noise_begin = 0
                    noise_end = noise_audio.shape[0]
                else:
                    # If noise longer than speech then randomly select a segment of noise.
                    noise_begin = random_state.randint(0, noise_audio.shape[0] - speech_audio.shape[0], size=1)[0]
                    noise_end = noise_begin + speech_audio.shape[0]

                writer.writerow([os.path.basename(speech_path), os.path.basename(noise_path), noise_begin, noise_end])

    print()
    print("Mixing clean {} speech with noises time: {}".format(data_type, time.time() - time_start))
    print()
Esempio n. 4
0
def extract_features(workspace, speech_to_enhance_dir, snr):
    time_start = time.time()

    sample_rate = cfg.sample_rate

    audios_to_enhance_dir = os.path.join(workspace, speech_to_enhance_dir)
    for audio_id, audio_path in enumerate(
            glob.glob(audios_to_enhance_dir + "/*.wav")):
        speech_audio = read_audio(audio_path, target_fs=sample_rate)[0]

        speech_audio_complex_spectrogram = calculate_spectrogram(
            speech_audio,
            mode="complex",
            window_size=cfg.n_window,
            n_overlap=cfg.n_overlap)

        # Save features.
        features = [
            speech_audio_complex_spectrogram,
            os.path.basename(audio_path).split(".")[0]
        ]

        features_filename = "{}.pickle".format(
            os.path.basename(audio_path).split(".")[0])
        features_dir = os.path.join(workspace, "data", "speech_to_enhance",
                                    "features", "spectrogram",
                                    "{}db".format(int(snr)))
        create_directory(features_dir)

        features_path = os.path.join(features_dir, features_filename)
        pickle.dump(features,
                    open(features_path, "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)

    print()
    print("Extracting features time: %s" % (time.time() - time_start))
    print()
Esempio n. 5
0
def calculate_mixture_features(workspace, speech_dir, noise_dir, data_type, snr):
    """Calculate spectrogram for mixed, speech and noise audio. Then write the
    features to disk.

    Args:
      workspace: str, path of workspace.
      speech_dir: str, path of speech data.
      noise_dir: str, path of noise data.
      data_type: str, 'train' | 'test'.
      snr: float, signal to noise ratio to be mixed.
    """
    time_start = time.time()

    fs = cfg.sample_rate

    # Open mixture csv.
    rules_filename = os.path.join(workspace, "mixing_rules", "{}.csv".format(data_type))
    with open(rules_filename, "r", encoding="utf-8") as f:
        rules_reader = csv.reader(f)
        next(rules_reader, None)  # skip the headers

        for i, rule in enumerate(rules_reader):
            [speech_filename, noise_filename, noise_begin, noise_end] = rule

            speech_path = os.path.join(speech_dir, speech_filename)
            speech_audio = read_audio(speech_path, target_fs=fs)[0]

            noise_path = os.path.join(noise_dir, noise_filename)
            noise_audio = read_audio(noise_path, target_fs=fs)[0]

            # Repeat noise n_repeat times to cover entire clean speech sample.
            if noise_audio.shape[0] < speech_audio.shape[0]:
                n_repeat = int(np.ceil(speech_audio.shape[0] / noise_audio.shape[0]))
                noise_audio = np.tile(noise_audio, n_repeat)[:speech_audio.shape[0]]
            # Truncate noise to the same length as speech.
            else:
                noise_audio = noise_audio[int(noise_begin):int(noise_end)]

            # Scale speech to given SNR.
            scaler = get_amplitude_scaling_factor(speech_audio, noise_audio, snr=snr)
            speech_audio *= scaler

            # Get normalized mixture, speech, noise.
            mixed_audio, speech_audio, noise_audio, alpha = additive_mixing(speech_audio, noise_audio)

            rule_name = "{}.{}".format(speech_filename.split(".")[0], noise_filename.split(".")[0])

            # Save mixed audio.
            mixed_audio_filename = "{}.wav".format(rule_name)
            mixed_audio_dir = os.path.join(workspace, "mixed_audios", "spectrogram", data_type, "{}db".format(int(snr)))
            create_directory(mixed_audio_dir)

            write_audio(os.path.join(mixed_audio_dir, mixed_audio_filename), mixed_audio, fs)

            # Extract spectrograms.
            mixed_audio_complex_spectrogram = calculate_spectrogram(mixed_audio, mode='complex',
                                                                    window_size=cfg.n_window, n_overlap=cfg.n_overlap)
            speech_spectrogram = calculate_spectrogram(speech_audio, mode='magnitude',
                                                       window_size=cfg.n_window, n_overlap=cfg.n_overlap)
            noise_spectrogram = calculate_spectrogram(noise_audio, mode='magnitude',
                                                      window_size=cfg.n_window, n_overlap=cfg.n_overlap)

            # Save features.
            features_filename = "{}.{}.pickle".format(speech_filename.split(".")[0], noise_filename.split(".")[0])
            features_dir = os.path.join(workspace, "features", "spectrogram", data_type, "{}db".format(int(snr)))
            create_directory(features_dir)

            features = [mixed_audio_complex_spectrogram, speech_spectrogram, noise_spectrogram, alpha, rule_name]
            feature_path = os.path.join(features_dir, features_filename)
            pickle.dump(features, open(feature_path, "wb"), protocol=pickle.HIGHEST_PROTOCOL)

            if (i + 1) % 101 == 0:
                print("Iteration # {}".format(i))

    print()
    print("Extracting features time: %s" % (time.time() - time_start))
    print()
Esempio n. 6
0
def enhance_audio(workspace, speech_to_enhance_dir, train_snr, test_snr,
                  n_concat, iteration):
    """Inference all test data, write out recovered wavs to disk.

    Args:
      workspace: str, path of workspace.
      train_snr: float, training SNR.
      test_snr: float, testing SNR.
      n_concat: int, number of frames to concatenta, should equal to n_concat
          in the training stage.
      iteration: int, iteration of model to load.
      visualize: bool, plot enhanced spectrogram for debug.
    """

    begin_time = time.time()

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Load model.
    model_path = os.path.join(workspace, "models",
                              "{}db".format(int(train_snr)),
                              "md_{}iters.h5".format(iteration))
    model = load_model(model_path)

    # Load scaler.
    scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                               "train", "{}db".format(int(train_snr)),
                               "scaler.pickle")
    scaler = pickle.load(open(scaler_path, "rb"))

    # Load test data.
    features_dir = os.path.join(workspace, "data", "speech_to_enhance",
                                "features", "spectrogram",
                                "{}db".format(int(test_snr)))

    for sample_id, feature_filename in enumerate(os.listdir(features_dir)):
        # Load feature.
        feature_path = os.path.join(features_dir, feature_filename)
        feature_data = pickle.load(open(feature_path, "rb"))

        mixed_audio_complex_spectrogram, audio_name = feature_data
        mixed_audio_spectrogram = np.abs(mixed_audio_complex_spectrogram)

        # Process data.
        n_pad = (n_concat - 1) / 2
        mixed_audio_spectrogram = audio_utils.pad_with_border(
            mixed_audio_spectrogram, n_pad)
        mixed_audio_spectrogram = audio_utils.log_sp(mixed_audio_spectrogram)

        # Scale data.
        if scale:
            mixed_audio_spectrogram = audio_utils.scale_on_2d(
                mixed_audio_spectrogram, scaler)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_audio_spectrogram_3d = audio_utils.mat_2d_to_3d(
            mixed_audio_spectrogram, agg_num=n_concat, hop=1)

        # Predict.
        prediction = model.predict(mixed_audio_spectrogram_3d)
        print("Sample id: {}. sample name: {}".format(sample_id, audio_name))

        # Inverse scale.
        if scale:
            prediction = audio_utils.inverse_scale_on_2d(prediction, scaler)

        # Recover enhanced wav.
        prediction_spectrogram = np.exp(prediction)
        recovered_wave = recover_wav(prediction_spectrogram,
                                     mixed_audio_complex_spectrogram,
                                     n_overlap, np.hamming)

        # Scaler for compensate the amplitude change after spectrogram and IFFT.
        recovered_wave *= np.sqrt((np.hamming(n_window)**2).sum())

        # Write out enhanced wav.
        enhanced_audio_filename = "{}.enh.wav".format(audio_name)
        enhanced_audio_dir = os.path.join(workspace, "data",
                                          "speech_to_enhance", "enhanced_wavs",
                                          "{}db".format(int(test_snr)))

        create_directory(enhanced_audio_dir)
        audio_utils.write_audio(
            os.path.join(enhanced_audio_dir, enhanced_audio_filename),
            recovered_wave, fs)

    print()
    print("Inference time: {}".format(time.time() - begin_time))
    print()