def _process_utterance(audio_dir, label_dir, index, wav_path, text_path, args):
    """
    Preprocesses a single utterance wav/text_jamo pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text_jamo: text_jamo spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text_jamo)
    """
    try:
        # Load the audio as numpy array
        # wav = audio.load_wav(wav_path, sr=args.sample_rate)
        with open(wav_path, 'rb') as pcmfile:
            buf = pcmfile.read()
            wav = np.frombuffer(buf, dtype='int16')
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path))
        return None

    # rescale wav
    if args.rescale:
        wav = wav / np.abs(wav).max() * args.rescaling_max

    # M-AILABS extra silence specific
    if args.trim_silence:
        wav = audio.trim_silence(wav, args)

    # [-1, 1]
    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, args).astype(out_dtype)
    mel_frames = mel_spectrogram.shape[1]

    # Ensure time resolution adjustement between audio and mel-spectrogram
    pad = audio.librosa_pad_lr(wav, args.n_fft, audio.get_hop_size(args))

    # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (0, pad), mode='reflect')
    assert len(out) >= mel_frames * audio.get_hop_size(args)

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(args)]
    assert len(out) % audio.get_hop_size(args) == 0
    time_steps = len(out)

    # text_jamo sequence
    with open(text_path, 'r', encoding='CP949') as f:
        line = f.readline()

    # ETRI transcription rule
    line = sentence_filter(line).upper()
    label_sequence = normalize(line)
    print(label_sequence)

    # Write the spectrogram and audio to disk
    mel_filename = 'mel-{}.npy'.format(index)
    label_filename = 'label-{}.txt'.format(index)
    np.save(os.path.join(audio_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    with open(os.path.join(label_dir, label_filename), 'w', encoding='utf-8') as f_out:
        f_out.write(label_sequence)

    # Return a tuple describing this training example
    return (wav_path, text_path, mel_filename, label_filename, time_steps, mel_frames)
Example #2
0
def _process_utterance(mel_dir, wav_dir, index, wav_path, speaker_id, hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectrogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis,
                                  hparams.preemphasize)

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        #Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        preem_wav = preem_wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                            audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, (l_pad, r_pad),
                     mode='constant',
                     constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    if hparams.gin_channels > 0:
        speaker_name = hparams.speakers[speaker_id]
        index = speaker_name + '_' + index
    audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index))
    mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index))
    np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
    np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)

    #global condition features
    if hparams.gin_channels > 0:
        speaker_id = speaker_id  #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable)
    else:
        speaker_id = '<no_g>'

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, mel_filename, speaker_id, time_steps,
            mel_frames)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wav exception
        print(
            'file {} present in txt metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    # rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        # [-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        # Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        # Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        # Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                   audio.get_hop_size(hparams))

        # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
Example #4
0
def _process_utterance(dataset, mel_dir, linear_dir, audio_dir, spk_emb_dir,
                       index, audio_path, text, emt_label, spk_label, sex,
                       hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        aud = audio.load_audio(audio_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(audio_path))
        return None
    #Trim lead/trail silences
    if hparams.trim_silence:
        aud = audio.trim_silence(aud, hparams)

    #Pre-emphasize
    preem_aud = audio.preemphasis(aud, hparams.preemphasis,
                                  hparams.preemphasize)

    #rescale audio
    if hparams.rescale:
        aud = aud / np.abs(aud).max() * hparams.rescaling_max
        preem_aud = preem_aud / np.abs(preem_aud).max() * hparams.rescaling_max

        #Assert all audio is in [-1, 1]
        if (aud > 1.).any() or (aud < -1.).any():
            raise RuntimeError(
                'audio has invalid value: {}'.format(audio_path))
        if (preem_aud > 1.).any() or (preem_aud < -1.).any():
            raise RuntimeError(
                'audio has invalid value: {}'.format(audio_path))

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(aud, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        aud = aud[start:end]
        preem_aud = preem_aud[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(aud, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = aud
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the audio
    mel_spectrogram = audio.melspectrogram(preem_aud,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    # if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
    # 	return None

    #Compute the linear scale spectrogram from the audui
    # linear_spectrogram = audio.linearspectrogram(preem_aud, hparams).astype(np.float32)
    # linear_frames = linear_spectrogram.shape[1]

    #sanity check
    # assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(aud, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        l_pad, r_pad = audio.librosa_pad_lr(aud, hparams.n_fft,
                                            audio.get_hop_size(hparams),
                                            hparams.wavenet_pad_sides)

        #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, (l_pad, r_pad),
                     mode='constant',
                     constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    #Get speaker embedding
    #spk_emb = scoring.get_embedding(spk_emb_model, spk_emb_buckets, audio_path)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    spk_emb_filename = 'spkemb-{}.npy'.format(index)
    # np.save(os.path.join(audio_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    # np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)
    #np.save(os.path.join(spk_emb_dir, spk_emb_filename), spk_emb, allow_pickle=False)

    basename = os.path.basename(audio_path)
    # Return a tuple describing this training example
    return (dataset, audio_filename, mel_filename, linear_filename,
            spk_emb_filename, time_steps, mel_frames, text, emt_label,
            spk_label, basename, sex)
Example #5
0
def _process_utterance(out_dir, index, wav_path, pinyin, hparams):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    pinyin: The pinyin of Chinese spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    mel_dir = out_dir + "/mels"
    linear_dir = out_dir + "/linear"
    wav_dir = out_dir + "/audio"

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    print("debug wav_path:", wav_path)
    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the wav:
    #mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        print("debug --- drop wav_path:", wav_path, "mel_frames:", mel_frames)
        return None

    # Compute the linear-scale spectrogram from the wav:
    #spectrogram = audio.spectrogram(wav).astype(np.float32)
    #n_frames = spectrogram.shape[1]
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrograms to disk:
    #spectrogram_filename = 'thchs30-spec-%05d.npy' % index
    #mel_filename = 'thchs30-mel-%05d.npy' % index
    #np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
    #np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)
    print("debug save wav file:", os.path.join(wav_dir, audio_filename))
    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, pinyin)
Example #6
0
def re_save_all(wav_path, audio_filename, mel_filename, linear_filename):

    try:
        # Load the audio as numpy array
        aud = audio.load_audio(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None
    #Trim lead/trail silences
    if hparams.trim_silence:
        aud = audio.trim_silence(aud, hparams)

    #Pre-emphasize
    preem_aud = audio.preemphasis(aud, hparams.preemphasis,
                                  hparams.preemphasize)

    #rescale audio
    if hparams.rescale:
        aud = aud / np.abs(aud).max() * hparams.rescaling_max
        preem_aud = preem_aud / np.abs(preem_aud).max() * hparams.rescaling_max

        #Assert all audio is in [-1, 1]
        if (aud > 1.).any() or (aud < -1.).any():
            raise RuntimeError('audio has invalid value: {}'.format(wav_path))
        if (preem_aud > 1.).any() or (preem_aud < -1.).any():
            raise RuntimeError('audio has invalid value: {}'.format(wav_path))

    #[-1, 1]
    out = aud
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the audio
    mel_spectrogram = audio.melspectrogram(preem_aud,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    #Compute the linear scale spectrogram from the audui
    linear_spectrogram = audio.linearspectrogram(preem_aud,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    #Ensure time resolution adjustement between audio and mel-spectrogram
    l_pad, r_pad = audio.librosa_pad_lr(aud, hparams.n_fft,
                                        audio.get_hop_size(hparams),
                                        hparams.wavenet_pad_sides)

    #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad),
                 mode='constant',
                 constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0

    # Write the spectrogram and audio to disk
    np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
    np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
    np.save(linear_filename, linear_spectrogram.T, allow_pickle=False)
Example #7
0
        # sanity check
        assert linear_frames == mel_frames
>>>>>>> f33090dba9ba4bc52db8367abdc48841d13c48f8

    if hparams.use_lws:
        # Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        # Zero pad audio signal
<<<<<<< HEAD
        out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    else:
        # Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))
=======
        out = np.pad(out, (l, r), mode='constant',
                     constant_values=constant_values)
    else:
        # Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(
            wav, hparams.n_fft, audio.get_hop_size(hparams))
>>>>>>> f33090dba9ba4bc52db8367abdc48841d13c48f8

        # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
Example #8
0
def _process_utterance(mel_dir, index, wav_path, start, end, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - start, end: start, end points of speech
        - hparams: hyper parameters

    Returns:
        - A tuple: (wav_path, mel_filename, time_steps, mel_frames, start, end)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path))
        return None

    start += 1 * hparams.sample_rate
    end += 1 * hparams.sample_rate

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #[-1, 1]
    out = wav
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype)
    mel_frames = mel_spectrogram.shape[1]

    # Ensure time resolution adjustement between audio and mel-spectrogram
    pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))

    # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (0, pad), mode='reflect')
    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    start = round(start/int(time_steps / mel_frames))
    end = round(end/int(time_steps / mel_frames))

    # Write the spectrogram and audio to disk
    mel_filename = 'mel-{}.npy'.format(index)
    np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example
    return (wav_path, mel_filename, time_steps, mel_frames, start, end)
Example #9
0
def build_from_path_ispl(hparams, input_dirs, mel_dir, label_dir, tqdm=lambda x: x):
    """
    Preprocesses the speech dataset from a gven input path to given output directories

    Args:
        - hparams: hyper parameters
        - input_dirs: input directory that contains the files to prerocess
        - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
        - label_dir: the directory to write the label into
        - tqdm: Optional, provides a nice progress bar

    Returns:
        - A list of tuple describing the train examples. this should be written to train.txt
    """

    # We use ProcessPoolExecutor to parallelize across processes, this is just for
    # optimization purposes and it can be omited
    futures = []
    index = 1
    for input_dir in input_dirs:
        files = find_files(os.path.join(input_dir))
        for wav_path in files:
            file_name = wav_path.split("\\")[-1]
            if int(file_name.split('.')[0]) <= 10:
                label_path = wav_path.split("\\")[0] + '/label.txt'
                with open(label_path, encoding='utf-8') as f:
                    lines = f.readlines()
                for line in lines:
                    if file_name in line:
                        labels = line.replace('[', '').replace(']', '').split(':')[1].replace(',\n', '').split(',')
                        start = []
                        end = []
                        for idx in range(0, len(labels), 2):
                            start.append(int(labels[idx]))
                            end.append(int(labels[idx+1]))

            try:
                # Load the audio as numpy array
                wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
            except FileNotFoundError:  # catch missing wav exception
                print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path))
                return None

            # rescale wav
            if hparams.rescale:
                wav = wav / np.abs(wav).max() * hparams.rescaling_max

            # M-AILABS extra silence specific
            if hparams.trim_silence:
                wav = audio.trim_silence(wav, hparams)

            # [-1, 1]
            out = wav
            out_dtype = np.float32

            if int(file_name.split('.')[0]) <= 10:
                label = np.zeros_like(out)
                for idx in range(len(start)):
                    start[idx] = int(start[idx] / 1000 * hparams.sample_rate)
                    end[idx] = int(end[idx] / 1000 * hparams.sample_rate)
                    label[start[idx]:end[idx]] = 1.
            else:
                label = wav_path.split('.')[0] + '.label'
                with open(label, encoding='utf-8') as f:
                    lines = f.readlines()
                label = np.asarray([int(line.strip('\n')) for line in lines])

            # Compute the mel scale spectrogram from the wav
            mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype)
            mel_spectrogram = mel_spectrogram[:, -len(label):]
            mel_frames = mel_spectrogram.shape[1]

            # Ensure time resolution adjustement between audio and mel-spectrogram
            pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))

            if int(file_name.split('.')[0]) <= 10:
                # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
                out = np.pad(out, (0, pad), mode='reflect')
                label = np.pad(label, (0, pad), mode='reflect')
                assert len(out) >= mel_frames * audio.get_hop_size(hparams)

                # time resolution adjustement
                # ensure length of raw audio is multiple of hop size so that we can use
                # transposed convolution to upsample
                out = out[:mel_frames * audio.get_hop_size(hparams)]
                label = label[:mel_frames * audio.get_hop_size(hparams)]
                assert len(out) % audio.get_hop_size(hparams) == 0
                label = label[::audio.get_hop_size(hparams)]

                time_steps = len(out)
            else:
                time_steps = len(out)

            # Write the spectrogram and audio to disk
            mel_filename = 'mel-{}.npy'.format(index)
            label_filename = 'label-{}.npy'.format(index)
            np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
            np.save(os.path.join(label_dir, label_filename), label, allow_pickle=False)
            futures.append((wav_path, mel_filename, time_steps, mel_frames, label_filename))
            index += 1

    return [future for future in tqdm(futures)]
Example #10
0
def _process_utterance(out_dir, index, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - out_dir: the directory to write the msgpack into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
            wav_path))
        return None

    # Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    # Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

    # rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        # Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    # [-1, 1]
    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    l_pad, r_pad = audio.librosa_pad_lr(wav, audio.get_hop_size(hparams), hparams.pad_sides)

    # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)
    npz_filename = '{}.npz'.format(index)
    r = hparams.outputs_per_step
    if hparams.symmetric_mels:
        _pad_value = -hparams.max_abs_value
    else:
        _pad_value = 0.
    # +2r for head and tail silence
    mel_spec = np.pad(mel_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value)
    linear_spec = np.pad(linear_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value)
    target_length = len(linear_spec)
    target_frames = (target_length // r + 1) * r
    num_pad = target_frames - target_length
    if num_pad != 0:
        linear_spec = np.pad(linear_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value)
        mel_spec = np.pad(mel_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value)
    stop_token = np.concatenate(
        [np.zeros(target_frames - 1, dtype=np.float32), np.ones(1, dtype=np.float32)],
        axis=0)
    data = {
        'mel': mel_spec,
        'linear': linear_spec,
        'audio': out.astype(out_dtype),
        'input_data': np.asarray(text_to_sequence(text)),
        'time_steps': time_steps,
        'mel_frames': target_frames,
        'text': text,
        'stop_token': stop_token,
    }
    dumps_msgpack(data, os.path.join(out_dir, npz_filename))
    # Return a tuple describing this training example
    return npz_filename, time_steps, mel_frames, text