Example #1
0
def main():
    print("trim audio beggin....")
    input = os.path.join("/Users/zhuribing/Documents", "p225_028.wav")
    output = os.path.join("/Users/zhuribing/Documents", "trim_p225_028.wav")
    wav = audio.load_wav(input, sr=16000)
    wav = audio.trim_silence(wav, hparams)
    audio.save_wav(wav, output, 16000)
Example #2
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams):
    """
<<<<<<< HEAD
	Preprocesses a single utterance wavs/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wavs into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:rescaling_max
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wavs exception
        print('file {} present in csv metadata is not present in wavs folder. skipping!'.format(
            wav_path))
        return None

    # rescale wavs
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start: end]
        out = out[start: end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        # [-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32
Example #3
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       speaker_num, lan_num, hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Get spectrogram from wav
    ret = audio.wav2spectrograms(wav, hparams)
    if ret is None:
        return None
    out, mel_spectrogram, linear_spectrogram, time_steps, mel_frames = ret

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(np.float32),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text, speaker_num, lan_num)
Example #4
0
def get_second_part_wave(wav, start_time, end_time, hparams):
    start_time = int(start_time * 1000)
    end_time = int(end_time * 1000)
    sentence = wav[start_time:end_time]
    temp = sentence.export('temp.wav', format="wav")
    sentence = audio.load_wav('temp.wav', sr=hparams.sample_rate)
    return sentence
    def _process_wave(self, wav_file, num_frames):
        try:
            wav = audio.load_wav(wav_file, sr=audio_hparams.sample_rate)
        except FileNotFoundError:
            print(
                'file {} present in csv metadata is not present in wav folder. skipping!'
                .format(wav_file))

        if audio_hparams.trim_silence:
            wav = audio.trim_silence(wav, audio_hparams)

        expect_len = num_frames * audio_hparams.hop_size + audio_hparams.win_size
        if len(wav) < expect_len:
            wav = np.concatenate([wav] * np.math.ceil(expect_len / len(wav)))

        if len(wav) > expect_len:
            sp = random.randint(0, len(wav) - expect_len)
            wav = wav[sp:sp + expect_len]

        wav = audio.preemphasis(wav, audio_hparams.preemphasis,
                                audio_hparams.preemphasize)

        if audio_hparams.rescale:
            wav = wav / np.abs(wav).max() * audio_hparams.rescaling_max

        mels = audio.melspectrogram(wav, audio_hparams).astype(np.float32).T
        return mels
Example #6
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text):

    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path)
    except:
        print('file {} present in csv not in folder'.format(wav_path))
        return None

    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.trim_silence:
        wav = audio.trim_silence(wav)

    out = mulaw_quantize(wav, hparams.quantize_channels)

    start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
    wav = wav[start:end]
    out = out[start:end]

    constant_values = mulaw_quantize(0, hparams.quantize_channels)
    out_dtype = np.int16

    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    assert linear_frames == mel_frames

    l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    time_steps = len(out)
    assert time_steps >= mel_frames * audio.get_hop_size()

    out = out[:mel_frames * audio.get_hop_size()]
    assert time_steps % audio.get_hop_size() == 0

    audio_filename = 'speech-audio-{:05d}.npy'.format(index)
    mel_filename = 'speech-mel-{:05d}.npy'.format(index)
    linear_filename = 'speech-linear-{:05d}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
Example #7
0
def main():
    process_type = '1'

    input_path = './wav_files/sample.wav'
    output_path = input_path.split(
        '.wav')[0] + '_enhanced_mmse' + process_type + '.wav'
    wav, sample_rate = audio.load_wav(input_path, sr=None)
    wav = wav / np.abs(wav).max() * 0.999

    clean_wav = speech_enhancement.mmse_stsa(wav,
                                             sample_rate=sample_rate,
                                             process_type=process_type)

    # Plot result
    plt.figure()
    plt.subplot(2, 1, 1)
    librosa.display.waveplot(wav, sr=sample_rate)
    plt.title('Noisy Time Signal')
    plt.subplot(2, 1, 2)
    librosa.display.waveplot(clean_wav, sr=sample_rate)
    plt.title('Estimated Clean Time Signal')

    plt.figure()
    plt.subplot(2, 1, 1)
    librosa.display.specshow(librosa.power_to_db(
        librosa.feature.melspectrogram(y=wav,
                                       sr=sample_rate,
                                       n_fft=1024,
                                       hop_length=512),
        ref=np.max),
                             sr=sample_rate,
                             x_axis='time',
                             y_axis='linear')
    plt.title('Noisy Spectrogram')
    plt.colorbar(format='%+2.0f dB', boundaries=np.linspace(-70, 0, 10))
    plt.subplot(2, 1, 2)
    librosa.display.specshow(librosa.power_to_db(
        librosa.feature.melspectrogram(y=clean_wav,
                                       sr=sample_rate,
                                       n_fft=1024,
                                       hop_length=512),
        ref=np.max),
                             sr=sample_rate,
                             x_axis='time',
                             y_axis='linear')
    plt.title('Estimated Clean Spectrogram')
    plt.colorbar(format='%+2.0f dB', boundaries=np.linspace(-70, 0, 10))
    plt.tight_layout()
    plt.show()

    clean_wav *= 32767 / max(0.01, np.max(np.abs(clean_wav)))
    # proposed by @dsmiller
    wavfile.write(output_path, sample_rate, clean_wav.astype(np.int16))
Example #8
0
def _process_utterance_libri(mel_dir, label_dir, index, wav_path, label, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - label_dir: the directory to write the label into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - label: time steps spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (wav_path, mel_filename, time_steps, mel_frames, label_filename)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #[-1, 1]
    out = wav
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype)
    mel_spectrogram = mel_spectrogram[:, -len(label):]
    mel_frames = mel_spectrogram.shape[1]
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    mel_filename = 'mel-{}.npy'.format(index)
    label_filename = 'label-{}.npy'.format(index)
    np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
    np.save(os.path.join(label_dir, label_filename), label, allow_pickle=False)

    # Return a tuple describing this training example
    return (wav_path, mel_filename, time_steps, mel_frames, label_filename)
Example #9
0
def vad():
    print("trim audio beggin....")
    dataset_root = Path("/Users/zhuribing/Project/AccelerateServerTest/audio")
    files = list(dataset_root.joinpath("org").glob("*"))

    print(len(files))
    cnt = 0
    for input in files:
        output = str(input).replace("org", "trim", 1)
        wav = audio.load_wav(input, sr=16000)
        wav = audio.trim_silence(wav, hparams)
        audio.save_wav(wav, output, 16000)
        cnt = cnt + 1
        if (cnt % 10 == 0):
            print("complete:", cnt)
Example #10
0
def _process_utterance(feat_dir, index, wav_path, text, hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, hparams)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    # feature extraction
    feature = audio.feature_extract(wav, hparams)
    n_frames = len(feature)
    if n_frames > hparams.max_frame_num or len(text) > hparams.max_text_length:
        return None

    feat_file = '{}.npy'.format(index)
    np.save(os.path.join(feat_dir, feat_file), feature, allow_pickle=False)

    # Return a tuple describing this training example
    return (feat_file, n_frames, text)
def run_eval(args, checkpoint_path, output_dir, hparams,synth):



    if args.model == 'Tacotron-2':
        assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir)

    # Create output path if it doesn't exist



    if args.reference_audio_path is not None:
        print('reference_audio:', args.reference_audio_path)
        ref_wavs = os.listdir(args.reference_audio_path)

    else:
        if hparams.use_style_encoder == True:
            print("*******************************")
            print("TODO: add style weights when there is no reference audio. Now we use random weights, " +
                  "which may generate unintelligible audio sometimes.")
            print("*******************************")
        else:
            # raise ValueError("You must set the reference audio if you don't want to use GSTs.")
            print("233")

    # Set inputs batch wise
    counter=0
    fault_ppgs=np.zeros((1,2,345),dtype=np.float32)
    for ref_wav in ref_wavs:
        speaker = ref_wav.split('_')[0]
        sentence = ref_wav.split('_')[1]
        save_path = output_dir+'/'+speaker+'/'+sentence
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        counter=counter+1
        ref_wav_name = os.path.join(args.reference_audio_path,ref_wav)
        save_name = os.path.join(save_path+'/'+ref_wav.split('.')[0]+'.npy')
        ref_wav = load_wav(ref_wav_name, hparams.sample_rate)
        reference_mel = melspectrogram(ref_wav, hparams).astype(np.float32).T
        style_embedding = synth.synthesize_embedding(fault_ppgs,[reference_mel])[0]
        np.save(save_name,style_embedding)
        print(str(counter)+'/'+str(len(ref_wavs)))
Example #12
0
def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
	eval_dir = os.path.join(output_dir, 'eval')
	log_dir = os.path.join(output_dir, 'logs-eval')

	if args.model == 'Tacotron-2':
		assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir)

	#Create output path if it doesn't exist
	os.makedirs(eval_dir, exist_ok=True)
	os.makedirs(log_dir, exist_ok=True)
	os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
	os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

	log(hparams_debug_string())
	synth = Synthesizer()
	
	if args.reference_audio is not None:
		ref_wav = audio.load_wav(args.reference_audio,sr=hparams.sample_rate)
		reference_mel = audio.melspectrogram(ref_wav,hparams).astype(np.float32).T
	else:
		#raise ValueError("Evaluation without reference audio. Please provide path to reference audio.")
		reference_mel = None
	synth.load(checkpoint_path, hparams, reference_mel=reference_mel)

	#Set inputs batch wise
	sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]

	
	log('Starting Synthesis')
	with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
		for i, texts in enumerate(tqdm(sentences)):
			start = time.time()
			basenames = ['batch_{:03d}_sentence_{:03d}'.format(i, j) for j in range(len(texts))]
			mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None, reference_mel=reference_mel)

			for elems in zip(texts, mel_filenames, speaker_ids):
				file.write('|'.join([str(x) for x in elems]) + '\n')
	log('synthesized mel spectrograms at {}'.format(eval_dir))
	return eval_dir
def run_eval(args, checkpoint_path, output_dir, hparams, ppgs, speakers, Lf0s):
    eval_dir = os.path.join(output_dir, 'eval')
    log_dir = os.path.join(output_dir, 'logs-eval')

    if args.model == 'Tacotron-2':
        assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir)

    #Create output path if it doesn't exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams, reference_mels=args.reference_audio)

    if args.reference_audio is not None:
        print('reference_audio:', args.reference_audio)
        ref_wav = load_wav(args.reference_audio.strip(), hparams.sample_rate)
        reference_mel = melspectrogram(ref_wav, hparams).astype(np.float32).T
    else:
        if hparams.use_style_encoder == True:
            print("*******************************")
            print(
                "TODO: add style weights when there is no reference audio. Now we use random weights, "
                + "which may generate unintelligible audio sometimes.")
            print("*******************************")
        else:
            #raise ValueError("You must set the reference audio if you don't want to use GSTs.")
            print("233")

    #Set inputs batch wise
    ppgs = [
        ppgs[i:i + hparams.tacotron_synthesis_batch_size]
        for i in range(0, len(ppgs), hparams.tacotron_synthesis_batch_size)
    ]
    Lf0s = [
        Lf0s[i:i + hparams.tacotron_synthesis_batch_size]
        for i in range(0, len(Lf0s), hparams.tacotron_synthesis_batch_size)
    ]
    if args.reference_audio is not None:
        reference_mels = [reference_mel] * len(ppgs)

    log('Starting Synthesis')
    with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:

        for i, texts in enumerate(tqdm(ppgs)):
            start = time.time()
            basenames = [
                'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))
            ]
            if args.reference_audio is not None:
                mel_filenames = synth.synthesize(texts, [speakers[i]],
                                                 basenames, eval_dir, log_dir,
                                                 None, [reference_mels[i]],
                                                 Lf0s[i])
            else:
                mel_filenames = synth.synthesize(texts, [speakers[i]],
                                                 basenames, eval_dir, log_dir,
                                                 None, None, Lf0s[i])

            for elems in zip(texts, mel_filenames, [speakers[i]]):
                file.write('|'.join([str(x) for x in elems]) + '\n')
    log('synthesized mel spectrograms at {}'.format(eval_dir))
    return eval_dir
Example #14
0
def build_from_path_ispl(hparams, input_dirs, mel_dir, label_dir, tqdm=lambda x: x):
    """
    Preprocesses the speech dataset from a gven input path to given output directories

    Args:
        - hparams: hyper parameters
        - input_dirs: input directory that contains the files to prerocess
        - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
        - label_dir: the directory to write the label into
        - tqdm: Optional, provides a nice progress bar

    Returns:
        - A list of tuple describing the train examples. this should be written to train.txt
    """

    # We use ProcessPoolExecutor to parallelize across processes, this is just for
    # optimization purposes and it can be omited
    futures = []
    index = 1
    for input_dir in input_dirs:
        files = find_files(os.path.join(input_dir))
        for wav_path in files:
            file_name = wav_path.split("\\")[-1]
            if int(file_name.split('.')[0]) <= 10:
                label_path = wav_path.split("\\")[0] + '/label.txt'
                with open(label_path, encoding='utf-8') as f:
                    lines = f.readlines()
                for line in lines:
                    if file_name in line:
                        labels = line.replace('[', '').replace(']', '').split(':')[1].replace(',\n', '').split(',')
                        start = []
                        end = []
                        for idx in range(0, len(labels), 2):
                            start.append(int(labels[idx]))
                            end.append(int(labels[idx+1]))

            try:
                # Load the audio as numpy array
                wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
            except FileNotFoundError:  # catch missing wav exception
                print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path))
                return None

            # rescale wav
            if hparams.rescale:
                wav = wav / np.abs(wav).max() * hparams.rescaling_max

            # M-AILABS extra silence specific
            if hparams.trim_silence:
                wav = audio.trim_silence(wav, hparams)

            # [-1, 1]
            out = wav
            out_dtype = np.float32

            if int(file_name.split('.')[0]) <= 10:
                label = np.zeros_like(out)
                for idx in range(len(start)):
                    start[idx] = int(start[idx] / 1000 * hparams.sample_rate)
                    end[idx] = int(end[idx] / 1000 * hparams.sample_rate)
                    label[start[idx]:end[idx]] = 1.
            else:
                label = wav_path.split('.')[0] + '.label'
                with open(label, encoding='utf-8') as f:
                    lines = f.readlines()
                label = np.asarray([int(line.strip('\n')) for line in lines])

            # Compute the mel scale spectrogram from the wav
            mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype)
            mel_spectrogram = mel_spectrogram[:, -len(label):]
            mel_frames = mel_spectrogram.shape[1]

            # Ensure time resolution adjustement between audio and mel-spectrogram
            pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))

            if int(file_name.split('.')[0]) <= 10:
                # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
                out = np.pad(out, (0, pad), mode='reflect')
                label = np.pad(label, (0, pad), mode='reflect')
                assert len(out) >= mel_frames * audio.get_hop_size(hparams)

                # time resolution adjustement
                # ensure length of raw audio is multiple of hop size so that we can use
                # transposed convolution to upsample
                out = out[:mel_frames * audio.get_hop_size(hparams)]
                label = label[:mel_frames * audio.get_hop_size(hparams)]
                assert len(out) % audio.get_hop_size(hparams) == 0
                label = label[::audio.get_hop_size(hparams)]

                time_steps = len(out)
            else:
                time_steps = len(out)

            # Write the spectrogram and audio to disk
            mel_filename = 'mel-{}.npy'.format(index)
            label_filename = 'label-{}.npy'.format(index)
            np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
            np.save(os.path.join(label_dir, label_filename), label, allow_pickle=False)
            futures.append((wav_path, mel_filename, time_steps, mel_frames, label_filename))
            index += 1

    return [future for future in tqdm(futures)]
def _process_utterance(mel_dir, linear_dir, wav_dir, spkid, uttid, wav_path,
                       text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis,
                                  hparams.preemphasize)

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        #Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        preem_wav = preem_wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                            audio.get_hop_size(hparams),
                                            hparams.wavenet_pad_sides)

        #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, (l_pad, r_pad),
                     mode='constant',
                     constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    sub_wav_dir = os.path.join(wav_dir, spkid)
    sub_mel_dir = os.path.join(mel_dir, spkid)
    sub_linear_dir = os.path.join(linear_dir, spkid)

    os.makedirs(sub_wav_dir, exist_ok=True)
    os.makedirs(sub_mel_dir, exist_ok=True)
    os.makedirs(sub_linear_dir, exist_ok=True)

    audio_filename = 'audio-{}.npy'.format(uttid)
    mel_filename = 'mel-{}.npy'.format(uttid)
    linear_filename = 'linear-{}.npy'.format(uttid)
    np.save(os.path.join(sub_wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(sub_mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(sub_linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (spkid, audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
Example #16
0
def _process_utterance(out_dir, index, wav_path, pinyin, hparams):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    pinyin: The pinyin of Chinese spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    mel_dir = out_dir + "/mels"
    linear_dir = out_dir + "/linear"
    wav_dir = out_dir + "/audio"

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    print("debug wav_path:", wav_path)
    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the wav:
    #mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        print("debug --- drop wav_path:", wav_path, "mel_frames:", mel_frames)
        return None

    # Compute the linear-scale spectrogram from the wav:
    #spectrogram = audio.spectrogram(wav).astype(np.float32)
    #n_frames = spectrogram.shape[1]
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrograms to disk:
    #spectrogram_filename = 'thchs30-spec-%05d.npy' % index
    #mel_filename = 'thchs30-mel-%05d.npy' % index
    #np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
    #np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)
    print("debug save wav file:", os.path.join(wav_dir, audio_filename))
    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, pinyin)
Example #17
0
def _process_utterance(mel_dir, index, wav_path, start, end, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - start, end: start, end points of speech
        - hparams: hyper parameters

    Returns:
        - A tuple: (wav_path, mel_filename, time_steps, mel_frames, start, end)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path))
        return None

    start += 1 * hparams.sample_rate
    end += 1 * hparams.sample_rate

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #[-1, 1]
    out = wav
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype)
    mel_frames = mel_spectrogram.shape[1]

    # Ensure time resolution adjustement between audio and mel-spectrogram
    pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))

    # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (0, pad), mode='reflect')
    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    start = round(start/int(time_steps / mel_frames))
    end = round(end/int(time_steps / mel_frames))

    # Write the spectrogram and audio to disk
    mel_filename = 'mel-{}.npy'.format(index)
    np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example
    return (wav_path, mel_filename, time_steps, mel_frames, start, end)
Example #18
0
def _process_utterance(lf0_dir, mgc_dir, bap_dir, cmp_dir, linear_dir,
                       basename, wav_path, text, hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- basename:
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""

    if hparams.trim_silence:
        tar_wavfile = wav_path[:-4] + "_trim.wav"
        print("raw wav path:%s" % wav_path)
        wav_raw, fs = sf.read(wav_path)
        wav_trim = audio.trim_silence(wav_raw, hparams)
        sf.write(tar_wavfile, wav_trim, fs)

        wav_path = tar_wavfile

    nFFTHalf, alpha, bap_dim = audio.get_config(hparams.sample_rate)

    mcsize = hparams.num_mgc - 1

    filename = basename  #os.path.basename(wav_path).split(".")[0]

    print('extract feats for %s' % wav_path)

    # extract f0,sp,ap
    os.system("analysis %s %s/%s.f0 %s/%s.sp %s/%s.bapd" %
              (wav_path, lf0_dir, filename, mgc_dir, filename, bap_dir,
               filename))  # get float64???

    # interpolate f0
    f0 = np.fromfile("%s/%s.f0" % (lf0_dir, filename), dtype=np.float64)
    continuous_f0 = interp1d(f0, kind="slinear")
    continuous_f0.tofile("%s/%s.f0c" % (lf0_dir, filename))

    # convert f0 to lf0
    os.system("x2x +da %s/%s.f0c > %s/%s.f0a" %
              (lf0_dir, filename, lf0_dir, filename))
    os.system(
        "x2x +af %s/%s.f0a | sopr -magic 0.0 -LN -MAGIC -1.0E+10 > %s/%s.lf0" %
        (lf0_dir, filename, lf0_dir, filename))

    # convert sp to mgc
    os.system("x2x +df %s/%s.sp | sopr -R -m 32768.0 | "
              "mcep -a %f -m %d -l %d -e 1.0E-8 -j 0 -f 0.0 -q 3 "
              "> %s/%s.mgc" %
              (mgc_dir, filename, alpha, mcsize, nFFTHalf, mgc_dir, filename))

    # convert ap to bap
    os.system("x2x +df %s/%s.bapd > %s/%s.bap" %
              (bap_dir, filename, bap_dir, filename))

    # merge mgc,lf0 and bap to cmp
    os.system("merge +f -s 0 -l 1 -L %d %s/%s.mgc < %s/%s.lf0 > %s/%s.ml" % (
        (mcsize + 1), mgc_dir, filename, lf0_dir, filename, cmp_dir, filename))
    os.system("merge +f -s 0 -l %d -L %d %s/%s.ml < %s/%s.bap > %s/%s.cmp" %
              (bap_dim, (mcsize + 2), cmp_dir, filename, bap_dir, filename,
               cmp_dir, filename))

    #if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
    #	return None

    #Compute the linear scale spectrogram from the wav
    wav = audio.load_wav(wav_path, hparams.sample_rate)
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    #assert linear_frames == mel_frames

    lf0 = np.fromfile("%s/%s.lf0" % (lf0_dir, filename), dtype=np.float32)
    mgc = np.fromfile("%s/%s.mgc" % (mgc_dir, filename), dtype=np.float32)
    bap = np.fromfile("%s/%s.bap" % (bap_dir, filename), dtype=np.float32)
    cmp = np.fromfile("%s/%s.cmp" % (cmp_dir, filename), dtype=np.float32)

    cmp_dim = mcsize + 1 + 1 + bap_dim
    cmp_frames = cmp.shape[0] / cmp_dim
    #print(f0[:100])
    #print(continuous_f0[:100])
    print(lf0.shape)
    print(continuous_f0.shape)
    print(mgc.shape)
    print(bap.shape)
    print(cmp_frames)
    print(continuous_f0.dtype)
    print(mgc.dtype)
    print(bap.dtype)
    assert (mgc.shape[0] /
            (mcsize + 1)) == (continuous_f0.shape[0] /
                              1) == (bap.shape[0] / bap_dim) == cmp_frames
    assert cmp_dim == hparams.num_mels
    #assert len(out) >= cmp_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    #out = out[:mel_frames * audio.get_hop_size(hparams)]
    #assert len(out) % audio.get_hop_size(hparams) == 0
    #time_steps = len(out)

    # Write the spectrogram and audio to disk
    #audio_filename = 'audio-{}.npy'.format(index)
    cmp_mat = cmp.reshape(-1, cmp_dim)
    cmp_filename = 'cmp-{}.npy'.format(basename)
    linear_filename = 'linear-{}.npy'.format(basename)
    #np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(cmp_dir, cmp_filename), cmp_mat, allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)
    # Return a tuple describing this training example
    return (cmp_filename, linear_filename, cmp_frames, text)
Example #19
0
    def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]

        # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])

        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]

        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        # Pad inputs according to each GPU max length
        input_seqs = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0])

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if self.gta:
            np_targets = [
                np.load(mel_filename) for mel_filename in mel_filenames
            ]
            target_lengths = [len(np_target) for np_target in np_targets]

            # pad targets according to each GPU max length
            target_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_target = np_targets[size_per_device *
                                           i:size_per_device * (i + 1)]
                device_target, max_target_len = self._prepare_targets(
                    device_target, self._hparams.outputs_per_step)
                target_seqs = np.concatenate(
                    (target_seqs, device_target),
                    axis=1) if target_seqs is not None else device_target
                split_infos[i][1] = \
                    max_target_len  # Not really used but setting it in case for future development maybe?

            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)

        if self.style_transfer and hparams.tacotron_style_reference_audio is not None and\
                hparams.tacotron_style_alignment is None:
            # only support one style reference audio
            if hparams.tacotron_style_reference_audio[-4:] == '.wav':
                wav = audio.load_wav(hparams.tacotron_style_reference_audio,
                                     sr=hparams.sample_rate)
                np_targets = audio.melspectrogram(wav, self._hparams).astype(
                    np.float32).T
            else:
                np_targets = np.load(hparams.tacotron_style_reference_audio)
            target_lengths = len(np_targets)

            # copy
            np_targets = [np_targets for _ in range(len(texts))]
            target_lengths = [target_lengths for _ in range(len(texts))]

            # pad targets according to each GPU max length
            target_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_target = np_targets[size_per_device *
                                           i:size_per_device * (i + 1)]
                device_target, max_target_len = self._prepare_targets(
                    device_target, self._hparams.outputs_per_step)
                target_seqs = np.concatenate(
                    (target_seqs, device_target),
                    axis=1) if target_seqs is not None else device_target
                split_infos[i][1] = \
                    max_target_len  # Not really used but setting it in case for future development maybe?

            feed_dict[self.targets] = target_seqs
            feed_dict[self.target_lengths] = target_lengths
            assert len(np_targets) == len(texts)

        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)

        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [
                    self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            # Linearize outputs (1D arrays)
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            if not self.gta:
                # Natural batch synthesis
                # Get Mel lengths for the entire batch from stop_tokens predictions
                target_lengths = self._get_output_lengths(stop_tokens)

            # Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            # Linearize outputs (1D arrays)
            linears = [
                linear for gpu_linear in linears for linear in gpu_linear
            ]
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            # Natural batch synthesis
            # Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            # target_lengths = self._get_output_lengths(stop_tokens)
            target_lengths = [9999]

            # Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            linears = [
                linear[:target_length, :]
                for linear, target_length in zip(linears, target_lengths)
            ]
            assert len(mels) == len(linears) == len(texts)

        if basenames is None:
            # Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels.T, hparams)
            audio.save_wav(wav, 'temp.wav',
                           sr=hparams.sample_rate)  # Find a better way

            chunk = 512
            f = wave.open('temp.wav', 'rb')
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
                            channels=f.getnchannels(),
                            rate=f.getframerate(),
                            output=True)
            data = f.readframes(chunk)
            while data:
                stream.write(data)
                data = f.readframes(chunk)

            stream.stop_stream()
            stream.close()

            p.terminate()
            return

        saved_mels_paths = []
        speaker_ids = []
        for i, mel in enumerate(mels):
            # Get speaker id for global conditioning (only used with GTA generally)
            if hparams.gin_channels > 0:
                raise RuntimeError(
                    'Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.'
                )
                speaker_id = '<no_g>'  # set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
                speaker_ids.append(
                    speaker_id
                )  # finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
            else:
                speaker_id = '<no_g>'
                speaker_ids.append(speaker_id)

            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir,
                                        'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                # save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav,
                               os.path.join(
                                   log_dir,
                                   'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                # save alignments
                plot.plot_alignment(alignments[i],
                                    os.path.join(
                                        log_dir,
                                        'plots/alignment-{}.png'.format(
                                            basenames[i])),
                                    title='{}'.format(texts[i]),
                                    split_title=True,
                                    max_len=target_lengths[i])

                # save mel spectrogram plot
                plot.plot_spectrogram(
                    mel,
                    os.path.join(log_dir,
                                 'plots/mel-{}.png'.format(basenames[i])),
                    title='{}'.format(texts[i]),
                    split_title=True)

                if hparams.predict_linear:
                    # save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav,
                                   os.path.join(
                                       log_dir,
                                       'wavs/wav-{}-linear.wav'.format(
                                           basenames[i])),
                                   sr=hparams.sample_rate)

                    # save linear spectrogram plot
                    plot.plot_spectrogram(linears[i],
                                          os.path.join(
                                              log_dir,
                                              'plots/linear-{}.png'.format(
                                                  basenames[i])),
                                          title='{}'.format(texts[i]),
                                          split_title=True,
                                          auto_aspect=True)

        return saved_mels_paths, speaker_ids
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, ppgs,
                       lf0_path, speaker, refer, hparams):
    """
	Preprocesses a single utterance wav/ppgs pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- ppgs: ppgs spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, refer_name ,time_steps, mel_frames, linear_frames, ppgs,speaker,lf0)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)
    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, refer, time_steps,
            mel_frames, ppgs, speaker, lf0_path)
Example #21
0
def _process_utterance(wav_dir, mel_dir, index, wav_path, text, hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #[-1, 1]
    out = encode_mu_law(wav, mu=512)

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames or len(
            text) > hparams.max_text_length:
        return None

    #Zero pad for quantized signal
    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    r = mel_frames * audio.get_hop_size(hparams) - len(wav)
    out = np.pad(out, (0, r), mode='constant', constant_values=0.)
    assert len(out) == mel_frames * audio.get_hop_size(hparams)
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    filename = '{}.npy'.format(index)
    np.save(os.path.join(wav_dir, filename),
            out.astype(np.int16),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (filename, time_steps, mel_frames, text)
Example #22
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams):
	"""
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
	try:
		# Load the audio as numpy array
		wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
	except FileNotFoundError: #catch missing wav exception
		print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
			wav_path))
		return None

	#rescale wav
	if hparams.rescale:
		wav = wav / np.abs(wav).max() * hparams.rescaling_max

	#M-AILABS extra silence specific
	if hparams.trim_silence:
		wav = audio.trim_silence(wav, hparams)

	#Mu-law quantize
	if is_mulaw_quantize(hparams.input_type):
		#[0, quantize_channels)
		out = mulaw_quantize(wav, hparams.quantize_channels)

		#Trim silences
		start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
		wav = wav[start: end]
		out = out[start: end]

		constant_values = mulaw_quantize(0, hparams.quantize_channels)
		out_dtype = np.int16

	elif is_mulaw(hparams.input_type):
		#[-1, 1]
		out = mulaw(wav, hparams.quantize_channels)
		constant_values = mulaw(0., hparams.quantize_channels)
		out_dtype = np.float32
	
	else:
		#[-1, 1]
		out = wav
		constant_values = 0.
		out_dtype = np.float32

	# Compute the mel scale spectrogram from the wav
	mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
	mel_frames = mel_spectrogram.shape[1]

	if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
		return None

	#Compute the linear scale spectrogram from the wav
	linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32)
	linear_frames = linear_spectrogram.shape[1] 

	#sanity check
	assert linear_frames == mel_frames

	#Ensure time resolution adjustement between audio and mel-spectrogram
	fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
	l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

	#Zero pad for quantized signal
	out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
	assert len(out) >= mel_frames * audio.get_hop_size(hparams)

	#time resolution adjustement
	#ensure length of raw audio is multiple of hop size so that we can use
	#transposed convolution to upsample
	out = out[:mel_frames * audio.get_hop_size(hparams)]
	assert len(out) % audio.get_hop_size(hparams) == 0
	time_steps = len(out)

	# Write the spectrogram and audio to disk
	audio_filename = 'speech-audio-{:05d}.npy'.format(index)
	mel_filename = 'speech-mel-{:05d}.npy'.format(index)
	linear_filename = 'speech-linear-{:05d}.npy'.format(index)
	np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
	np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
	np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)

	# Return a tuple describing this training example
	return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
Example #23
0
def _process_utterance_clova(audio_dir, label_dir, index, wav_path, text_path, args):
    """
    Preprocesses a single utterance wav/text_jamo pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text_jamo: text_jamo spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text_jamo)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=args.sample_rate)
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path))
        return None

    # rescale wav
    if args.rescale:
        wav = wav / np.abs(wav).max() * args.rescaling_max

    # M-AILABS extra silence specific
    if args.trim_silence:
        wav = audio.trim_silence(wav, args)

    # [-1, 1]
    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, args).astype(out_dtype)
    mel_frames = mel_spectrogram.shape[1]

    # Ensure time resolution adjustement between audio and mel-spectrogram
    pad = audio.librosa_pad_lr(wav, args.n_fft, audio.get_hop_size(args))

    # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (0, pad), mode='reflect')
    assert len(out) >= mel_frames * audio.get_hop_size(args)

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(args)]
    assert len(out) % audio.get_hop_size(args) == 0
    time_steps = len(out)

    # text_jamo sequence
    with open(text_path, 'r', encoding='utf-8', newline='') as f:
        rdr = csv.reader(f)
        for x in rdr:
            if os.path.basename(wav_path) == x[0]:
                line = x[1]

    # ETRI transcription rule
    line = sentence_filter(line).upper()
    label_sequence = normalize(line)
    print(label_sequence)

    # Write the spectrogram and audio to disk
    mel_filename = 'mel-{}.npy'.format(index)
    label_filename = 'label-{}.txt'.format(index)
    np.save(os.path.join(audio_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    with open(os.path.join(label_dir, label_filename), 'w', encoding='utf-8') as f_out:
        f_out.write(label_sequence)

    # Return a tuple describing this training example
    return (wav_path, text_path, mel_filename, label_filename, time_steps, mel_frames)
Example #24
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text):
	"""
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""

	try:
		# Load the audio as numpy array
		wav = audio.load_wav(wav_path)
	except :
		print('file {} present in csv not in folder'.format(
			wav_path))
		return None

	if hparams.rescale:
		wav = wav / np.abs(wav).max() * hparams.rescaling_max

	#M-AILABS extra silence specific
	if hparams.trim_silence:
		wav = audio.trim_silence(wav)

	#[0, quantize_channels)
	out = mulaw_quantize(wav, hparams.quantize_channels)

	#Trim silences
	start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
	wav = wav[start: end]
	out = out[start: end]

	constant_values = mulaw_quantize(0, hparams.quantize_channels)
	out_dtype = np.int16

	# Compute the mel scale spectrogram from the wav
	mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
	mel_frames = mel_spectrogram.shape[1]

	#Compute the linear scale spectrogram from the wav
	linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32)
	linear_frames = linear_spectrogram.shape[1] 

	#sanity check
	assert linear_frames == mel_frames

	#Ensure time resolution adjustement between audio and mel-spectrogram
	l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size())

	#Zero pad for quantized signal
	out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
	time_steps = len(out)
	assert time_steps >= mel_frames * audio.get_hop_size()

	#time resolution adjustement
	#ensure length of raw audio is multiple of hop size so that we can use
	#transposed convolution to upsample
	out = out[:mel_frames * audio.get_hop_size()]
	assert time_steps % audio.get_hop_size() == 0

	# Write the spectrogram and audio to disk
	audio_filename = 'speech-audio-{:05d}.npy'.format(index)
	mel_filename = 'speech-mel-{:05d}.npy'.format(index)
	linear_filename = 'speech-linear-{:05d}.npy'.format(index)
	np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
	np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
	np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)

	# Return a tuple describing this training example
	return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
Example #25
0
    Args:
            - mel_dir: the directory to write the mel spectograms into
            - linear_dir: the directory to write the linear spectrograms into
            - wav_dir: the directory to write the preprocessed wav into
            - index: the numeric index to use in the spectogram filename
            - wav_path: path to the audio file containing the speech input
            - text: text spoken in the input audio file
            - hparams: hyper parameters

    Returns:
            - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    eliminated=0
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
            wav_path))
        return None

    try:
        # rescale wav
        if hparams.rescale:
            wav = wav / np.abs(wav).max() * hparams.rescaling_max

        # M-AILABS extra silence specific
        if hparams.trim_silence:
            new_wav = audio.trim_silence(wav, hparams)
            eliminated+=(len(wav)-len(new_wav))/hparams.sample_rate
    except Exception as e:
Example #26
0
def _process_utterance(out_dir, index, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - out_dir: the directory to write the msgpack into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
            wav_path))
        return None

    # Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    # Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

    # rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        # Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    # [-1, 1]
    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    l_pad, r_pad = audio.librosa_pad_lr(wav, audio.get_hop_size(hparams), hparams.pad_sides)

    # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)
    npz_filename = '{}.npz'.format(index)
    r = hparams.outputs_per_step
    if hparams.symmetric_mels:
        _pad_value = -hparams.max_abs_value
    else:
        _pad_value = 0.
    # +2r for head and tail silence
    mel_spec = np.pad(mel_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value)
    linear_spec = np.pad(linear_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value)
    target_length = len(linear_spec)
    target_frames = (target_length // r + 1) * r
    num_pad = target_frames - target_length
    if num_pad != 0:
        linear_spec = np.pad(linear_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value)
        mel_spec = np.pad(mel_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value)
    stop_token = np.concatenate(
        [np.zeros(target_frames - 1, dtype=np.float32), np.ones(1, dtype=np.float32)],
        axis=0)
    data = {
        'mel': mel_spec,
        'linear': linear_spec,
        'audio': out.astype(out_dtype),
        'input_data': np.asarray(text_to_sequence(text)),
        'time_steps': time_steps,
        'mel_frames': target_frames,
        'text': text,
        'stop_token': stop_token,
    }
    dumps_msgpack(data, os.path.join(out_dir, npz_filename))
    # Return a tuple describing this training example
    return npz_filename, time_steps, mel_frames, text
Example #27
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path, sr=hparams.sample_rate)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        # [-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
    l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

    # Zero pad for quantized signal
    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'speech-audio-{:05d}.npy'.format(index)
    mel_filename = 'speech-mel-{:05d}.npy'.format(index)
    linear_filename = 'speech-linear-{:05d}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
Example #28
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams):
    wav = _trim_wav(audio.load_wav(wav_path, sr=hparams.sample_rate))
    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        # [-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)

    name = os.path.splitext(os.path.basename(wav_path))[0]
    speaker_id = _speaker_re.match(name).group(1)

    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
    l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

    # Zero pad for quantized signal
    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'speech-audio-{:05d}.npy'.format(index)
    mel_filename = 'speech-mel-{:05d}.npy'.format(index)
    linear_filename = 'speech-linear-{:05d}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, speaker_id, text)