Beispiel #1
0
def _process_utterance(out_dir, index, wav_path, text):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- out-dir: the directory to write the spectograms into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file

	Returns:
		- A tuple: (mel_filename, n_frames, text)
	"""

    # Load the audio as numpy array
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav to calculate n_frames
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrogram to disk
    mel_filename = 'ljspeech-mel-{:05d}.npy'.format(index)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (mel_filename, n_frames, text)
Beispiel #2
0
def extract_mel(wav_filename, out_wav_path, out_dir, key, hparams, args):
    if not os.path.exists(wav_filename):
        print("Wav file {} doesn't exists.".format(wav_filename))
        return None

    wav = audio.load_wav(wav_filename, sr=hparams.sample_rate)
    # Process wav samples
    wav = audio.trim_silence(wav, hparams)
    n_samples = len(wav)

    # Extract mel spectrogram
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    n_frames = mel_spectrogram.shape[1]
    if n_frames > hparams.max_acoustic_length:
        print(
            "Ignore wav {} because the frame number {} is too long (Max {} frames in hparams.yaml)."
            .format(wav_filename, n_frames, hparams.max_acoustic_length))
        return None

    # Align features
    desired_frames = int(min(n_samples / hparams.hop_size, n_frames))
    wav = wav[:desired_frames * hparams.hop_size]
    mel_spectrogram = mel_spectrogram[:, :desired_frames]
    n_samples = wav.shape[0]
    n_frames = mel_spectrogram.shape[1]
    assert (n_samples / hparams.hop_size == n_frames)

    # Save intermediate acoustic features
    mel_filename = os.path.join(out_dir, key + '.npy')
    np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
    audio.save_wav(wav, out_wav_path, hparams)

    return (wav_filename, mel_filename, n_samples, n_frames)
Beispiel #3
0
def infer(model, src_pth):
    src = load_wav(src_pth, seg=False)
    mel = melspectrogram(src).astype(np.float32)
    mel = mode(torch.Tensor([mel]))
    with torch.no_grad():
        res = model.infer(mel)[0]
    return [src, to_arr(res)]
Beispiel #4
0
 def wav2spec(self, wav_path):
     wav = audio.load_wav(wav_path)
     spec = audio.melspectrogram(wav).astype(np.float32)
     spec = spec.transpose()
     feat_size = spec.shape[1]
     pad_spec = np.zeros(
         [(len(spec) + self.outputs_per_step - 1) // self.outputs_per_step *
          self.outputs_per_step, feat_size],
         dtype='float32')
     pad_spec[:len(spec)] = spec
     return pad_spec.reshape([-1, self.outputs_per_step * feat_size])
Beispiel #5
0
def _process_wav(wav_path, audio_path, spc_path):
    wav = audio.load_wav(wav_path)
    wav1, wav2, wav3, wav4 = audio.subband(wav)

    if hparams.feature_type == 'mcc':
        # Extract mcc and f0
        spc = audio.extract_mcc(wav)
    else:
        # Extract mels
        spc = audio.melspectrogram(wav).astype(np.float32)

    # Align audios and mels
    hop_length = int(hparams.frame_shift_ms / 4000 * hparams.sample_rate)
    length_diff_1 = len(spc) * hop_length - len(wav1)
    length_diff_2 = len(spc) * hop_length - len(wav2)
    length_diff_3 = len(spc) * hop_length - len(wav3)
    length_diff_4 = len(spc) * hop_length - len(wav4)
    wav1 = wav1.reshape(-1,1)
    if length_diff_1 > 0:
        wav1 = np.pad(wav1, [[0, length_diff_1], [0, 0]], 'constant')
    elif length_diff_1 < 0:
        wav1 = wav1[: hop_length * spc.shape[0]]
    wav2 = wav2.reshape(-1,1)
    if length_diff_2 > 0:
        wav2 = np.pad(wav2, [[0, length_diff_2], [0, 0]], 'constant')
    elif length_diff_2 < 0:
        wav2 = wav2[: hop_length * spc.shape[0]]
    wav3 = wav3.reshape(-1,1)
    if length_diff_3 > 0:
        wav3 = np.pad(wav1, [[0, length_diff_3], [0, 0]], 'constant')
    elif length_diff_3 < 0:
        wav3 = wav3[: hop_length * spc.shape[0]]
    wav4 = wav4.reshape(-1,1)
    if length_diff_4 > 0:
        wav4 = np.pad(wav4, [[0, length_diff_4], [0, 0]], 'constant')
    elif length_diff_4 < 0:
        wav4 = wav4[: hop_length * spc.shape[0]]
    fid1 = os.path.basename(audio_path).replace('.npy', '_band1.npy')
    fid2 = os.path.basename(audio_path).replace('.npy', '_band2.npy')
    fid3 = os.path.basename(audio_path).replace('.npy', '_band3.npy')
    fid4 = os.path.basename(audio_path).replace('.npy', '_band4.npy')

    fid1 = os.path.join('training_data/audios', fid1)
    
    fid2 = os.path.join('training_data/audios', fid2)
    fid3 = os.path.join('training_data/audios', fid3)
    fid4 = os.path.join('training_data/audios', fid4)
    
    np.save(fid1, wav1)
    np.save(fid2, wav2)
    np.save(fid3, wav3)
    np.save(fid4, wav4)
    np.save(spc_path, spc)
    return (fid1, fid2, fid3, fid4, spc_path, spc.shape[0])
Beispiel #6
0
 def __getitem__(self, index):
     if hps.prep:
         wav, mel = self.f_list[index]
         seg_ml = hps.seg_l // hps.frame_shift + 1
         ms = np.random.randint(0, mel.shape[1] -
                                seg_ml) if mel.shape[1] > seg_ml else 0
         ws = hps.frame_shift * ms
         wav = wav[ws:ws + hps.seg_l]
         mel = mel[:, ms:ms + seg_ml]
     else:
         wav = load_wav(self.f_list[index])
         mel = melspectrogram(wav).astype(np.float32)
     return wav, mel
Beispiel #7
0
def infer(wav_path, text, model):
	sequence = text_to_sequence(text, hps.text_cleaners)
	sequence = to_var(torch.IntTensor(sequence)[None, :]).long()
	mel = melspectrogram(load_wav(wav_path))
	r = mel.shape[1]%hps.n_frames_per_step
	mel_in = to_var(torch.Tensor([mel[:, :-r]]))
	if mel_in.shape[2] < 1:
		return None
	sequence = torch.cat([sequence, sequence], 0)
	mel_in = torch.cat([mel_in, mel_in], 0)
	_, mel_outputs_postnet, _, _ = model.teacher_infer(sequence, mel_in)
	ret = mel
	ret[:, :-r] = to_arr(mel_outputs_postnet[0])
	return ret
Beispiel #8
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    # print(len(spectrogram))
    # print(len(spectrogram[0]))
    # print(type(spectrogram))
    # print(np.shape(spectrogram))
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    # print(np.shape(mel_spectrogram))
    # print()

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Beispiel #9
0
def files_to_list(fdir):
    f_list = []
    with open(os.path.join(fdir, 'metadata.csv'), encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('|')
            wav_path = os.path.join(fdir, 'wavs', '%s.wav' % parts[0])
            if hps.prep:
                wav = load_wav(wav_path, False)
                if wav.shape[0] < hps.seg_l:
                    wav = np.pad(wav, (0, hps.seg_l - wav.shape[0]),
                                 'constant',
                                 constant_values=(0, 0))
                mel = melspectrogram(wav).astype(np.float32)
                f_list.append([wav, mel])
            else:
                f_list.append(wav_path)
    if hps.prep and hps.pth is not None:
        with open(hps.pth, 'wb') as w:
            pickle.dump(f_list, w)
    return f_list
Beispiel #10
0
def _process_wav(wav_path, audio_path, spc_path):
    wav = audio.load_wav(wav_path)
    if hparams.feature_type == 'mcc':
        # Extract mcc and f0
        spc = audio.extract_mcc(wav)
    else:
        # Extract mels
        spc = audio.melspectrogram(wav).astype(np.float32)

    # Align audios and mels
    hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    length_diff = len(spc) * hop_length - len(wav)
    wav = wav.reshape(-1, 1)
    if length_diff > 0:
        wav = np.pad(wav, [[0, length_diff], [0, 0]], 'constant')
    elif length_diff < 0:
        wav = wav[:hop_length * spc.shape[0]]

    np.save(audio_path, wav)
    np.save(spc_path, spc)
    return (audio_path, spc_path, spc.shape[0])
Beispiel #11
0
def _process_utterance(out_dir, index, wav_path, text):

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'meta_spec_%05d.npy' % index
    mel_filename = 'meta_mel_%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
def main():

    with tf.device(
            '/cpu:0'):  # cpu가 더 빠르다. gpu로 설정하면 Error. tf.device 없이 하면 더 느려진다.
        config = get_arguments()
        started_datestring = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now())
        logdir = os.path.join(config.logdir, 'generate', started_datestring)
        print('logdir0-------------' + logdir)

        if not os.path.exists(logdir):
            os.makedirs(logdir)

        load_hparams(hparams, config.checkpoint_dir)

        sess = tf.Session()
        scalar_input = hparams.scalar_input
        net = WaveNetModel(
            batch_size=config.batch_size,
            dilations=hparams.dilations,
            filter_width=hparams.filter_width,
            residual_channels=hparams.residual_channels,
            dilation_channels=hparams.dilation_channels,
            quantization_channels=hparams.quantization_channels,
            out_channels=hparams.out_channels,
            skip_channels=hparams.skip_channels,
            use_biases=hparams.use_biases,
            scalar_input=hparams.scalar_input,
            global_condition_channels=hparams.gc_channels,
            global_condition_cardinality=config.gc_cardinality,
            local_condition_channels=hparams.num_mels,
            upsample_factor=hparams.upsample_factor,
            legacy=hparams.legacy,
            residual_legacy=hparams.residual_legacy,
            train_mode=False
        )  # train 단계에서는 global_condition_cardinality를 AudioReader에서 파악했지만, 여기서는 넣어주어야 함

        if scalar_input:
            samples = tf.placeholder(tf.float32, shape=[net.batch_size, None])
        else:
            samples = tf.placeholder(
                tf.int32, shape=[net.batch_size, None]
            )  # samples: mu_law_encode로 변환된 것. one-hot으로 변환되기 전. (batch_size, 길이)

        # local condition이 (N,T,num_mels) 여야 하지만, 길이 1까지로 들어가야하기 때무넹, (N,1,num_mels) --> squeeze하면 (N,num_mels)
        upsampled_local_condition = tf.placeholder(
            tf.float32, shape=[net.batch_size, hparams.num_mels])

        next_sample = net.predict_proba_incremental(
            samples, upsampled_local_condition, [config.gc_id] * net.batch_size
        )  # Fast Wavenet Generation Algorithm-1611.09482 algorithm 적용

        # making local condition data. placeholder - upsampled_local_condition 넣어줄 upsampled local condition data를 만들어 보자.
        print('logdir0-------------' + logdir)
        mel_input = np.load(config.mel)
        sample_size = mel_input.shape[0] * hparams.hop_size
        mel_input = np.tile(mel_input, (config.batch_size, 1, 1))
        with tf.variable_scope('wavenet', reuse=tf.AUTO_REUSE):
            upsampled_local_condition_data = net.create_upsample(
                mel_input, upsample_type=hparams.upsample_type)

        var_list = [
            var for var in tf.global_variables() if 'queue' not in var.name
        ]
        saver = tf.train.Saver(var_list)
        print('Restoring model from {}'.format(config.checkpoint_dir))

        load(saver, sess, config.checkpoint_dir)
        init_op = tf.group(tf.initialize_all_variables(),
                           net.queue_initializer)

        sess.run(init_op)  # 이 부분이 없으면, checkpoint에서 복원된 값들이 들어 있다.

        quantization_channels = hparams.quantization_channels
        if config.wav_seed:
            # wav_seed의 길이가 receptive_field보다 작으면, padding이라도 해야 되는 거 아닌가? 그냥 짧으면 짧은 대로 return함  --> 그래서 너무 짧으면 error
            seed = create_seed(config.wav_seed, hparams.sample_rate,
                               quantization_channels, net.receptive_field,
                               scalar_input)  # --> mu_law encode 된 것.
            if scalar_input:
                waveform = seed.tolist()
            else:
                waveform = sess.run(
                    seed).tolist()  # [116, 114, 120, 121, 127, ...]

            print('Priming generation...')
            for i, x in enumerate(waveform[-net.receptive_field:-1]
                                  ):  # 제일 마지막 1개는 아래의 for loop의 첫 loop에서 넣어준다.
                if i % 100 == 0:
                    print('Priming sample {}/{}'.format(
                        i, net.receptive_field),
                          end='\r')
                sess.run(next_sample,
                         feed_dict={
                             samples:
                             np.array([x] * net.batch_size).reshape(
                                 net.batch_size, 1),
                             upsampled_local_condition:
                             np.zeros([net.batch_size, hparams.num_mels])
                         })
            print('Done.')
            waveform = np.array([waveform[-net.receptive_field:]] *
                                net.batch_size)
        else:
            # Silence with a single random sample at the end.
            if scalar_input:
                waveform = [0.0] * (net.receptive_field - 1)
                waveform = np.array(waveform * net.batch_size).reshape(
                    net.batch_size, -1)
                waveform = np.concatenate(
                    [
                        waveform, 2 * np.random.rand(net.batch_size).reshape(
                            net.batch_size, -1) - 1
                    ],
                    axis=-1)  # -1~1사이의 random number를 만들어 끝에 붙힌다.
                # wavefor: shape(batch_size,net.receptive_field )
            else:
                waveform = [quantization_channels / 2] * (
                    net.receptive_field - 1
                )  # 필요한 receptive_field 크기보다 1개 작게 만든 후, 아래에서 random하게 1개를 덧붙힌다.
                waveform = np.array(waveform * net.batch_size).reshape(
                    net.batch_size, -1)
                waveform = np.concatenate(
                    [
                        waveform,
                        np.random.randint(quantization_channels,
                                          size=net.batch_size).reshape(
                                              net.batch_size, -1)
                    ],
                    axis=-1)  # one hot 변환 전. (batch_size, 5117)

        start_time = time.time()
        upsampled_local_condition_data = sess.run(
            upsampled_local_condition_data)
        last_sample_timestamp = datetime.now()
        for step in range(sample_size):  # 원하는 길이를 구하기 위해 loop sample_size

            window = waveform[:,
                              -1:]  # 제일 끝에 있는 1개만 samples에 넣어 준다.  window: shape(N,1)

            # Run the WaveNet to predict the next sample.

            # fast가 아닌경우. window: [128.0, 128.0, ..., 128.0, 178, 185]
            # fast인 경우, window는 숫자 1개.
            prediction = sess.run(
                next_sample,
                feed_dict={
                    samples:
                    window,
                    upsampled_local_condition:
                    upsampled_local_condition_data[:, step, :]
                }
            )  # samples는 mu law encoding된 것. 계산 과정에서 one hot으로 변환된다.  --> (batch_size,256)

            if scalar_input:
                sample = prediction  # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다.
            else:
                # Scale prediction distribution using temperature.
                # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다.
                # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다.
                np.seterr(divide='ignore')
                scaled_prediction = np.log(
                    prediction
                ) / config.temperature  # config.temperature인 경우는 값의 변화가 없다.
                scaled_prediction = (
                    scaled_prediction - np.logaddexp.reduce(
                        scaled_prediction, axis=-1, keepdims=True)
                )  # np.log(np.sum(np.exp(scaled_prediction)))
                scaled_prediction = np.exp(scaled_prediction)
                np.seterr(divide='warn')

                # Prediction distribution at temperature=1.0 should be unchanged after
                # scaling.
                if config.temperature == 1.0:
                    np.testing.assert_allclose(
                        prediction,
                        scaled_prediction,
                        atol=1e-5,
                        err_msg=
                        'Prediction scaling at temperature=1.0 is not working as intended.'
                    )

                # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다.
                sample = [[
                    np.random.choice(np.arange(quantization_channels), p=p)
                ] for p in scaled_prediction]  # choose one sample per batch

            waveform = np.concatenate([waveform, sample],
                                      axis=-1)  #window.shape: (N,1)

            # Show progress only once per second.
            current_sample_timestamp = datetime.now()
            time_since_print = current_sample_timestamp - last_sample_timestamp
            if time_since_print.total_seconds() > 1.:
                duration = time.time() - start_time
                print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format(
                    step + 1, sample_size, duration),
                      end='\r')
                last_sample_timestamp = current_sample_timestamp

        # Introduce a newline to clear the carriage return from the progress.
        print()

        # Save the result as a wav file.
        if hparams.input_type == 'raw':
            out = waveform[:, net.receptive_field:]
        elif hparams.input_type == 'mulaw':
            decode = mu_law_decode(samples,
                                   quantization_channels,
                                   quantization=False)
            out = sess.run(
                decode, feed_dict={samples: waveform[:, net.receptive_field:]})
        else:  # 'mulaw-quantize'
            decode = mu_law_decode(samples,
                                   quantization_channels,
                                   quantization=True)
            out = sess.run(
                decode, feed_dict={samples: waveform[:, net.receptive_field:]})

        # save wav

        for i in range(net.batch_size):
            config.wav_out_path = logdir + '/test-{}.wav'.format(i)
            mel_path = config.wav_out_path.replace(".wav", ".png")

            gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype(
                np.float32).T
            audio.save_wav(out[i], config.wav_out_path,
                           hparams.sample_rate)  # save_wav 내에서 out[i]의 값이 바뀐다.

            plot.plot_spectrogram(gen_mel_spectrogram,
                                  mel_path,
                                  title='generated mel spectrogram',
                                  target_spectrogram=mel_input[i])
        print('Finished generating.')
Beispiel #13
0
def extract_audio_mels(audio_path):
    wav = audio.load_wav(audio_path)
    mels = audio.melspectrogram(wav)
    return mels
Beispiel #14
0
 def get_mel(self, filename):
     wav = load_wav(filename)
     mel = melspectrogram(wav).astype(np.float32)
     return torch.Tensor(mel)
Beispiel #15
0
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'bznsyp-audio-%05d.npy' % index
    mel_filename = 'bznsyp-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)
Beispiel #16
0
def _process_utterance(out_dir, index, audio_filepath, text):
    # Load the audio to a numpy array:
    wav_whole = audio.load_wav(audio_filepath)

    if hparams.rescaling:
        wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max

    # This is a librivox source, so the audio files are going to be v. long
    # compared to a typical 'utterance' : So split the wav into chunks

    tup_results = []

    n_samples = int(8.0 * hparams.sample_rate)  # All 8 second utterances
    n_chunks = wav_whole.shape[0] // n_samples

    for chunk_idx in range(n_chunks):
        chunk_start, chunk_end = chunk_idx * n_samples, (chunk_idx + 1) * n_samples
        if chunk_idx == n_chunks - 1:  # This is the last chunk - allow it to extend to the end of the file
            chunk_end = None
        wav = wav_whole[chunk_start: chunk_end]

        # Mu-law quantize
        if is_mulaw_quantize(hparams.input_type):
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels)

            # Trim silences
            start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
            wav = wav[start:end]
            out = out[start:end]
            constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
            out_dtype = np.int16
        elif is_mulaw(hparams.input_type):
            # [-1, 1]
            out = P.mulaw(wav, hparams.quantize_channels)
            constant_values = P.mulaw(0.0, hparams.quantize_channels)
            out_dtype = np.float32
        else:
            # [-1, 1]
            out = wav
            constant_values = 0.0
            out_dtype = np.float32

        # Compute a mel-scale spectrogram from the trimmed wav:
        # (N, D)
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
        # lws pads zeros internally before performing stft
        # this is needed to adjust time resolution between audio and mel-spectrogram
        l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

        # zero pad for quantized signal
        out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
        N = mel_spectrogram.shape[0]
        assert len(out) >= N * audio.get_hop_size()

        # time resolution adjustment
        # ensure length of raw audio is multiple of hop_size so that we can use
        # transposed convolution to upsample
        out = out[:N * audio.get_hop_size()]
        assert len(out) % audio.get_hop_size() == 0

        timesteps = len(out)

        # Write the spectrograms to disk:
        audio_filename = 'librivox-audio-%04d-%05d.npy' % (index, chunk_idx,)
        mel_filename = 'librivox-mel-%04d-%05d.npy' % (index, chunk_idx,)
        text_idx = '%s - %05d' % (text, chunk_idx,)
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype), allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.astype(np.float32), allow_pickle=False)

        # Add results tuple describing this training example:
        tup_results.append((audio_filename, mel_filename, timesteps, text_idx))

    # Return all the audio results tuples (unpack in caller)
    return tup_results
Beispiel #17
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    # TODO
    if exists(lab_path) and False:
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=20)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=20)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'cmu_arctic-audio-%05d.npy' % index
    mel_filename = 'cmu_arctic-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text, speaker_id)
Beispiel #18
0
def get_mel(wav_path):
    wav = load_wav(wav_path)
    return torch.Tensor(melspectrogram(wav).astype(np.float32))
Beispiel #19
0
def _process_utterance(out_dir, index, wav_path, text, silence_threshold,
                       fft_size):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, text, mel_len) tuple to write to train.txt
    '''
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hp.rescaling:
        wav = wav / np.abs(wav).max() * hp.rescaling_max

    if hp.input_type != "raw":
        # Mu-law quantize
        out = P.mulaw_quantize(wav)

        # Trim silences
        start, end = audio.start_and_end_indices(out, silence_threshold)
        out = out[start:end]
        wav = wav[start:end]
        constant_value = P.mulaw_quantize(0, 256)
        out_dtype = np.int16
    else:
        out = wav
        constant_value = 0.
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T

    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_value)
    mel_len = mel_spectrogram.shape[0]
    assert len(out) >= mel_len * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:mel_len * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    wav_id = wav_path.split('/')[-1].split('.')[0]
    # Write the spectrograms to disk:
    audio_path = os.path.join(out_dir, '{}-audio.npy'.format(wav_id))
    mel_path = os.path.join(out_dir, '{}-mel.npy'.format(wav_id))
    np.save(audio_path, out.astype(out_dtype), allow_pickle=False)
    np.save(mel_path, mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return os.path.abspath(audio_path), os.path.abspath(
        mel_path), text, timesteps
Beispiel #20
0
 def get_mel(self, audio):
     audio_norm = audio / wavenet_utils.MAX_WAV_VALUE
     melspec = melspectrogram(audio_norm, hparams)
     melspec = melspec.transpose()
     return melspec
Beispiel #21
0
def eval_step(sess,logdir,step,waveform,upsampled_local_condition_data,speaker_id_data,mel_input_data,samples,speaker_id,upsampled_local_condition,next_sample,temperature=1.0):
    waveform = waveform[:,:1]
    
    sample_size = upsampled_local_condition_data.shape[1]
    last_sample_timestamp = datetime.now()
    start_time = time.time()
    for step2 in range(sample_size):  # 원하는 길이를 구하기 위해 loop sample_size
        window = waveform[:,-1:]  # 제일 끝에 있는 1개만 samples에 넣어 준다.  window: shape(N,1)
        

        prediction = sess.run(next_sample, feed_dict={samples: window,upsampled_local_condition: upsampled_local_condition_data[:,step2,:],speaker_id: speaker_id_data })


        if hparams.scalar_input:
            sample = prediction  # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다.
        else:
            # Scale prediction distribution using temperature.
            # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다.
            # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다.
            np.seterr(divide='ignore')
            scaled_prediction = np.log(prediction) / temperature   # config.temperature인 경우는 값의 변화가 없다.
            scaled_prediction = (scaled_prediction - np.logaddexp.reduce(scaled_prediction,axis=-1,keepdims=True))  # np.log(np.sum(np.exp(scaled_prediction)))
            scaled_prediction = np.exp(scaled_prediction)
            np.seterr(divide='warn')
    
            # Prediction distribution at temperature=1.0 should be unchanged after
            # scaling.
            if temperature == 1.0:
                np.testing.assert_allclose( prediction, scaled_prediction, atol=1e-5, err_msg='Prediction scaling at temperature=1.0 is not working as intended.')
            
            # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다.
            sample = [[np.random.choice(np.arange(hparams.quantization_channels), p=p)] for p in scaled_prediction]  # choose one sample per batch
        
        waveform = np.concatenate([waveform,sample],axis=-1)   #window.shape: (N,1)

        # Show progress only once per second.
        current_sample_timestamp = datetime.now()
        time_since_print = current_sample_timestamp - last_sample_timestamp
        if time_since_print.total_seconds() > 1.:
            duration = time.time() - start_time
            print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format(step2 + 1, sample_size, duration), end='\r')
            last_sample_timestamp = current_sample_timestamp
    
    print('\n')
    # Save the result as a wav file.    
    if hparams.input_type == 'raw':
        out = waveform[:,1:]
    elif hparams.input_type == 'mulaw':
        decode = mu_law_decode(samples, hparams.quantization_channels,quantization=False)
        out = sess.run(decode, feed_dict={samples: waveform[:,1:]})
    else:  # 'mulaw-quantize'
        decode = mu_law_decode(samples, hparams.quantization_channels,quantization=True)
        out = sess.run(decode, feed_dict={samples: waveform[:,1:]})          
        
        
    # save wav
    
    for i in range(1):
        wav_out_path= logdir + '/test-{}-{}.wav'.format(step,i)
        mel_path =  wav_out_path.replace(".wav", ".png")
        
        gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype(np.float32).T
        audio.save_wav(out[i], wav_out_path, hparams.sample_rate)  # save_wav 내에서 out[i]의 값이 바뀐다.
        
        plot.plot_spectrogram(gen_mel_spectrogram, mel_path, title='generated mel spectrogram{}'.format(step),target_spectrogram=mel_input_data[i])  
Beispiel #22
0
class Synthesizer:
    def load(self, checkpoint_path, hparams, model_name='WaveNet'):
        log('Constructing model: {}'.format(model_name))
        self._hparams = hparams
        local_cond, global_cond = self._check_conditions()

        self.local_conditions = tf.placeholder(
            tf.float32,
            shape=(None, None, hparams.num_mfccs),
            name='local_condition_features') if local_cond else None
        self.global_conditions = tf.placeholder(
            tf.int32, shape=(None, 1),
            name='global_condition_features') if global_cond else None
        self.synthesis_length = tf.placeholder(
            tf.int32, shape=(),
            name='synthesis_length') if not local_cond else None
        self.input_lengths = tf.placeholder(
            tf.int32, shape=(1, ),
            name='input_lengths') if hparams.wavenet_synth_debug else None
        self.synth_debug = hparams.wavenet_synth_debug

        with tf.variable_scope('WaveNet_model') as scope:
            self.model = create_model(model_name, hparams)
            self.model.initialize(y=None,
                                  c=self.local_conditions,
                                  g=self.global_conditions,
                                  input_lengths=self.input_lengths,
                                  synthesis_length=self.synthesis_length,
                                  test_inputs=None)

            self._hparams = hparams
            sh_saver = create_shadow_saver(self.model)

            log('Loading checkpoint: {}'.format(checkpoint_path))
            #Memory allocation on the GPU as needed
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            config.allow_soft_placement = True

            self.session = tf.Session(config=config)
            self.session.run(tf.global_variables_initializer())

        load_averaged_model(self.session, sh_saver, checkpoint_path)

    def synthesize(self,
                   mel_spectrograms,
                   speaker_ids,
                   basenames,
                   out_dir,
                   log_dir,
                   embed_dir,
                   embed_only=True):
        hparams = self._hparams
        local_cond, global_cond = self._check_conditions()

        #Switch mels in case of debug
        if self.synth_debug:
            assert len(hparams.wavenet_debug_mels) == len(
                hparams.wavenet_debug_wavs)
            mel_spectrograms = [
                np.load(mel_file) for mel_file in hparams.wavenet_debug_mels
            ]

        #Prepare local condition batch
        maxlen = max([len(x) for x in mel_spectrograms])
        #[-max, max] or [0,max]
        T2_output_range = (
            -self._hparams.max_abs_value,
            self._hparams.max_abs_value) if self._hparams.symmetric_mels else (
                0, self._hparams.max_abs_value)

        if self._hparams.clip_for_wavenet:
            mel_spectrograms = [
                np.clip(x, T2_output_range[0], T2_output_range[1])
                for x in mel_spectrograms
            ]

        c_batch = np.asarray(mel_spectrograms).astype(np.float32)
        print("c batch shape {}".format(c_batch.shape))
        if self._hparams.normalize_for_wavenet:
            #rerange to [0, 1]
            c_batch = _interp(c_batch, T2_output_range).astype(np.float32)

        g = None if speaker_ids is None else np.asarray(
            speaker_ids, dtype=np.int32).reshape(len(c_batch), 1)
        print("g shape {}".format(g.shape))
        feed_dict = {}

        if local_cond:
            feed_dict[self.local_conditions] = c_batch
        else:
            feed_dict[self.synthesis_length] = 100

        if global_cond:
            feed_dict[self.global_conditions] = g

        if self.synth_debug:
            debug_wavs = hparams.wavenet_debug_wavs
            assert len(debug_wavs) % hparams.wavenet_num_gpus == 0
            test_wavs = [
                np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs
            ]

            #pad wavs to same length
            max_test_len = max([len(x) for x in test_wavs])
            test_wavs = np.stack([
                _pad_inputs(x, max_test_len) for x in test_wavs
            ]).astype(np.float32)

            assert len(test_wavs) == len(debug_wavs)
            #### GTA False
            feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]])

    if embed_only == False:

        #Generate wavs and clip extra padding to select Real speech parts
        #### VQVAE Out
        generated_wavs, upsampled_features, vq_embeddings, vq_onehot, vq_w, vq_enc_ind = self.session.run(
            [
                self.model.tower_y_hat,
                self.model.tower_synth_upsampled_local_features,
                self.model.vq_embeddings, self.model.vq_onehot,
                self.model.vq_w, self.model.vq_enc_ind
            ],
            feed_dict=feed_dict)

        #Linearize outputs (n_gpus -> 1D)
        generated_wavs = [
            wav for gpu_wavs in generated_wavs for wav in gpu_wavs
        ]
        upsampled_features = [
            feat for gpu_feats in upsampled_features for feat in gpu_feats
        ]

        for i, (generated_wav, input_mel, upsampled_feature,
                vq_embedding) in enumerate(
                    zip(generated_wavs, mel_spectrograms, upsampled_features,
                        vq_embeddings)):
            #Save wav to disk
            audio_filename = os.path.join(
                out_dir, 'wavenet-audio-{}.wav'.format(basenames[i]))
            save_wavenet_wav(generated_wav,
                             audio_filename,
                             sr=hparams.sample_rate,
                             inv_preemphasize=hparams.preemphasize,
                             k=hparams.preemphasis)

            #### Vq embedding save (shape [batch_size, num_frames, embed_dim])
            embed_filename = os.path.join(embed_dir,
                                          'emb-{}.npy'.format(basenames[i]))
            np.save(embed_filename, vq_embedding)

            onehot_filename = os.path.join(
                embed_dir, 'onehot-{}.npy'.format(basenames[i]))
            np.save(onehot_filename, vq_onehot)

            wmatrix_filename = os.path.join(
                embed_dir, 'wmatrix-{}.npy'.format(basenames[i]))
            np.save(wmatrix_filename, vq_w)

            idx_filename = os.path.join(embed_dir,
                                        'idx-{}.npy'.format(basenames[i]))
            np.save(idx_filename, vq_enc_ind)

            #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
            #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
            generated_mel = melspectrogram(generated_wav, hparams).T
            util.plot_spectrogram(
                generated_mel,
                os.path.join(
                    log_dir,
                    'wavenet-mel-spectrogram-{}.png'.format(basenames[i])),
                title=
                'Local Condition vs Reconstructed Audio Mel-Spectrogram analysis',
                target_spectrogram=input_mel)
            #Save upsampled features to visualize checkerboard artifacts.
            util.plot_spectrogram(
                upsampled_feature.T,
                os.path.join(
                    log_dir,
                    'wavenet-upsampled_features-{}.png'.format(basenames[i])),
                title='Upmsampled Local Condition features',
                auto_aspect=True)

            #Save waveplot to disk
            if log_dir is not None:
                plot_filename = os.path.join(
                    log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i]))
                util.waveplot(plot_filename,
                              generated_wav,
                              None,
                              hparams,
                              title='WaveNet generated Waveform.')

    else:
        #Generate wavs and clip extra padding to select Real speech parts
        #### VQVAE Out

        vq_embeddings, vq_onehot, vq_w, vq_enc_ind = self.session.run(
            [
                self.model.vq_embeddings, self.model.vq_onehot,
                self.model.vq_w, self.model.vq_enc_ind
            ],
            feed_dict=feed_dict)

        for i, vq_embedding in enumerate(vq_embeddings):

            #### Vq embedding save (shape [batch_size, num_frames, embed_dim])
            embed_filename = os.path.join(embed_dir,
                                          'emb-{}.npy'.format(basenames[i]))
            np.save(embed_filename, vq_embedding)

            onehot_filename = os.path.join(
                embed_dir, 'onehot-{}.npy'.format(basenames[i]))
            np.save(onehot_filename, vq_onehot)

            wmatrix_filename = os.path.join(
                embed_dir, 'wmatrix-{}.npy'.format(basenames[i]))
            np.save(wmatrix_filename, vq_w)

            idx_filename = os.path.join(embed_dir,
                                        'idx-{}.npy'.format(basenames[i]))
            np.save(idx_filename, vq_enc_ind)

    def _check_conditions(self):
        local_condition = self._hparams.cin_channels > 0
        global_condition = self._hparams.gin_channels > 0
        return local_condition, global_condition
Beispiel #23
0
def _process_utterance(out_dir, index, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    # Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    # Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis,
                                  hparams.preemphasize)

    # rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        # Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    # [-1, 1]
    out = wav
    constant_values = 0.

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if (mel_frames > hparams.max_mel_frames and hparams.clip_mels_length) or (
            hparams.min_text_tokens > len(text)
            or hparams.min_mel_frames > mel_frames):
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.hop_size,
                                        hparams.pad_sides)

    # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad),
                 mode='constant',
                 constant_values=constant_values)

    assert len(out) >= mel_frames * hparams.hop_size

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * hparams.hop_size]
    assert len(out) % hparams.hop_size == 0
    time_steps = len(out)
    npz_filename = '{}.npz'.format(index)
    mel_spectrogram = mel_spectrogram.T
    linear_spectrogram = linear_spectrogram.T

    r = hparams.reduction_factor
    if hparams.symmetric_mels:
        _pad_value = -hparams.max_abs_value
    else:
        _pad_value = 0.
    target_length = len(linear_spectrogram)
    mel_spectrogram = np.pad(mel_spectrogram, [[r, r], [0, 0]],
                             "constant",
                             constant_values=_pad_value)
    linear_spectrogram = np.pad(linear_spectrogram, [[r, r], [0, 0]],
                                "constant",
                                constant_values=_pad_value)
    target_length = target_length + 2 * r
    padded_target_length = (target_length // r + 1) * r
    num_pad = padded_target_length - target_length
    stop_token_target = np.pad(np.zeros(padded_target_length - 1,
                                        dtype=np.float32), (0, 1),
                               "constant",
                               constant_values=1)
    mel_spectrogram = np.pad(mel_spectrogram, ((0, num_pad), (0, 0)),
                             "constant",
                             constant_values=_pad_value)
    linear_spectrogram = np.pad(linear_spectrogram, ((0, num_pad), (0, 0)),
                                "constant",
                                constant_values=_pad_value)

    data = {
        'mel': mel_spectrogram,
        'linear': linear_spectrogram,
        'input_data': text_to_sequence(text),  # eos(~)
        'time_steps': time_steps,
        'stop_token_target': stop_token_target,
        'mel_frames': padded_target_length,
        'text': text,
    }
    np.savez(os.path.join(out_dir, npz_filename), **data, allow_pickle=False)
    # Return a tuple describing this training example
    return npz_filename, time_steps, padded_target_length, text
def get_mel(filename):
    wav = load_wav(filename)
    mel = melspectrogram(wav).astype(np.float32)
    return mel
Beispiel #25
0
ctr = 0
for line in f:
 if len(line) > 2:
    ctr += 1
    line = line.split('\n')[0]

    fname = line.split()[0]
    phones = ' '.join(k for k in line.split()[1:])

    if generate_feats_flag:
       wav_fname = wav_dir + '/' + fname + '.wav'
       wav = audio.load_wav(wav_fname)
       max_samples = _max_out_length * 5 / 1000 * 16000
       spectrogram = audio.spectrogram(wav).astype(np.float32)
       n_frames = spectrogram.shape[1]
       mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
       lspec_fname = lspec_dir + '/' + fname + '_lspec.npy'
       mspec_fname = mspec_dir + '/' + fname + '_mspec.npy'
       np.save(lspec_fname, spectrogram.T, allow_pickle=False)
       np.save(mspec_fname, mel_spectrogram.T, allow_pickle=False)

       g = open(data_file, 'a')
       g.write(lspec_fname + '|' + mspec_fname + '|' + str(n_frames) + '| ' + phones  + '\n')
       g.close()

       g = open(feats_dir + '/' + fname + '.feats', 'w')
       for phone in phones.split():
          g.write(phone + '\n')
       g.close()

    if ctr % 100 == 1:
Beispiel #26
0
def _process_utterance(out_dir, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate) #1차원짜리 wav파일 뽑아옴
        #Load an audio file as a floating point time series.
        #Audio will be automatically resampled to the given rate (default sr=22050).
        #To preserve the native sampling rate of the file, use sr=None. 
        #print('====wav====')
        #print(wav,wav.shape) (240001,)
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
            wav_path))
        return None

    #rescale wav
    if hparams.rescaling:   # hparams.rescale = True
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
    #We rescale because it is assumed in Wavenet training that wavs are in [-1, 1] when computing the mixture loss. This is mainly coming from PixelCNN implementation.
    #https://github.com/Rayhane-mamah/Tacotron-2/issues/69

    #M-AILABS extra silence specific
    if hparams.trim_silence:  # hparams.trim_silence = True
        wav = audio.trim_silence(wav, hparams)   # Trim leading and trailing silence

    #Mu-law quantize, default 값은 'raw'
    #The quantization noise is from the analog to digital conversion. The mu-law compression actually reduces the noise and increases the dynamic range.
    #If you search a little bit in the code you will find that the input is always mu-law encoded here.
    #scalar_input only determines if the model uses a one-hot encoding for every data point of the input waveform, or just uses floating point values for each sample.
    if hparams.input_type=='mulaw-quantize':
        #[0, quantize_channels)
        out = audio.mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start: end]
        out = out[start: end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif hparams.input_type=='mulaw':
        #[-1, 1]
        out = audio.mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:  # raw
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrograFm from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    #print('====mel_spectrogram====')
    #print(mel_spectrogram,mel_spectrogram.shape) #(80,797),(80,801) ...
    mel_frames = mel_spectrogram.shape[1]
    #print('===mel frame====')
    #print(mel_frames) 801, 797 ,...
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:   # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32)
    #print('====linear_spectrogram====')
    #print(linear_spectrogram,linear_spectrogram.shape) #(1025,787),(1025,801)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:    # hparams.use_lws = False
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #1024 == 2048//2 == fft_size//2
        #print('===pad===')
        #print(pad) 
        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        #print(out,out.shape) #(240001,)
        out = np.pad(out, pad, mode='reflect') #shape : (242049,) - 패딩
        #print(out,out.shape) #(242049,)
        #print('===out====')
        #print(out,out.shape)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)] #240300으로 맞춤(자름)
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)
    #print(audio.get_hop_size(hparams)) : 300
    #print(out,out.shape) #(240300,) = 801*300
    # Write the spectrogram and audio to disk
    wav_id = os.path.splitext(os.path.basename(wav_path))[0] #확장자 제외하고 파일 이름 얻기
    #print('====wav_id====')
    #print(wav_id)
    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    linear_filename = '{}-linear.npy'.format(wav_id)
    npz_filename = '{}.npz'.format(wav_id)
    npz_flag=True
    if npz_flag:
        # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다.
        data = {
            'audio': out.astype(out_dtype),
            'mel': mel_spectrogram.T,  
            'linear': linear_spectrogram.T,
            'time_steps': time_steps,
            'mel_frames': mel_frames,
            'text': text,
            'tokens': text_to_sequence(text),   # eos(~)에 해당하는 "1"이 끝에 붙는다.
            'loss_coeff': 1  # For Tacotron
        }
        #print('=====data====')
        #print(data)
        np.savez(os.path.join(out_dir,npz_filename ), **data, allow_pickle=False) #여러개의 배열을 1개의 압축되지 않은 *.npz 포맷 파일로 저장하기
    else:
        np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example
    #print('====mel_frames====')
    #print(mel_frames)
    #print('====time_steps====')
    #print(time_steps)
    return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text,npz_filename)
Beispiel #27
0
        batch = phonemizer.encode(batch, njobs=args.NJOBS, clean=False)
        phonemes.extend(batch)
    audio_data = np.concatenate([np.array(audio_data), np.expand_dims(phonemes, axis=1)], axis=1)
    if args.CACHE_PHON:
        np.save(phon_path, audio_data, allow_pickle=True)

print('\nBuilding dataset and writing files')
np.random.seed(42)
np.random.shuffle(audio_data)
test_metafile = os.path.join(args.TARGET_DIR, 'test_metafile.txt')
train_metafile = os.path.join(args.TARGET_DIR, 'train_metafile.txt')

test_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in
              audio_data[:config['n_test']]]
train_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in
               audio_data[config['n_test']:-1]]

with open(test_metafile, 'w+', encoding='utf-8') as test_f:
    test_f.writelines(test_lines)
with open(train_metafile, 'w+', encoding='utf-8') as train_f:
    train_f.writelines(train_lines)

for i in tqdm.tqdm(range(len(audio_data))):
    filename, _, _ = audio_data[i]
    wav_path = os.path.join(args.WAV_DIR, filename + '.wav')
    y, sr = librosa.load(wav_path, sr=config['sampling_rate'])
    mel = melspectrogram(y, config)
    mel_path = os.path.join(mel_dir, filename)
    np.save(mel_path, mel.T)
print('\nDone')
def _process_utterance(out_dir, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescaling:  # hparams.rescale = True
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:  # hparams.trim_silence = True
        wav = audio.trim_silence(wav,
                                 hparams)  # Trim leading and trailing silence

    #Mu-law quantize, default 값은 'raw'
    if hparams.input_type == 'mulaw-quantize':
        #[0, quantize_channels)
        out = audio.mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif hparams.input_type == 'mulaw':
        #[-1, 1]
        out = audio.mulaw(wav, hparams.quantize_channels)
        constant_values = audio.mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:  # raw
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:  # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:  # hparams.use_lws = False
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.fft_size,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    wav_id = os.path.splitext(os.path.basename(wav_path))[0]

    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    linear_filename = '{}-linear.npy'.format(wav_id)
    npz_filename = '{}.npz'.format(wav_id)
    npz_flag = True
    if npz_flag:
        # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다.
        data = {
            'audio': out.astype(out_dtype),
            'mel': mel_spectrogram.T,
            'linear': linear_spectrogram.T,
            'time_steps': time_steps,
            'mel_frames': mel_frames,
            'text': text,
            'tokens': text_to_sequence(text),  # eos(~)에 해당하는 "1"이 끝에 붙는다.
            'loss_coeff': 1  # For Tacotron
        }

        np.savez(os.path.join(out_dir, npz_filename),
                 **data,
                 allow_pickle=False)
    else:
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype),
                allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.T,
                allow_pickle=False)
        np.save(os.path.join(out_dir, linear_filename),
                linear_spectrogram.T,
                allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text, npz_filename)
Beispiel #29
0
from utils.audio import melspectrogram,inv_mel_spectrogram,load_wav,save_wav


wav_path = "LJ001-0008.wav"
raw_wav = load_wav(wav_path)
mel_spec = melspectrogram(raw_wav)
inv_wav = inv_mel_spectrogram(mel_spec)
save_wav(inv_wav,"inv.wav")