import os import unittest from tests import get_tests_input_path, get_tests_output_path, get_tests_path from TTS.config import BaseAudioConfig from TTS.utils.audio import AudioProcessor TESTS_PATH = get_tests_path() OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") os.makedirs(OUT_PATH, exist_ok=True) conf = BaseAudioConfig(mel_fmax=8000) # pylint: disable=protected-access class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.ap = AudioProcessor(**conf) def test_audio_synthesis(self): """1. load wav 2. set normalization parameters 3. extract mel-spec 4. invert to wav and save the output """ print(" > Sanity check for the process wav -> mel -> wav") def _test(max_norm, signal_norm, symmetric_norm, clip_norm): self.ap.max_norm = max_norm
meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")) # download dataset if not already present if not os.path.exists(dataset_config.path): print("Downloading dataset") download_thorsten_de( os.path.split(os.path.abspath(dataset_config.path))[0]) audio_config = BaseAudioConfig( sample_rate=22050, do_trim_silence=True, trim_db=60.0, signal_norm=False, mel_fmin=0.0, mel_fmax=8000, spec_gain=1.0, log_func="np.log", ref_level_db=20, preemphasis=0.0, ) config = SpeedySpeechConfig( run_name="speedy_speech_thorsten-de", audio=audio_config, batch_size=32, eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, compute_input_seq_cache=True, run_eval=True,
import torch from tests import get_tests_input_path, get_tests_output_path, get_tests_path from TTS.config import BaseAudioConfig from TTS.utils.audio import AudioProcessor from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT TESTS_PATH = get_tests_path() OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") os.makedirs(OUT_PATH, exist_ok=True) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") ap = AudioProcessor(**BaseAudioConfig().to_dict()) def test_torch_stft(): torch_stft = TorchSTFT(ap.fft_size, ap.hop_length, ap.win_length) # librosa stft wav = ap.load_wav(WAV_FILE) M_librosa = abs(ap._stft(wav)) # pylint: disable=protected-access # torch stft wav = torch.from_numpy(wav[None, :]).float() M_torch = torch_stft(wav) # check the difference b/w librosa and torch outputs assert (M_librosa - M_torch[0].data.numpy()).max() < 1e-5 def test_stft_loss():