def audio2mel(audiopaths_and_text, melpaths_and_text, args): melpaths_and_text_list = load_filepaths_and_text(melpaths_and_text) audiopaths_and_text_list = load_filepaths_and_text(audiopaths_and_text) data_loader = TextMelLoader(audiopaths_and_text, args) for i in range(len(melpaths_and_text_list)): if i % 100 == 0: print("done", i, "/", len(melpaths_and_text_list)) mel = data_loader.get_mel(audiopaths_and_text_list[i][0]) torch.save(mel, melpaths_and_text_list[i][0])
def __init__(self, dataset_path, audiopaths_and_text, text_cleaners, n_mel_channels, symbol_set='english_basic', n_speakers=1, load_mel_from_disk=True, max_wav_value=None, sampling_rate=None, filter_length=None, hop_length=None, win_length=None, mel_fmin=None, mel_fmax=None, **ignored): self.audiopaths_and_text = load_filepaths_and_text( dataset_path, audiopaths_and_text, has_speakers=(n_speakers > 1)) self.load_mel_from_disk = load_mel_from_disk if not load_mel_from_disk: self.max_wav_value = max_wav_value self.sampling_rate = sampling_rate self.stft = layers.TacotronSTFT( filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax)
def __init__(self, dataset_path, audiopaths_and_text, args): self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text) self.max_wav_value = args.max_wav_value self.sampling_rate = args.sampling_rate self.stft = layers.TacotronSTFT( args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) self.segment_length = args.segment_length random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, dataset_path, audiopaths_and_text, args, load_mel_from_disk=True): self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text) self.text_cleaners = args.text_cleaners self.load_mel_from_disk = load_mel_from_disk if not load_mel_from_disk: self.max_wav_value = args.max_wav_value self.sampling_rate = args.sampling_rate self.stft = layers.TacotronSTFT( args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax)
def __init__(self, audiopaths_and_text, args): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = args.text_cleaners self.max_wav_value = args.max_wav_value self.sampling_rate = args.sampling_rate self.load_mel_from_disk = args.load_mel_from_disk self.stft = layers.TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def audio2mel(dataset_path: str, audiopaths_and_text: str, melpaths_and_text: str, args: ArgumentParser) -> None: """Create mel spectrograms on disk from audio files. Args: dataset_path (str): Path to dataset audiopaths_and_text (str): Path to filelist with audio paths and text melpaths_and_text (str): Path to filelist with mel paths and text args (ArgumentParser): Namespace with arguments """ melpaths_and_text_list = load_filepaths_and_text(dataset_path, melpaths_and_text) audiopaths_and_text_list = load_filepaths_and_text(dataset_path, audiopaths_and_text) data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args) for i, melpath_and_text in enumerate(melpaths_and_text_list): if i % 100 == 0: print("done", i, "/", len(melpaths_and_text_list)) mel = data_loader.get_mel(audiopaths_and_text_list[i][0]) torch.save(mel, melpath_and_text[0])
def audio2mel(dataset_path, audiopaths_and_text, melpaths_and_text, args, use_intermed=None): melpaths_and_text_list = \ load_filepaths_and_text(dataset_path, melpaths_and_text) audiopaths_and_text_list = \ load_filepaths_and_text(dataset_path, audiopaths_and_text) # n = 10 # print(f"The first {n} melpaths and text are {melpaths_and_text_list[:n]}") # print(f"The first {n} audiopaths and text are {audiopaths_and_text_list[:n]}") data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args) for i in range(len(melpaths_and_text_list)): if i % 100 == 0: print("done", i, "/", len(melpaths_and_text_list)) mel = data_loader.get_mel(audiopaths_and_text_list[i][0]) torch.save(mel, melpaths_and_text_list[i][0])
def __init__(self, dataset_path, audiopaths_and_text, segment_length, n_mel_channels, max_wav_value, sampling_rate, filter_length, hop_length, win_length, mel_fmin, mel_fmax, args): self.audiopaths_and_text = load_filepaths_and_text( dataset_path, audiopaths_and_text) self.max_wav_value = max_wav_value self.sampling_rate = sampling_rate self.stft = layers.TacotronSTFT(filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax) self.segment_length = segment_length random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, dataset_path, audiopaths_and_text, args, speaker_ids=None): self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text) self.text_cleaners = args.text_cleaners self.max_wav_value = args.max_wav_value self.sampling_rate = args.sampling_rate self.load_mel_from_disk = args.load_mel_from_disk self.stft = layers.TacotronSTFT( args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) self.speaker_ids = speaker_ids if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
def __init__( self, dataset_path, audiopaths_and_text, text_cleaners, n_mel_channels, symbol_set='english_basic', p_arpabet=1.0, n_speakers=1, load_mel_from_disk=True, load_pitch_from_disk=True, pitch_mean=214.72203, # LJSpeech defaults pitch_std=65.72038, max_wav_value=None, sampling_rate=None, filter_length=None, hop_length=None, win_length=None, mel_fmin=None, mel_fmax=None, prepend_space_to_text=False, append_space_to_text=False, pitch_online_dir=None, betabinomial_online_dir=None, use_betabinomial_interpolator=True, pitch_online_method='pyin', **ignored): # Expect a list of filenames if type(audiopaths_and_text) is str: audiopaths_and_text = [audiopaths_and_text] self.dataset_path = dataset_path self.audiopaths_and_text = load_filepaths_and_text( dataset_path, audiopaths_and_text, has_speakers=(n_speakers > 1)) self.load_mel_from_disk = load_mel_from_disk if not load_mel_from_disk: self.max_wav_value = max_wav_value self.sampling_rate = sampling_rate self.stft = layers.TacotronSTFT(filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax) self.load_pitch_from_disk = load_pitch_from_disk self.prepend_space_to_text = prepend_space_to_text self.append_space_to_text = append_space_to_text assert p_arpabet == 0.0 or p_arpabet == 1.0, ( 'Only 0.0 and 1.0 p_arpabet is currently supported. ' 'Variable probability breaks caching of betabinomial matrices.') self.tp = TextProcessing(symbol_set, text_cleaners, p_arpabet=p_arpabet) self.n_speakers = n_speakers self.pitch_tmp_dir = pitch_online_dir self.f0_method = pitch_online_method self.betabinomial_tmp_dir = betabinomial_online_dir self.use_betabinomial_interpolator = use_betabinomial_interpolator if use_betabinomial_interpolator: self.betabinomial_interpolator = BetaBinomialInterpolator() expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1)) assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None) if len(self.audiopaths_and_text[0]) < expected_columns: raise ValueError( f'Expected {expected_columns} columns in audiopaths file. ' 'The format is <mel_or_wav>|[<pitch>|]<text>[|<speaker_id>]') if len(self.audiopaths_and_text[0]) > expected_columns: print('WARNING: Audiopaths file has more columns than expected') to_tensor = lambda x: torch.Tensor([x]) if type(x) is float else x self.pitch_mean = to_tensor(pitch_mean) self.pitch_std = to_tensor(pitch_std)
def audio2mel2audio(dataset_path, audiopaths_and_text, melpaths_and_text, args, use_intermed=None): melpaths_and_text_list = \ load_filepaths_and_text(dataset_path, melpaths_and_text) audiopaths_and_text_list = \ load_filepaths_and_text(dataset_path, audiopaths_and_text) # n = 10 # print(f"The first {n} melpaths and text are {melpaths_and_text_list[:n]}") # print(f"The first {n} audiopaths and text are {audiopaths_and_text_list[:n]}") # torchaudio implementation spec = T.Spectrogram( n_fft=args.filter_length, win_length=args.win_length, hop_length=args.hop_length, power=1, normalized=True, ) # print(spec) griffin_lim = T.GriffinLim( n_fft=args.filter_length, win_length=args.win_length, hop_length=args.hop_length, n_iter=args.n_iters, power=1, normalized=True, ) # import pdb; pdb.set_trace() print(args) data_path = "/data/logotypografia_simple/cleaned_wavs/" # tacotron-based implementations # stft_fn = STFT(args.filter_length, args.hop_length, args.win_length) # data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args) for i in range(len(melpaths_and_text_list)): # specotrogram calculation based on internal components, buggy # spec = data_loader.get_spec(audiopaths_and_text_list[i][0]) # wave = griffin_lim(spec, stft_fn, n_iters=30) # wave = wave.detach().cpu().numpy() # spectrogram calculation based on torchaudio wav_name = data_path + audiopaths_and_text_list[i][0].split("/")[-1] audio, sampling_rate = load_wav_to_torch(wav_name) _spectrogram = spec(audio) inv_waveform = griffin_lim(_spectrogram) # torch.save(mel, f"grifin_lin/{}") inv_wav_name = "griffin_lim_inv_audio_custom7/" \ + audiopaths_and_text_list[i][0].split("/")[-1] print(f"Saving reconstructed wav with name {inv_wav_name}") write(inv_wav_name, 16000, inv_waveform.detach().cpu().numpy())