Ejemplo n.º 1
0
def audio2mel(audiopaths_and_text, melpaths_and_text, args):

    melpaths_and_text_list = load_filepaths_and_text(melpaths_and_text)
    audiopaths_and_text_list = load_filepaths_and_text(audiopaths_and_text)

    data_loader = TextMelLoader(audiopaths_and_text, args)

    for i in range(len(melpaths_and_text_list)):
        if i % 100 == 0:
            print("done", i, "/", len(melpaths_and_text_list))

        mel = data_loader.get_mel(audiopaths_and_text_list[i][0])
        torch.save(mel, melpaths_and_text_list[i][0])
Ejemplo n.º 2
0
 def __init__(self,
              dataset_path,
              audiopaths_and_text,
              text_cleaners,
              n_mel_channels,
              symbol_set='english_basic',
              n_speakers=1,
              load_mel_from_disk=True,
              max_wav_value=None,
              sampling_rate=None,
              filter_length=None,
              hop_length=None,
              win_length=None,
              mel_fmin=None,
              mel_fmax=None,
              **ignored):
     self.audiopaths_and_text = load_filepaths_and_text(
         dataset_path, audiopaths_and_text,
         has_speakers=(n_speakers > 1))
     self.load_mel_from_disk = load_mel_from_disk
     if not load_mel_from_disk:
         self.max_wav_value = max_wav_value
         self.sampling_rate = sampling_rate
         self.stft = layers.TacotronSTFT(
             filter_length, hop_length, win_length,
             n_mel_channels, sampling_rate, mel_fmin, mel_fmax)
Ejemplo n.º 3
0
 def __init__(self, dataset_path, audiopaths_and_text, args):
     self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text)
     self.max_wav_value = args.max_wav_value
     self.sampling_rate = args.sampling_rate
     self.stft = layers.TacotronSTFT(
         args.filter_length, args.hop_length, args.win_length,
         args.n_mel_channels, args.sampling_rate, args.mel_fmin,
         args.mel_fmax)
     self.segment_length = args.segment_length
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
 def __init__(self, dataset_path, audiopaths_and_text, args, load_mel_from_disk=True):
     self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text)
     self.text_cleaners = args.text_cleaners
     self.load_mel_from_disk = load_mel_from_disk
     if not load_mel_from_disk:
         self.max_wav_value = args.max_wav_value
         self.sampling_rate = args.sampling_rate
         self.stft = layers.TacotronSTFT(
             args.filter_length, args.hop_length, args.win_length,
             args.n_mel_channels, args.sampling_rate, args.mel_fmin,
             args.mel_fmax)
Ejemplo n.º 5
0
 def __init__(self, audiopaths_and_text, args):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = args.text_cleaners
     self.max_wav_value = args.max_wav_value
     self.sampling_rate = args.sampling_rate
     self.load_mel_from_disk = args.load_mel_from_disk
     self.stft = layers.TacotronSTFT(args.filter_length, args.hop_length,
                                     args.win_length, args.n_mel_channels,
                                     args.sampling_rate, args.mel_fmin,
                                     args.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
def audio2mel(dataset_path: str, audiopaths_and_text: str,
              melpaths_and_text: str, args: ArgumentParser) -> None:
    """Create mel spectrograms on disk from audio files.

    Args:
        dataset_path (str): Path to dataset
        audiopaths_and_text (str): Path to filelist with audio paths and text
        melpaths_and_text (str): Path to filelist with mel paths and text
        args (ArgumentParser): Namespace with arguments
    """

    melpaths_and_text_list = load_filepaths_and_text(dataset_path,
                                                     melpaths_and_text)
    audiopaths_and_text_list = load_filepaths_and_text(dataset_path,
                                                       audiopaths_and_text)
    data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args)

    for i, melpath_and_text in enumerate(melpaths_and_text_list):
        if i % 100 == 0:
            print("done", i, "/", len(melpaths_and_text_list))
        mel = data_loader.get_mel(audiopaths_and_text_list[i][0])
        torch.save(mel, melpath_and_text[0])
Ejemplo n.º 7
0
def audio2mel(dataset_path,
              audiopaths_and_text,
              melpaths_and_text,
              args,
              use_intermed=None):

    melpaths_and_text_list = \
        load_filepaths_and_text(dataset_path, melpaths_and_text)

    audiopaths_and_text_list = \
        load_filepaths_and_text(dataset_path, audiopaths_and_text)

    # n = 10
    # print(f"The first {n} melpaths and text are {melpaths_and_text_list[:n]}")
    # print(f"The first {n} audiopaths and text are {audiopaths_and_text_list[:n]}")

    data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args)
    for i in range(len(melpaths_and_text_list)):
        if i % 100 == 0:
            print("done", i, "/", len(melpaths_and_text_list))
        mel = data_loader.get_mel(audiopaths_and_text_list[i][0])
        torch.save(mel, melpaths_and_text_list[i][0])
Ejemplo n.º 8
0
 def __init__(self, dataset_path, audiopaths_and_text, segment_length,
              n_mel_channels, max_wav_value, sampling_rate, filter_length,
              hop_length, win_length, mel_fmin, mel_fmax, args):
     self.audiopaths_and_text = load_filepaths_and_text(
         dataset_path, audiopaths_and_text)
     self.max_wav_value = max_wav_value
     self.sampling_rate = sampling_rate
     self.stft = layers.TacotronSTFT(filter_length, hop_length, win_length,
                                     n_mel_channels, sampling_rate,
                                     mel_fmin, mel_fmax)
     self.segment_length = segment_length
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 9
0
    def __init__(self, dataset_path, audiopaths_and_text, args, speaker_ids=None):
        self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text)
        self.text_cleaners = args.text_cleaners
        self.max_wav_value = args.max_wav_value
        self.sampling_rate = args.sampling_rate
        self.load_mel_from_disk = args.load_mel_from_disk
        self.stft = layers.TacotronSTFT(
            args.filter_length, args.hop_length, args.win_length,
            args.n_mel_channels, args.sampling_rate, args.mel_fmin,
            args.mel_fmax)
        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)

        self.speaker_ids = speaker_ids
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
Ejemplo n.º 10
0
    def __init__(
            self,
            dataset_path,
            audiopaths_and_text,
            text_cleaners,
            n_mel_channels,
            symbol_set='english_basic',
            p_arpabet=1.0,
            n_speakers=1,
            load_mel_from_disk=True,
            load_pitch_from_disk=True,
            pitch_mean=214.72203,  # LJSpeech defaults
            pitch_std=65.72038,
            max_wav_value=None,
            sampling_rate=None,
            filter_length=None,
            hop_length=None,
            win_length=None,
            mel_fmin=None,
            mel_fmax=None,
            prepend_space_to_text=False,
            append_space_to_text=False,
            pitch_online_dir=None,
            betabinomial_online_dir=None,
            use_betabinomial_interpolator=True,
            pitch_online_method='pyin',
            **ignored):

        # Expect a list of filenames
        if type(audiopaths_and_text) is str:
            audiopaths_and_text = [audiopaths_and_text]

        self.dataset_path = dataset_path
        self.audiopaths_and_text = load_filepaths_and_text(
            dataset_path, audiopaths_and_text, has_speakers=(n_speakers > 1))
        self.load_mel_from_disk = load_mel_from_disk
        if not load_mel_from_disk:
            self.max_wav_value = max_wav_value
            self.sampling_rate = sampling_rate
            self.stft = layers.TacotronSTFT(filter_length, hop_length,
                                            win_length, n_mel_channels,
                                            sampling_rate, mel_fmin, mel_fmax)
        self.load_pitch_from_disk = load_pitch_from_disk

        self.prepend_space_to_text = prepend_space_to_text
        self.append_space_to_text = append_space_to_text

        assert p_arpabet == 0.0 or p_arpabet == 1.0, (
            'Only 0.0 and 1.0 p_arpabet is currently supported. '
            'Variable probability breaks caching of betabinomial matrices.')

        self.tp = TextProcessing(symbol_set,
                                 text_cleaners,
                                 p_arpabet=p_arpabet)
        self.n_speakers = n_speakers
        self.pitch_tmp_dir = pitch_online_dir
        self.f0_method = pitch_online_method
        self.betabinomial_tmp_dir = betabinomial_online_dir
        self.use_betabinomial_interpolator = use_betabinomial_interpolator

        if use_betabinomial_interpolator:
            self.betabinomial_interpolator = BetaBinomialInterpolator()

        expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1))

        assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None)

        if len(self.audiopaths_and_text[0]) < expected_columns:
            raise ValueError(
                f'Expected {expected_columns} columns in audiopaths file. '
                'The format is <mel_or_wav>|[<pitch>|]<text>[|<speaker_id>]')

        if len(self.audiopaths_and_text[0]) > expected_columns:
            print('WARNING: Audiopaths file has more columns than expected')

        to_tensor = lambda x: torch.Tensor([x]) if type(x) is float else x
        self.pitch_mean = to_tensor(pitch_mean)
        self.pitch_std = to_tensor(pitch_std)
Ejemplo n.º 11
0
def audio2mel2audio(dataset_path,
                    audiopaths_and_text,
                    melpaths_and_text,
                    args,
                    use_intermed=None):

    melpaths_and_text_list = \
        load_filepaths_and_text(dataset_path, melpaths_and_text)

    audiopaths_and_text_list = \
        load_filepaths_and_text(dataset_path, audiopaths_and_text)

    # n = 10
    # print(f"The first {n} melpaths and text are {melpaths_and_text_list[:n]}")
    # print(f"The first {n} audiopaths and text are {audiopaths_and_text_list[:n]}")

    # torchaudio implementation
    spec = T.Spectrogram(
            n_fft=args.filter_length,
            win_length=args.win_length,
            hop_length=args.hop_length,
            power=1,
            normalized=True,

        )
    # print(spec)

    griffin_lim = T.GriffinLim(
            n_fft=args.filter_length,
            win_length=args.win_length,
            hop_length=args.hop_length,
            n_iter=args.n_iters,
            power=1,
            normalized=True,
        )
    # import pdb; pdb.set_trace()
    print(args)
    data_path = "/data/logotypografia_simple/cleaned_wavs/"

    #  tacotron-based implementations
    # stft_fn = STFT(args.filter_length, args.hop_length, args.win_length)
    # data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args)
    for i in range(len(melpaths_and_text_list)):
        # specotrogram calculation based on internal components, buggy
        # spec = data_loader.get_spec(audiopaths_and_text_list[i][0])
        # wave = griffin_lim(spec, stft_fn, n_iters=30)
        # wave = wave.detach().cpu().numpy()

        #  spectrogram calculation based on torchaudio
        wav_name = data_path + audiopaths_and_text_list[i][0].split("/")[-1]

        audio, sampling_rate = load_wav_to_torch(wav_name)

        _spectrogram = spec(audio)
        inv_waveform = griffin_lim(_spectrogram)

        # torch.save(mel, f"grifin_lin/{}")
        inv_wav_name = "griffin_lim_inv_audio_custom7/" \
                       + audiopaths_and_text_list[i][0].split("/")[-1]
        print(f"Saving reconstructed wav with name {inv_wav_name}")
        write(inv_wav_name, 16000, inv_waveform.detach().cpu().numpy())