Ejemplo n.º 1
0
    def __init__(self, audiopaths_and_text, hparams, speaker_ids=None):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.stft = layers.TacotronSTFT(hparams.filter_length,
                                        hparams.hop_length, hparams.win_length,
                                        hparams.n_mel_channels,
                                        hparams.sampling_rate,
                                        hparams.mel_fmin, hparams.mel_fmax)
        self.sampling_rate = hparams.sampling_rate
        self.filter_length = hparams.filter_length
        self.hop_length = hparams.hop_length
        self.f0_min = hparams.f0_min
        self.f0_max = hparams.f0_max
        self.harm_thresh = hparams.harm_thresh
        self.p_arpabet = hparams.p_arpabet

        self.cmudict = None
        if hparams.cmudict_path is not None:
            self.cmudict = cmudict.CMUDict(hparams.cmudict_path)

        self.speaker_ids = speaker_ids
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)

        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 2
0
    def __init__(self, audiopaths_and_text, hparams, speaker_ids=None, mode='train'):
        if mode == 'train':
            self.audiopaths_and_text = load_filepaths_and_text_train(audiopaths_and_text, split='\t')
            self.mode = True
        else:
            self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text, split='\t')
            self.mode = False
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.stft = layers.TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
            hparams.mel_fmax)
        self.sampling_rate = hparams.sampling_rate
        self.filter_length = hparams.filter_length
        self.hop_length = hparams.hop_length
        self.f0_min = hparams.f0_min
        self.f0_max = hparams.f0_max
        self.harm_thresh = hparams.harm_thresh
        self.p_arpabet = hparams.p_arpabet

        self.cmudict = None
        if hparams.cmudict_path is not None:
            self.cmudict = cmudict.CMUDict(hparams.cmudict_path)

        self.speaker_ids = speaker_ids

        if self.speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
Ejemplo n.º 3
0
 def __init__(self, audiopaths_and_text, hparams):
     self.hparams = hparams
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(hparams.seed)
     random.shuffle(self.audiopaths_and_text)
     if hparams.use_cmudict:
         if not os.path.isfile(hparams.cmudict_path):
             raise Exception(
                 'If use_cmudict=True, you must download ' +
                 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'
                 % cmudict_path)
         if hparams.p_cmudict == 1.0:
             self._cmudict = cmudict.CMUDict(str(cmudict_path),
                                             keep_ambiguous=True)
         else:
             self._cmudict = cmudict.CMUDict(str(cmudict_path),
                                             keep_ambiguous=False)
         print('Loaded CMUDict with %d unambiguous entries' %
               len(self._cmudict))
     else:
         self._cmudict = None
Ejemplo n.º 4
0
    def __init__(self, dataset, experiment, hparams, load_durations):
        self.experiment = experiment
        self.audiopaths_and_text = load_filepaths_and_text(dataset, experiment, hparams)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.hparams = hparams
        self.load_durations = load_durations
        self.durations_dir = os.path.join(experiment.paths["acoustic_features"], "dur")
        if hparams.preprocessing_type == "vocalid":
            # vocalid preprocessing is never on the fly
            self.load_mel_from_disk = True
        else:
            self.stft = layers.TacotronSTFT(
                hparams.filter_length, hparams.hop_length, hparams.win_length,
                hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                hparams.mel_fmax)

        #TODO: will go to preprocessing
        self.textanalyzer = TextAnalyzer(use_phones=hparams.use_phonemes,
                                         g2p_backend=hparams.g2p_backend, language=hparams.language)
        self._phone_cache_dir = os.path.join(experiment.paths["acoustic_features"], "utt")
        self._hparams = hparams
        print(f"Creating new in-memory phone cache")
        self._phoneme_cache = {}
        os.makedirs(self._phone_cache_dir, exist_ok=True)
        # fill phoneme cache first time before multiprocessing clones this data
        for paths in self.audiopaths_and_text:
            self.get_mel_text_pair(paths, dummy_mel=True)
        random.seed(hparams.seed)
        random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 5
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft_80 = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     self.stft_512 = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         512, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 6
0
 def __init__(self, hparams, train_set=True):
     voxceleb1_root = '/hdd/klab/cmtts/data/VoxCeleb1'
     self.audiopaths_and_text = self.get_wav_txt_pairs(voxceleb1_root)
     print('load {} pairs from voxceleb1'.format(
         len(self.audiopaths_and_text)))
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
     train_num = int(0.95 * len(self.audiopaths_and_text))
     if train_set:
         self.audiopaths_and_text = self.audiopaths_and_text[:train_num]
         print('train set using {} pairs'.format(
             len(self.audiopaths_and_text)))
     else:
         self.audiopaths_and_text = self.audiopaths_and_text[train_num:]
         print('val   set using {} pairs'.format(
             len(self.audiopaths_and_text)))
Ejemplo n.º 7
0
def multiprocess_gen_mels(audiopaths_internal):
    import layers
    stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length,
                               hparams.win_length, hparams.n_mel_channels,
                               hparams.sampling_rate, hparams.mel_fmin,
                               hparams.mel_fmax)
    return_string = ""
    total = len(audiopaths_internal)
    for index, path in enumerate(audiopaths_internal):
        if index < 0: continue
        #try:
        file = path.replace(".npy", ".wav")
        audio, sampling_rate = load_wav_to_torch(file)
        if sampling_rate != stft.sampling_rate:
            raise ValueError("{} {} SR doesn't match target {} SR".format(
                file, sampling_rate, stft.sampling_rate))
        melspec = stft.mel_spectrogram(
            audio.unsqueeze(0)).squeeze(0).cpu().numpy()
        np.save(file.replace('.wav', ''), melspec)
        if not index % 1000:
            print(total - index)
        #except Exception as ex:
        #    return_string+=(path+" failed to process\nException: "+str(ex)+"\n")
    if not return_string:
        return_string = "No Errors on this process."
    return return_string
Ejemplo n.º 8
0
 def __init__(self, audiopaths_and_text, hparams, max_len=40):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
     self._max_len = max_len
     self._epoch = 0
     # self._normalize()
     self._mean = torch.tensor([[-7.0222], [-6.1906], [-5.1736], [-4.2412], [-3.7652], [-3.6533], [-3.6642], [-3.7249], [-3.7714], [-3.7709], [-3.6496], [-3.5707],
             [-3.5742], [-3.6369], [-3.7370], [-3.9888], [-4.1180], [-4.1938], [-4.3030], [-4.4620], [-4.6258], [-4.7973], [-5.0267], [-5.0906],
             [-5.1643], [-5.1518], [-5.2571], [-5.2868], [-5.3991], [-5.4988], [-5.5740], [-5.7033], [-5.7849], [-5.8197], [-5.9224], [-5.8171],
             [-5.7680], [-5.6486], [-5.5940], [-5.5730], [-5.5224], [-5.4793], [-5.5243], [-5.6329], [-5.7697], [-5.8886], [-5.9992], [-6.0405],
             [-6.0295], [-5.9937], [-5.9651], [-5.8888], [-5.8137], [-5.7405], [-5.7429], [-5.8212], [-5.8967], [-5.9552], [-5.9658], [-5.9283],
             [-5.9219], [-5.9360], [-5.9943], [-6.0838], [-6.1482], [-6.2169], [-6.2732], [-6.3252], [-6.4438], [-6.6830], [-6.9697], [-7.1962],
             [-7.3519], [-7.3759], [-7.3302], [-7.1762], [-6.9551], [-6.7458], [-6.6292], [-6.5967]]).float()
     self._std = torch.tensor([[0.9304], [0.7729], [1.0068], [1.5478], [1.8270], [1.7940], [1.6933], [1.7043], [1.8344], [1.8844], [1.8506], [1.7672], [1.7807],
             [1.7977], [1.7882], [1.7599], [1.7680], [1.7909], [1.7831], [1.7588], [1.7445], [1.7822], [1.7940], [1.7761], [1.7961], [1.7989],
             [1.7818], [1.7519], [1.7466], [1.7335], [1.7068], [1.7336], [1.7537], [1.7538], [1.7427], [1.7253], [1.7055], [1.7193], [1.7359],
             [1.7460], [1.7527], [1.7514], [1.7380], [1.7031], [1.6757], [1.6612], [1.6603], [1.6675], [1.7022], [1.7513], [1.7748], [1.7932],
             [1.7957], [1.8250], [1.8481], [1.8137], [1.7564], [1.7130], [1.7024], [1.7243], [1.7348], [1.7485], [1.7810], [1.8169], [1.8318],
             [1.8312], [1.8427], [1.8756], [1.9143], [1.9503], [2.0072], [2.0761], [2.1519], [2.1848], [2.1574], [2.1386], [2.1442], [2.1601],
             [2.1547], [2.1208]]).float()
Ejemplo n.º 9
0
    def __init__(self, audiopaths_and_text, polyphone_dict_file,
                 mask_dict_file, hparams):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.stft = layers.TacotronSTFT(hparams.filter_length,
                                        hparams.hop_length, hparams.win_length,
                                        hparams.n_mel_channels,
                                        hparams.sampling_rate,
                                        hparams.mel_fmin, hparams.mel_fmax)

        # with codecs.open(polyphone_dict_file, 'r', 'utf-8') as usernames:
        # self.polyphone_dict = json.load(usernames)
        # with codecs.open(mask_dict_file, 'r', 'utf-8') as usernames:
        #     self.mask_dict = json.load(usernames)
        with codecs.open(hparams.class2idx, 'r', 'utf-8') as usernames:
            self.class2idx = json.load(usernames)
        print("num classes: {}".format(len(self.class2idx)))
        num_classes = len(self.class2idx)
        with codecs.open(hparams.merge_cedict, 'r', 'utf-8') as usernames:
            self.merge_cedict = json.load(usernames)

        self.tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

        random.seed(hparams.seed)
        random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 10
0
def create_mels():
    stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length,
                               hparams.win_length, hparams.n_mel_channels,
                               hparams.sampling_rate, hparams.mel_fmin,
                               hparams.mel_fmax)

    def save_mel(file):
        audio, sampling_rate = load_wav_to_torch(file)
        if sampling_rate != stft.sampling_rate:
            raise ValueError("{} {} SR doesn't match target {} SR".format(
                file, sampling_rate, stft.sampling_rate))
        audio_norm = audio / hparams.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0).cpu().numpy()
        np.save(file.replace('.wav', ''), melspec)

    import glob
    wavs = glob.glob('/media/cookie/Samsung 860 QVO/ClipperDatasetV2/**/*.wav',
                     recursive=True)
    print(str(len(wavs)) + " files being converted to mels")
    for index, i in tqdm(enumerate(wavs), smoothing=0, total=len(wavs)):
        if index < 0: continue
        try:
            save_mel(i)
        except Exception as ex:
            tqdm.write(i, " failed to process\n", ex, "\n")
    assert 0
Ejemplo n.º 11
0
def _process_utterance(out_dir, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.
  This writes the mel feature to disk and returns a tuple to write
  to the mels.txt file.
  Args:
    out_dir: The directory to write the spectrograms into
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file
  Returns:
    A (melspec, n_frames, text) tuple to write to mels.txt
  '''

    fid = os.path.splitext(os.path.basename(wav_path))[0]

    # case if mel already exist
    if hparams.mel_data_type == 'numpy':
        mel_path = os.path.join(out_dir, '{}.npy'.format(fid))
        if os.path.isfile(mel_path):
            melspec = torch.from_numpy(np.load(mel_path))
            return (mel_path, melspec.shape[1], text)
    elif hparams.mel_data_type == 'torch':
        mel_path = os.path.join(out_dir, '{}.pt'.format(fid))
        if os.path.isfile(mel_path):
            #melspec = torch.load(mel_path) # pkl is faster than torch here
            with open(mel_path, 'rb') as f:
                melspec = pkl.load(f)
            return (mel_path, melspec.shape[1], text)

    # case if mel has not been generated
    audio, sampling_rate = load_wav_to_torch(wav_path)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{}: {} SR doesn't match target {} SR".format(
            wav_path, sampling_rate, hparams.sampling_rate))
    audio_norm = audio / hparams.max_wav_value  # dim: #samples
    audio_norm = audio_norm.unsqueeze(0)  # dim: 1 X #samples
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    # over-riding win_length/hop_length if win_length_ms/hop_length_ms are specified
    if hasattr(hparams, 'win_length_ms'):
        hparams.win_length = int(hparams.win_length_ms / 1000 *
                                 hparams.sampling_rate)
    if hasattr(hparams, 'hop_length_ms'):
        hparams.hop_length = int(hparams.hop_length_ms / 1000 *
                                 hparams.sampling_rate)
    stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length,
                               hparams.win_length, hparams.n_mel_channels,
                               hparams.sampling_rate, hparams.mel_fmin,
                               hparams.mel_fmax)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = torch.squeeze(melspec, 0)

    if hparams.mel_data_type == 'numpy':
        np.save(mel_path, melspec.numpy(), allow_pickle=False)
    elif hparams.mel_data_type == 'torch':
        #torch.save(melspec, mel_path) # pkl is faster than torch here
        with open(mel_path, 'wb') as f:
            pkl.dump(melspec, f, protocol=pkl.HIGHEST_PROTOCOL)

    # Return a tuple describing this training example:
    return (mel_path, melspec.shape[1], text)
Ejemplo n.º 12
0
def _process_utterance(in_dir, out_dir, wav_path, txt_path):
  '''Preprocesses a single utterance audio/text pair.
  This writes the mel feature to disk and returns a tuple to write
  to the mels.txt file.
  Args:
    out_dir: The directory to write the spectrograms into
    wav_path: Path to the audio file containing the speech input
    txt_path: Path to the text file containing the text of speech input
  Returns:
    A (melspec, n_frames, text) tuple to write to mels.txt
  '''

  # get text
  text = open(txt_path, 'r').readline().rstrip()

  # case if mel already exist
  if hparams.mel_data_type == 'numpy':
    mel_path = wav_path.replace('.wav', '.npy')
    mel_path = mel_path.replace(in_dir, out_dir)
    if os.path.isfile(mel_path):
      melspec = torch.from_numpy(np.load(mel_path))
      return (mel_path, melspec.shape[1], text)
  elif hparams.mel_data_type == 'torch':
    mel_path = wav_path.replace('.wav', '.pt')
    mel_path = mel_path.replace(in_dir, out_dir)
    if os.path.isfile(mel_path):
      #melspec = torch.load(mel_path) # pkl is faster than torch here
      with open(mel_path, 'rb') as f:
        melspec = pkl.load(f)
      return (mel_path, melspec.shape[1], text)

  # case if mel has not been generated
  audio, sampling_rate = load_wav_to_torch(wav_path)
  if sampling_rate != hparams.sampling_rate:
    raise ValueError("{}: {} SR doesn't match target {} SR".format(
      wav_path, sampling_rate, hparams.sampling_rate))
  audio_norm = audio / hparams.max_wav_value # dim: #samples
  audio_norm = audio_norm.unsqueeze(0) # dim: 1 X #samples
  audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
  stft = layers.TacotronSTFT(
    hparams.filter_length, hparams.hop_length, hparams.win_length,
    hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
    hparams.mel_fmax)
  melspec = stft.mel_spectrogram(audio_norm)
  melspec = torch.squeeze(melspec, 0)

  if hparams.mel_data_type == 'numpy':
    np.save(mel_path, melspec.numpy(), allow_pickle=False)
  elif hparams.mel_data_type == 'torch':
    #torch.save(melspec, mel_path) # pkl is faster than torch here
    with open(mel_path, 'wb') as f:
      pkl.dump(melspec, f, protocol=pkl.HIGHEST_PROTOCOL)

  # Return a tuple describing this training example:
  return (mel_path, melspec.shape[1], text)
Ejemplo n.º 13
0
 def __init__(self, audiopaths_and_text, hparams, check_files=True, TBPTT=True, shuffle=False, speaker_ids=None, audio_offset=0, verbose=False):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.truncated_length = hparams.truncated_length
     self.batch_size = hparams.batch_size
     self.speaker_ids = speaker_ids
     self.audio_offset = audio_offset
     self.shuffle = shuffle
     if speaker_ids is None:
         self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
     
     self.load_torchmoji = hparams.torchMoji_training and hparams.torchMoji_linear
     
     # ---------- CHECK FILES --------------
     self.start_token = hparams.start_token
     self.stop_token = hparams.stop_token
     if check_files:
         self.checkdataset(verbose)
     # -------------- CHECK FILES --------------
     
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     
     self.sampling_rate = hparams.sampling_rate
     self.filter_length = hparams.filter_length
     self.hop_length = hparams.hop_length
     
     # Apply weighting to MLP Datasets
     duplicated_audiopaths = [x for x in self.audiopaths_and_text if "SlicedDialogue" in x[0]]
     for i in range(3):
         self.audiopaths_and_text.extend(duplicated_audiopaths)
     
     # SHUFFLE audiopaths
     random.seed(hparams.seed)
     self.random_seed = hparams.seed
     random.shuffle(self.audiopaths_and_text)
     
     self.batch_size = hparams.batch_size if speaker_ids is None else hparams.val_batch_size
     n_gpus = hparams.n_gpus
     self.rank = hparams.rank
     self.total_batch_size = self.batch_size * n_gpus # number of audio files being processed together
     self.truncated_length = hparams.truncated_length # frames
     
     # -------------- PREDICT LENGTH (TBPTT) --------------
     if hparams.use_TBPTT:
         self.audio_lengths = torch.tensor([self.get_mel(x[0]).shape[1] for x in self.audiopaths_and_text]) # get the length of every file (the long way)
     else:
         self.audio_lengths = torch.tensor([self.truncated_length-1 for x in self.audiopaths_and_text]) # use dummy lengths
     self.update_dataloader_indexes()
Ejemplo n.º 14
0
def get_mel(hparams, filename):

    audio, sampling_rate = load_wav_to_torch(filename)
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = layers.TacotronSTFT(hparams).mel_spectrogram(audio_norm)
    melspec = torch.squeeze(melspec, 0)
    length = torch.LongTensor(1)
    length = melspec.size(1)
    return melspec, length
Ejemplo n.º 15
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     # Perform Checks on Dataset
     i = 0
     i_offset = 0
     for i_ in range(len(self.audiopaths_and_text)):
         i = i_ + i_offset
         if i == len(self.audiopaths_and_text): break
         file = self.audiopaths_and_text[i]
         if self.load_mel_from_disk and '.wav' in file[0]:
             print(".wav file", file[0], "\n[warning] in filelist while expecting '.npy' . Being Ignored.")
             self.audiopaths_and_text.remove(file)
             i_offset-=1
             continue
         elif not self.load_mel_from_disk and '.npy' in file[0]:
                 print(".npy file", file[0], "\n[warning] in filelist while expecting '.wav' . Being Ignored.")
                 self.audiopaths_and_text.remove(file)
                 i_offset-=1
                 continue
         if (not os.path.exists(file[0])):
             print("|".join(file), "\n[warning] does not exist and has been ignored")
             self.audiopaths_and_text.remove(file)
             i_offset-=1
             continue
         if not len(file[1]):
             print("|".join(file), "\n[warning] has no text and has been ignored.")
             self.audiopaths_and_text.remove(file)
             i_offset-=1
             continue
         if len(file[1]) < 3:
             print("|".join(file), "\n[info] has no/very little text.")
         if not ((file[1].strip())[-1] in r"!?,.;:␤"):
             print("|".join(file), "\n[info] has no ending punctuation.")
         if self.load_mel_from_disk:
             melspec = torch.from_numpy(np.load(file[0], allow_pickle=True))
             mel_length = melspec.shape[1]
             if mel_length == 0:
                 print("|".join(file), "\n[warning] has 0 duration and has been ignored")
                 self.audiopaths_and_text.remove(file)
                 i_offset-=1
                 continue
     
     # init STFT (not used for load_mel_from_disk)
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 16
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
     self.embedding_map = self.load_embedding()
     print('Load embedding for:',list(self.embedding_map.keys()))
Ejemplo n.º 17
0
 def __init__(self, audiopaths_and_text, hparams, shuffle=True):
     self.audiopaths_and_text = load_filepaths_and_text(
         audiopaths_and_text, hparams.sort_by_length)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(1234)
     if shuffle:
         random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 18
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     if hparams.ipa_preprocessing:
         convert_to_ipa(self.audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(hparams.seed)
     random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 19
0
    def __init__(self, audiopaths_and_text, hparams):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.hparams=hparams
        ###一个短时傅里叶变换器,用来将waveform转换成mel-spectrogram
        self.stft = layers.TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
            hparams.mel_fmax)

        random.seed(hparams.seed)
        random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 20
0
def get_mel(hparams, path):
    stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length,
                               hparams.win_length, hparams.n_mel_channels,
                               hparams.sampling_rate, hparams.mel_fmin,
                               hparams.mel_fmax)
    audio, sampling_rate = load_wav_to_torch(path)
    if sampling_rate != stft.sampling_rate:
        raise ValueError("{} {} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = torch.squeeze(melspec, 0)
    return melspec
Ejemplo n.º 21
0
 def __init__(self, audiopaths_and_text, hparams, shuffle=True):
     self.audiopaths_and_text = load_filepaths_and_text(
         audiopaths_and_text, hparams.sort_by_length)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(1234)
     if shuffle:
         random.shuffle(self.audiopaths_and_text)
     self.all_pairs = [self.get_mel_text_pair(self.audiopaths_and_text[index]) for index in tqdm.trange(self.__len__())]
Ejemplo n.º 22
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     print(
         f'samples {len(self.audiopaths_and_text)} will go over {len(self.audiopaths_and_text)/hparams.batch_size} step on batch size {hparams.batch_size}'
     )
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 23
0
 def __init__(self, lstfile, hparams):
     self.fbs, self.fb_text_dict = load_fbs_and_fb_text_dict(
         lstfile, hparams.lab_path)
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.audio_path = hparams.audio_path
     self.mel_path = hparams.mel_path
     self.MelStd_mel = hparams.MelStd_mel
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.fbs)
Ejemplo n.º 24
0
 def __init__(self, audiopaths_and_text, hparams, warp_set="og"):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.n_speakers = hparams.speaker_num
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     # self.mel_time_warping = hparams.mel_time_warping
     # self.mel_time_length_adjustment = hparams.mel_time_length_adjustment
     # self.mel_time_length_adjustment_double = hparams.mel_time_length_adjustment_double
     # self.mel_time_mask = hparams.mel_time_mask
     # self.mel_freq_mask = hparams.mel_freq_mask
     # self.mel_freq_warping = hparams.mel_freq_warping
     self.value_adjustmet = hparams.value_adjustmet
     self.stft = layers.TacotronSTFT(hparams)
     self.warp_set = warp_set
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_dataset(
         audiopaths_and_text, separator=hparams.data_separator)
     self.speaker_field = hparams.speaker_field
     self.audio_field = hparams.audio_field
     self.text_field = hparams.text_field
     self.speaker_encoder = self.fit_speaker_encoder()
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 26
0
    def __init__(self,
                 audiopaths_and_text,
                 hparams,
                 speaker_ids=None,
                 output_directory=None):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        # add
        self.filter_length = hparams.filter_length
        self.hop_length = hparams.hop_length
        self.f0_min = hparams.f0_min
        self.f0_max = hparams.f0_max
        self.harm_thresh = hparams.harm_thresh
        self.p_arpabet = hparams.p_arpabet

        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.stft = layers.TacotronSTFT(hparams.filter_length,
                                        hparams.hop_length, hparams.win_length,
                                        hparams.n_mel_channels,
                                        hparams.sampling_rate,
                                        hparams.mel_fmin, hparams.mel_fmax)

        self.cmudict = None
        if hparams.cmudict_path is not None:
            self.cmudict = cmudict.CMUDict(hparams.cmudict_path)

        self.speaker_ids = speaker_ids
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)

        # print speaker_lookup_table
        if not (output_directory is None) and not (self.speaker_ids is None):
            speaker_id_path = os.path.join(output_directory, 'speaker_ids.txt')

            with open(speaker_id_path, 'w', encoding='utf-8') as f:
                for key, value in self.speaker_ids.items():
                    f.write('{}: {}\n'.format(key, value))

        random.seed(hparams.seed)
        random.shuffle(self.audiopaths_and_text)
    def __init__(self, audiopaths_and_text, hparams, speaker_ids=None):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.stft = layers.TacotronSTFT(hparams.filter_length,
                                        hparams.hop_length, hparams.win_length,
                                        hparams.n_mel_channels,
                                        hparams.sampling_rate,
                                        hparams.mel_fmin, hparams.mel_fmax)

        self.speaker_ids = speaker_ids
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)

        random.seed(hparams.seed)
        random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 28
0
 def __init__(self, audiopaths_and_text, hparams, shuffle=True):
     self.audiopaths_and_text = load_filepaths_and_text(
         audiopaths_and_text, hparams.sort_by_length)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.speaker_encoder = layers.SpeakerEncoder(hparams.num_mel, )
     self.speaker_encoder.load_model(hparams.se_checkpoint)
     self.speaker_encoder.eval()
     self.hparms = hparams
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(1234)
     if shuffle:
         random.shuffle(self.audiopaths_and_text)
Ejemplo n.º 29
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.mel_data_type = hparams.mel_data_type
     if hasattr(hparams, 'win_length_ms'):
         hparams.win_length = int(hparams.win_length_ms / 1000 *
                                  hparams.sampling_rate)
     if hasattr(hparams, 'hop_length_ms'):
         hparams.hop_length = int(hparams.hop_length_ms / 1000 *
                                  hparams.sampling_rate)
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(hparams.seed)
     random.shuffle(self.audiopaths_and_text)
    def __init__(self, hparams, is_train=True, combine_ratio=0.5):

        # audio data: file_path|text
        data_1 = []
        data_2 = []

        file_name_1 = None
        file_name_2 = None

        if is_train:
            file_name_1 = hparams.training_files_1
            file_name_2 = hparams.training_files_2
        else:
            file_name_1 = hparams.validate_files_1
            file_name_2 = hparams.validate_files_2

        data_1 = load_filepaths_and_text(file_name_1)
        data_2 = load_filepaths_and_text(file_name_2)
        shuffle(data_2)

        len_data_1 = len(data_1)
        len_data_2 = int((1 - combine_ratio) * len_data_1 / combine_ratio)
        len_data_2 = min(len(data_2), len_data_2)

        data_2 = data_2[:len_data_2]

        self.audiopaths_and_text = data_1
        self.audiopaths_and_text.extend(data_2)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.stft = layers.TacotronSTFT(hparams.filter_length,
                                        hparams.hop_length, hparams.win_length,
                                        hparams.n_mel_channels,
                                        hparams.sampling_rate,
                                        hparams.mel_fmin, hparams.mel_fmax)
        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)
        # print(*self.audiopaths_and_text[:10], sep="\n")
        self.hparams = hparams