Exemple #1
0
def create_mels(training_filelist, validation_filelist, threads):
    import glob
    audiopaths = []
    audiopaths.extend(
        list(set([x[0] for x in load_filepaths_and_text(training_filelist)
                  ])))  # add all unique audio paths for training data
    audiopaths.extend(
        list(set([x[0] for x in load_filepaths_and_text(validation_filelist)
                  ])))  # add all unique audio paths for validation data
    print(str(len(audiopaths)) + " files being converted to mels")
    multiprocess_arr(multiprocess_gen_mels, audiopaths, threads=threads)
Exemple #2
0
def create_mels(training_filelist, validation_filelist, threads):
    import glob
    #    audiopaths = glob.glob('/media/cookie/Samsung 860 QVO/ClipperDatasetV2/**/*.wav',recursive=True)
    audiopaths = []
    audiopaths.extend([
        x[0] for x in load_filepaths_and_text(training_filelist)
    ])  # add audio paths for training data
    audiopaths.extend([
        x[0] for x in load_filepaths_and_text(validation_filelist)
    ])  # add audio paths for validation data
    print(str(len(audiopaths)) + " files being converted to mels")
    multiprocess_arr(multiprocess_gen_mels, audiopaths, threads=threads)
    def __init__(self, audiopaths_and_text, polyphone_dict_file,
                 mask_dict_file, hparams):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.stft = layers.TacotronSTFT(hparams.filter_length,
                                        hparams.hop_length, hparams.win_length,
                                        hparams.n_mel_channels,
                                        hparams.sampling_rate,
                                        hparams.mel_fmin, hparams.mel_fmax)

        # with codecs.open(polyphone_dict_file, 'r', 'utf-8') as usernames:
        # self.polyphone_dict = json.load(usernames)
        # with codecs.open(mask_dict_file, 'r', 'utf-8') as usernames:
        #     self.mask_dict = json.load(usernames)
        with codecs.open(hparams.class2idx, 'r', 'utf-8') as usernames:
            self.class2idx = json.load(usernames)
        print("num classes: {}".format(len(self.class2idx)))
        num_classes = len(self.class2idx)
        with codecs.open(hparams.merge_cedict, 'r', 'utf-8') as usernames:
            self.merge_cedict = json.load(usernames)

        self.tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

        random.seed(hparams.seed)
        random.shuffle(self.audiopaths_and_text)
Exemple #4
0
    def __init__(self, audiopaths_and_text, hparams, speaker_ids=None, mode='train'):
        if mode == 'train':
            self.audiopaths_and_text = load_filepaths_and_text_train(audiopaths_and_text, split='\t')
            self.mode = True
        else:
            self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text, split='\t')
            self.mode = False
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.stft = layers.TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
            hparams.mel_fmax)
        self.sampling_rate = hparams.sampling_rate
        self.filter_length = hparams.filter_length
        self.hop_length = hparams.hop_length
        self.f0_min = hparams.f0_min
        self.f0_max = hparams.f0_max
        self.harm_thresh = hparams.harm_thresh
        self.p_arpabet = hparams.p_arpabet

        self.cmudict = None
        if hparams.cmudict_path is not None:
            self.cmudict = cmudict.CMUDict(hparams.cmudict_path)

        self.speaker_ids = speaker_ids

        if self.speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
 def __init__(self, audiopaths_and_text, hparams, max_len=40):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
     self._max_len = max_len
     self._epoch = 0
     # self._normalize()
     self._mean = torch.tensor([[-7.0222], [-6.1906], [-5.1736], [-4.2412], [-3.7652], [-3.6533], [-3.6642], [-3.7249], [-3.7714], [-3.7709], [-3.6496], [-3.5707],
             [-3.5742], [-3.6369], [-3.7370], [-3.9888], [-4.1180], [-4.1938], [-4.3030], [-4.4620], [-4.6258], [-4.7973], [-5.0267], [-5.0906],
             [-5.1643], [-5.1518], [-5.2571], [-5.2868], [-5.3991], [-5.4988], [-5.5740], [-5.7033], [-5.7849], [-5.8197], [-5.9224], [-5.8171],
             [-5.7680], [-5.6486], [-5.5940], [-5.5730], [-5.5224], [-5.4793], [-5.5243], [-5.6329], [-5.7697], [-5.8886], [-5.9992], [-6.0405],
             [-6.0295], [-5.9937], [-5.9651], [-5.8888], [-5.8137], [-5.7405], [-5.7429], [-5.8212], [-5.8967], [-5.9552], [-5.9658], [-5.9283],
             [-5.9219], [-5.9360], [-5.9943], [-6.0838], [-6.1482], [-6.2169], [-6.2732], [-6.3252], [-6.4438], [-6.6830], [-6.9697], [-7.1962],
             [-7.3519], [-7.3759], [-7.3302], [-7.1762], [-6.9551], [-6.7458], [-6.6292], [-6.5967]]).float()
     self._std = torch.tensor([[0.9304], [0.7729], [1.0068], [1.5478], [1.8270], [1.7940], [1.6933], [1.7043], [1.8344], [1.8844], [1.8506], [1.7672], [1.7807],
             [1.7977], [1.7882], [1.7599], [1.7680], [1.7909], [1.7831], [1.7588], [1.7445], [1.7822], [1.7940], [1.7761], [1.7961], [1.7989],
             [1.7818], [1.7519], [1.7466], [1.7335], [1.7068], [1.7336], [1.7537], [1.7538], [1.7427], [1.7253], [1.7055], [1.7193], [1.7359],
             [1.7460], [1.7527], [1.7514], [1.7380], [1.7031], [1.6757], [1.6612], [1.6603], [1.6675], [1.7022], [1.7513], [1.7748], [1.7932],
             [1.7957], [1.8250], [1.8481], [1.8137], [1.7564], [1.7130], [1.7024], [1.7243], [1.7348], [1.7485], [1.7810], [1.8169], [1.8318],
             [1.8312], [1.8427], [1.8756], [1.9143], [1.9503], [2.0072], [2.0761], [2.1519], [2.1848], [2.1574], [2.1386], [2.1442], [2.1601],
             [2.1547], [2.1208]]).float()
Exemple #6
0
def save_checkpoint(model, optimizer, learning_rate, iteration, hparams,
                    best_validation_loss, average_loss, speaker_id_lookup,
                    filepath):
    from utils import load_filepaths_and_text
    tqdm.write("Saving model and optimizer state at iteration {} to {}".format(
        iteration, filepath))

    # get speaker names to ID
    speakerlist = load_filepaths_and_text(hparams.speakerlist)
    speaker_name_lookup = {
        x[1]: speaker_id_lookup[x[2]]
        for x in speakerlist if x[2] in speaker_id_lookup.keys()
    }

    torch.save(
        {
            'iteration': iteration,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'learning_rate': learning_rate,
            #'amp': amp.state_dict(),
            'hparams': hparams,
            'speaker_id_lookup': speaker_id_lookup,
            'speaker_name_lookup': speaker_name_lookup,
            'best_validation_loss': best_validation_loss,
            'average_loss': average_loss
        },
        filepath)
    tqdm.write("Saving Complete")
Exemple #7
0
    def __init__(self, audiopaths_and_text, hparams, speaker_ids=None):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.stft = layers.TacotronSTFT(hparams.filter_length,
                                        hparams.hop_length, hparams.win_length,
                                        hparams.n_mel_channels,
                                        hparams.sampling_rate,
                                        hparams.mel_fmin, hparams.mel_fmax)
        self.sampling_rate = hparams.sampling_rate
        self.filter_length = hparams.filter_length
        self.hop_length = hparams.hop_length
        self.f0_min = hparams.f0_min
        self.f0_max = hparams.f0_max
        self.harm_thresh = hparams.harm_thresh
        self.p_arpabet = hparams.p_arpabet

        self.cmudict = None
        if hparams.cmudict_path is not None:
            self.cmudict = cmudict.CMUDict(hparams.cmudict_path)

        self.speaker_ids = speaker_ids
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)

        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)
Exemple #8
0
 def __init__(self, audiopaths_and_text, hparams):
     self.hparams = hparams
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(hparams.seed)
     random.shuffle(self.audiopaths_and_text)
     if hparams.use_cmudict:
         if not os.path.isfile(hparams.cmudict_path):
             raise Exception(
                 'If use_cmudict=True, you must download ' +
                 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'
                 % cmudict_path)
         if hparams.p_cmudict == 1.0:
             self._cmudict = cmudict.CMUDict(str(cmudict_path),
                                             keep_ambiguous=True)
         else:
             self._cmudict = cmudict.CMUDict(str(cmudict_path),
                                             keep_ambiguous=False)
         print('Loaded CMUDict with %d unambiguous entries' %
               len(self._cmudict))
     else:
         self._cmudict = None
Exemple #9
0
    def __init__(self, dataset, experiment, hparams, load_durations):
        self.experiment = experiment
        self.audiopaths_and_text = load_filepaths_and_text(dataset, experiment, hparams)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.hparams = hparams
        self.load_durations = load_durations
        self.durations_dir = os.path.join(experiment.paths["acoustic_features"], "dur")
        if hparams.preprocessing_type == "vocalid":
            # vocalid preprocessing is never on the fly
            self.load_mel_from_disk = True
        else:
            self.stft = layers.TacotronSTFT(
                hparams.filter_length, hparams.hop_length, hparams.win_length,
                hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                hparams.mel_fmax)

        #TODO: will go to preprocessing
        self.textanalyzer = TextAnalyzer(use_phones=hparams.use_phonemes,
                                         g2p_backend=hparams.g2p_backend, language=hparams.language)
        self._phone_cache_dir = os.path.join(experiment.paths["acoustic_features"], "utt")
        self._hparams = hparams
        print(f"Creating new in-memory phone cache")
        self._phoneme_cache = {}
        os.makedirs(self._phone_cache_dir, exist_ok=True)
        # fill phoneme cache first time before multiprocessing clones this data
        for paths in self.audiopaths_and_text:
            self.get_mel_text_pair(paths, dummy_mel=True)
        random.seed(hparams.seed)
        random.shuffle(self.audiopaths_and_text)
    def __init__(self, melpaths_and_text, hparams):
        self.melpaths_and_text = load_filepaths_and_text(melpaths_and_text)
        self.text_cleaners = hparams.text_cleaners

        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        random.seed(hparams.seed)
        random.shuffle(self.melpaths_and_text)
Exemple #11
0
 def __init__(self, audiopaths_and_text, hparams, check_files=True, TBPTT=True, shuffle=False, speaker_ids=None, audio_offset=0, verbose=False):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.truncated_length = hparams.truncated_length
     self.batch_size = hparams.batch_size
     self.speaker_ids = speaker_ids
     self.audio_offset = audio_offset
     self.shuffle = shuffle
     if speaker_ids is None:
         self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
     
     self.load_torchmoji = hparams.torchMoji_training and hparams.torchMoji_linear
     
     # ---------- CHECK FILES --------------
     self.start_token = hparams.start_token
     self.stop_token = hparams.stop_token
     if check_files:
         self.checkdataset(verbose)
     # -------------- CHECK FILES --------------
     
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     
     self.sampling_rate = hparams.sampling_rate
     self.filter_length = hparams.filter_length
     self.hop_length = hparams.hop_length
     
     # Apply weighting to MLP Datasets
     duplicated_audiopaths = [x for x in self.audiopaths_and_text if "SlicedDialogue" in x[0]]
     for i in range(3):
         self.audiopaths_and_text.extend(duplicated_audiopaths)
     
     # SHUFFLE audiopaths
     random.seed(hparams.seed)
     self.random_seed = hparams.seed
     random.shuffle(self.audiopaths_and_text)
     
     self.batch_size = hparams.batch_size if speaker_ids is None else hparams.val_batch_size
     n_gpus = hparams.n_gpus
     self.rank = hparams.rank
     self.total_batch_size = self.batch_size * n_gpus # number of audio files being processed together
     self.truncated_length = hparams.truncated_length # frames
     
     # -------------- PREDICT LENGTH (TBPTT) --------------
     if hparams.use_TBPTT:
         self.audio_lengths = torch.tensor([self.get_mel(x[0]).shape[1] for x in self.audiopaths_and_text]) # get the length of every file (the long way)
     else:
         self.audio_lengths = torch.tensor([self.truncated_length-1 for x in self.audiopaths_and_text]) # use dummy lengths
     self.update_dataloader_indexes()
Exemple #12
0
    def __init__(self, melpaths_and_text, hparams):
        self.melpaths_and_text = load_filepaths_and_text(melpaths_and_text)
        self.text_cleaners = hparams.text_cleaners

        with codecs.open(hparams.class2idx, 'r', 'utf-8') as usernames:
            self.class2idx = json.load(usernames)

        self.tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

        random.seed(hparams.seed)
        random.shuffle(self.melpaths_and_text)
    def __init__(self, hparams, is_train=True, combine_ratio=0.5):

        # audio data: file_path|text
        data_1 = []
        data_2 = []

        file_name_1 = None
        file_name_2 = None

        if is_train:
            file_name_1 = hparams.training_files_1
            file_name_2 = hparams.training_files_2
        else:
            file_name_1 = hparams.validate_files_1
            file_name_2 = hparams.validate_files_2

        data_1 = load_filepaths_and_text(file_name_1)
        data_2 = load_filepaths_and_text(file_name_2)
        shuffle(data_2)

        len_data_1 = len(data_1)
        len_data_2 = int((1 - combine_ratio) * len_data_1 / combine_ratio)
        len_data_2 = min(len(data_2), len_data_2)

        data_2 = data_2[:len_data_2]

        self.audiopaths_and_text = data_1
        self.audiopaths_and_text.extend(data_2)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.stft = layers.TacotronSTFT(hparams.filter_length,
                                        hparams.hop_length, hparams.win_length,
                                        hparams.n_mel_channels,
                                        hparams.sampling_rate,
                                        hparams.mel_fmin, hparams.mel_fmax)
        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)
        # print(*self.audiopaths_and_text[:10], sep="\n")
        self.hparams = hparams
Exemple #14
0
    def update_tt(self, tacotron_name):
        self.model, self.ttm_hparams, self.ttm_sp_name_lookup, self.ttm_sp_id_lookup = self.load_tacotron2(
            self.conf['TTM']['models'][tacotron_name]['modelpath'])
        self.ttm_current = tacotron_name

        if self.conf['TTM'][
                'use_speaker_ids_file_override']:  # (optional) override
            self.ttm_sp_name_lookup = {
                name: self.ttm_sp_id_lookup[int(ext_id)]
                for _, name, ext_id in load_filepaths_and_text(
                    self.conf['TTM']['speaker_ids_file'])
            }
Exemple #15
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.sampling_rate = hparams.sampling_rate
     # self.max_wav_value = hparams.max_wav_value
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
Exemple #16
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     # Perform Checks on Dataset
     i = 0
     i_offset = 0
     for i_ in range(len(self.audiopaths_and_text)):
         i = i_ + i_offset
         if i == len(self.audiopaths_and_text): break
         file = self.audiopaths_and_text[i]
         if self.load_mel_from_disk and '.wav' in file[0]:
             print(".wav file", file[0], "\n[warning] in filelist while expecting '.npy' . Being Ignored.")
             self.audiopaths_and_text.remove(file)
             i_offset-=1
             continue
         elif not self.load_mel_from_disk and '.npy' in file[0]:
                 print(".npy file", file[0], "\n[warning] in filelist while expecting '.wav' . Being Ignored.")
                 self.audiopaths_and_text.remove(file)
                 i_offset-=1
                 continue
         if (not os.path.exists(file[0])):
             print("|".join(file), "\n[warning] does not exist and has been ignored")
             self.audiopaths_and_text.remove(file)
             i_offset-=1
             continue
         if not len(file[1]):
             print("|".join(file), "\n[warning] has no text and has been ignored.")
             self.audiopaths_and_text.remove(file)
             i_offset-=1
             continue
         if len(file[1]) < 3:
             print("|".join(file), "\n[info] has no/very little text.")
         if not ((file[1].strip())[-1] in r"!?,.;:␤"):
             print("|".join(file), "\n[info] has no ending punctuation.")
         if self.load_mel_from_disk:
             melspec = torch.from_numpy(np.load(file[0], allow_pickle=True))
             mel_length = melspec.shape[1]
             if mel_length == 0:
                 print("|".join(file), "\n[warning] has 0 duration and has been ignored")
                 self.audiopaths_and_text.remove(file)
                 i_offset-=1
                 continue
     
     # init STFT (not used for load_mel_from_disk)
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
Exemple #17
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams['text_cleaners']
     self.max_wav_value = hparams['max_wav_value']
     self.sampling_rate = hparams['sampling_rate']
     self.load_mel_from_disk = hparams['load_mel_from_disk']
     self.stft = TacotronSTFT(hparams['filter_length'],
                              hparams['hop_length'], hparams['win_length'],
                              hparams['n_mel_channels'],
                              hparams['sampling_rate'], hparams['mel_fmin'],
                              hparams['mel_fmax'])
     random.seed(hparams['seed'])
     random.shuffle(self.audiopaths_and_text)
Exemple #18
0
 def __init__(self, audiopaths_and_text, hparams, shuffle=True):
     self.audiopaths_and_text = load_filepaths_and_text(
         audiopaths_and_text, hparams.sort_by_length)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(1234)
     if shuffle:
         random.shuffle(self.audiopaths_and_text)
Exemple #19
0
 def __init__(self, split, hparams):
     audiopaths_and_text = hp.tran_file_format.format(
         split)  # train, cv & test
     print(audiopaths_and_text)
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
    def __init__(self, audiopaths, conf, valid=False):
        data_conf = conf['data']
        self.is_norm = data_conf['is_norm']
        self.is_valid = valid

        if 'mel' in data_conf['batch']:
            self.use_mel = True
        else:
            self.use_mel = False
        self.data_name = data_conf['data_name']

        self.sampling_rate = data_conf['sampling_rate'] if data_conf[
            'sampling_rate'] else None
        self.n_fft = data_conf['n_fft'] if data_conf['n_fft'] else None
        self.hop_length = data_conf['hop_length'] if data_conf[
            'hop_length'] else None
        self.win_length = data_conf['win_length'] if data_conf[
            'win_length'] else None
        self.n_mel = data_conf['n_mel'] if data_conf['n_mel'] else None
        self.audio_refdB = data_conf['audio_refdB'] if data_conf[
            'audio_refdB'] else None
        self.audio_maxdB = data_conf['audio_maxdB'] if data_conf[
            'audio_maxdB'] else None
        self.reduction_factor = data_conf['reduction_factor'] if data_conf[
            'reduction_factor'] else None
        self.segment_length = data_conf['segment_length'] if data_conf[
            'segment_length'] else None
        self.text_cleaners = data_conf['text_cleaners'] if data_conf[
            'text_cleaners'] else None

        self.use_audio = True if 'audio' in data_conf['batch'] else False
        self.use_audio_seg = True if 'audio_seg' in data_conf[
            'batch'] else False
        self.use_mel_seg = True if 'mel_seg' in data_conf['batch'] else False
        self.use_coarse_mel = True if self.reduction_factor is not None and self.reduction_factor > 1 else False
        self.use_mel = True if 'mel' in data_conf['batch'] else False
        self.use_text = True if 'text' in data_conf['batch'] else False
        self.use_attn_guide = True if 'attn_guide' in data_conf[
            'batch'] else False
        self.use_attn_mask = True if 'attn_mask' in data_conf[
            'batch'] else False
        self.use_tvmt = True if 'tvmt' in data_conf['batch'] else False
        self.use_attn_mask2 = True if 'attn_mask2' in data_conf[
            'batch'] else False

        self.load_mel_from_disk = conf['load_mel_from_disk']
        self.audiopaths = load_filepaths_and_text(audiopaths)

        random.seed(1234)
        random.shuffle(self.audiopaths)
Exemple #21
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     if hparams.ipa_preprocessing:
         convert_to_ipa(self.audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(hparams.seed)
     random.shuffle(self.audiopaths_and_text)
Exemple #22
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
     self.embedding_map = self.load_embedding()
     print('Load embedding for:',list(self.embedding_map.keys()))
Exemple #23
0
 def __init__(self, audiopaths_and_text, hparams, return_file_name=None):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     self.return_file_name = return_file_name
     random.seed(hparams.seed)
     random.shuffle(self.audiopaths_and_text)
Exemple #24
0
def prepare_mel_meta(hparams, audiopath_and_text):

    audiopath_and_texts = load_filepaths_and_text(audiopath_and_text)

    with open(os.path.join('./filelists', 'metadata_mel10_val.csv'), 'w', encoding='utf-8') as csvfile:
        for i in range(len(audiopath_and_texts)):
            audiopath, text, speaker_id = audiopath_and_texts[i][0], audiopath_and_texts[i][1], audiopath_and_texts[i][2]

            out_dir = audiopath[:11]
            file_name = audiopath[12:-4]

            file_path = os.path.join(out_dir, file_name+'.npy')
            wr = csv.writer(csvfile, delimiter='|')
            wr.writerow([file_path, text, speaker_id])
    pass
Exemple #25
0
 def __init__(self, audiopaths_and_text, hparams, shuffle=True):
     self.audiopaths_and_text = load_filepaths_and_text(
         audiopaths_and_text, hparams.sort_by_length)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(1234)
     if shuffle:
         random.shuffle(self.audiopaths_and_text)
     self.all_pairs = [self.get_mel_text_pair(self.audiopaths_and_text[index]) for index in tqdm.trange(self.__len__())]
Exemple #26
0
    def __init__(self, audiopaths_and_text, hparams):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.hparams=hparams
        ###一个短时傅里叶变换器,用来将waveform转换成mel-spectrogram
        self.stft = layers.TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
            hparams.mel_fmax)

        random.seed(hparams.seed)
        random.shuffle(self.audiopaths_and_text)
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     print(
         f'samples {len(self.audiopaths_and_text)} will go over {len(self.audiopaths_and_text)/hparams.batch_size} step on batch size {hparams.batch_size}'
     )
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = layers.TacotronSTFT(hparams.filter_length,
                                     hparams.hop_length, hparams.win_length,
                                     hparams.n_mel_channels,
                                     hparams.sampling_rate,
                                     hparams.mel_fmin, hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
Exemple #28
0
 def __init__(
         self,
         audiopaths_and_texts,
         config,
         shuffle=True
     ):
     self.audiopaths_and_texts = load_filepaths_and_text(
         audiopaths_and_texts, config['sort_by_length'])
     self.text_cleaners = config['text_cleaners']
     random.seed(1234)
     if shuffle:
         random.shuffle(self.audiopaths_and_texts)
     self.gt_module = ConvModule(config)
     self.gt_module.load_state_dict(torch.load('conv_module.pt',
                                               map_location=lambda storage, loc: storage))
     _ = self.gt_module.cpu().eval()
Exemple #29
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.add_noise = hparams.add_noise
     self.add_space = hparams.add_space
     if getattr(hparams, "cmudict_path", None) is not None:
       self.cmudict = cmudict.CMUDict(hparams.cmudict_path)
     self.stft = commons.TacotronSTFT(
         hparams.filter_length, hparams.hop_length, hparams.win_length,
         hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
         hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
Exemple #30
0
    def __init__(self, conf):
        self.conf = conf

        # load Tacotron2
        self.ttm_current = self.conf['TTM']['default_model']
        assert self.ttm_current in self.conf['TTM']['models'].keys(
        ), "Tacotron default model not found in config models"
        tacotron_path = self.conf['TTM']['models'][self.ttm_current][
            'modelpath']  # get first available Tacotron
        self.tacotron, self.ttm_hparams, self.ttm_sp_name_lookup, self.ttm_sp_id_lookup = self.load_tacotron2(
            tacotron_path)

        # load WaveGlow
        self.MTW_current = self.conf['MTW']['default_model']
        assert self.MTW_current in self.conf['MTW']['models'].keys(
        ), "WaveGlow default model not found in config models"
        vocoder_path = self.conf['MTW']['models'][self.MTW_current][
            'modelpath']  # get first available waveglow
        vocoder_confpath = self.conf['MTW']['models'][
            self.MTW_current]['configpath']
        self.waveglow, self.MTW_denoiser, self.MTW_train_sigma, self.MTW_sp_id_lookup = self.load_waveglow(
            vocoder_path, vocoder_confpath)

        # load torchMoji
        if self.ttm_hparams.torchMoji_linear:  # if Tacotron includes a torchMoji layer
            self.tm_sentence_tokenizer, self.tm_torchmoji = self.load_torchmoji(
            )

        # override since my checkpoints are still missing speaker names
        if self.conf['TTM']['use_speaker_ids_file_override']:
            speaker_ids_fpath = self.conf['TTM']['speaker_ids_file']
            self.ttm_sp_name_lookup = {
                name: self.ttm_sp_id_lookup[int(ext_id)]
                for _, name, ext_id in load_filepaths_and_text(
                    speaker_ids_fpath)
            }

        # load arpabet/pronounciation dictionary
        dict_path = self.conf['dict_path']
        self.load_arpabet_dict(dict_path)

        # download nltk package for splitting text into sentences
        nltk.download('punkt')

        print("T2S Initialized and Ready!")