def __init__(self, filelist_path, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, max_wav_value, p_arpabet, cmudict_path, text_cleaners, speaker_ids=None, use_attn_prior=False, attn_prior_threshold=1e-4, randomize=True, seed=1234): self.max_wav_value = max_wav_value self.audiopaths_and_text = load_filepaths_and_text(filelist_path) self.use_attn_prior = use_attn_prior self.attn_prior_threshold = attn_prior_threshold if speaker_ids is None or speaker_ids == '': self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text) else: self.speaker_ids = speaker_ids self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.sampling_rate = sampling_rate self.text_cleaners = text_cleaners self.p_arpabet = p_arpabet self.cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=True) if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text) else: self.speaker_ids = speaker_ids random.seed(seed) if randomize: random.shuffle(self.audiopaths_and_text)
class Data(torch.utils.data.Dataset): def __init__(self, filelist_path, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, max_wav_value, p_arpabet, cmudict_path, text_cleaners, speaker_ids=None, use_attn_prior=False, attn_prior_threshold=1e-4, randomize=True, keep_ambiguous=False, seed=1234): self.max_wav_value = max_wav_value self.audiopaths_and_text = load_filepaths_and_text(filelist_path) self.use_attn_prior = use_attn_prior self.attn_prior_threshold = attn_prior_threshold self.keep_ambiguous = keep_ambiguous if speaker_ids is None or speaker_ids == '': self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) else: self.speaker_ids = speaker_ids self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.sampling_rate = sampling_rate self.text_cleaners = text_cleaners self.p_arpabet = p_arpabet self.cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=keep_ambiguous) if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) else: self.speaker_ids = speaker_ids random.seed(seed) if randomize: random.shuffle(self.audiopaths_and_text) def compute_attention_prior(self, audiopath, mel_length, text_length): attn_prior = beta_binomial_prior_distribution(text_length, mel_length) if self.attn_prior_threshold > 0: attn_prior = attn_prior.masked_fill( attn_prior < self.attn_prior_threshold, 0.0) return attn_prior def create_speaker_lookup_table(self, audiopaths_and_text): speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text])) d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))} print("Number of speakers :", len(d)) return d def get_mel(self, audio): audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def get_speaker_id(self, speaker_id): return torch.LongTensor([self.speaker_ids[int(speaker_id)]]) def get_text(self, text): text = _clean_text(text, self.text_cleaners) words = re.findall(r'\S*\{.*?\}\S*|\S+', text) text = ' '.join([ get_arpabet(word, self.cmudict) if random.random() < self.p_arpabet else word for word in words ]) text_norm = torch.LongTensor(text_to_sequence(text)) return text_norm def __getitem__(self, index): # Read audio and text audiopath, text, speaker_id = self.audiopaths_and_text[index] audio, sampling_rate = load_wav_to_torch(audiopath) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) mel = self.get_mel(audio) text_encoded = self.get_text(text) speaker_id = self.get_speaker_id(speaker_id) attn_prior = None if self.use_attn_prior: attn_prior = self.compute_attention_prior(audiopath, mel.shape[1], text_encoded.shape[0]) return (mel, speaker_id, text_encoded, attn_prior) def __len__(self): return len(self.audiopaths_and_text)
def __init__(self, filelist_path, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, max_wav_value, p_arpabet, cmudict_path, text_cleaners, speaker_ids=None, use_attn_prior=False, attn_prior_threshold=1e-4, prior_cache_path="", betab_scaling_factor=1.0, randomize=True, keep_ambiguous=False, seed=1234): self.max_wav_value = max_wav_value self.audiopaths_and_text = load_filepaths_and_text(filelist_path) self.use_attn_prior = use_attn_prior self.betab_scaling_factor = betab_scaling_factor self.attn_prior_threshold = attn_prior_threshold self.keep_ambiguous = keep_ambiguous if speaker_ids is None or speaker_ids == '': self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) else: self.speaker_ids = speaker_ids self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.sampling_rate = sampling_rate self.text_cleaners = text_cleaners self.p_arpabet = p_arpabet self.cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=keep_ambiguous) if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) else: self.speaker_ids = speaker_ids # caching makes sense for p_phoneme=1.0 # for other values, everytime text lengths will change self.prior_cache_path = prior_cache_path self.caching_enabled = False if (self.prior_cache_path is not None and self.prior_cache_path != "" and p_arpabet == 1.0): self.caching_enabled = True # make sure caching path exists if (self.caching_enabled and not os.path.exists(self.prior_cache_path)): os.makedirs(self.prior_cache_path) random.seed(seed) if randomize: random.shuffle(self.audiopaths_and_text)
class Data(torch.utils.data.Dataset): def __init__(self, filelist_path, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, max_wav_value, p_arpabet, cmudict_path, text_cleaners, speaker_ids=None, use_attn_prior=False, attn_prior_threshold=1e-4, prior_cache_path="", betab_scaling_factor=1.0, randomize=True, keep_ambiguous=False, seed=1234): self.max_wav_value = max_wav_value self.audiopaths_and_text = load_filepaths_and_text(filelist_path) self.use_attn_prior = use_attn_prior self.betab_scaling_factor = betab_scaling_factor self.attn_prior_threshold = attn_prior_threshold self.keep_ambiguous = keep_ambiguous if speaker_ids is None or speaker_ids == '': self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) else: self.speaker_ids = speaker_ids self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.sampling_rate = sampling_rate self.text_cleaners = text_cleaners self.p_arpabet = p_arpabet self.cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=keep_ambiguous) if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) else: self.speaker_ids = speaker_ids # caching makes sense for p_phoneme=1.0 # for other values, everytime text lengths will change self.prior_cache_path = prior_cache_path self.caching_enabled = False if (self.prior_cache_path is not None and self.prior_cache_path != "" and p_arpabet == 1.0): self.caching_enabled = True # make sure caching path exists if (self.caching_enabled and not os.path.exists(self.prior_cache_path)): os.makedirs(self.prior_cache_path) random.seed(seed) if randomize: random.shuffle(self.audiopaths_and_text) def compute_attention_prior(self, audiopath, mel_length, text_length): folder_path = audiopath.split('/')[-2] filename = os.path.basename(audiopath).split('.')[0] prior_path = os.path.join(self.prior_cache_path, folder_path + "_" + filename) prior_path += "_prior.pth" prior_loaded = False if self.caching_enabled and os.path.exists(prior_path): attn_prior = torch.load(prior_path) if (attn_prior.shape[1] == text_length and attn_prior.shape[0] == mel_length): prior_loaded = True else: print("Prior size mismatch, recomputing") if not prior_loaded: attn_prior = beta_binomial_prior_distribution( text_length, mel_length, self.betab_scaling_factor) if self.caching_enabled: torch.save(attn_prior, prior_path) if self.attn_prior_threshold > 0: attn_prior = attn_prior.masked_fill( attn_prior < self.attn_prior_threshold, 0.0) return attn_prior def create_speaker_lookup_table(self, audiopaths_and_text): speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text])) d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))} print("Number of speakers :", len(d)) return d def get_mel(self, audio): audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def get_speaker_id(self, speaker_id): return torch.LongTensor([self.speaker_ids[int(speaker_id)]]) def get_text(self, text): text = _clean_text(text, self.text_cleaners) words = re.findall(r'\S*\{.*?\}\S*|\S+', text) text = ' '.join([ get_arpabet(word, self.cmudict) if random.random() < self.p_arpabet else word for word in words ]) text_norm = torch.LongTensor(text_to_sequence(text)) return text_norm def __getitem__(self, index): # Read audio and text audiopath, text, speaker_id = self.audiopaths_and_text[index] #print("PREPROCESSING AUDIO") #print(audiopath) audio, sampling_rate = load_wav_to_torch(audiopath) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) mel = self.get_mel(audio) text_encoded = self.get_text(text) speaker_id = self.get_speaker_id(speaker_id) attn_prior = None if self.use_attn_prior: attn_prior = self.compute_attention_prior(audiopath, mel.shape[1], text_encoded.shape[0]) return (mel, speaker_id, text_encoded, attn_prior) def __len__(self): return len(self.audiopaths_and_text)
class Data(torch.utils.data.Dataset): def __init__(self, filelist_path, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, max_wav_value, p_arpabet, cmudict_path, text_cleaners, speaker_ids=None, randomize=True, seed=1234): self.max_wav_value = max_wav_value self.audiopaths_and_text = load_filepaths_and_text(filelist_path) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.sampling_rate = sampling_rate self.text_cleaners = text_cleaners self.p_arpabet = p_arpabet self.cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=True) if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) else: self.speaker_ids = speaker_ids random.seed(seed) if randomize: random.shuffle(self.audiopaths_and_text) def create_speaker_lookup_table(self, audiopaths_and_text): speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text])) d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))} print("Number of speakers :", len(d)) return d def get_mel(self, audio): audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def get_speaker_id(self, speaker_id): return torch.LongTensor([self.speaker_ids[int(speaker_id)]]) def get_text(self, text): text = _clean_text(text, self.text_cleaners) words = re.findall(r'\S*\{.*?\}\S*|\S+', text) text = ' '.join([ get_arpabet(word, self.cmudict) if random.random() < self.p_arpabet else word for word in words ]) # from hparams import create_hparams_and_paths # hparams, path = create_hparams_and_paths() with open('config.json') as f: data = f.read() embeeding_config = json.loads(data)["embeeding_config"] text_embedding = TextEmbedding(embeeding_config) text_norm = text_embedding.text_norm(text) from ZaG2P.api import load_model g2p_model, viet_dict = load_model() text_out = text_embedding.g2s(text_norm) sequence = text_embedding.text2seq(text_out) text_norm = torch.LongTensor(sequence) return text_norm def __getitem__(self, index): # Read audio and text audiopath, text, speaker_id = self.audiopaths_and_text[index] audio, sampling_rate = load_wav_to_torch(audiopath) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) mel = self.get_mel(audio) text_encoded = self.get_text(text) speaker_id = self.get_speaker_id(speaker_id) return (mel, speaker_id, text_encoded) def __len__(self): return len(self.audiopaths_and_text)
class Data(torch.utils.data.Dataset): def __init__(self, filelist_path, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, max_wav_value, p_arpabet, cmudict_path, text_cleaners, use_attn_prior=False, attn_prior_threshold=1e-4, randomize=True, keep_ambiguous=False, seed=1234): self.max_wav_value = max_wav_value self.audiopaths_and_text = load_filepaths_and_text(filelist_path) self.use_attn_prior = use_attn_prior self.attn_prior_threshold = attn_prior_threshold self.keep_ambiguous = keep_ambiguous self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.sampling_rate = sampling_rate self.text_cleaners = text_cleaners self.p_arpabet = p_arpabet self.cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=keep_ambiguous) random.seed(seed) if randomize: random.shuffle(self.audiopaths_and_text) def compute_attention_prior(self, audiopath, mel_length, text_length): attn_prior = beta_binomial_prior_distribution(text_length, mel_length) if self.attn_prior_threshold > 0: attn_prior = attn_prior.masked_fill( attn_prior < self.attn_prior_threshold, 0.0) return attn_prior def get_mel(self, audio): audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def get_text(self, text): text = _clean_text(text, self.text_cleaners) words = re.findall(r'\S*\{.*?\}\S*|\S+', text) text = ' '.join([ get_arpabet(word, self.cmudict) if random.random() < self.p_arpabet else word for word in words ]) text_norm = torch.LongTensor(text_to_sequence(text)) return text_norm def get_embeds(self, embeds): return torch.from_numpy(np.array(embeds)) def __getitem__(self, index): # Read audio and text audiopath, text, speaker_id = self.audiopaths_and_text[index] audio, sampling_rate = load_wav_to_torch(audiopath) embeds = np.load(audiopath.replace(".wav", ".npy")) '''if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR, in {}".format( sampling_rate, self.sampling_rate, audiopath)) ''' mel = self.get_mel(audio) text_encoded = self.get_text(text) attn_prior = None if self.use_attn_prior: attn_prior = self.compute_attention_prior(audiopath, mel.shape[1], text_encoded.shape[0]) return (mel, embeds, text_encoded, attn_prior) def __len__(self): return len(self.audiopaths_and_text)