Example #1
0
    def __init__(self, filelist_path, filter_length, hop_length, win_length,
                 sampling_rate, mel_fmin, mel_fmax, max_wav_value, p_arpabet,
                 cmudict_path, text_cleaners, speaker_ids=None,
                 use_attn_prior=False, attn_prior_threshold=1e-4,
                 randomize=True, seed=1234):
        self.max_wav_value = max_wav_value
        self.audiopaths_and_text = load_filepaths_and_text(filelist_path)
        self.use_attn_prior = use_attn_prior
        self.attn_prior_threshold = attn_prior_threshold

        if speaker_ids is None or speaker_ids == '':
            self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
        else:
            self.speaker_ids = speaker_ids

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.sampling_rate = sampling_rate
        self.text_cleaners = text_cleaners
        self.p_arpabet = p_arpabet
        self.cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=True)
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
        else:
            self.speaker_ids = speaker_ids

        random.seed(seed)
        if randomize:
            random.shuffle(self.audiopaths_and_text)
Example #2
0
class Data(torch.utils.data.Dataset):
    def __init__(self,
                 filelist_path,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 max_wav_value,
                 p_arpabet,
                 cmudict_path,
                 text_cleaners,
                 speaker_ids=None,
                 use_attn_prior=False,
                 attn_prior_threshold=1e-4,
                 randomize=True,
                 keep_ambiguous=False,
                 seed=1234):
        self.max_wav_value = max_wav_value
        self.audiopaths_and_text = load_filepaths_and_text(filelist_path)
        self.use_attn_prior = use_attn_prior
        self.attn_prior_threshold = attn_prior_threshold
        self.keep_ambiguous = keep_ambiguous

        if speaker_ids is None or speaker_ids == '':
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)
        else:
            self.speaker_ids = speaker_ids

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.sampling_rate = sampling_rate
        self.text_cleaners = text_cleaners
        self.p_arpabet = p_arpabet
        self.cmudict = cmudict.CMUDict(cmudict_path,
                                       keep_ambiguous=keep_ambiguous)
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)
        else:
            self.speaker_ids = speaker_ids

        random.seed(seed)
        if randomize:
            random.shuffle(self.audiopaths_and_text)

    def compute_attention_prior(self, audiopath, mel_length, text_length):
        attn_prior = beta_binomial_prior_distribution(text_length, mel_length)

        if self.attn_prior_threshold > 0:
            attn_prior = attn_prior.masked_fill(
                attn_prior < self.attn_prior_threshold, 0.0)

        return attn_prior

    def create_speaker_lookup_table(self, audiopaths_and_text):
        speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text]))
        d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))}
        print("Number of speakers :", len(d))
        return d

    def get_mel(self, audio):
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def get_speaker_id(self, speaker_id):
        return torch.LongTensor([self.speaker_ids[int(speaker_id)]])

    def get_text(self, text):
        text = _clean_text(text, self.text_cleaners)
        words = re.findall(r'\S*\{.*?\}\S*|\S+', text)
        text = ' '.join([
            get_arpabet(word, self.cmudict)
            if random.random() < self.p_arpabet else word for word in words
        ])
        text_norm = torch.LongTensor(text_to_sequence(text))
        return text_norm

    def __getitem__(self, index):
        # Read audio and text
        audiopath, text, speaker_id = self.audiopaths_and_text[index]
        audio, sampling_rate = load_wav_to_torch(audiopath)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        mel = self.get_mel(audio)
        text_encoded = self.get_text(text)
        speaker_id = self.get_speaker_id(speaker_id)
        attn_prior = None
        if self.use_attn_prior:
            attn_prior = self.compute_attention_prior(audiopath, mel.shape[1],
                                                      text_encoded.shape[0])

        return (mel, speaker_id, text_encoded, attn_prior)

    def __len__(self):
        return len(self.audiopaths_and_text)
    def __init__(self,
                 filelist_path,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 max_wav_value,
                 p_arpabet,
                 cmudict_path,
                 text_cleaners,
                 speaker_ids=None,
                 use_attn_prior=False,
                 attn_prior_threshold=1e-4,
                 prior_cache_path="",
                 betab_scaling_factor=1.0,
                 randomize=True,
                 keep_ambiguous=False,
                 seed=1234):
        self.max_wav_value = max_wav_value
        self.audiopaths_and_text = load_filepaths_and_text(filelist_path)
        self.use_attn_prior = use_attn_prior
        self.betab_scaling_factor = betab_scaling_factor
        self.attn_prior_threshold = attn_prior_threshold
        self.keep_ambiguous = keep_ambiguous

        if speaker_ids is None or speaker_ids == '':
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)
        else:
            self.speaker_ids = speaker_ids

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.sampling_rate = sampling_rate
        self.text_cleaners = text_cleaners
        self.p_arpabet = p_arpabet
        self.cmudict = cmudict.CMUDict(cmudict_path,
                                       keep_ambiguous=keep_ambiguous)
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)
        else:
            self.speaker_ids = speaker_ids

        # caching makes sense for p_phoneme=1.0
        # for other values, everytime text lengths will change
        self.prior_cache_path = prior_cache_path
        self.caching_enabled = False
        if (self.prior_cache_path is not None and self.prior_cache_path != ""
                and p_arpabet == 1.0):
            self.caching_enabled = True
        # make sure caching path exists
        if (self.caching_enabled
                and not os.path.exists(self.prior_cache_path)):
            os.makedirs(self.prior_cache_path)

        random.seed(seed)
        if randomize:
            random.shuffle(self.audiopaths_and_text)
class Data(torch.utils.data.Dataset):
    def __init__(self,
                 filelist_path,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 max_wav_value,
                 p_arpabet,
                 cmudict_path,
                 text_cleaners,
                 speaker_ids=None,
                 use_attn_prior=False,
                 attn_prior_threshold=1e-4,
                 prior_cache_path="",
                 betab_scaling_factor=1.0,
                 randomize=True,
                 keep_ambiguous=False,
                 seed=1234):
        self.max_wav_value = max_wav_value
        self.audiopaths_and_text = load_filepaths_and_text(filelist_path)
        self.use_attn_prior = use_attn_prior
        self.betab_scaling_factor = betab_scaling_factor
        self.attn_prior_threshold = attn_prior_threshold
        self.keep_ambiguous = keep_ambiguous

        if speaker_ids is None or speaker_ids == '':
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)
        else:
            self.speaker_ids = speaker_ids

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.sampling_rate = sampling_rate
        self.text_cleaners = text_cleaners
        self.p_arpabet = p_arpabet
        self.cmudict = cmudict.CMUDict(cmudict_path,
                                       keep_ambiguous=keep_ambiguous)
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)
        else:
            self.speaker_ids = speaker_ids

        # caching makes sense for p_phoneme=1.0
        # for other values, everytime text lengths will change
        self.prior_cache_path = prior_cache_path
        self.caching_enabled = False
        if (self.prior_cache_path is not None and self.prior_cache_path != ""
                and p_arpabet == 1.0):
            self.caching_enabled = True
        # make sure caching path exists
        if (self.caching_enabled
                and not os.path.exists(self.prior_cache_path)):
            os.makedirs(self.prior_cache_path)

        random.seed(seed)
        if randomize:
            random.shuffle(self.audiopaths_and_text)

    def compute_attention_prior(self, audiopath, mel_length, text_length):
        folder_path = audiopath.split('/')[-2]
        filename = os.path.basename(audiopath).split('.')[0]
        prior_path = os.path.join(self.prior_cache_path,
                                  folder_path + "_" + filename)

        prior_path += "_prior.pth"

        prior_loaded = False
        if self.caching_enabled and os.path.exists(prior_path):
            attn_prior = torch.load(prior_path)
            if (attn_prior.shape[1] == text_length
                    and attn_prior.shape[0] == mel_length):
                prior_loaded = True
            else:
                print("Prior size mismatch, recomputing")

        if not prior_loaded:
            attn_prior = beta_binomial_prior_distribution(
                text_length, mel_length, self.betab_scaling_factor)
            if self.caching_enabled:
                torch.save(attn_prior, prior_path)

        if self.attn_prior_threshold > 0:
            attn_prior = attn_prior.masked_fill(
                attn_prior < self.attn_prior_threshold, 0.0)

        return attn_prior

    def create_speaker_lookup_table(self, audiopaths_and_text):
        speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text]))
        d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))}
        print("Number of speakers :", len(d))
        return d

    def get_mel(self, audio):
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def get_speaker_id(self, speaker_id):
        return torch.LongTensor([self.speaker_ids[int(speaker_id)]])

    def get_text(self, text):
        text = _clean_text(text, self.text_cleaners)
        words = re.findall(r'\S*\{.*?\}\S*|\S+', text)
        text = ' '.join([
            get_arpabet(word, self.cmudict)
            if random.random() < self.p_arpabet else word for word in words
        ])
        text_norm = torch.LongTensor(text_to_sequence(text))
        return text_norm

    def __getitem__(self, index):
        # Read audio and text
        audiopath, text, speaker_id = self.audiopaths_and_text[index]
        #print("PREPROCESSING AUDIO")
        #print(audiopath)
        audio, sampling_rate = load_wav_to_torch(audiopath)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        mel = self.get_mel(audio)
        text_encoded = self.get_text(text)
        speaker_id = self.get_speaker_id(speaker_id)
        attn_prior = None
        if self.use_attn_prior:
            attn_prior = self.compute_attention_prior(audiopath, mel.shape[1],
                                                      text_encoded.shape[0])

        return (mel, speaker_id, text_encoded, attn_prior)

    def __len__(self):
        return len(self.audiopaths_and_text)
Example #5
0
class Data(torch.utils.data.Dataset):
    def __init__(self,
                 filelist_path,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 max_wav_value,
                 p_arpabet,
                 cmudict_path,
                 text_cleaners,
                 speaker_ids=None,
                 randomize=True,
                 seed=1234):
        self.max_wav_value = max_wav_value
        self.audiopaths_and_text = load_filepaths_and_text(filelist_path)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.sampling_rate = sampling_rate
        self.text_cleaners = text_cleaners
        self.p_arpabet = p_arpabet
        self.cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=True)
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(
                self.audiopaths_and_text)
        else:
            self.speaker_ids = speaker_ids

        random.seed(seed)
        if randomize:
            random.shuffle(self.audiopaths_and_text)

    def create_speaker_lookup_table(self, audiopaths_and_text):
        speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text]))
        d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))}
        print("Number of speakers :", len(d))
        return d

    def get_mel(self, audio):
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def get_speaker_id(self, speaker_id):
        return torch.LongTensor([self.speaker_ids[int(speaker_id)]])

    def get_text(self, text):
        text = _clean_text(text, self.text_cleaners)
        words = re.findall(r'\S*\{.*?\}\S*|\S+', text)
        text = ' '.join([
            get_arpabet(word, self.cmudict)
            if random.random() < self.p_arpabet else word for word in words
        ])

        # from hparams import create_hparams_and_paths
        # hparams, path = create_hparams_and_paths()
        with open('config.json') as f:
            data = f.read()
        embeeding_config = json.loads(data)["embeeding_config"]
        text_embedding = TextEmbedding(embeeding_config)
        text_norm = text_embedding.text_norm(text)
        from ZaG2P.api import load_model
        g2p_model, viet_dict = load_model()
        text_out = text_embedding.g2s(text_norm)
        sequence = text_embedding.text2seq(text_out)

        text_norm = torch.LongTensor(sequence)
        return text_norm

    def __getitem__(self, index):
        # Read audio and text
        audiopath, text, speaker_id = self.audiopaths_and_text[index]
        audio, sampling_rate = load_wav_to_torch(audiopath)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        mel = self.get_mel(audio)
        text_encoded = self.get_text(text)
        speaker_id = self.get_speaker_id(speaker_id)
        return (mel, speaker_id, text_encoded)

    def __len__(self):
        return len(self.audiopaths_and_text)
Example #6
0
class Data(torch.utils.data.Dataset):
    def __init__(self,
                 filelist_path,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 max_wav_value,
                 p_arpabet,
                 cmudict_path,
                 text_cleaners,
                 use_attn_prior=False,
                 attn_prior_threshold=1e-4,
                 randomize=True,
                 keep_ambiguous=False,
                 seed=1234):
        self.max_wav_value = max_wav_value
        self.audiopaths_and_text = load_filepaths_and_text(filelist_path)
        self.use_attn_prior = use_attn_prior
        self.attn_prior_threshold = attn_prior_threshold
        self.keep_ambiguous = keep_ambiguous

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.sampling_rate = sampling_rate
        self.text_cleaners = text_cleaners
        self.p_arpabet = p_arpabet
        self.cmudict = cmudict.CMUDict(cmudict_path,
                                       keep_ambiguous=keep_ambiguous)

        random.seed(seed)
        if randomize:
            random.shuffle(self.audiopaths_and_text)

    def compute_attention_prior(self, audiopath, mel_length, text_length):
        attn_prior = beta_binomial_prior_distribution(text_length, mel_length)

        if self.attn_prior_threshold > 0:
            attn_prior = attn_prior.masked_fill(
                attn_prior < self.attn_prior_threshold, 0.0)

        return attn_prior

    def get_mel(self, audio):
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def get_text(self, text):
        text = _clean_text(text, self.text_cleaners)
        words = re.findall(r'\S*\{.*?\}\S*|\S+', text)
        text = ' '.join([
            get_arpabet(word, self.cmudict)
            if random.random() < self.p_arpabet else word for word in words
        ])
        text_norm = torch.LongTensor(text_to_sequence(text))
        return text_norm

    def get_embeds(self, embeds):
        return torch.from_numpy(np.array(embeds))

    def __getitem__(self, index):
        # Read audio and text
        audiopath, text, speaker_id = self.audiopaths_and_text[index]
        audio, sampling_rate = load_wav_to_torch(audiopath)
        embeds = np.load(audiopath.replace(".wav", ".npy"))
        '''if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR, in {}".format(
                sampling_rate, self.sampling_rate, audiopath))
		  '''
        mel = self.get_mel(audio)
        text_encoded = self.get_text(text)
        attn_prior = None
        if self.use_attn_prior:
            attn_prior = self.compute_attention_prior(audiopath, mel.shape[1],
                                                      text_encoded.shape[0])

        return (mel, embeds, text_encoded, attn_prior)

    def __len__(self):
        return len(self.audiopaths_and_text)