Example #1
0
 def get_mel(self, filename):
     if not self.load_mel_from_disk:
         audio, sampling_rate, max_value = load_wav_to_torch(filename)
         if self.audio_offset:  # used for extreme GTA'ing
             audio = audio[self.audio_offset:]
         self.max_wav_value = max(
             max_value,
             audio.max().item(), -audio.min().item()
         )  # I'm not sure how, but sometimes the magnitude of audio exceeds the max of the datatype used before casting.
         if sampling_rate != self.stft.sampling_rate:
             raise ValueError("{} {} SR doesn't match target {} SR".format(
                 sampling_rate, self.stft.sampling_rate))
         audio_norm = audio / self.max_wav_value
         audio_norm = audio_norm.unsqueeze(0)
         audio_norm = torch.autograd.Variable(audio_norm,
                                              requires_grad=False)
         melspec = self.stft.mel_spectrogram(audio_norm)
         melspec = torch.squeeze(melspec, 0)
     else:
         melspec = torch.from_numpy(np.load(filename,
                                            allow_pickle=True)).float()
         assert melspec.size(0) == self.stft.n_mel_channels, (
             'Mel dimension mismatch: given {}, expected {}'.format(
                 melspec.size(0), self.stft.n_mel_channels))
     return melspec
Example #2
0
 def save_mel(file):
     audio, sampling_rate = load_wav_to_torch(file)
     if sampling_rate != stft.sampling_rate:
         raise ValueError("{} {} SR doesn't match target {} SR".format(
             file, sampling_rate, stft.sampling_rate))
     melspec = stft.mel_spectrogram(
         audio.unsqueeze(0)).squeeze(0).cpu().numpy()
     np.save(file.replace('.wav', ''), melspec)
Example #3
0
def check_file_lengths(sampling_rate, segment_size, training_files_old):
    segment_size_s = segment_size/sampling_rate
    training_files_new = []
    for file in tqdm(training_files_old):
        audio, native_sr = load_wav_to_torch(file, min_sr=sampling_rate*0.9, target_sr=None, return_empty_on_exception=True)
        audio_s = len(audio) / native_sr
        if audio_s > segment_size_s and audio_s < 15.0:
            training_files_new.append(file)
    return training_files_new
Example #4
0
 def save_mel(file):
     audio, sampling_rate = load_wav_to_torch(file)
     if sampling_rate != stft.sampling_rate:
         raise ValueError("{} {} SR doesn't match target {} SR".format(
             file, sampling_rate, stft.sampling_rate))
     audio_norm = audio / hparams.max_wav_value
     audio_norm = audio_norm.unsqueeze(0)
     audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
     melspec = stft.mel_spectrogram(audio_norm)
     melspec = torch.squeeze(melspec, 0).cpu().numpy()
     np.save(file.replace('.wav', ''), melspec)
Example #5
0
 def __getitem__(self, index):
     audiopath = self.audio_files[index]
     
     # load audio
     if not self.fine_tuning:
         audio, sampling_rate = load_wav_to_torch(audiopath, target_sr=self.sampling_rate)
         audio = audio - audio.mean()# remove DC offset
         audio = (audio / audio.abs().max()) * 0.95# and normalize volume        
         if self.trim_non_voiced:# trim out non-voiced segments
             assert len(audio.shape) == 1# [B]
             f0, voiced = self.get_pitch(audio)
             start_indx, end_indx = get_nonzero_indexes(voiced)
             audio = audio[start_indx*self.hop_size:end_indx*self.hop_size]
     else:
         pm_audio_path = os.path.splitext(audiopath)[0]+'.pm_audio.pt'# predicted mel audio
         audio = torch.load(pm_audio_path).float()# [T]
     
     audio = audio.unsqueeze(0)# [B] -> [1, B]
     if not self.fine_tuning:
         if self.split:
             if audio.size(1) >= self.segment_size:
                 max_audio_start = audio.size(1) - self.segment_size
                 audio_start = random.randint(0, max_audio_start)
                 audio = audio[:, audio_start:audio_start+self.segment_size]
             else:
                 audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
         
         gt_mel = self.STFT.get_mel(audio)
         mel = gt_mel# 'mel' is the input to the vocoder, gt_mel is the original mel that will be used as a target by the model.
     else:
         pred_mel_path = os.path.splitext(audiopath)[0]+'.pred_mel.pt'
         mel = torch.load(pred_mel_path).float()
         if len(mel.shape) == 2:
             mel = mel.unsqueeze(0)# [n_mel, mel_T] -> [1, n_mel, mel_T]
         
         if self.split:
             frames_per_seg = math.ceil(self.segment_size / self.hop_size)
             
             if audio.size(1) >= self.segment_size:
                 mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
                 mel = mel[:, :, mel_start:mel_start + frames_per_seg]
                 audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
             else:
                 mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
                 audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
         gt_mel = self.STFT.get_mel(audio)
         min_mel_len = min(gt_mel.shape[-1], mel.shape[-1])
         mel    = mel[:, :, :min_mel_len]
         gt_mel = gt_mel[:, :, :min_mel_len]
         mel = DTW(mel, gt_mel, 5, 3)
     
     return (mel.squeeze(), audio.squeeze(0), audiopath, gt_mel.squeeze())
Example #6
0
 def get_embed_from_path(self, audiopath):
     audio, sr = load_wav_to_torch(audiopath, target_sr=16000)
     spec = get_spect(audio).float().unsqueeze(0)# [1, mel_T, n_mel]
     spec = spec.to(next(self.parameters()))
     if spec.shape[-1]%128:
         spec = torch.nn.functional.pad(spec, (0, 0, 0, spec.shape[-1]%128))
     embeds = []
     for i in range(0, spec.shape[-1], 128):
         embed = self(spec[:, :, i:i+128])# [1, 128, 80] -> [1, embed]
         embeds.append(embed)
         if i > 5:
             break
     embeds = torch.mean(torch.cat(embeds, dim=0), dim=0)# [1, embed]
     return embed.cpu().float().squeeze(0)# [embed]
Example #7
0
    def get_audio_text_pair(self, index):
        audiopath, _, speaker_id, _, duration_path, enc_out_path = self.audiopaths_and_text[
            index]
        encoder_outputs = torch.from_numpy(
            np.load(enc_out_path)).float()  # [enc_T, enc_dim]
        durations = torch.from_numpy(np.load(duration_path)).float()  # [enc_T]
        audio, sampling_rate = load_wav_to_torch(audiopath)  # [T]

        max_audio_start = audio.size(0) - self.segment_length
        audio_start = random.randint(
            0, max_audio_start // self.hop_length) * self.hop_length
        audio_segment = audio[audio_start:audio_start + self.segment_length]
        attention_contexts = self.get_contexts(audio_start, encoder_outputs,
                                               durations)  # [dec_T, enc_dim]
        return (audio_segment, attention_contexts, encoder_outputs, durations)
Example #8
0
 def get_mel(self, filename):
     if not self.load_mel_from_disk:
         audio, sampling_rate = load_wav_to_torch(filename)
         if self.audio_offset:  # used for extreme GTA'ing
             audio = audio[self.audio_offset:]
         if sampling_rate != self.stft.sampling_rate:
             raise ValueError("{} SR doesn't match target {} SR".format(
                 sampling_rate, self.stft.sampling_rate))
         melspec = self.stft.mel_spectrogram(audio.unsqueeze(0)).squeeze(0)
     else:
         melspec = torch.from_numpy(np.load(filename,
                                            allow_pickle=True)).float()
         assert melspec.size(0) == self.stft.n_mel_channels, (
             'Mel dimension mismatch: given {}, expected {}'.format(
                 melspec.size(0), self.stft.n_mel_channels))
     return melspec