Esempio n. 1
0
def test_MCD_and_f0():
    hparams = create_hparams()
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)
    audio_path = 'kakao/1/1_0001.wav'
    mel_path = 'kakao/1/1_0001.mel.npy'
    srcMel = torch.from_numpy(np.load(mel_path)).unsqueeze(0)
    srcMel = torch.clamp(srcMel, -4.0, 4.0)
    # print(srcMel.shape,  srcMel.max(), srcMel.min())
    audio, sr = load_wav_to_torch(audio_path)
    # print(audio.shape, audio.max(), audio.min())
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)

    # print(audio_norm.shape, audio_norm.max(), audio_norm.min())
    dstMel = stft.mel_spectrogram(audio_norm)
    # print(dstMel.shape, dstMel.max(), dstMel.min())
    # mcc = stft.cepstrum_from_audio(audio_norm)
    # print('mcc', mcc.shape, mcc.max(), mcc.min())

    log_MCD = MCD_from_mels(stft, srcMel, dstMel)
    print(log_MCD.data, 'log')

    sqrtDiffF0 = sqDiffF0_from_mels(stft, srcMel, dstMel)
    print(sqrtDiffF0)
    meanSqrtDiffF0 = torch.mean(sqrtDiffF0)
    print(meanSqrtDiffF0.data, '100hz')
Esempio n. 2
0
def load_data(datapath, glob_file_str, scale=True, data_split=[0.8, 0.1]):
    data = defaultdict(list)
    stft = TacotronSTFT(filter_length=1024,
                        hop_length=160,
                        win_length=1024,
                        sampling_rate=16000,
                        n_mel_channels=64,
                        mel_fmin=0,
                        mel_fmax=None,
                        representation='asrgen')

    for folderpath in sorted(glob.glob(os.path.join(datapath, '*/'))):
        label = os.path.basename(os.path.normpath(folderpath))
        filepaths = glob.glob(
            os.path.join(os.path.join(datapath, label), glob_file_str))
        for filepath in filepaths:
            audio = load_wav_to_torch(filepath, stft.sampling_rate)
            audio_norm = audio / MAX_WAV_VALUE
            audio_norm = audio_norm / torch.max(audio_norm.abs())
            audio_norm = audio_norm.unsqueeze(0)
            audio_norm = torch.autograd.Variable(audio_norm,
                                                 requires_grad=False)
            mel_spec = stft.mel_spectrogram(audio_norm)[0]
            mel_spec -= mel_spec.min()
            mel_spec = mel_spec / torch.max(mel_spec)
            mel_spec = (mel_spec * 2) - 1
            train_end = int(mel_spec.size(1) * data_split[0])
            val_end = int(mel_spec.size(1) * (data_split[0] + data_split[1]))
            data['train'].append([mel_spec[:, :train_end], label])
            data['valid'].append([mel_spec[:, train_end:val_end], label])
            data['test'].append([mel_spec[:, val_end:], label])
    return data
Esempio n. 3
0
class TextMelLoader(torch.utils.data.Dataset):
    """
        1) loads audio,text pairs
        2) normalizes text and converts them to sequences of one-hot vectors
        3) computes mel-spectrograms from audio files.
    """
    def __init__(self, audiopaths_and_text, hparams):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                                 hparams.win_length, hparams.n_mel_channels,
                                 hparams.sampling_rate, hparams.mel_fmin,
                                 hparams.mel_fmax)
        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)

    def get_mel_text_pair(self, audiopath_and_text):
        # separate filename and text
        audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
        text = self.get_text(text)
        mel = self.get_mel(audiopath)
        return (text, mel)

    def get_mel(self, filename):
        if not self.load_mel_from_disk:
            audio, sampling_rate = load_wav_to_torch(filename)
            if sampling_rate != self.stft.sampling_rate:
                raise ValueError("{} {} SR doesn't match target {} SR".format(
                    sampling_rate, self.stft.sampling_rate))
            audio_norm = audio / self.max_wav_value
            audio_norm = audio_norm.unsqueeze(0)
            audio_norm = torch.autograd.Variable(audio_norm,
                                                 requires_grad=False)
            melspec = self.stft.mel_spectrogram(audio_norm)
            melspec = torch.squeeze(melspec, 0)
        else:
            melspec = torch.from_numpy(np.load(filename))
            assert melspec.size(0) == self.stft.n_mel_channels, (
                'Mel dimension mismatch: given {}, expected {}'.format(
                    melspec.size(0), self.stft.n_mel_channels))

        return melspec

    def get_text(self, text):
        text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
        return text_norm

    def __getitem__(self, index):
        return self.get_mel_text_pair(self.audiopaths_and_text[index])

    def __len__(self):
        return len(self.audiopaths_and_text)
Esempio n. 4
0
def get_mel(filename, hparams):
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)
    audio = load_wav_to_torch(filename, hparams.sampling_rate)
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = torch.squeeze(melspec, 0)
    return melspec
Esempio n. 5
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data

        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
Esempio n. 6
0
def load_mel(path):
    hparams = create_hparams()
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cpu()
    return melspec
Esempio n. 7
0
class Synthesizer(object):
    def __init__(self):
        super().__init__()
        self.hparams = create_hparams()
        self.hparams.sampling_rate = 16000
        self.hparams.max_decoder_steps = 600

        self.stft = TacotronSTFT(
            self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length,
            self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin,
            self.hparams.mel_fmax)

    def load_mel(self, path):
        audio, sampling_rate = load_wav_to_torch(path)
        if sampling_rate != self.hparams.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.stft.sampling_rate))
        audio_norm = audio / self.hparams.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = melspec.cuda()
        return melspec

    # def close(self):
    #     tf.reset_default_graph()
    #     self.sess.close()

    def load(self, checkpoint_path, waveglow_path):
        self.model = load_model(self.hparams)
        self.model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
        _ = self.model.eval()

        self.waveglow = torch.load(waveglow_path)['model']
        self.waveglow.cuda()

        path = './web/static/uploads/koemo_spk_emo_all_test.txt'
        with open(path, encoding='utf-8') as f:
            filepaths_and_text = [line.strip().split("|") for line in f]
        
        base_path = os.path.dirname(checkpoint_path)
        data_path = os.path.basename(checkpoint_path) + '_' + path.rsplit('_', 1)[1].split('.')[0] + '.npz'
        npz_path = os.path.join(base_path, data_path)
        
        if os.path.exists(npz_path):
            d = np.load(npz_path)
            zs = d['zs']
            emotions = d['emotions']
        else:
            emotions = []
            zs = []
            for audio_path, _, _, emotion in tqdm(filepaths_and_text):
                melspec = self.load_mel(audio_path)
                _, _, _, z = self.model.vae_gst(melspec)
                zs.append(z.cpu().data)
                emotions.append(int(emotion))
            emotions = np.array(emotions) # list이면 안됨 -> ndarray
            zs = torch.cat(zs, dim=0).data.numpy()
            d = {'zs':zs, 'emotions':emotions}
            np.savez(npz_path, **d)

        self.neu = np.mean(zs[emotions==0,:], axis=0)
        self.sad = np.mean(zs[emotions==1,:], axis=0)
        self.ang = np.mean(zs[emotions==2,:], axis=0)
        self.hap = np.mean(zs[emotions==3,:], axis=0)

    def synthesize(self, text, path, condition_on_ref, ref_audio, ratios):
        print(ratios)
        sequence = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :]
        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
        inputs = self.model.parse_input(sequence)
        transcript_embedded_inputs = self.model.transcript_embedding(inputs).transpose(1,2)
        transcript_outputs = self.model.encoder.inference(transcript_embedded_inputs)
        print(condition_on_ref)

        if condition_on_ref:
            #ref_audio = '/data1/jinhan/KoreanEmotionSpeech/wav/hap/hap_00000001.wav'
            ref_audio_mel = self.load_mel(ref_audio)
            latent_vector, _, _, _ = self.model.vae_gst(ref_audio_mel)
            latent_vector = latent_vector.unsqueeze(1).expand_as(transcript_outputs)
        
        else: # condition on emotion ratio
            latent_vector = ratios[0] * self.neu + ratios[1] * self.sad + \
                        ratios[2] * self.hap + ratios[3] * self.ang
            latent_vector = torch.FloatTensor(latent_vector).cuda()
            latent_vector = self.model.vae_gst.fc3(latent_vector)

        encoder_outputs = transcript_outputs + latent_vector

        decoder_input = self.model.decoder.get_go_frame(encoder_outputs)
        self.model.decoder.initialize_decoder_states(encoder_outputs, mask=None)
        mel_outputs, gate_outputs, alignments = [], [], []

        while True:
            decoder_input = self.model.decoder.prenet(decoder_input)
            mel_output, gate_output, alignment = self.model.decoder.decode(decoder_input)

            mel_outputs += [mel_output]
            gate_outputs += [gate_output]
            alignments += [alignment]

            if torch.sigmoid(gate_output.data) > self.hparams.gate_threshold:
                # print(torch.sigmoid(gate_output.data), gate_output.data)
                break
            if len(mel_outputs) == self.hparams.max_decoder_steps:
                print("Warning! Reached max decoder steps")
                break

            decoder_input = mel_output

        mel_outputs, gate_outputs, alignments = self.model.decoder.parse_decoder_outputs(
                mel_outputs, gate_outputs, alignments)
        mel_outputs_postnet = self.model.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
        # print(mel_outputs_postnet.shape)

        with torch.no_grad():
            synth = self.waveglow.infer(mel_outputs, sigma=0.666)
        
        # return synth[0].data.cpu().numpy()
        # path = add_postfix(path, idx)
        # print(path)
        librosa.output.write_wav(path, synth[0].data.cpu().numpy(), 16000)
Esempio n. 8
0
class LoadedMellotron:
    def __init__(self, ckpt, wglw, n_speakers=123):
        print("[Loading Model]")
        self.ckpt = ckpt
        self.hparams = create_hparams()
        self.hparams.n_speakers = n_speakers
        self.stft = TacotronSTFT(self.hparams.filter_length,
                                 self.hparams.hop_length,
                                 self.hparams.win_length,
                                 self.hparams.n_mel_channels,
                                 self.hparams.sampling_rate,
                                 self.hparams.mel_fmin, self.hparams.mel_fmax)
        self.mellotron = load_model(self.hparams).cuda().eval()
        self.waveglow = torch.load(wglw)['model'].cuda().eval()
        self.denoiser = Denoiser(self.waveglow).cuda().eval()
        self.arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
        self.mellotron.load_state_dict(torch.load(ckpt)['state_dict'])
        print('[Loaded Model]')

    def load_mel(self, path):
        audio, sampling_rate = librosa.core.load(path,
                                                 sr=self.hparams.sampling_rate)
        audio = torch.from_numpy(audio)
        if sampling_rate != self.hparams.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.stft.sampling_rate))
        audio_norm = audio / self.hparams.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = melspec.cuda()
        return melspec

    def run(
        self,
        audio_path,
        text,
        title,
        speaker_id=0,
    ):
        print("[Running]")
        dataloader = TextMelLoader(audio_path, text, self.hparams, speaker_id)
        datacollate = TextMelCollate(1)

        text_encoded = torch.LongTensor(
            text_to_sequence(text, self.hparams.text_cleaners,
                             self.arpabet_dict))[None, :].cuda()
        pitch_contour = dataloader.get_data()[3][None].cuda()
        mel = self.load_mel(audio_path)
        print(audio_path, text)

        # load source data to obtain rhythm using tacotron 2 as a forced aligner
        x, y = self.mellotron.parse_batch(datacollate([dataloader.get_data()]))

        with torch.no_grad():
            # get rhythm (alignment map) using tacotron 2
            mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = self.mellotron.forward(
                x)
            rhythm = rhythm.permute(1, 0, 2)

        s_id = torch.LongTensor([speaker_id]).cuda()
        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = self.mellotron.inference_noattention(
                (text_encoded, mel, s_id, pitch_contour, rhythm))
            audio = self.denoiser(
                self.waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.02)[:,
                                                                           0]
        # plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
        #                 mel_outputs_postnet.data.cpu().numpy()[0],
        #                 pitch_contour.data.cpu().numpy()[0, 0],
        #                 rhythm.data.cpu().numpy()[:, 0].T, f"tests/{title}.png")
        write(f"outputs/{title}",
              rate=self.hparams.sampling_rate,
              data=audio[0].data.cpu().numpy())
        print("[END]")
Esempio n. 9
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, audio_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(audio_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        #audio = audio + (torch.rand_like(audio) - 0.5) / MAX_WAV_VALUE   # commenting out because why are we adding noise?
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        while(True):
            try:
                # Read audio
                filename = self.audio_files[index]
                audio, sampling_rate = load_wav_to_torch(filename)
                if sampling_rate != self.sampling_rate:
                    raise ValueError("{} SR doesn't match target {} SR".format(
                        sampling_rate, self.sampling_rate))

                # Take segment
                if audio.size(0) >= self.segment_length:
                    max_audio_start = audio.size(0) - self.segment_length
                    audio_start = random.randint(0, max_audio_start)
                    audio = audio[audio_start:audio_start+self.segment_length]

                    # if the audio sample has a very small standard deviation, it's probably a bad sample
                    audio_std = audio.std()
                    if audio_std < 1e-4:
                        raise ValueError("Sample low std deviation: {}".format(filename))

                    # try and detect silence with pydub
                    audio_pydub = AudioSegment.from_wav(filename)
                    audio_slice = audio_pydub[audio_start:audio_start + self.segment_length]
                    if silence.detect_silence(audio_slice):
                        raise ValueError("Sample too silent: {}".format(filename))

                else:
                    raise ValueError("Sample too short: {}".format(filename))
                    #audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data
                break
            except Exception as e:
                print(e)
            finally:
                index = randrange(0,len(self.audio_files))


        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)