Beispiel #1
0
 def __init__(self, training_files, segment_length, filter_length,
              hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
     if os.path.isfile(str(training_files)):
         self.audio_files = files_to_list(training_files)
     else:
         self.audio_files = []
     random.seed(1234)
     random.shuffle(self.audio_files)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin, mel_fmax=mel_fmax)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
Beispiel #2
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """

    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        if os.path.isfile(str(training_files)):
            self.audio_files = files_to_list(training_files)
        else:
            self.audio_files = []
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio  # audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data

        mel = self.get_mel(audio)
        # audio = audio / MAX_WAV_VALUE

        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
Beispiel #3
0
import unidecode
import yaml
import librosa

from waveglow import inference as waveglow
from melgan import inference as melgan
from mellotron import inference as mellotron
from utils.argutils import locals2dict

from mellotron.layers import TacotronSTFT
from mellotron.hparams import create_hparams

# 用griffinlim声码器
_hparams = create_hparams()
_stft = TacotronSTFT(_hparams.filter_length, _hparams.hop_length,
                     _hparams.win_length, _hparams.n_mel_channels,
                     _hparams.sampling_rate, _hparams.mel_fmin,
                     _hparams.mel_fmax)

_use_waveglow = 0

_device = 'cuda' if torch.cuda.is_available() else 'cpu'
filename_formatter_re = re.compile(r'[\s\\/:*?"<>|\']+')


def plot_mel_alignment_gate_audio(mel,
                                  alignment,
                                  gate,
                                  audio,
                                  figsize=(16, 16)):
    fig, axes = plt.subplots(4, 1, figsize=figsize)
    axes = axes.flatten()
Beispiel #4
0
import torch
import aukit
import tqdm
import requests

from waveglow import inference as waveglow
from mellotron import inference as mellotron
from mellotron.layers import TacotronSTFT
from mellotron.hparams import create_hparams

_home_dir = os.path.dirname(os.path.abspath(__file__))

# 用griffinlim声码器
_hparams = create_hparams()
_stft = TacotronSTFT(_hparams.filter_length, _hparams.hop_length,
                     _hparams.win_length, _hparams.n_mel_channels,
                     _hparams.sampling_rate, _hparams.mel_fmin,
                     _hparams.mel_fmax)

_use_waveglow = 0
_device = 'cuda' if torch.cuda.is_available() else 'cpu'

_mellotron_path = os.path.join(_home_dir, 'resource', 'model',
                               'mellotron.kuangdd-rtvc.pt')
_waveglow_path = os.path.join(_home_dir, 'resource', 'model',
                              'waveglow.kuangdd.pt')
_ge2e_path = os.path.join(_home_dir, 'resource', 'model', 'ge2e.kuangdd.pt')
_mellotron_hparams_path = os.path.join(_home_dir, 'resource', 'model',
                                       'mellotron_hparams.json')
_reference_audio_tar_path = os.path.join(_home_dir, 'resource',
                                         'reference_audio.tar')
_audio_tar_path = os.path.join(_home_dir, 'resource', 'audio.tar')