def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): if os.path.isfile(str(training_files)): self.audio_files = files_to_list(training_files) else: self.audio_files = [] random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): if os.path.isfile(str(training_files)): self.audio_files = files_to_list(training_files) else: self.audio_files = [] random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate def get_mel(self, audio): audio_norm = audio # audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = self.get_mel(audio) # audio = audio / MAX_WAV_VALUE return (mel, audio) def __len__(self): return len(self.audio_files)
import unidecode import yaml import librosa from waveglow import inference as waveglow from melgan import inference as melgan from mellotron import inference as mellotron from utils.argutils import locals2dict from mellotron.layers import TacotronSTFT from mellotron.hparams import create_hparams # 用griffinlim声码器 _hparams = create_hparams() _stft = TacotronSTFT(_hparams.filter_length, _hparams.hop_length, _hparams.win_length, _hparams.n_mel_channels, _hparams.sampling_rate, _hparams.mel_fmin, _hparams.mel_fmax) _use_waveglow = 0 _device = 'cuda' if torch.cuda.is_available() else 'cpu' filename_formatter_re = re.compile(r'[\s\\/:*?"<>|\']+') def plot_mel_alignment_gate_audio(mel, alignment, gate, audio, figsize=(16, 16)): fig, axes = plt.subplots(4, 1, figsize=figsize) axes = axes.flatten()
import torch import aukit import tqdm import requests from waveglow import inference as waveglow from mellotron import inference as mellotron from mellotron.layers import TacotronSTFT from mellotron.hparams import create_hparams _home_dir = os.path.dirname(os.path.abspath(__file__)) # 用griffinlim声码器 _hparams = create_hparams() _stft = TacotronSTFT(_hparams.filter_length, _hparams.hop_length, _hparams.win_length, _hparams.n_mel_channels, _hparams.sampling_rate, _hparams.mel_fmin, _hparams.mel_fmax) _use_waveglow = 0 _device = 'cuda' if torch.cuda.is_available() else 'cpu' _mellotron_path = os.path.join(_home_dir, 'resource', 'model', 'mellotron.kuangdd-rtvc.pt') _waveglow_path = os.path.join(_home_dir, 'resource', 'model', 'waveglow.kuangdd.pt') _ge2e_path = os.path.join(_home_dir, 'resource', 'model', 'ge2e.kuangdd.pt') _mellotron_hparams_path = os.path.join(_home_dir, 'resource', 'model', 'mellotron_hparams.json') _reference_audio_tar_path = os.path.join(_home_dir, 'resource', 'reference_audio.tar') _audio_tar_path = os.path.join(_home_dir, 'resource', 'audio.tar')