def __init__(self, n_audio_channel, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): # self.audio_files = files_to_list(training_files) # random.seed(1234) # random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax, n_group=n_audio_channel)
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, n_audio_channel, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): # self.audio_files = files_to_list(training_files) # random.seed(1234) # random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax, n_group=n_audio_channel) # self.segment_length = segment_length # self.sampling_rate = sampling_rate def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) # at this step, audio_norm this is same as torchaudio.load output in our repo audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, n_audio_channel, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.audio_files = files_to_list(training_files) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax, n_group=n_audio_channel) self.segment_length = segment_length self.sampling_rate = sampling_rate def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = self.get_mel(audio) audio = audio / MAX_WAV_VALUE return (mel, audio) def __len__(self): return len(self.audio_files)
from scipy.io.wavfile import read import numpy as np from TacotronSTFT import TacotronSTFT from timeit import default_timer as timer ## RTF is the real-time factor which tells how many seconds of speech are generated in 1 second of wall time MAX_WAV_VALUE = 32768.0 n_audio_channel = 128 stft = TacotronSTFT(filter_length=1024, hop_length=256, win_length=1024, sampling_rate=22050, mel_fmin=0.0, mel_fmax=8000.0, n_group=n_audio_channel) def load_wav_to_torch(full_path): """ Loads wavdata into torch array """ sampling_rate, data = read(full_path) return torch.from_numpy(data).float(), sampling_rate def get_mel(audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm)
def __init__(self, n_audio_channel, path_in, split, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, stride, temp_jitter=False, store_in_ram=False, seed=0, split_utterances=True, pc_split_utterances=0.1, split_speakers=False, pc_split_speakers=0.1, frame_energy_thres=0.025, do_audio_load=True, trim=None, select_speaker=None, select_file=None, verbose=True): self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax, n_group=n_audio_channel) self.segment_length = segment_length self.sampling_rate = sampling_rate # Temp Jitter may be True when doing data aug in training self.path_in = path_in self.split = split self.lchunk = segment_length self.stride = stride self.temp_jitter = temp_jitter self.store_in_ram = store_in_ram if trim is None or trim <= 0: trim = np.inf # Get filenames in folder and subfolders self.filenames = [] for dirpath, dirnames, filenames in os.walk(self.path_in): for fn in filenames: if not fn.endswith(EXTENSION): continue new_fn = os.path.join(dirpath, fn) new_fn = os.path.relpath(new_fn, self.path_in) self.filenames.append(new_fn) self.filenames.sort() random.seed(seed) random.shuffle(self.filenames) # Get speakers & utterances self.speakers = {} self.utterances = {} for fullfn in self.filenames: spk, ut = self.filename_split(fullfn) if spk not in self.speakers: self.speakers[spk] = len(self.speakers) if ut not in self.utterances: self.utterances[ut] = len(self.utterances) self.n_max_speakers = len(self.speakers) # Split lutterances = list(self.utterances.keys()) lutterances.sort() random.shuffle(lutterances) lspeakers = list(self.speakers.keys()) lspeakers.sort() random.shuffle(lspeakers) isplit_ut = int(len(lutterances) * pc_split_utterances) isplit_spk = int(len(lspeakers) * pc_split_speakers) if split == 'train': spk_del = lspeakers[-2 * isplit_spk:] ut_del = lutterances[-2 * isplit_ut:] elif split == 'valid': spk_del = lspeakers[:-2 * isplit_spk] + lspeakers[-isplit_spk:] ut_del = lutterances[:-2 * isplit_ut] + lutterances[-isplit_ut:] elif split == 'train+valid': spk_del = lspeakers[-isplit_spk:] ut_del = lutterances[-isplit_ut:] elif split == 'test': spk_del = lspeakers[:-isplit_spk] ut_del = lutterances[:-isplit_ut] else: print('Not implemented split', split) sys.exit() if split_speakers: for spk in spk_del: del self.speakers[spk] if split_utterances: for ut in ut_del: del self.utterances[ut] # Filter filenames by speaker and utterance filenames_new = [] for filename in self.filenames: spk, ut = self.filename_split(filename) if spk in self.speakers and ut in self.utterances: filenames_new.append(filename) self.filenames = filenames_new # Select speaker if select_speaker is not None: select_speaker = select_speaker.split(',') filenames_new = [] for filename in self.filenames: spk, ut = self.filename_split(filename) if spk in select_speaker and spk in self.speakers: filenames_new.append(filename) if len(filenames_new) == 0: print('\nERROR: Selected an invalid speaker. Options are:', list(self.speakers.keys())) sys.exit() self.filenames = filenames_new # Select specific file if select_file is not None: select_file = select_file.split(',') filenames_new = [] for filename in self.filenames: _, file = os.path.split(filename[:-len(EXTENSION)]) if file in select_file: filenames_new.append(filename) if len(filenames_new) == 0: print('\nERROR: Selected an invalid file. Options are:', self.filenames[:int(np.min([50, len(self.filenames)]))], '... (without folder and without extension))') sys.exit() self.filenames = filenames_new # Indices! self.audios = [None] * len(self.filenames) self.indices = [] duration = {} start = time.time() if do_audio_load: for i, filename in enumerate(self.filenames): if verbose: if i % 1000 == 0: print('Read {} out of {} files'.format( i + 1, len(self.filenames))) # print('\rRead audio {:5.1f}%'.format( # 100 * (i + 1) / len(self.filenames)), end='') # Info spk, ut = self.filename_split(filename) ispk, iut = self.speakers[spk], self.utterances[ut] # Load if spk not in duration: duration[spk] = 0 if duration[spk] >= trim: continue x = torch.load(os.path.join(self.path_in, filename)) if self.store_in_ram: self.audios[i] = x.clone() x = x.float() # Process for j in range(0, len(x), stride): if j + self.lchunk >= len(x): continue else: xx = x[j:j + self.lchunk] if (xx.pow(2).sum() / self.lchunk).sqrt().item() >= frame_energy_thres: info = [i, j, 0, ispk, iut] self.indices.append(torch.LongTensor(info)) duration[spk] += stride / sampling_rate if duration[spk] >= trim: break self.indices[-1][2] = 1 if verbose: print() self.indices = torch.stack(self.indices) print("Time elapsed: {}".format(time.time() - start)) # Print if verbose: totalduration = 0 for key in duration.keys(): totalduration += duration[key] print( 'Loaded {:s}: {:.1f} h, {:d} spk, {:d} ut, {:d} files, {:d} frames (fet={:.1e},' .format(split, totalduration / 3600, len(self.speakers), len(self.utterances), len(self.filenames), len(self.indices), frame_energy_thres), end='') if trim is None or trim > 1e12: print(' no trim)') else: print(' trim={:.1f}s)'.format(trim)) if select_speaker is not None: print('Selected speaker(s):', select_speaker) if select_file is not None: print('Selected file(s):', select_file)
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, n_audio_channel, path_in, split, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, stride, temp_jitter=False, store_in_ram=False, seed=0, split_utterances=True, pc_split_utterances=0.1, split_speakers=False, pc_split_speakers=0.1, frame_energy_thres=0.025, do_audio_load=True, trim=None, select_speaker=None, select_file=None, verbose=True): self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax, n_group=n_audio_channel) self.segment_length = segment_length self.sampling_rate = sampling_rate # Temp Jitter may be True when doing data aug in training self.path_in = path_in self.split = split self.lchunk = segment_length self.stride = stride self.temp_jitter = temp_jitter self.store_in_ram = store_in_ram if trim is None or trim <= 0: trim = np.inf # Get filenames in folder and subfolders self.filenames = [] for dirpath, dirnames, filenames in os.walk(self.path_in): for fn in filenames: if not fn.endswith(EXTENSION): continue new_fn = os.path.join(dirpath, fn) new_fn = os.path.relpath(new_fn, self.path_in) self.filenames.append(new_fn) self.filenames.sort() random.seed(seed) random.shuffle(self.filenames) # Get speakers & utterances self.speakers = {} self.utterances = {} for fullfn in self.filenames: spk, ut = self.filename_split(fullfn) if spk not in self.speakers: self.speakers[spk] = len(self.speakers) if ut not in self.utterances: self.utterances[ut] = len(self.utterances) self.n_max_speakers = len(self.speakers) # Split lutterances = list(self.utterances.keys()) lutterances.sort() random.shuffle(lutterances) lspeakers = list(self.speakers.keys()) lspeakers.sort() random.shuffle(lspeakers) isplit_ut = int(len(lutterances) * pc_split_utterances) isplit_spk = int(len(lspeakers) * pc_split_speakers) if split == 'train': spk_del = lspeakers[-2 * isplit_spk:] ut_del = lutterances[-2 * isplit_ut:] elif split == 'valid': spk_del = lspeakers[:-2 * isplit_spk] + lspeakers[-isplit_spk:] ut_del = lutterances[:-2 * isplit_ut] + lutterances[-isplit_ut:] elif split == 'train+valid': spk_del = lspeakers[-isplit_spk:] ut_del = lutterances[-isplit_ut:] elif split == 'test': spk_del = lspeakers[:-isplit_spk] ut_del = lutterances[:-isplit_ut] else: print('Not implemented split', split) sys.exit() if split_speakers: for spk in spk_del: del self.speakers[spk] if split_utterances: for ut in ut_del: del self.utterances[ut] # Filter filenames by speaker and utterance filenames_new = [] for filename in self.filenames: spk, ut = self.filename_split(filename) if spk in self.speakers and ut in self.utterances: filenames_new.append(filename) self.filenames = filenames_new # Select speaker if select_speaker is not None: select_speaker = select_speaker.split(',') filenames_new = [] for filename in self.filenames: spk, ut = self.filename_split(filename) if spk in select_speaker and spk in self.speakers: filenames_new.append(filename) if len(filenames_new) == 0: print('\nERROR: Selected an invalid speaker. Options are:', list(self.speakers.keys())) sys.exit() self.filenames = filenames_new # Select specific file if select_file is not None: select_file = select_file.split(',') filenames_new = [] for filename in self.filenames: _, file = os.path.split(filename[:-len(EXTENSION)]) if file in select_file: filenames_new.append(filename) if len(filenames_new) == 0: print('\nERROR: Selected an invalid file. Options are:', self.filenames[:int(np.min([50, len(self.filenames)]))], '... (without folder and without extension))') sys.exit() self.filenames = filenames_new # Indices! self.audios = [None] * len(self.filenames) self.indices = [] duration = {} start = time.time() if do_audio_load: for i, filename in enumerate(self.filenames): if verbose: if i % 1000 == 0: print('Read {} out of {} files'.format( i + 1, len(self.filenames))) # print('\rRead audio {:5.1f}%'.format( # 100 * (i + 1) / len(self.filenames)), end='') # Info spk, ut = self.filename_split(filename) ispk, iut = self.speakers[spk], self.utterances[ut] # Load if spk not in duration: duration[spk] = 0 if duration[spk] >= trim: continue x = torch.load(os.path.join(self.path_in, filename)) if self.store_in_ram: self.audios[i] = x.clone() x = x.float() # Process for j in range(0, len(x), stride): if j + self.lchunk >= len(x): continue else: xx = x[j:j + self.lchunk] if (xx.pow(2).sum() / self.lchunk).sqrt().item() >= frame_energy_thres: info = [i, j, 0, ispk, iut] self.indices.append(torch.LongTensor(info)) duration[spk] += stride / sampling_rate if duration[spk] >= trim: break self.indices[-1][2] = 1 if verbose: print() self.indices = torch.stack(self.indices) print("Time elapsed: {}".format(time.time() - start)) # Print if verbose: totalduration = 0 for key in duration.keys(): totalduration += duration[key] print( 'Loaded {:s}: {:.1f} h, {:d} spk, {:d} ut, {:d} files, {:d} frames (fet={:.1e},' .format(split, totalduration / 3600, len(self.speakers), len(self.utterances), len(self.filenames), len(self.indices), frame_energy_thres), end='') if trim is None or trim > 1e12: print(' no trim)') else: print(' trim={:.1f}s)'.format(trim)) if select_speaker is not None: print('Selected speaker(s):', select_speaker) if select_file is not None: print('Selected file(s):', select_file) def filename_split(self, fullfn): aux = os.path.split(fullfn)[-1][:-len(EXTENSION)].split('_') return aux[0], aux[1] def get_mel(self, audio): audio = audio.unsqueeze(0) audio = torch.autograd.Variable(audio, requires_grad=False) melspec = self.stft.mel_spectrogram(audio) melspec = torch.squeeze(melspec, 0) return melspec def get_whole_audio(self, idx): # Load file if self.store_in_ram: x = self.audios[idx] else: x = torch.load(os.path.join(self.path_in, self.filenames[idx])) assert x is not None x = x.float() # Info spk, ut = self.filename_split(self.filenames[idx]) ispk, iut = self.speakers[spk], self.utterances[ut] y = torch.LongTensor([ispk]) ichap = torch.LongTensor([iut]) last = torch.LongTensor([1]) return x, y, ichap, last def __getitem__(self, index): if self.split == 'test': return self.get_whole_audio(index) i, j, last, ispk, ichap = self.indices[index, :] # Load file if self.store_in_ram: tmp = self.audios[i] else: tmp = torch.load(os.path.join(self.path_in, self.filenames[i])) # Temporal jitter if self.temp_jitter: j = j + np.random.randint(-self.stride // 2, self.stride // 2) if j < 0: j = 0 elif j > len(tmp) - self.lchunk: j = np.max([0, len(tmp) - self.lchunk]) # Get frame if j + self.lchunk > len(tmp): x = tmp[j:].float() x = torch.cat([x, torch.zeros(self.lchunk - len(x))]) else: x = tmp[j:j + self.lchunk].float() # Get info y = torch.LongTensor([ispk]) # mel = self.get_mel(x) return x, y, ichap, last def __len__(self): if self.split == 'test': return len(self.filenames) return self.indices.size(0)
parser.add_argument("-d", "--denoiser_strength", default=0.0, type=float, help='Removes model bias. Start with 0.1 and adjust') args = parser.parse_args() with open(args.config) as f: data = f.read() config = json.loads(data) global data_config data_config = config["data_config"] data_config['split'] = 'train' global squeezewave_config squeezewave_config = config['squeezewave_config'] stft = TacotronSTFT(filter_length=data_config['filter_length'], hop_length=data_config['hop_length'], win_length=data_config['win_length'], sampling_rate=data_config['sampling_rate'], mel_fmin=data_config['mel_fmin'], mel_fmax=data_config['mel_fmax'], n_group=squeezewave_config['n_audio_channel']) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) os.chmod(args.output_dir, 0o775) main(args.squeezewave_path, args.sigma, args.output_dir, args.is_fp16, args.denoiser_strength)