def get_mel(self, filename): if not self.load_mel_from_disk: audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) else: melspec = torch.from_numpy(np.load(filename)) assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return melspec
def __getitem__(self, index): filename = self.audio_files[index] filename = os.path.join('dataset', filename) audio, sampling_rate = utils.load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: print(filename) raise ValueError("Sampling rate doesn't math") if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = F.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, self.mu_quantization) return audio
def get_mel(self, filename, stft): try: melspec = torch.from_numpy(np.load(filename[:-4] + '_' + str(stft.n_mel_channels) + '.npy')) assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) except: audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( sampling_rate, stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) with open(filename[:-4] + '_' + str(stft.n_mel_channels) + '.npy', 'wb+') as f: np.save(f, melspec.numpy()) return melspec
def get_mel(self, filename): if not self.load_mel_from_disk: audio, sampling_rate, max_value = load_wav_to_torch(filename) if self.audio_offset: # used for extreme GTA'ing audio = audio[self.audio_offset:] self.max_wav_value = max(max_value, audio.max().item(), -audio.min().item()) # I'm not sure how, but sometimes the magnitude of audio exceeds the max of the datatype used before casting. if sampling_rate != self.stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) else: melspec = torch.from_numpy(np.load(filename, allow_pickle=True)).float() assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return melspec
def get_audio(self, filename): audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) spec_filename = filename.replace(".wav", ".spec.pt") if os.path.exists(spec_filename): spec = torch.load(spec_filename) else: spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False) spec = torch.squeeze(spec, 0) torch.save(spec, spec_filename) return spec, audio_norm
def get_mel_and_f0(self, filepath): audio, sampling_rate = load_wav_to_torch(filepath) audio_norm = audio / self.max_wav_value # if sampling_rate != self.stft.sampling_rate: # raise ValueError("{} SR doesn't match target {} SR".format( # sampling_rate, self.stft.sampling_rate)) # audio_norm = audio_norm.unsqueeze(0) # melspec = self.stft.mel_spectrogram(audio_norm) # melspec = torch.squeeze(melspec, 0) melspec = linearspectrogram_torch(audio_norm) # 用aukit的频谱生成方案 f0 = self.get_f0(audio.cpu().numpy(), sampling_rate, self.filter_length, self.hop_length, self.f0_min, self.f0_max, self.harm_thresh) f0 = torch.from_numpy(f0)[None] # f0 = f0[:, :melspec.size(1)] # 用零向量替换F0 # f0 = torch.zeros(1, melspec.shape[1], dtype=torch.float) return melspec, f0
def get_mel_and_f0(filepath, filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax, f0_min, f0_max, harm_thresh): stft = layers.TacotronSTFT(filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax) audio, sampling_rate = load_wav_to_torch(filepath) if sampling_rate != stft.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, stft.sampling_rate)) audio_norm = audio # I changed them to float32 during preprocessing so this normalization is unnecessary. audio_norm = audio_norm.unsqueeze(0) melspec = stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) f0 = get_f0(audio.cpu().numpy(), sampling_rate, filter_length, hop_length, f0_min, f0_max, harm_thresh) f0 = torch.from_numpy(f0)[None] f0 = f0[:, :melspec.size(1)] return melspec, f0
def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate = utils.load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = self.get_mel(audio) audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, self.mu_quantization) return (mel, audio)
def get_mel(self, fb): if self.load_mel_from_disk: cur_mel_path = os.path.join(self.mel_path, fb + '.npy') melspec = np.load(cur_mel_path) mean, std = np.load(self.MelStd_mel) melspec = (melspec - mean) / std melspec = np.transpose(melspec) melspec = torch.from_numpy(melspec) else: cur_audio_path = os.path.join(self.audio_path, fb + '.wav') audio = load_wav_to_torch(cur_audio_path, self.sampling_rate) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) # [mel_bin, T] assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return melspec
def prepare_mel_npy(hparams, audiopath_and_text): audiopath_and_texts = load_filepaths_and_text(audiopath_and_text) for i in range(len(audiopath_and_texts)): audiopath, text, speaker_id = audiopath_and_texts[i][0], audiopath_and_texts[i][1], audiopath_and_texts[i][2] audio, sampling_rate = load_wav_to_torch(audiopath) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = layers.TacotronSTFT(hparams).mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) out_dir = audiopath[:11] file_name = audiopath[12:-4] file = os.path.join(out_dir, file_name) np.save(file, melspec) print("{} / {}".format(i,len(audiopath_and_texts))) pass
def dnp(run_name, noisy_file, samples_dir, LR=0.001, num_iter=5000, save_every=50): # initiate model nlayers = 6 model = Unet(nlayers=nlayers, nefilters=60).cuda() samples_dir = os.path.join(samples_dir, run_name) utils.makedirs(samples_dir) # load data target, sr = utils.load_wav_to_torch(noisy_file) target = target[:(len(target) // 2**nlayers) * 2**nlayers] target = target / utils.MAX_WAV_VALUE input = torch.rand_like(target) input = (input - 0.5) * 2 target, input = target.cuda(), input.cuda() criterion = torch.nn.MSELoss() optimize(model.parameters(), model, criterion, input, target, samples_dir, LR, num_iter, sr, save_every)
help='Directory to put Mel-Spectrogram Tensors') parser.add_argument('-c', '--config', type=str, help='JSON file for configuration') args = parser.parse_args() filepaths = utils.files_to_list(args.audio_list) # Make directory if it doesn't exist if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) os.chmod(args.output_dir, 0o775) # Parse config. Only using data processing with open(args.config) as f: data = f.read() config = json.loads(data) data_config = config["data_config"] mel_factory = Mel2SampOnehot(**data_config) for filepath in filepaths: audio, sampling_rate = utils.load_wav_to_torch(filepath) assert (sampling_rate == mel_factory.sampling_rate) melspectrogram = mel_factory.get_mel(audio) filename = os.path.basename(filepath) new_filepath = args.output_dir + '/' + filename + '.pt' print(new_filepath) torch.save(melspectrogram, new_filepath)
def get_mel(self, filename): audio = load_wav_to_torch(filename, self.sampling_rate) mel = self.mel_transform_fn.transform(audio.view(1, -1)) return mel
hparams = create_hparams() stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) if gen_mel: audio_files = sorted(glob('audio/*.wav')) out_dir = 'mel' SaveMkdir(out_dir) for file in tqdm(audio_files): tqdm.write(file) file_basename = os.path.basename(file).split('.')[0] audio_path = os.path.join(hparams.audio_path, file_basename + '.wav') audio, sampling_rate = load_wav_to_torch(audio_path, hparams.sampling_rate) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) #转置存错 即数据行代表帧 列代表特征 melspec = torch.squeeze(melspec, 0).numpy().transpose() out_file = os.path.join(out_dir, file_basename + '.npy') np.save(out_file, melspec) mean_std = cal_MeanStd(out_dir, hparams.n_mel_channels, ref_file=None) np.save(os.path.join(out_dir, os.pardir, 'MeanStd_Tacotron_mel.npy'), mean_std)
def __getitem__(self, index): item = self.audios[index] return load_wav_to_torch(item)[0]