def main(): config = ConfigXT() load = FileXT(config.audio_path) print( colored('Preprocessing audio for ', 'blue', attrs=['bold']) + load.basename) data = preprocess.preprocess(load.filename, config.speaker, config, verbose=False) if config.infer_acoustic: f0, rmse = acoustic_infer(data, config) for i in range(len(data)): data[i][4] = f0[i] data[i][5] = rmse[i] mel = durian_infer(data, config) if config.vocoder == 'wavernn': wave = wavernn_infer(mel, config) elif config.vocoder == 'waveglow': wave = waveglow_infer(mel, config) savename = config.durian_path.replace('.pt', '_') + FileXT( config.vocoder_path).basestem + '_speaker' + str( config.speaker) + '_' + load.basename torchaudio.save(savename, wave, config.sample_rate) print( colored('Audio generated to ', 'blue', attrs=['bold']) + savename + '\n')
def test_vae(test_fns): """ Used with a trained model, where MODEL_FN is already a saved .pth file """ d = prep(test_fns) y_hats = [] length = len(d['data']) img_y_hats = [] for epoch in range(1, EPOCHS + 1): print(f'test epoch: {epoch}') train.train_epoch(d, epoch, BATCH_SIZE, device) apply_idx = random.randint(0, length) sample = d['data'][apply_idx].view(BATCH_SIZE, 2, -1) y_hat = utilz.gen_apply(d['m'], sample, device).cpu() print(f'y_hat.shape: {y_hat.shape}') img_vers = y_hat.view(1, 3, 240, 245) # for 2 seconds img_y_hats.append(img_vers) y_hats.append(y_hat) song = torch.cat(y_hats, dim=1) print(song) if SAVE_SONG: save_wavfn = f'vaeconv_{RUN_TIME}.wav' song_path = d['path'] + save_wavfn torchaudio.save(song_path, song, d['sr']) print(f'audio saved to {song_path}') return song# , video
def test_filter(): audio, sr = torchaudio.load("tests/classical.00002.wav") num_samples = sr * 5 transform = Compose([HighLowPass(sample_rate=sr)], ) audios = transform(audio) torchaudio.save("tests/filter.wav", audios, sample_rate=sr) assert audios.shape[1] == audio.shape[1]
def infer_wavenet(args): import sys sys.path.append('thirdparty/wavenet_vocoder') from train import build_model from synthesis import wavegen from tqdm import tqdm target_sample_rate = 22050 hparams, model = load_model(args.model_name) meller = MelSpectrogram() files = [ item for item in os.listdir(args.folder_in) if item.endswith('wav') ] for idx, audio in enumerate(files): wav_path = os.path.join(args.folder_in, audio) wav = load_wav(wav_path, target_sample_rate) c = meller(wav)[0] if c.shape[1] != hparams.num_mels: c = c.transpose(0, 1) # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1] # c = np.interp(c, (0, 4), (0, 1)) # Generate waveform = wavegen(model, c=c, fast=True, tqdm=tqdm) path = os.path.join(args.folder_out, audio) folder = os.path.dirname(path) if not os.path.exists(folder): os.makedirs(folder) torchaudio.save(path, waveform, hparams.sample_rate)
def main(args): # spectrogram = torch.from_numpy(np.load(args.spectrogram_path)) params = {} if args.noise_schedule: params['noise_schedule'] = torch.from_numpy( np.load(args.noise_schedule)) start_time = time.time() model = load_model(model_dir=args.model_dir, params=params) if not os.path.exists(args.output_path): os.mkdir(args.output_path) time_consuming = time.time() - start_time print(" > Load model, time consuming {}s".format(round(time_consuming, 2))) mels = os.listdir(args.spectrogram_path) for mel_file in mels: if not os.path.isdir(mel_file) and mel_file[-2:] == "pt": start_time = time.time() print(" > Start inferencing sentence {} . ".format(mel_file)) spectrogram = torch.tensor( torch.load(os.path.join(args.spectrogram_path, mel_file))) audio, sr = predict(spectrogram, model) wav_name = os.path.join(args.output_path, mel_file[:-7] + "_wg.wav") torchaudio.save(wav_name, audio.cpu(), sample_rate=sr) time_consuming = time.time() - start_time print(" > Complete, time consuming {}s".format( round(time_consuming, 2)))
def inference_fast(mels, real_audio=None, i=0, epoch=0): generator.eval() val_losses = [] audio = torch.zeros(mels.shape[0], 1, 1).to(device) with torch.no_grad(): for length in range(1, mels.shape[2] + 1): res = generator(audio[:, :, -1:], mels[:, :, length - 1:length], fast_generation=True) audio = torch.cat([ audio, mu_decode(torch.argmax(res[:, :, -1:], dim=1)).unsqueeze(1) ], dim=2) name = "example " + str(i) + "_no_tf" torchaudio.save("gen.wav", audio[0].squeeze().detach().cpu().numpy(), sample_rate=22050) wandb_gen = wandb.Audio(audio[0].squeeze().detach().cpu().numpy(), caption="Inference_1", sample_rate=22050) wandb_audios = [wandb_gen] if real_audio != None: wandb_real = wandb.Audio( real_audio[0].squeeze().detach().cpu().numpy(), caption="Real_1", sample_rate=22050) wandb_audios.append(wandb_real) wandb.log({name: wandb_audios}, step=epoch)
def resample_folder(in_dir, out_dir, target_fs=16000, regex="*.wav"): """ Resamples the audio files contained in the in_dir folder and saves them in out_dir folder Args: in_dir (str): path to audio directory (audio to be resampled) out_dir (str): path to audio resampled directory target_fs (int, optional): target sample rate. Defaults to 16000. regex (str, optional): regular expression for extension of file. Defaults to "*.wav". """ compute = True files = glob.glob(os.path.join(in_dir, regex)) if os.path.exists(out_dir): out_files = glob.glob(os.path.join(out_dir, regex)) if len(files) == len(out_files): compute = False if compute: for f in tqdm.tqdm(files): audio, orig_fs = torchaudio.load(f) audio = resample(audio, orig_fs, target_fs) os.makedirs( Path(os.path.join(out_dir, Path(f).relative_to(Path(in_dir)))).parent, exist_ok=True, ) torchaudio.save( os.path.join(out_dir, Path(f).relative_to(Path(in_dir))), audio, target_fs, ) return compute
def asadsa2(): """ """ from draugr.torch_utilities import to_tensor from neodroidaudition.data.recognition.libri_speech import LibriSpeech from neodroidaudition.noise_generation.gaussian_noise import white_noise import torchaudio from pathlib import Path libri_speech = LibriSpeech(path=Path.home() / "Data" / "Audio" / "Speech" / "LibriSpeech") files, sr = zip(*[(v[0].numpy(), v[1]) for _, v in zip(range(1), libri_speech)]) assert all([sr[0] == s for s in sr[1:]]) normed = files[0] mixed = mix_ratio(normed, normed, 0) mixed2 = mix_ratio(mixed, mixed, 0) print(normed, mixed) print(mixed2, mixed) print(root_mean_square(normed)) print(root_mean_square(mixed)) print(root_mean_square(mixed2)) assert numpy.allclose(normed, mixed) assert numpy.allclose(mixed2, mixed) torchaudio.save( str(ensure_existence(Path.cwd() / "exclude") / "mixed_same.wav"), to_tensor(mixed), int(sr[0]), )
def convert(path): split_list = [] with open(join(path, 'hub5e_00.pem'), 'r') as fp: for line in fp: if line[0] not in ['e', 's']: continue if line[-1] == '\n': line = line[:-1] line = line.split(' ') name = line[0] + '-' + line[1] + '.wav' time_0 = float(line[3]) i = 4 while len(line[i]) <= 1: i += 1 time_1 = float(line[i]) split_list.append((name, time_0, time_1)) print('From {} found {} segments'.format(join(path, 'hub5e_00.pem'), len(split_list))) split_list = sorted(split_list, key=lambda x: x[0]) file_list = list(Path(path).rglob('*.wav')) file_list = sorted(file_list) print('From {} found {} wav files.'.format(path, len(file_list))) j = 0 for idx, p in tqdm(enumerate(file_list)): waveform, sr = torchaudio.load(str(p)) count = 0 while split_list[j][0] == str(p).split('/')[-1]: t_0, t_1 = split_list[j][1], split_list[j][2] split_wavform = waveform[:, int(t_0 * sr):int(t_1 * sr) + 1] torchaudio.save( str(p)[:-4] + '-' + str(count).zfill(3) + '.wav', split_wavform, sr) j += 1 count += 1 if j == len(split_list): break print('Finished splitting {} wav files into {}.'.format( j, len(split_list)))
def save_audio(wav, path, samplerate, bitrate=320, clip='rescale', bits_per_sample=16, as_float=False): """Save audio file, automatically preventing clipping if necessary based on the given `clip` strategy. If the path ends in `.mp3`, this will save as mp3 with the given `bitrate`. """ wav = prevent_clip(wav, mode=clip) path = Path(path) suffix = path.suffix.lower() if suffix == ".mp3": encode_mp3(wav, path, samplerate, bitrate) elif suffix == ".wav": if as_float: bits_per_sample = 32 encoding = 'PCM_F' else: encoding = 'PCM_S' ta.save(str(path), wav, sample_rate=samplerate, encoding=encoding, bits_per_sample=bits_per_sample) else: raise ValueError(f"Invalid suffix for path: {suffix}")
def test_cut_load_custom_recording_pad_left(): sampling_rate = 16000 duration = 52.4 audio = np.random.randn(1, compute_num_samples( duration, sampling_rate)).astype(np.float32) audio /= np.abs(audio).max() # normalize to [-1, 1] with NamedTemporaryFile(suffix=".wav") as f: torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate) f.flush() os.fsync(f) recording = Recording.from_file(f.name) # Note: MonoCut doesn't normally have an "alignment" attribute, # and a "load_alignment()" method. # We are dynamically extending it. cut = MonoCut( id="x", start=0, duration=duration, channel=0, recording=dummy_recording(0, duration=duration), ) cut.my_favorite_song = recording cut_pad = cut.pad(duration=60.0, direction="left") restored_audio = cut_pad.load_my_favorite_song() assert restored_audio.shape == (1, 960000) # 16000 * 60 np.testing.assert_almost_equal(0, restored_audio[:, :-audio.shape[1]]) np.testing.assert_almost_equal(audio, restored_audio[:, -audio.shape[1]:])
def main(cfg: DictConfig): spectrogram_path = Path( hydra.utils.to_absolute_path(cfg.inference.spectrogram_path)) output_path = Path( hydra.utils.to_absolute_path(cfg.inference.inferenced_path)) output_audio_path = output_path / (spectrogram_path.stem + ".wav") output_plot_path = output_path / (spectrogram_path.stem + ".png") model = core.model.WaveNet.load_from_checkpoint( hydra.utils.to_absolute_path(cfg.inference.checkpoint_path)) model = model.eval().to(cfg.inference.device) spectrogram = torch.load(spectrogram_path).to(model.device) if len(spectrogram.shape) < 3: spectrogram = spectrogram.unsqueeze(0) if cfg.inference.cut_size is not None: spectrogram = spectrogram[:, :, :int(cfg.inference.cut_size * cfg.data.sample_rate / cfg.preprocessing.hop_length)] audio = model.inference(spectrogram).squeeze(0) audio = torchaudio.functional.mu_law_decoding(audio, model.n_mu_law) torchaudio.save(str(output_audio_path), audio.detach().cpu(), sample_rate=cfg.data.sample_rate) plt.plot(audio[0].detach().cpu().numpy()) plt.savefig(output_plot_path)
def forward(self, input_path: str, output_path: str): torchaudio.sox_effects.init_sox_effects() # 1. load audio waveform, sample_rate = torchaudio.load(input_path) # 2. Add background noise alpha = 0.01 waveform = alpha * torch.randn_like(waveform) + (1 - alpha) * waveform # 3. Reample the RIR filter to much the audio sample rate rir, _ = torchaudio.sox_effects.apply_effects_tensor( self.rir, self.rir_sample_rate, effects=[["rate", str(sample_rate)]]) rir = rir / torch.norm(rir, p=2) rir = torch.flip(rir, [1]) # 4. Apply RIR filter waveform = torch.nn.functional.pad(waveform, (rir.shape[1] - 1, 0)) waveform = torch.nn.functional.conv1d(waveform[None, ...], rir[None, ...])[0] # Save torchaudio.save(output_path, waveform, sample_rate)
def _test_1_save_sine(self): # save created file sinewave_filepath = os.path.join(self.test_dirpath, "assets", "sinewave.wav") sr = 16000 freq = 440 volume = 0.3 y = (torch.cos( 2 * math.pi * torch.arange(0, 4 * sr).float() * freq / sr)) y.unsqueeze_(0) # y is between -1 and 1, so must scale y = (y * volume * (2**31)).long() torchaudio.save(sinewave_filepath, y, sr) self.assertTrue(os.path.isfile(sinewave_filepath)) # test precision new_precision = 32 new_filepath = os.path.join(self.test_dirpath, "test.wav") si, ei = torchaudio.info(sinewave_filepath) torchaudio.save(new_filepath, y, sr, new_precision) si32, ei32 = torchaudio.info(new_filepath) self.assertEqual(si.precision, 16) self.assertEqual(si32.precision, new_precision) os.unlink(new_filepath)
def generate_background_noise(speech_commands): """Split the background noise provided by the dataset in 1 second chunks. Parameters: speech_commands (torch.utils.data.Dataset): Speech Command dataset as defined by torchaudio. """ background_noise = glob.glob( os.path.join(speech_commands._path, "_background_noise_", "*.wav")) os.makedirs(os.path.join(speech_commands._path, "background"), exist_ok=True) for file in background_noise: waveform, sample_rate = torchaudio.load(file) background_waveforms = torch.split(waveform, sample_rate, dim=1)[:-1] for idx, background_waveform in enumerate(background_waveforms): torchaudio.save( os.path.join( speech_commands._path, "background", f"{hash(waveform)}_nohash_{idx}.wav", ), background_waveform, sample_rate=sample_rate, )
def run_inference(text, audio=None): generator.eval() text = [ord(c) for c in text if ord(c) < 256] text = torch.tensor(text).view(1, -1) with torch.no_grad(): text = text.to(device) pad_mask = (text != 0).to(device) res, before_prenet, stop_token, attn_matrix = generator( text, pad_mask, None, device) wandb_gen = wandb.Image(res[0, :, :].detach().cpu().numpy(), caption="Generated") wandb_attn = wandb.Image(256 * attn_matrix[0, :, :].detach().cpu().numpy(), caption="Attention") wandb_images = [wandb_gen, wandb_attn] audio_gen = vocoder.inference(res[:1, :, :].detach().cpu()) torchaudio.save("gen.wav", audio_gen, sample_rate=22050) wandb_audios = [ wandb.Audio("gen.wav", caption="Generated", sample_rate=22050) ] if audio != None: wandb_real = wandb.Image(audio[0, :, :].detach().cpu().numpy(), caption="Real") wandb_images.append(wandb_real) audio_real = vocoder.inference(audio[:1, :, :]) torchaudio.save("temp_real.wav", audio_real, sample_rate=22050) wandb_audios.append( wandb.Audio("temp_real.wav", caption="Real", sample_rate=22050)) wandb.log({"mels": wandb_images}, step=0) wandb.log({"audios": wandb_audios}, step=0) api.flush()
def _test_4_load_partial(self): num_frames = 101 offset = 201 # load entire mono sinewave wav file, load a partial copy and then compare input_sine_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav') x_sine_full, sr_sine = torchaudio.load(input_sine_path) x_sine_part, _ = torchaudio.load(input_sine_path, num_frames=num_frames, offset=offset) l1_error = x_sine_full[:, offset:(num_frames + offset)].sub(x_sine_part).abs().sum().item() # test for the correct number of samples and that the correct portion was loaded self.assertEqual(x_sine_part.size(1), num_frames) self.assertEqual(l1_error, 0.) # create a two channel version of this wavefile x_2ch_sine = x_sine_full.repeat(1, 2) out_2ch_sine_path = os.path.join(self.test_dirpath, 'assets', '2ch_sinewave.wav') torchaudio.save(out_2ch_sine_path, x_2ch_sine, sr_sine) x_2ch_sine_load, _ = torchaudio.load(out_2ch_sine_path, num_frames=num_frames, offset=offset) os.unlink(out_2ch_sine_path) l1_error = x_2ch_sine_load.sub(x_2ch_sine[:, offset:(offset + num_frames)]).abs().sum().item() self.assertEqual(l1_error, 0.) # test with two channel mp3 x_2ch_full, sr_2ch = torchaudio.load(self.test_filepath, normalization=True) x_2ch_part, _ = torchaudio.load(self.test_filepath, normalization=True, num_frames=num_frames, offset=offset) l1_error = x_2ch_full[:, offset:(offset + num_frames)].sub(x_2ch_part).abs().sum().item() self.assertEqual(x_2ch_part.size(1), num_frames) self.assertEqual(l1_error, 0.) # check behavior if number of samples would exceed file length offset_ns = 300 x_ns, _ = torchaudio.load(input_sine_path, num_frames=100000, offset=offset_ns) self.assertEqual(x_ns.size(1), x_sine_full.size(1) - offset_ns) # check when offset is beyond the end of the file with self.assertRaises(RuntimeError): torchaudio.load(input_sine_path, offset=100000)
def extract_wav_files(data_dir, params_list, clip_format, sample_rate, output_dir): clip_dir = 'wavs' clip_ext = clip_format os.makedirs(os.path.join(output_dir, clip_dir), exist_ok=True) max_int16 = torch.iinfo(torch.int16).max for params in params_list: id_ = params['id'] metadata_file = os.path.join('data', f'{id_}.metadata.txt') audio_dir = os.path.join('data', f'{id_}') with open(metadata_file, 'rt') as metadata_f: current_file = None current_audio = None for line in metadata_f: parts = line.rstrip('\r\n').split('|') id_, audio_file, audio_start, audio_end, _, _ = parts audio_start, audio_end = int(audio_start), int(audio_end) if current_file != audio_file: file = os.path.join(audio_dir, audio_file) print(f'\rReading {file}', end='') y, sr = torchaudio.load(file) assert len(y.shape) == 2 and y.shape[0] == 1 assert y.dtype == torch.float32 assert sr == sample_rate y = (y * max_int16 / torch.max(torch.abs(y))).to(torch.int16) current_file = audio_file current_audio = y output_file = os.path.join(output_dir, clip_dir, f'{id_}.{clip_ext}') y = current_audio[:, audio_start:audio_end] torchaudio.save(output_file, y, sample_rate)
def predict_from_mel(ctx: Context, squeeze_wave_checkpoint: str, output_dir: str): config: Config = ctx.obj["CONFIG"] output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) on_gpu = torch.cuda.is_available() squeeze_wave = LitSqueezeWave.load_from_checkpoint(squeeze_wave_checkpoint, config=config, on_gpu=False) squeeze_wave = SqueezeWave.remove_norms(squeeze_wave.model) squeeze_wave = squeeze_wave.eval() trump_spec = torch.load('data/preprocessed-tacotron2/mel/speech00_0000.pt') lj_spec = torch.load('data/lj-speech-tacotron2/mel/LJ001-0001.pt') prefix = str(Path(squeeze_wave_checkpoint).name) for spec, suffix in zip([trump_spec, lj_spec], ["trump", "lj"]): audio = squeeze_wave.infer(spec) audio_path = output_dir / f"{prefix}-{suffix}.wav" torchaudio.save(str(audio_path), audio.cpu(), sample_rate=config.dataset.audio_format.sampling_rate) print(f"Results saved to {output_dir}")
def _preprocess_files(self) -> None: """ Segment the waveform files based on :py:attr:`segment_len` and cache the file-segments. """ if self.segment_len is None: return processed_audio_dir = self.base_dir / f"segment_len={self.segment_len}" processed_audio_dir.mkdir(parents=True, exist_ok=True) waveform_paths = self.base_dir / self.metadata[ "filePath"] # type: ignore segment_filenames: List[Tuple[str, str]] = [] for path in tqdm(waveform_paths, desc="Preprocessing", colour=self._PBAR_COL): waveform_filename = path.stem waveform, sr = torchaudio.load(path) # type: ignore audio_len = waveform.size(-1) / sr frac_remainder, num_segments = math.modf(audio_len / self.segment_len) num_segments = int(num_segments) if frac_remainder >= 0.5: self.logger.debug( f"Length of audio file '{path.resolve()}' is not integer-divisible by " f"{self.segment_len}: terminally zero-padding the file along the " f"time-axis to compensate.", ) padding = torch.zeros( waveform.size(0), int((self.segment_len - (frac_remainder * self.segment_len)) * sr), ) waveform = torch.cat((waveform, padding), dim=-1) num_segments += 1 if 0 < frac_remainder < 0.5: self.logger.debug( f"Length of audio file '{path.resolve()}' is not integer-divisible by " f"{self.segment_len} and not of sufficient length to be padded " f"(fractional remainder must be greater than 0.5): discarding terminal segment.", ) waveform = waveform[:, :int(num_segments * self.segment_len * sr)] waveform_segments = waveform.chunk(chunks=num_segments, dim=-1) for seg_idx, segment in enumerate(waveform_segments): segment_filename = f"{waveform_filename}_{seg_idx}.wav" segment_filepath = processed_audio_dir / segment_filename torchaudio.save( # type: ignore filepath=segment_filepath, src=segment, sample_rate=sr, ) segment_filenames.append( (waveform_filename, str(segment_filepath.relative_to(self.base_dir)))) pd.DataFrame(segment_filenames, columns=["fileName", "filePath" ]).to_csv(processed_audio_dir / "filepaths.csv", index=False)
def main(): args = get_args() # Load in model from checkpoint model = load_model("demucs_quantized").to(args.device) # Initialize output path out = Path("separated") / "demucs_quantized" out.mkdir(parents=True, exist_ok=True) print(f"Separated tracks will be stored in {out.resolve()}") for track in args.tracks: if not track.exists(): print(f"File {track} does not exist.") continue print(f"Separating track {track}") # Load audio and run_model wav, sr = torchaudio.load(str(track)) wav = wav.to(args.device) wav, ref = preprocess_and_normalize_audio(wav, sr, model.audio_channels, model.samplerate) sources = run_model_with_splits_and_shifts(model, wav, split=True) sources = sources * ref.std() + ref.mean() # Save outputs track_folder = out / track.name.rsplit(".", 1)[0] track_folder.mkdir(exist_ok=True) for source, name in zip(sources, model.sources): source = source / max(1.01 * source.abs().max(), 1) source = source.cpu() wavname = str(track_folder / f"{name}.wav") torchaudio.save(wavname, source, sample_rate=model.samplerate)
def resample_folder(input_folder, output_folder, fs, regex): files = get_all_files(input_folder, match_and=[regex]) torchaudio.initialize_sox() for f in tqdm.tqdm(files): # we use sox because torchaudio.Resample uses too much RAM. resample = torchaudio.sox_effects.SoxEffectsChain() resample.append_effect_to_chain("rate", [fs]) resample.set_input_file(f) audio, fs = resample.sox_build_flow_effects() audio = (audio / torch.max(torch.abs(audio), dim=-1, keepdim=True)[0] ) # scale back otherwise you get empty .wav file os.makedirs( Path( os.path.join(output_folder, Path(f).relative_to(Path(input_folder)))).parent, exist_ok=True, ) torchaudio.save( os.path.join(output_folder, Path(f).relative_to(Path(input_folder))), audio, fs, ) torchaudio.shutdown_sox()
def compute_objectives(self, predict_wavs, batch, stage): """Computes the loss given the predicted and targeted outputs""" clean_wavs, lens = batch.clean_sig loss = self.hparams.compute_cost(predict_wavs, clean_wavs, lens) self.loss_metric.append( batch.id, predict_wavs, clean_wavs, lens, reduction="batch" ) if stage != sb.Stage.TRAIN: # Evaluate speech quality/intelligibility self.stoi_metric.append( batch.id, predict_wavs, clean_wavs, lens, reduction="batch" ) self.pesq_metric.append( batch.id, predict=predict_wavs, target=clean_wavs, lengths=lens ) # Write wavs to file if stage == sb.Stage.TEST: lens = lens * clean_wavs.shape[1] for name, pred_wav, length in zip(batch.id, predict_wavs, lens): name += ".wav" enhance_path = os.path.join( self.hparams.enhanced_folder, name ) pred_wav = pred_wav / torch.max(torch.abs(pred_wav)) * 0.99 torchaudio.save( enhance_path, torch.unsqueeze(pred_wav[: int(length)].cpu(), 0), 16000, ) return loss
def write_audio(filepath, audio, samplerate): """write audio on disk. It is basically a wrapper to support saving audio signals in the speechbrain format (audio, channels). Arguments ---------- filepath: path Path where to save the audio file audio : torch.Tensor Audio file in the expected speechbrain format (signal, channels) samplerate: int Sample rate (e.g, 16000) Example ------- >>> import os >>> tmpfile = os.path.join(str(getfixture('tmpdir')), "wave.wav") >>> dummywav = torch.rand(16000, 2) >>> write_audio(tmpfile, dummywav, 16000) >>> loaded = read_audio(tmpfile) >>> loaded.allclose(dummywav,atol=1e-4) # replace with eq with sox_io backend True """ if len(audio.shape) == 2: audio = audio.transpose(0, 1) elif len(audio.shape) == 1: audio = audio.unsqueeze(0) torchaudio.save(filepath, audio, samplerate)
def test_audio_caching_disabled_works(): lhotse.set_caching_enabled(False) # Disable caching. np.random.seed(89) # Reproducibility. # Prepare two different waveforms. noise1 = np.random.rand(1, 32000).astype(np.float32) noise2 = np.random.rand(1, 32000).astype(np.float32) # Sanity check -- the noises are different assert np.abs(noise1 - noise2).sum() != 0 # Save the first waveform in a file. with NamedTemporaryFile(suffix=".wav") as f: torchaudio.save(f.name, torch.from_numpy(noise1), sample_rate=16000) recording = Recording.from_file(f.name) # Read the audio -- should be equal to noise1. audio = recording.load_audio() np.testing.assert_almost_equal(audio, noise1) # Save noise2 to the same location. torchaudio.save(f.name, torch.from_numpy(noise2), sample_rate=16000) # Read the audio -- should be equal to noise2, # and the caching is ignored (doesn't happen). audio = recording.load_audio() np.testing.assert_almost_equal(audio, noise2)
def main(): weights_dir: str = os.path.expanduser("~/git/cherokee-diffwave/models/") cd_script_dir() model_pt = os.path.join(weights_dir, "weights.pt") npy_files: List[str] = list() npy_files.extend(sorted(glob.glob("?.npy"))) npy_files.extend(sorted(glob.glob("??.npy"))) npy_files.extend(sorted(glob.glob("???.npy"))) npy_files.extend(sorted(glob.glob("????.npy"))) npy_files.extend(sorted(glob.glob("?????.npy"))) bar: ProgressBar = progressbar.ProgressBar(maxval=len(npy_files)) bar.start() npy_wav_files: List[Tuple[str, str]] = list() for npy_file in npy_files: wav_file = f"wg-{os.path.splitext(npy_file)[0]}.wav" npy_wav_files.append((npy_file, wav_file)) if os.path.isfile(wav_file): os.remove(wav_file) for npy_file, wav_file in npy_wav_files: nd_array = numpy.load(npy_file) spectrogram: Tensor = torch.from_numpy(nd_array).float() spectrogram = torch.clamp((spectrogram + 100) / 100, 0.0, 1.0) audio, sr = diffwave_predict(spectrogram, model_pt, device=torch.device("cuda")) torchaudio.save(wav_file, audio.cpu(), sample_rate=sr) bar.update(bar.currval + 1) bar.finish()
def _save_audio(self, audios, fs, feature_names, save2): for feature_name in feature_names: audio = audios[feature_name] audio = audio.squeeze(0) if len(audio.shape) > 2 else audio torchaudio.save( pjoin(save2, feature_name) + '.wav', audio, fs['audio_sr'].item())
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('Using device: {}'.format(device)) data, sample_rates = load(enforce_samplerate=44100) print(data) # 10 seconds of audio seq_len = 60 print('Loading trained model...') model = load_model_from_checkpoint(TRAINED_STATE, device) print('Performing inference...') prev, _ = get_batch(data, 10, 1, device, segment_size=44100, full=True) print('Encoding seed sequence') hidden = model.encode(prev) print('Producing sequence') audio = torch.clamp(model.decode(hidden, seq_len), -1, 1) print('Saving result') np.save(os.path.join(OUTPUT_DIR, 'prediction.npy'), audio[0].detach().cpu().numpy()) if not IS_WINDOWS: import torchaudio torchaudio.save("prediction.mp3", torch.stack((audio[0], audio[0])), sample_rates[0]) plt.plot(audio[0].detach().cpu().numpy()) plt.show() return audio
def segment_audio( audio_path: str, channel: int, start: int, end: int, save_path: str, sample_rate: int = 16000, device: str = "cpu", ): """segment and resample audio""" start = int(start / 100 * 8000) end = int(end / 100 * 8000) num_frames = end - start data, _ = torchaudio.load(audio_path, frame_offset=start, num_frames=num_frames) resampler = Resample(orig_freq=8000, new_freq=sample_rate).to(device=device) data = resampler(data) data = torch.unsqueeze(data[channel], 0) torchaudio.save(save_path, src=data, sample_rate=sample_rate)
def write_wavs(self, batch_id, wavs, score, lens): """Write wavs to files, for historical discriminator training Arguments --------- batch_id : list of str A list of the utterance ids for the batch wavs : torch.Tensor The wavs to write to files score : torch.Tensor The actual scores for the corresponding utterances lens : torch.Tensor The relative lengths of each utterance """ lens = lens * wavs.shape[1] record = {} for i, (name, pred_wav, length) in enumerate(zip(batch_id, wavs, lens)): path = os.path.join(self.hparams.MetricGAN_folder, name + ".wav") data = torch.unsqueeze(pred_wav[: int(length)].cpu(), 0) torchaudio.save(path, data, self.hparams.Sample_rate) # Make record of path and score for historical training score = float(score[i][0]) record[name] = { "enh_wav": path, "score": score, } # Update records for historical training self.historical_set.update(record) with open(self.hparams.historical_file, "wb") as fp: # Pickling pickle.dump(self.historical_set, fp)
def setUpClass(cls): if not os.path.exists(cls._AUDIO_DATA_DIR): os.makedirs(cls._AUDIO_DATA_DIR) if not os.path.exists(cls._AUDIO_LIST_DIR): os.makedirs(cls._AUDIO_LIST_DIR) with open(cls._JUNK_FILE, "w") as f: f.write("this is some garbage\nShould have no impact.") with open(cls._AUDIO_LIST_PATHS_PATH, "w") as f_list_fnames, \ open(cls._AUDIO_LIST_FNAMES_PATH, "w") as f_list_paths: lengths = torch.randint(int(.5e5), int(1.5e6), (cls._N_EXAMPLES,)) for i in range(cls._N_EXAMPLES): # dividing gets the noise in [-1, 1] white_noise = torch.randn((cls._N_CHANNELS, lengths[i])) / 10 f_path = cls._AUDIO_DATA_PATH_FMT.format(i) torchaudio.save(f_path, white_noise, cls._SAMPLE_RATE) f_name_short = cls._AUDIO_DATA_FMT.format(i) f_list_fnames.write(f_name_short + "\n") f_list_paths.write(f_path + "\n")
import argparse import torch import torchaudio from data.data_loader import load_audio, NoiseInjection parser = argparse.ArgumentParser() parser.add_argument('--input-path', default='input.wav', help='The input audio to inject noise into') parser.add_argument('--noise-path', default='noise.wav', help='The noise file to mix in') parser.add_argument('--output-path', default='output.wav', help='The noise file to mix in') parser.add_argument('--sample-rate', default=16000, help='Sample rate to save output as') parser.add_argument('--noise-level', type=float, default=1.0, help='The Signal to Noise ratio (higher means more noise)') args = parser.parse_args() noise_injector = NoiseInjection() data = load_audio(args.input_path) mixed_data = noise_injector.inject_noise_sample(data, args.noise_path, args.noise_level) mixed_data = torch.tensor(mixed_data, dtype=torch.float).unsqueeze(1) # Add channels dim torchaudio.save(args.output_path, mixed_data, args.sample_rate) print('Saved mixed file to %s' % args.output_path)