Beispiel #1
0
def main():
    config = ConfigXT()
    load = FileXT(config.audio_path)

    print(
        colored('Preprocessing audio for ', 'blue', attrs=['bold']) +
        load.basename)
    data = preprocess.preprocess(load.filename,
                                 config.speaker,
                                 config,
                                 verbose=False)
    if config.infer_acoustic:
        f0, rmse = acoustic_infer(data, config)
        for i in range(len(data)):
            data[i][4] = f0[i]
            data[i][5] = rmse[i]

    mel = durian_infer(data, config)

    if config.vocoder == 'wavernn':
        wave = wavernn_infer(mel, config)
    elif config.vocoder == 'waveglow':
        wave = waveglow_infer(mel, config)

    savename = config.durian_path.replace('.pt', '_') + FileXT(
        config.vocoder_path).basestem + '_speaker' + str(
            config.speaker) + '_' + load.basename
    torchaudio.save(savename, wave, config.sample_rate)

    print(
        colored('Audio generated to ', 'blue', attrs=['bold']) + savename +
        '\n')
Beispiel #2
0
def test_vae(test_fns):
    """
    Used with a trained model, where MODEL_FN is already a saved .pth file


    """
    d = prep(test_fns)
    y_hats = []
    length = len(d['data'])
    img_y_hats = []
    for epoch in range(1, EPOCHS + 1):
        print(f'test epoch: {epoch}')

        train.train_epoch(d, epoch, BATCH_SIZE, device)
        apply_idx = random.randint(0, length)
        sample = d['data'][apply_idx].view(BATCH_SIZE, 2, -1)
        y_hat = utilz.gen_apply(d['m'], sample, device).cpu()
        print(f'y_hat.shape: {y_hat.shape}')

        img_vers = y_hat.view(1, 3, 240, 245) # for 2 seconds
        img_y_hats.append(img_vers)

        y_hats.append(y_hat)

    song = torch.cat(y_hats, dim=1)
    print(song)

    if SAVE_SONG:
        save_wavfn = f'vaeconv_{RUN_TIME}.wav'
        song_path = d['path'] + save_wavfn
        torchaudio.save(song_path, song, d['sr'])
        print(f'audio saved to {song_path}')


    return song# , video
def test_filter():
    audio, sr = torchaudio.load("tests/classical.00002.wav")
    num_samples = sr * 5
    transform = Compose([HighLowPass(sample_rate=sr)], )
    audios = transform(audio)
    torchaudio.save("tests/filter.wav", audios, sample_rate=sr)
    assert audios.shape[1] == audio.shape[1]
def infer_wavenet(args):
    import sys
    sys.path.append('thirdparty/wavenet_vocoder')

    from train import build_model
    from synthesis import wavegen
    from tqdm import tqdm
    target_sample_rate = 22050

    hparams, model = load_model(args.model_name)
    meller = MelSpectrogram()
    files = [
        item for item in os.listdir(args.folder_in) if item.endswith('wav')
    ]
    for idx, audio in enumerate(files):
        wav_path = os.path.join(args.folder_in, audio)
        wav = load_wav(wav_path, target_sample_rate)
        c = meller(wav)[0]
        if c.shape[1] != hparams.num_mels:
            c = c.transpose(0, 1)
        # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1]
        # c = np.interp(c, (0, 4), (0, 1))

        # Generate
        waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
        path = os.path.join(args.folder_out, audio)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        torchaudio.save(path, waveform, hparams.sample_rate)
Beispiel #5
0
def main(args):
    # spectrogram = torch.from_numpy(np.load(args.spectrogram_path))
    params = {}
    if args.noise_schedule:
        params['noise_schedule'] = torch.from_numpy(
            np.load(args.noise_schedule))
    start_time = time.time()
    model = load_model(model_dir=args.model_dir, params=params)
    if not os.path.exists(args.output_path):
        os.mkdir(args.output_path)
    time_consuming = time.time() - start_time
    print(" > Load model, time consuming {}s".format(round(time_consuming, 2)))
    mels = os.listdir(args.spectrogram_path)
    for mel_file in mels:
        if not os.path.isdir(mel_file) and mel_file[-2:] == "pt":
            start_time = time.time()
            print(" > Start inferencing sentence {} . ".format(mel_file))
            spectrogram = torch.tensor(
                torch.load(os.path.join(args.spectrogram_path, mel_file)))
            audio, sr = predict(spectrogram, model)
            wav_name = os.path.join(args.output_path,
                                    mel_file[:-7] + "_wg.wav")
            torchaudio.save(wav_name, audio.cpu(), sample_rate=sr)
            time_consuming = time.time() - start_time
            print(" > Complete, time consuming {}s".format(
                round(time_consuming, 2)))
Beispiel #6
0
def inference_fast(mels, real_audio=None, i=0, epoch=0):
    generator.eval()
    val_losses = []
    audio = torch.zeros(mels.shape[0], 1, 1).to(device)
    with torch.no_grad():
        for length in range(1, mels.shape[2] + 1):
            res = generator(audio[:, :, -1:],
                            mels[:, :, length - 1:length],
                            fast_generation=True)
            audio = torch.cat([
                audio,
                mu_decode(torch.argmax(res[:, :, -1:], dim=1)).unsqueeze(1)
            ],
                              dim=2)
        name = "example " + str(i) + "_no_tf"
        torchaudio.save("gen.wav",
                        audio[0].squeeze().detach().cpu().numpy(),
                        sample_rate=22050)
        wandb_gen = wandb.Audio(audio[0].squeeze().detach().cpu().numpy(),
                                caption="Inference_1",
                                sample_rate=22050)
        wandb_audios = [wandb_gen]
        if real_audio != None:
            wandb_real = wandb.Audio(
                real_audio[0].squeeze().detach().cpu().numpy(),
                caption="Real_1",
                sample_rate=22050)
            wandb_audios.append(wandb_real)
        wandb.log({name: wandb_audios}, step=epoch)
def resample_folder(in_dir, out_dir, target_fs=16000, regex="*.wav"):
    """
    Resamples the audio files contained in the in_dir folder and saves them in out_dir folder

    Args:
        in_dir (str): path to audio directory (audio to be resampled)
        out_dir (str): path to audio resampled directory
        target_fs (int, optional): target sample rate. Defaults to 16000.
        regex (str, optional): regular expression for extension of file. Defaults to "*.wav".
    """
    compute = True
    files = glob.glob(os.path.join(in_dir, regex))
    if os.path.exists(out_dir):
        out_files = glob.glob(os.path.join(out_dir, regex))
        if len(files) == len(out_files):
            compute = False

    if compute:
        for f in tqdm.tqdm(files):
            audio, orig_fs = torchaudio.load(f)
            audio = resample(audio, orig_fs, target_fs)

            os.makedirs(
                Path(os.path.join(out_dir,
                                  Path(f).relative_to(Path(in_dir)))).parent,
                exist_ok=True,
            )
            torchaudio.save(
                os.path.join(out_dir,
                             Path(f).relative_to(Path(in_dir))),
                audio,
                target_fs,
            )
    return compute
Beispiel #8
0
    def asadsa2():
        """ """
        from draugr.torch_utilities import to_tensor
        from neodroidaudition.data.recognition.libri_speech import LibriSpeech
        from neodroidaudition.noise_generation.gaussian_noise import white_noise
        import torchaudio
        from pathlib import Path

        libri_speech = LibriSpeech(path=Path.home() / "Data" / "Audio" /
                                   "Speech" / "LibriSpeech")
        files, sr = zip(*[(v[0].numpy(), v[1])
                          for _, v in zip(range(1), libri_speech)])
        assert all([sr[0] == s for s in sr[1:]])

        normed = files[0]
        mixed = mix_ratio(normed, normed, 0)
        mixed2 = mix_ratio(mixed, mixed, 0)
        print(normed, mixed)
        print(mixed2, mixed)
        print(root_mean_square(normed))
        print(root_mean_square(mixed))
        print(root_mean_square(mixed2))
        assert numpy.allclose(normed, mixed)
        assert numpy.allclose(mixed2, mixed)
        torchaudio.save(
            str(ensure_existence(Path.cwd() / "exclude") / "mixed_same.wav"),
            to_tensor(mixed),
            int(sr[0]),
        )
Beispiel #9
0
def convert(path):
    split_list = []
    with open(join(path, 'hub5e_00.pem'), 'r') as fp:
        for line in fp:
            if line[0] not in ['e', 's']: continue
            if line[-1] == '\n': line = line[:-1]
            line = line.split(' ')
            name = line[0] + '-' + line[1] + '.wav'
            time_0 = float(line[3])
            i = 4
            while len(line[i]) <= 1:
                i += 1
            time_1 = float(line[i])
            split_list.append((name, time_0, time_1))
    print('From {} found {} segments'.format(join(path, 'hub5e_00.pem'),
                                             len(split_list)))
    split_list = sorted(split_list, key=lambda x: x[0])
    file_list = list(Path(path).rglob('*.wav'))
    file_list = sorted(file_list)
    print('From {} found {} wav files.'.format(path, len(file_list)))
    j = 0
    for idx, p in tqdm(enumerate(file_list)):
        waveform, sr = torchaudio.load(str(p))
        count = 0
        while split_list[j][0] == str(p).split('/')[-1]:
            t_0, t_1 = split_list[j][1], split_list[j][2]
            split_wavform = waveform[:, int(t_0 * sr):int(t_1 * sr) + 1]
            torchaudio.save(
                str(p)[:-4] + '-' + str(count).zfill(3) + '.wav',
                split_wavform, sr)
            j += 1
            count += 1
            if j == len(split_list): break
    print('Finished splitting {} wav files into {}.'.format(
        j, len(split_list)))
Beispiel #10
0
def save_audio(wav,
               path,
               samplerate,
               bitrate=320,
               clip='rescale',
               bits_per_sample=16,
               as_float=False):
    """Save audio file, automatically preventing clipping if necessary
    based on the given `clip` strategy. If the path ends in `.mp3`, this
    will save as mp3 with the given `bitrate`.
    """
    wav = prevent_clip(wav, mode=clip)
    path = Path(path)
    suffix = path.suffix.lower()
    if suffix == ".mp3":
        encode_mp3(wav, path, samplerate, bitrate)
    elif suffix == ".wav":
        if as_float:
            bits_per_sample = 32
            encoding = 'PCM_F'
        else:
            encoding = 'PCM_S'
        ta.save(str(path),
                wav,
                sample_rate=samplerate,
                encoding=encoding,
                bits_per_sample=bits_per_sample)
    else:
        raise ValueError(f"Invalid suffix for path: {suffix}")
Beispiel #11
0
def test_cut_load_custom_recording_pad_left():
    sampling_rate = 16000
    duration = 52.4
    audio = np.random.randn(1, compute_num_samples(
        duration, sampling_rate)).astype(np.float32)
    audio /= np.abs(audio).max()  # normalize to [-1, 1]
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate)
        f.flush()
        os.fsync(f)
        recording = Recording.from_file(f.name)

        # Note: MonoCut doesn't normally have an "alignment" attribute,
        #       and a "load_alignment()" method.
        #       We are dynamically extending it.
        cut = MonoCut(
            id="x",
            start=0,
            duration=duration,
            channel=0,
            recording=dummy_recording(0, duration=duration),
        )
        cut.my_favorite_song = recording

        cut_pad = cut.pad(duration=60.0, direction="left")

        restored_audio = cut_pad.load_my_favorite_song()
        assert restored_audio.shape == (1, 960000)  # 16000 * 60

        np.testing.assert_almost_equal(0, restored_audio[:, :-audio.shape[1]])
        np.testing.assert_almost_equal(audio, restored_audio[:,
                                                             -audio.shape[1]:])
Beispiel #12
0
def main(cfg: DictConfig):
    spectrogram_path = Path(
        hydra.utils.to_absolute_path(cfg.inference.spectrogram_path))
    output_path = Path(
        hydra.utils.to_absolute_path(cfg.inference.inferenced_path))
    output_audio_path = output_path / (spectrogram_path.stem + ".wav")
    output_plot_path = output_path / (spectrogram_path.stem + ".png")

    model = core.model.WaveNet.load_from_checkpoint(
        hydra.utils.to_absolute_path(cfg.inference.checkpoint_path))
    model = model.eval().to(cfg.inference.device)

    spectrogram = torch.load(spectrogram_path).to(model.device)
    if len(spectrogram.shape) < 3:
        spectrogram = spectrogram.unsqueeze(0)
    if cfg.inference.cut_size is not None:
        spectrogram = spectrogram[:, :, :int(cfg.inference.cut_size *
                                             cfg.data.sample_rate /
                                             cfg.preprocessing.hop_length)]
    audio = model.inference(spectrogram).squeeze(0)
    audio = torchaudio.functional.mu_law_decoding(audio, model.n_mu_law)

    torchaudio.save(str(output_audio_path),
                    audio.detach().cpu(),
                    sample_rate=cfg.data.sample_rate)
    plt.plot(audio[0].detach().cpu().numpy())
    plt.savefig(output_plot_path)
    def forward(self, input_path: str, output_path: str):
        torchaudio.sox_effects.init_sox_effects()

        # 1. load audio
        waveform, sample_rate = torchaudio.load(input_path)

        # 2. Add background noise
        alpha = 0.01
        waveform = alpha * torch.randn_like(waveform) + (1 - alpha) * waveform

        # 3. Reample the RIR filter to much the audio sample rate
        rir, _ = torchaudio.sox_effects.apply_effects_tensor(
            self.rir,
            self.rir_sample_rate,
            effects=[["rate", str(sample_rate)]])
        rir = rir / torch.norm(rir, p=2)
        rir = torch.flip(rir, [1])

        # 4. Apply RIR filter
        waveform = torch.nn.functional.pad(waveform, (rir.shape[1] - 1, 0))
        waveform = torch.nn.functional.conv1d(waveform[None, ...], rir[None,
                                                                       ...])[0]

        # Save
        torchaudio.save(output_path, waveform, sample_rate)
Beispiel #14
0
    def _test_1_save_sine(self):

        # save created file
        sinewave_filepath = os.path.join(self.test_dirpath, "assets",
                                         "sinewave.wav")
        sr = 16000
        freq = 440
        volume = 0.3

        y = (torch.cos(
            2 * math.pi * torch.arange(0, 4 * sr).float() * freq / sr))
        y.unsqueeze_(0)
        # y is between -1 and 1, so must scale
        y = (y * volume * (2**31)).long()
        torchaudio.save(sinewave_filepath, y, sr)
        self.assertTrue(os.path.isfile(sinewave_filepath))

        # test precision
        new_precision = 32
        new_filepath = os.path.join(self.test_dirpath, "test.wav")
        si, ei = torchaudio.info(sinewave_filepath)
        torchaudio.save(new_filepath, y, sr, new_precision)
        si32, ei32 = torchaudio.info(new_filepath)
        self.assertEqual(si.precision, 16)
        self.assertEqual(si32.precision, new_precision)
        os.unlink(new_filepath)
Beispiel #15
0
def generate_background_noise(speech_commands):
    """Split the background noise provided by the dataset in 1 second chunks.

    Parameters:
        speech_commands (torch.utils.data.Dataset): Speech Command dataset as defined by torchaudio.
    """
    background_noise = glob.glob(
        os.path.join(speech_commands._path, "_background_noise_", "*.wav"))
    os.makedirs(os.path.join(speech_commands._path, "background"),
                exist_ok=True)

    for file in background_noise:
        waveform, sample_rate = torchaudio.load(file)
        background_waveforms = torch.split(waveform, sample_rate, dim=1)[:-1]

        for idx, background_waveform in enumerate(background_waveforms):
            torchaudio.save(
                os.path.join(
                    speech_commands._path,
                    "background",
                    f"{hash(waveform)}_nohash_{idx}.wav",
                ),
                background_waveform,
                sample_rate=sample_rate,
            )
Beispiel #16
0
def run_inference(text, audio=None):
    generator.eval()
    text = [ord(c) for c in text if ord(c) < 256]
    text = torch.tensor(text).view(1, -1)
    with torch.no_grad():
        text = text.to(device)
        pad_mask = (text != 0).to(device)
        res, before_prenet, stop_token, attn_matrix = generator(
            text, pad_mask, None, device)
        wandb_gen = wandb.Image(res[0, :, :].detach().cpu().numpy(),
                                caption="Generated")
        wandb_attn = wandb.Image(256 *
                                 attn_matrix[0, :, :].detach().cpu().numpy(),
                                 caption="Attention")
        wandb_images = [wandb_gen, wandb_attn]
        audio_gen = vocoder.inference(res[:1, :, :].detach().cpu())
        torchaudio.save("gen.wav", audio_gen, sample_rate=22050)
        wandb_audios = [
            wandb.Audio("gen.wav", caption="Generated", sample_rate=22050)
        ]
        if audio != None:
            wandb_real = wandb.Image(audio[0, :, :].detach().cpu().numpy(),
                                     caption="Real")
            wandb_images.append(wandb_real)
            audio_real = vocoder.inference(audio[:1, :, :])
            torchaudio.save("temp_real.wav", audio_real, sample_rate=22050)
            wandb_audios.append(
                wandb.Audio("temp_real.wav", caption="Real",
                            sample_rate=22050))
        wandb.log({"mels": wandb_images}, step=0)
        wandb.log({"audios": wandb_audios}, step=0)
        api.flush()
Beispiel #17
0
    def _test_4_load_partial(self):
        num_frames = 101
        offset = 201
        # load entire mono sinewave wav file, load a partial copy and then compare
        input_sine_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav')
        x_sine_full, sr_sine = torchaudio.load(input_sine_path)
        x_sine_part, _ = torchaudio.load(input_sine_path, num_frames=num_frames, offset=offset)
        l1_error = x_sine_full[:, offset:(num_frames + offset)].sub(x_sine_part).abs().sum().item()
        # test for the correct number of samples and that the correct portion was loaded
        self.assertEqual(x_sine_part.size(1), num_frames)
        self.assertEqual(l1_error, 0.)
        # create a two channel version of this wavefile
        x_2ch_sine = x_sine_full.repeat(1, 2)
        out_2ch_sine_path = os.path.join(self.test_dirpath, 'assets', '2ch_sinewave.wav')
        torchaudio.save(out_2ch_sine_path, x_2ch_sine, sr_sine)
        x_2ch_sine_load, _ = torchaudio.load(out_2ch_sine_path, num_frames=num_frames, offset=offset)
        os.unlink(out_2ch_sine_path)
        l1_error = x_2ch_sine_load.sub(x_2ch_sine[:, offset:(offset + num_frames)]).abs().sum().item()
        self.assertEqual(l1_error, 0.)

        # test with two channel mp3
        x_2ch_full, sr_2ch = torchaudio.load(self.test_filepath, normalization=True)
        x_2ch_part, _ = torchaudio.load(self.test_filepath, normalization=True, num_frames=num_frames, offset=offset)
        l1_error = x_2ch_full[:, offset:(offset + num_frames)].sub(x_2ch_part).abs().sum().item()
        self.assertEqual(x_2ch_part.size(1), num_frames)
        self.assertEqual(l1_error, 0.)

        # check behavior if number of samples would exceed file length
        offset_ns = 300
        x_ns, _ = torchaudio.load(input_sine_path, num_frames=100000, offset=offset_ns)
        self.assertEqual(x_ns.size(1), x_sine_full.size(1) - offset_ns)

        # check when offset is beyond the end of the file
        with self.assertRaises(RuntimeError):
            torchaudio.load(input_sine_path, offset=100000)
Beispiel #18
0
def extract_wav_files(data_dir, params_list, clip_format, sample_rate, output_dir):

    clip_dir = 'wavs'
    clip_ext = clip_format

    os.makedirs(os.path.join(output_dir, clip_dir), exist_ok=True)
    max_int16 = torch.iinfo(torch.int16).max

    for params in params_list:
        id_ = params['id']
        metadata_file = os.path.join('data', f'{id_}.metadata.txt')
        audio_dir = os.path.join('data', f'{id_}')
        with open(metadata_file, 'rt') as metadata_f:
            current_file = None
            current_audio = None
            for line in metadata_f:
                parts = line.rstrip('\r\n').split('|')
                id_, audio_file, audio_start, audio_end, _, _ = parts
                audio_start, audio_end = int(audio_start), int(audio_end)
                if current_file != audio_file:
                    file = os.path.join(audio_dir, audio_file)
                    print(f'\rReading {file}', end='')
                    y, sr = torchaudio.load(file)
                    assert len(y.shape) == 2 and y.shape[0] == 1
                    assert y.dtype == torch.float32
                    assert sr == sample_rate
                    y = (y * max_int16 / torch.max(torch.abs(y))).to(torch.int16) 
                    current_file = audio_file
                    current_audio = y
                output_file = os.path.join(output_dir, clip_dir, f'{id_}.{clip_ext}')
                y = current_audio[:, audio_start:audio_end]
                torchaudio.save(output_file, y, sample_rate)
Beispiel #19
0
def predict_from_mel(ctx: Context, squeeze_wave_checkpoint: str,
                     output_dir: str):
    config: Config = ctx.obj["CONFIG"]

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)

    on_gpu = torch.cuda.is_available()
    squeeze_wave = LitSqueezeWave.load_from_checkpoint(squeeze_wave_checkpoint,
                                                       config=config,
                                                       on_gpu=False)
    squeeze_wave = SqueezeWave.remove_norms(squeeze_wave.model)
    squeeze_wave = squeeze_wave.eval()

    trump_spec = torch.load('data/preprocessed-tacotron2/mel/speech00_0000.pt')
    lj_spec = torch.load('data/lj-speech-tacotron2/mel/LJ001-0001.pt')

    prefix = str(Path(squeeze_wave_checkpoint).name)
    for spec, suffix in zip([trump_spec, lj_spec], ["trump", "lj"]):
        audio = squeeze_wave.infer(spec)
        audio_path = output_dir / f"{prefix}-{suffix}.wav"
        torchaudio.save(str(audio_path),
                        audio.cpu(),
                        sample_rate=config.dataset.audio_format.sampling_rate)
    print(f"Results saved to {output_dir}")
Beispiel #20
0
    def _preprocess_files(self) -> None:
        """
        Segment the waveform files based on :py:attr:`segment_len` and cache the file-segments.
        """
        if self.segment_len is None:
            return
        processed_audio_dir = self.base_dir / f"segment_len={self.segment_len}"
        processed_audio_dir.mkdir(parents=True, exist_ok=True)

        waveform_paths = self.base_dir / self.metadata[
            "filePath"]  # type: ignore
        segment_filenames: List[Tuple[str, str]] = []
        for path in tqdm(waveform_paths,
                         desc="Preprocessing",
                         colour=self._PBAR_COL):
            waveform_filename = path.stem
            waveform, sr = torchaudio.load(path)  # type: ignore
            audio_len = waveform.size(-1) / sr
            frac_remainder, num_segments = math.modf(audio_len /
                                                     self.segment_len)
            num_segments = int(num_segments)

            if frac_remainder >= 0.5:
                self.logger.debug(
                    f"Length of audio file '{path.resolve()}' is not integer-divisible by "
                    f"{self.segment_len}: terminally zero-padding the file along the "
                    f"time-axis to compensate.", )
                padding = torch.zeros(
                    waveform.size(0),
                    int((self.segment_len -
                         (frac_remainder * self.segment_len)) * sr),
                )
                waveform = torch.cat((waveform, padding), dim=-1)
                num_segments += 1
            if 0 < frac_remainder < 0.5:
                self.logger.debug(
                    f"Length of audio file '{path.resolve()}' is not integer-divisible by "
                    f"{self.segment_len} and not of sufficient length to be padded "
                    f"(fractional remainder must be greater than 0.5): discarding terminal segment.",
                )
                waveform = waveform[:, :int(num_segments * self.segment_len *
                                            sr)]

            waveform_segments = waveform.chunk(chunks=num_segments, dim=-1)
            for seg_idx, segment in enumerate(waveform_segments):
                segment_filename = f"{waveform_filename}_{seg_idx}.wav"
                segment_filepath = processed_audio_dir / segment_filename
                torchaudio.save(  # type: ignore
                    filepath=segment_filepath,
                    src=segment,
                    sample_rate=sr,
                )
                segment_filenames.append(
                    (waveform_filename,
                     str(segment_filepath.relative_to(self.base_dir))))

        pd.DataFrame(segment_filenames,
                     columns=["fileName", "filePath"
                              ]).to_csv(processed_audio_dir / "filepaths.csv",
                                        index=False)
Beispiel #21
0
def main():
    args = get_args()

    # Load in model from checkpoint
    model = load_model("demucs_quantized").to(args.device)

    # Initialize output path
    out = Path("separated") / "demucs_quantized"
    out.mkdir(parents=True, exist_ok=True)
    print(f"Separated tracks will be stored in {out.resolve()}")

    for track in args.tracks:
        if not track.exists():
            print(f"File {track} does not exist.")
            continue

        print(f"Separating track {track}")

        # Load audio and run_model
        wav, sr = torchaudio.load(str(track))
        wav = wav.to(args.device)
        wav, ref = preprocess_and_normalize_audio(wav, sr,
                                                  model.audio_channels,
                                                  model.samplerate)
        sources = run_model_with_splits_and_shifts(model, wav, split=True)
        sources = sources * ref.std() + ref.mean()

        # Save outputs
        track_folder = out / track.name.rsplit(".", 1)[0]
        track_folder.mkdir(exist_ok=True)
        for source, name in zip(sources, model.sources):
            source = source / max(1.01 * source.abs().max(), 1)
            source = source.cpu()
            wavname = str(track_folder / f"{name}.wav")
            torchaudio.save(wavname, source, sample_rate=model.samplerate)
def resample_folder(input_folder, output_folder, fs, regex):

    files = get_all_files(input_folder, match_and=[regex])
    torchaudio.initialize_sox()
    for f in tqdm.tqdm(files):

        # we use sox because torchaudio.Resample uses too much RAM.
        resample = torchaudio.sox_effects.SoxEffectsChain()
        resample.append_effect_to_chain("rate", [fs])
        resample.set_input_file(f)

        audio, fs = resample.sox_build_flow_effects()

        audio = (audio / torch.max(torch.abs(audio), dim=-1, keepdim=True)[0]
                 )  # scale back otherwise you get empty .wav file
        os.makedirs(
            Path(
                os.path.join(output_folder,
                             Path(f).relative_to(Path(input_folder)))).parent,
            exist_ok=True,
        )
        torchaudio.save(
            os.path.join(output_folder,
                         Path(f).relative_to(Path(input_folder))),
            audio,
            fs,
        )
    torchaudio.shutdown_sox()
Beispiel #23
0
    def compute_objectives(self, predict_wavs, batch, stage):
        """Computes the loss given the predicted and targeted outputs"""
        clean_wavs, lens = batch.clean_sig

        loss = self.hparams.compute_cost(predict_wavs, clean_wavs, lens)
        self.loss_metric.append(
            batch.id, predict_wavs, clean_wavs, lens, reduction="batch"
        )

        if stage != sb.Stage.TRAIN:

            # Evaluate speech quality/intelligibility
            self.stoi_metric.append(
                batch.id, predict_wavs, clean_wavs, lens, reduction="batch"
            )
            self.pesq_metric.append(
                batch.id, predict=predict_wavs, target=clean_wavs, lengths=lens
            )

            # Write wavs to file
            if stage == sb.Stage.TEST:
                lens = lens * clean_wavs.shape[1]
                for name, pred_wav, length in zip(batch.id, predict_wavs, lens):
                    name += ".wav"
                    enhance_path = os.path.join(
                        self.hparams.enhanced_folder, name
                    )
                    pred_wav = pred_wav / torch.max(torch.abs(pred_wav)) * 0.99
                    torchaudio.save(
                        enhance_path,
                        torch.unsqueeze(pred_wav[: int(length)].cpu(), 0),
                        16000,
                    )

        return loss
Beispiel #24
0
def write_audio(filepath, audio, samplerate):
    """write audio on disk. It is basically a wrapper to support saving
    audio signals in the speechbrain format (audio, channels).

    Arguments
    ----------
    filepath: path
        Path where to save the audio file
    audio : torch.Tensor
        Audio file in the expected speechbrain format (signal, channels)
    samplerate: int
        Sample rate (e.g, 16000)


    Example
    -------
    >>> import os
    >>> tmpfile = os.path.join(str(getfixture('tmpdir')),  "wave.wav")
    >>> dummywav = torch.rand(16000, 2)
    >>> write_audio(tmpfile, dummywav, 16000)
    >>> loaded = read_audio(tmpfile)
    >>> loaded.allclose(dummywav,atol=1e-4) # replace with eq with sox_io backend
    True
    """
    if len(audio.shape) == 2:
        audio = audio.transpose(0, 1)
    elif len(audio.shape) == 1:
        audio = audio.unsqueeze(0)

    torchaudio.save(filepath, audio, samplerate)
Beispiel #25
0
def test_audio_caching_disabled_works():
    lhotse.set_caching_enabled(False)  # Disable caching.

    np.random.seed(89)  # Reproducibility.

    # Prepare two different waveforms.
    noise1 = np.random.rand(1, 32000).astype(np.float32)
    noise2 = np.random.rand(1, 32000).astype(np.float32)
    # Sanity check -- the noises are different
    assert np.abs(noise1 - noise2).sum() != 0

    # Save the first waveform in a file.
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(noise1), sample_rate=16000)
        recording = Recording.from_file(f.name)

        # Read the audio -- should be equal to noise1.
        audio = recording.load_audio()
        np.testing.assert_almost_equal(audio, noise1)

        # Save noise2 to the same location.
        torchaudio.save(f.name, torch.from_numpy(noise2), sample_rate=16000)

        # Read the audio -- should be equal to noise2,
        # and the caching is ignored (doesn't happen).
        audio = recording.load_audio()
        np.testing.assert_almost_equal(audio, noise2)
def main():
    weights_dir: str = os.path.expanduser("~/git/cherokee-diffwave/models/")
    cd_script_dir()
    model_pt = os.path.join(weights_dir, "weights.pt")
    npy_files: List[str] = list()
    npy_files.extend(sorted(glob.glob("?.npy")))
    npy_files.extend(sorted(glob.glob("??.npy")))
    npy_files.extend(sorted(glob.glob("???.npy")))
    npy_files.extend(sorted(glob.glob("????.npy")))
    npy_files.extend(sorted(glob.glob("?????.npy")))
    bar: ProgressBar = progressbar.ProgressBar(maxval=len(npy_files))
    bar.start()
    npy_wav_files: List[Tuple[str, str]] = list()
    for npy_file in npy_files:
        wav_file = f"wg-{os.path.splitext(npy_file)[0]}.wav"
        npy_wav_files.append((npy_file, wav_file))
        if os.path.isfile(wav_file):
            os.remove(wav_file)
    for npy_file, wav_file in npy_wav_files:
        nd_array = numpy.load(npy_file)
        spectrogram: Tensor = torch.from_numpy(nd_array).float()
        spectrogram = torch.clamp((spectrogram + 100) / 100, 0.0, 1.0)
        audio, sr = diffwave_predict(spectrogram,
                                     model_pt,
                                     device=torch.device("cuda"))
        torchaudio.save(wav_file, audio.cpu(), sample_rate=sr)
        bar.update(bar.currval + 1)
    bar.finish()
Beispiel #27
0
 def _save_audio(self, audios, fs, feature_names, save2):
     for feature_name in feature_names:
         audio = audios[feature_name]
         audio = audio.squeeze(0) if len(audio.shape) > 2 else audio
         torchaudio.save(
             pjoin(save2, feature_name) + '.wav', audio,
             fs['audio_sr'].item())
Beispiel #28
0
def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print('Using device: {}'.format(device))

    data, sample_rates = load(enforce_samplerate=44100)
    print(data)
    # 10 seconds of audio
    seq_len = 60

    print('Loading trained model...')
    model = load_model_from_checkpoint(TRAINED_STATE, device)
    print('Performing inference...')

    prev, _ = get_batch(data, 10, 1, device, segment_size=44100, full=True)

    print('Encoding seed sequence')
    hidden = model.encode(prev)

    print('Producing sequence')
    audio = torch.clamp(model.decode(hidden, seq_len), -1, 1)

    print('Saving result')
    np.save(os.path.join(OUTPUT_DIR, 'prediction.npy'),
            audio[0].detach().cpu().numpy())
    if not IS_WINDOWS:
        import torchaudio
        torchaudio.save("prediction.mp3", torch.stack((audio[0], audio[0])),
                        sample_rates[0])

    plt.plot(audio[0].detach().cpu().numpy())
    plt.show()

    return audio
Beispiel #29
0
def segment_audio(
    audio_path: str,
    channel: int,
    start: int,
    end: int,
    save_path: str,
    sample_rate: int = 16000,
    device: str = "cpu",
):
    """segment and resample audio"""

    start = int(start / 100 * 8000)
    end = int(end / 100 * 8000)
    num_frames = end - start

    data, _ = torchaudio.load(audio_path,
                              frame_offset=start,
                              num_frames=num_frames)

    resampler = Resample(orig_freq=8000,
                         new_freq=sample_rate).to(device=device)

    data = resampler(data)
    data = torch.unsqueeze(data[channel], 0)

    torchaudio.save(save_path, src=data, sample_rate=sample_rate)
Beispiel #30
0
    def write_wavs(self, batch_id, wavs, score, lens):
        """Write wavs to files, for historical discriminator training

        Arguments
        ---------
        batch_id : list of str
            A list of the utterance ids for the batch
        wavs : torch.Tensor
            The wavs to write to files
        score : torch.Tensor
            The actual scores for the corresponding utterances
        lens : torch.Tensor
            The relative lengths of each utterance
        """
        lens = lens * wavs.shape[1]
        record = {}
        for i, (name, pred_wav, length) in enumerate(zip(batch_id, wavs, lens)):
            path = os.path.join(self.hparams.MetricGAN_folder, name + ".wav")
            data = torch.unsqueeze(pred_wav[: int(length)].cpu(), 0)
            torchaudio.save(path, data, self.hparams.Sample_rate)

            # Make record of path and score for historical training
            score = float(score[i][0])
            record[name] = {
                "enh_wav": path,
                "score": score,
            }

        # Update records for historical training
        self.historical_set.update(record)

        with open(self.hparams.historical_file, "wb") as fp:  # Pickling
            pickle.dump(self.historical_set, fp)
    def setUpClass(cls):
        if not os.path.exists(cls._AUDIO_DATA_DIR):
            os.makedirs(cls._AUDIO_DATA_DIR)
        if not os.path.exists(cls._AUDIO_LIST_DIR):
            os.makedirs(cls._AUDIO_LIST_DIR)

        with open(cls._JUNK_FILE, "w") as f:
            f.write("this is some garbage\nShould have no impact.")

        with open(cls._AUDIO_LIST_PATHS_PATH, "w") as f_list_fnames, \
                open(cls._AUDIO_LIST_FNAMES_PATH, "w") as f_list_paths:
            lengths = torch.randint(int(.5e5), int(1.5e6), (cls._N_EXAMPLES,))
            for i in range(cls._N_EXAMPLES):
                # dividing gets the noise in [-1, 1]
                white_noise = torch.randn((cls._N_CHANNELS, lengths[i])) / 10
                f_path = cls._AUDIO_DATA_PATH_FMT.format(i)
                torchaudio.save(f_path, white_noise, cls._SAMPLE_RATE)
                f_name_short = cls._AUDIO_DATA_FMT.format(i)
                f_list_fnames.write(f_name_short + "\n")
                f_list_paths.write(f_path + "\n")
import argparse

import torch
import torchaudio

from data.data_loader import load_audio, NoiseInjection

parser = argparse.ArgumentParser()
parser.add_argument('--input-path', default='input.wav', help='The input audio to inject noise into')
parser.add_argument('--noise-path', default='noise.wav', help='The noise file to mix in')
parser.add_argument('--output-path', default='output.wav', help='The noise file to mix in')
parser.add_argument('--sample-rate', default=16000, help='Sample rate to save output as')
parser.add_argument('--noise-level', type=float, default=1.0,
                    help='The Signal to Noise ratio (higher means more noise)')
args = parser.parse_args()

noise_injector = NoiseInjection()
data = load_audio(args.input_path)
mixed_data = noise_injector.inject_noise_sample(data, args.noise_path, args.noise_level)
mixed_data = torch.tensor(mixed_data, dtype=torch.float).unsqueeze(1)  # Add channels dim
torchaudio.save(args.output_path, mixed_data, args.sample_rate)
print('Saved mixed file to %s' % args.output_path)