Exemple #1
0
def read_audio(fp, downsample=True):
    sig, sr = torchaudio.load(fp)
    if downsample:
        # 48khz -> 16 khz
        if sig.size(0) % 3 == 0:
            sig = sig[::3].contiguous()
        else:
            sig = sig[:-(sig.size(0) % 3):3].contiguous()
    return sig, sr
Exemple #2
0
def load_audio(path):
    sound, _ = torchaudio.load(path)
    sound = sound.numpy()
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound = sound.mean(axis=1)  # multiple channels, average
    return sound
Exemple #3
0
def load_audio(path):
    '''使用torchaudio读取音频
    Args:
        path(string)            : 音频的路径
    Returns:
        sound(numpy.ndarray)    : 单声道音频数据,如果是多声道进行平均(Samples * 1 channel)
    '''
    sound, _ = torchaudio.load(path)
    sound = sound.numpy()
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound = sound.mean(axis = 1)
    return sound
def load_audio(path):
    '''
    Input:
        path     : string 载入音频的路径
    Output:
        sound    : numpy.ndarray 单声道音频数据,如果是多声道进行平均
    '''
    sound, _ = torchaudio.load(path)
    sound = sound.numpy()
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound - sound.mean(axis=1)
    return sound
Exemple #5
0
def load_audio(path):
    """
    Args:
        path     : string 载入音频的路径
    Returns:
        sound    : numpy.ndarray 单声道音频数据,如果是多声道进行平均
    """
    sound, _ = torchaudio.load(path)
    sound = sound.numpy()
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound = sound.mean(axis=1)
    return sound
Exemple #6
0
def load_wave(path, normalize=True):
    """
    Args:
        path     : string 载入音频的路径
    Returns:
    """
    sound, _ = torchaudio.load(path)
    sound = sound.numpy()
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound = sound.mean(axis=1)
    
    wave = torch.FloatTensor(sound)
    if normalize:
        mean = wave.mean()
        std = wave.std()
        wave.add_(-mean)
        wave.div_(std)
    return wave
Exemple #7
0
def download_vctk(destination, tmp_dir=None, device="cpu"):
    """Download dataset and perform resample to 16000 Hz.

    Arguments
    ---------
    destination : str
        Place to put final zipped dataset.
    tmp_dir : str
        Location to store temporary files. Will use `tempfile` if not provided.
    device : str
        Passed directly to pytorch's ``.to()`` method. Used for resampling.
    """
    dataset_name = "noisy-vctk-16k"
    if tmp_dir is None:
        tmp_dir = tempfile.gettempdir()
    final_dir = os.path.join(tmp_dir, dataset_name)

    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir)

    if not os.path.isdir(final_dir):
        os.mkdir(final_dir)

    prefix = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/2791/"
    noisy_vctk_urls = [
        prefix + "clean_testset_wav.zip",
        prefix + "noisy_testset_wav.zip",
        prefix + "testset_txt.zip",
        prefix + "clean_trainset_28spk_wav.zip",
        prefix + "noisy_trainset_28spk_wav.zip",
        prefix + "trainset_28spk_txt.zip",
    ]

    zip_files = []
    for url in noisy_vctk_urls:
        filename = os.path.join(tmp_dir, url.split("/")[-1])
        zip_files.append(filename)
        if not os.path.isfile(filename):
            logger.info("Downloading " + url)
            with urllib.request.urlopen(url) as response:
                with open(filename, "wb") as tmp_file:
                    logger.info("... to " + tmp_file.name)
                    shutil.copyfileobj(response, tmp_file)

    # Unzip
    for zip_file in zip_files:
        logger.info("Unzipping " + zip_file)
        shutil.unpack_archive(zip_file, tmp_dir, "zip")
        os.remove(zip_file)

    # Move transcripts to final dir
    shutil.move(os.path.join(tmp_dir, "testset_txt"), final_dir)
    shutil.move(os.path.join(tmp_dir, "trainset_28spk_txt"), final_dir)

    # Downsample
    dirs = [
        "noisy_testset_wav",
        "clean_testset_wav",
        "noisy_trainset_28spk_wav",
        "clean_trainset_28spk_wav",
    ]

    downsampler = Resample(orig_freq=48000, new_freq=16000)

    for directory in dirs:
        logger.info("Resampling " + directory)
        dirname = os.path.join(tmp_dir, directory)

        # Make directory to store downsampled files
        dirname_16k = os.path.join(final_dir, directory + "_16k")
        if not os.path.isdir(dirname_16k):
            os.mkdir(dirname_16k)

        # Load files and downsample
        for filename in get_all_files(dirname, match_and=[".wav"]):
            signal, rate = torchaudio.load(filename)
            downsampled_signal = downsampler(signal.view(1, -1).to(device))

            # Save downsampled file
            torchaudio.save(
                os.path.join(dirname_16k, filename[-12:]),
                downsampled_signal[0].cpu(),
                sample_rate=16000,
                channels_first=False,
            )

            # Remove old file
            os.remove(filename)

        # Remove old directory
        os.rmdir(dirname)

    logger.info("Zipping " + final_dir)
    final_zip = shutil.make_archive(
        base_name=final_dir,
        format="zip",
        root_dir=os.path.dirname(final_dir),
        base_dir=os.path.basename(final_dir),
    )

    logger.info(f"Moving {final_zip} to {destination}")
    shutil.move(final_zip, os.path.join(destination, dataset_name + ".zip"))
Exemple #8
0
            if char == '<s>':
                prev = ''
                continue
            hypothesis += char
            prev = char
        return hypothesis.replace('|', ' ')


# Load Wav2Vec2 pretrained model from Hugging Face Hub
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
# Convert the model to torchaudio format, which supports TorchScript.
model = import_huggingface_model(model)
# Remove weight normalization which is not supported by quantization.
model.encoder.transformer.pos_conv_embed.__prepare_scriptable__()
model = model.eval()
# Attach decoder
model = SpeechRecognizer(model)

# Apply quantization / script / optimize for motbile
quantized_model = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
scripted_model = torch.jit.script(quantized_model)
optimized_model = optimize_for_mobile(scripted_model)

# Sanity check
waveform , _ = torchaudio.load('scent_of_a_woman_future.wav')
print('Result:', optimized_model(waveform))

optimized_model._save_for_lite_interpreter("wav2vec2.ptl")

Exemple #9
0
 def test_Vol(self):
     test_filepath = common_utils.get_asset_path(
         'steam-train-whistle-daniel_simon.wav')
     waveform, _ = torchaudio.load(test_filepath)
     self._assert_consistency(T.Vol(1.1), waveform)
Exemple #10
0
 def _reload_signal(self):
     data_signal, sample_rate = torchaudio.load(self.path)
     self._sample_rate = sample_rate
     self._data_signal = data_signal
Exemple #11
0
    def __call__(
        self, file: AudioFile, sample_offset: int = 0, num_samples: int = None
    ) -> Tensor:
        """

        Parameters
        ----------
        file : AudioFile
            Audio file.
        sample_offset : int, optional
            Start loading at this `sample_offset` sample. Defaults ot 0.
        num_samples : int, optional
            Load that many samples. Defaults to load up to the end of the file.

        Returns
        -------
        samples : (time, channel) torch.Tensor
            Samples

        """

        self.is_valid(file)

        original_samples = None

        if isinstance(file, dict):

            # file = {"samples": torch.Tensor, "sample_rate": int, [ "channel": int ]}
            if "samples" in file:
                original_samples = file["samples"]
                original_sample_rate = file["sample_rate"]
                original_total_num_samples = original_samples.shape[1]
                channel = file.get("channel", None)

            # file = {"audio": str or Path, [ "channel": int ]}
            else:
                audio_path = str(file["audio"])
                (
                    original_total_num_samples,
                    original_sample_rate,
                ) = self.get_audio_metadata(audio_path)
                channel = file.get("channel", None)

        #  file = str or Path
        else:
            audio_path = str(file)
            original_total_num_samples, original_sample_rate = self.get_audio_metadata(
                audio_path
            )
            channel = None

        original_sample_offset = round(
            sample_offset * original_sample_rate / self.sample_rate
        )
        if num_samples is None:
            original_num_samples = original_total_num_samples - original_sample_offset
        else:
            original_num_samples = round(
                num_samples * original_sample_rate / self.sample_rate
            )

        if original_sample_offset + original_num_samples > original_total_num_samples:
            raise ValueError()

        if original_samples is None:
            try:
                original_data, _ = torchaudio.load(
                    audio_path,
                    frame_offset=original_sample_offset,
                    num_frames=original_num_samples,
                )
            except TypeError:
                raise Exception(
                    "It looks like you are using an unsupported version of torchaudio."
                    " If you have 0.6 or older, please upgrade to a newer version."
                )

        else:
            original_data = original_samples[
                :, original_sample_offset : original_sample_offset + original_num_samples
            ]

        if channel is not None:
            original_data = original_data[channel - 1 : channel, :]

        result = self.downmix_and_resample(original_data, original_sample_rate)

        if num_samples is not None:
            # If there is an off-by-one error in the length (e.g. due to resampling), fix it.
            if result.shape[-1] > num_samples:
                result = result[:, :num_samples]
            elif result.shape[-1] < num_samples:
                diff = num_samples - result.shape[-1]
                result = torch.nn.functional.pad(result, (0, diff))

        return result
Exemple #12
0
def load_wav(path):
    signal, _ = torchaudio.load(path)
    signal = signal.reshape(-1)
    return signal
Exemple #13
0
    lines = f.readlines()
    data = [(line.strip().split(',')[0], int(line.strip().split(',')[1]))
            for line in lines]

model.eval()
with torch.no_grad():
    all_labels = []
    all_pred_labels = []
    small_audios_indices = []
    audio_lengths = []
    wrong_sample_rate_indices = []
    clean_indices = []
    total_mag = []
    #for batch_num, (audio, label) in enumerate(loader):
    for i, (path, label) in enumerate(data):
        audio, sample_rate = torchaudio.load('/storage' + path)
        if sample_rate != 8000:
            wrong_sample_rate_indices.append(i)
            continue
        if model.cnn_type == 'vgg':
            min_len = 10100
        else:
            min_len = 12300
        if audio.shape[1] < min_len:
            #print("Audio at index {} is too small to pass through CNN".format(i))
            small_audios_indices.append(i)
            continue
        if audio.shape[1] > 80000:
            audio = audio[:, :80000]
        clean_indices.append(i)
        total_mag.append(
Exemple #14
0
import torch
import torchaudio
import pytorch_lightning as pl
import matplotlib.pyplot as plt
import pandas as pd
import pathlib as Path
import os

torch.cuda.is_available()
print(os.getcwd())

csv = pd.read_csv('./data/ESC-50-master/meta/esc50.csv')
x, sr = torchaudio.load(f'./data/ESC-50-master/audio/{csv.iloc[0, 0]}')
h = torchaudio.transforms.Resample(new_freq=8000)(x)
print(h.shape)
plt.imshow(h[0])
Exemple #15
0
def getDFTFeature(filepath,
                  win_size=1024,
                  win_shift=512,
                  preemphasis=False,
                  channel_first=True,
                  drop_dc=True,
                  cut_len=5160,
                  normalize=False):
    '''
    获取一个音频的对数DFT频谱
    
    Args:
        filepath: 音频路径
        win_size: 窗口大小(点)
        win_shift: 滑动距离(点)
        preemphasis: 是否预强化。通过一阶差分弱低频强高频
        channel_first: whether to put channels in first dimension
        drop_dc: whether to drop DC component in spectrum (frequency==0)
        cut_len: keep the fix number of points in time axis
        normalize: 观察发现能量很小,对数能量在-100的数量级,有必要处理一下
        
    Return:
        (log_power_spectrum, phase_spectrum):能量谱与相位谱相叠形成的tensor
        大小原本为(2C,T,M//2),C为通道数据,T为帧数,M为FFT点数
        经转置后变为(T,M//2,2C)
    '''
    waveform, sample_freq = torchaudio.load(filepath)
    m, n = waveform.shape
    # padding to 2^k
    if (n - win_size) % win_shift != 0:
        waveform = torch.cat(
            [waveform,
             torch.zeros(m, win_shift - (n - win_size) % win_shift)],
            dim=1)
        n = waveform.shape[1]

    # split frames into rows
    frame_num = (n - win_size) // win_shift + 1
    strided_input = waveform.as_strided((m, frame_num, win_size),
                                        (n, win_shift, 1))
    strided_input = strided_input - torch.mean(strided_input,
                                               dim=2).unsqueeze(2)

    # pre-emphasis
    preemphasis = 0.97
    offset_strided_input = torch.nn.functional.pad(strided_input, (1, 0),
                                                   mode='replicate')
    strided_input = strided_input - preemphasis * offset_strided_input[:, :, :
                                                                       -1]

    # windowed and FFT
    win_func = torch.hamming_window(win_size, periodic=False)
    windowed_input = strided_input * win_func
    fft = torch.rfft(windowed_input, 1, normalized=False,
                     onesided=True) * 2 / win_size
    if drop_dc:
        fft = fft[:, :, 1:]
    fft = fft[:, :cut_len, :]
    power_spectrum = fft.pow(2).sum(3)
    log_power_spectrum = torch.log10(power_spectrum) * 10
    # 对于能量谱正则化处理
    mean_vec = log_power_spectrum.mean(axis=1, keepdim=True)
    std_vec = log_power_spectrum.std(axis=1, keepdim=True)
    log_power_spectrum = (log_power_spectrum - mean_vec) / std_vec
    #
    phase_spectrum = fft[:, :, :, 0] / fft.pow(2).sum(3).sqrt()

    phase_spectrum = torch.acos(phase_spectrum)
    phase_spectrum[fft[:, :, :, 0] < 0] = -phase_spectrum[fft[:, :, :, 0] < 0]
    spectrums = torch.cat([log_power_spectrum, phase_spectrum], dim=0)
    if not channel_first:
        spectrums = spectrums.permute(1, 2, 0)
    return spectrums
Exemple #16
0
    def __getitem__(self, index):
        np.random.seed()
        sample_name = self.sample_list[index]

        ###############################################################
        # 0. Batch settings
        ###############################################################
        if self.sample_count % self.batch_size == 0:
            self.sample_count = 0
            # random global stretch & compression for the batch
            if self.global_stretch:
                self.frames = np.random.randint(self.min_stretch_frames,
                                                self.max_stretch_frames + 1)
        self.sample_count += 1
        target = np.linspace(
            -1, 1,
            int(self.cfg["sample_duration"] * self.cfg["vid_framerate"]))

        ###############################################################
        # 1. Video data (keypoints)
        ###############################################################
        video_feature = scipy.io.loadmat('{0}/{1}.mat'.format(
            self.cfg["video_feature_root"], sample_name))

        # grab keypoints (discard foot keypoints)
        video_feature = video_feature['lip_list'][0]
        video_feature = video_feature[:, 0:self.cfg["num_lip_keypoints"], :]

        # crop desired length from the video --- 300 frames for 12 seconds
        n_frames = video_feature.shape[0]
        frame_start = np.random.randint(
            0, n_frames -
            self.cfg["sample_duration"] * self.cfg["vid_framerate"] + 1)
        frame_end = frame_start + self.cfg["sample_duration"] * self.cfg[
            "vid_framerate"]
        video_feature = video_feature[frame_start:frame_end, :, :]

        # normalize keypoint positions
        for kp in range(self.cfg["num_lip_keypoints"]):
            scaler = MinMaxScaler(feature_range=(-1, 1))
            video_feature[:,
                          kp, :] = scaler.fit_transform(video_feature[:,
                                                                      kp, :])

        #video_feature = video_feature.reshape(-1, self.cfg["num_lip_keypoints"]*2)

        # video augmentation --- horizontal flipping
        if self.mode == 'train':
            if np.random.rand() < self.cfg["prob_horizontal_flip"]:
                video_feature[:, :, 0] = video_feature[:, :, 0] * -1

        ###############################################################
        # 2. Global Shift
        ###############################################################
        if self.global_shift:
            # shift the audio in feasible range
            left_shift = min(frame_start, self.max_shift_frames)
            right_shift = min(n_frames - frame_end, self.max_shift_frames)
            frame_shift = np.random.randint(-left_shift, right_shift + 1)
            audio_frame_start = frame_start + frame_shift
            # modify the target array
            target += 2 * frame_shift / (self.cfg["vid_framerate"] *
                                         self.cfg["sample_duration"])
        else:
            audio_frame_start = frame_start

        ###############################################################
        # 3. Audio data (Spectrogram)
        ###############################################################
        waveform, audio_sample_rate = torchaudio.load('{0}/{1}.mp3'.format(
            self.cfg["audio_feature_root"], sample_name))
        # crop 12s of audio data
        audio_start = int(audio_frame_start * self.cfg["mel_sr"] /
                          self.cfg["vid_framerate"])
        audio_end = int(audio_start +
                        self.cfg["mel_sr"] * self.cfg["sample_duration"])
        waveform = waveform[0, audio_start:audio_end]

        # compute spectrogram
        audio_feature = self.spec2db(self.calc_spec(waveform))
        audio_feature = torch.squeeze(audio_feature, 0)

        # normalize spectrogram
        audio_feature = (audio_feature - torch.mean(audio_feature)) / (
            torch.max(audio_feature) - torch.min(audio_feature))

        # spectrogram augmentation
        if self.mode == 'train':
            # frequency masking
            if np.random.rand() < self.cfg["prob_freqmask"]:
                dur = np.random.randint(self.cfg["min_freqmask"],
                                        self.cfg["max_freqmask"] + 1)
                st = np.random.randint(0, audio_feature.shape[0] - dur + 1)
                audio_feature[st:st + dur, :] = 0

            # time masking
            if np.random.rand() < self.cfg["prob_timemask"]:
                dur = np.random.randint(self.cfg["min_timemask"],
                                        self.cfg["max_timemask"] + 1)
                st = np.random.randint(0, audio_feature.shape[1] - dur + 1)
                audio_feature[:, st:st + dur] = 0

        ###############################################################
        # 3. Global Stretch & Local Distortion
        ###############################################################
        new_video_feature = np.zeros(
            (self.frames, self.cfg["num_lip_keypoints"], 2))
        new_target = np.zeros(self.frames)

        # Random distortion
        random_position = np.linspace(-1, 1, self.frames)
        if self.local_distortion:
            random_position = np.zeros(self.frames)
            while np.max(
                    np.abs(random_position - np.linspace(-1, 1, self.frames))
            ) > self.max_distortion_ratio:
                resample_len = int(self.frames *
                                   self.cfg["random_resample_rate"])
                random_position = np.random.rand(resample_len)
                random_position = np.sort(
                    (random_position - np.min(random_position)) * 2 /
                    (np.max(random_position) - np.min(random_position)) - 1)
                f = interpolate.interp1d(np.linspace(-1, 1, resample_len),
                                         random_position,
                                         kind='linear')
                random_position = f(np.linspace(-1, 1, self.frames))

        # Distorted & Stretched video feature
        for k in range(self.frames):
            orig_index = (random_position[k] + 1) / 2 * (
                self.cfg["sample_duration"] * self.cfg["vid_framerate"] - 1)
            lower = int(np.floor(orig_index))
            upper = int(np.ceil(orig_index))

            if lower == upper:
                new_video_feature[k, :, :] = video_feature[lower, :, :]
                new_target[k] = target[lower]
            else:
                new_video_feature[k, :, :] = video_feature[lower, :, :] * (
                    upper - orig_index) + video_feature[upper, :, :] * (
                        orig_index - lower)
                new_target[k] = target[lower] * (
                    upper - orig_index) + target[upper] * (orig_index - lower)
        video_feature = new_video_feature
        target = new_target

        # velocity
        video_velo = np.zeros((self.frames, self.cfg["num_lip_keypoints"], 2))
        video_velo[
            1:, :, :] = video_feature[1:, :, :] - video_feature[:-1, :, :]
        video_velo = video_velo / np.amax(np.absolute(video_velo))

        # acceleration
        video_acc = np.zeros((self.frames, self.cfg["num_lip_keypoints"], 2))
        video_acc[1:, :, :] = video_velo[1:, :, :] - video_velo[:-1, :, :]
        video_acc = video_acc / np.amax(np.absolute(video_acc))

        # aggregate
        video_agg = np.zeros(
            (self.frames, self.cfg["num_lip_keypoints"], 2, 2))
        video_agg[:, :, :, 0] = video_velo
        video_agg[:, :, :, 1] = video_acc

        return {"video_feature": torch.from_numpy(video_agg.astype(np.float32)), \
                "audio_feature": audio_feature, \
                "target": torch.from_numpy(target.astype(np.float32)), \
                "sample_name": sample_name}
def extract_length(input_file):
    wav, _ = torchaudio.load(input_file)
    return wav.size(-1)
def torchaudio_load_file(file_path, normalization=True):
    data, sr = torchaudio.load(str(file_path))
    return data.float(), sr
 def load_audio(self,audiopath):
     self.waveform, self.sample_rate = torchaudio.load(audiopath)
     return self.waveform,self.sample_rate
Exemple #20
0
    def download(self):
        """Download the yesno data if it doesn't exist in processed_folder already."""
        from six.moves import urllib
        import tarfile

        if self._check_exists():
            return

        raw_abs_dir = os.path.join(self.root, self.raw_folder)
        processed_abs_dir = os.path.join(self.root, self.processed_folder)
        dset_abs_path = os.path.join(
            self.root, self.raw_folder, self.dset_path)

        # download files
        try:
            os.makedirs(os.path.join(self.root, self.raw_folder))
            os.makedirs(os.path.join(self.root, self.processed_folder))
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise

        url = self.url
        print('Downloading ' + url)
        filename = url.rpartition('/')[2]
        file_path = os.path.join(self.root, self.raw_folder, filename)
        if not os.path.isfile(file_path):
            urllib.request.urlretrieve(url, file_path)
        else:
            print("Tar file already downloaded")
        if not os.path.exists(dset_abs_path):
            with tarfile.open(file_path) as zip_f:
                zip_f.extractall(raw_abs_dir)
        else:
            print("Tar file already extracted")

        if not self.dev_mode:
            os.unlink(file_path)

        # process and save as torch files
        print('Processing...')
        shutil.copyfile(
            os.path.join(dset_abs_path, "README"),
            os.path.join(processed_abs_dir, "YESNO_README")
        )
        audios = [x for x in os.listdir(dset_abs_path) if ".wav" in x]
        print("Found {} audio files".format(len(audios)))
        tensors = []
        labels = []
        lengths = []
        for i, f in enumerate(audios):
            full_path = os.path.join(dset_abs_path, f)
            sig, sr = torchaudio.load(full_path)
            tensors.append(sig)
            lengths.append(sig.size(0))
            labels.append(os.path.basename(f).split(".", 1)[0].split("_"))
        # sort sigs/labels: longest -> shortest
        tensors, labels = zip(*[(b, c) for (a, b, c) in sorted(
            zip(lengths, tensors, labels), key=lambda x: x[0], reverse=True)])
        self.max_len = tensors[0].size(0)
        torch.save(
            (tensors, labels),
            os.path.join(
                self.root,
                self.processed_folder,
                self.processed_file
            )
        )
        if not self.dev_mode:
            shutil.rmtree(raw_abs_dir, ignore_errors=True)

        print('Done!')
Exemple #21
0
 def _load_wav(self, path):
     wav, _ = torchaudio.load(path_join(self.data_dir, path))
     wav = self.resampler(wav).squeeze(0)
     return wav
Exemple #22
0
 def open(self, item, **kwargs):
     if isinstance(item, (Path, PosixPath, str)):
         sig, sr = torchaudio.load(item)
         return AudioItem(sig, sr, path=Path(item))
     if isinstance(item, (tuple, np.ndarray)):
         return AudioItem(item)
#
# To load audio data, you can use ``torchaudio.load``.
#
# This function accepts a path-like object or file-like object as input.
#
# The returned value is a tuple of waveform (``Tensor``) and sample rate
# (``int``).
#
# By default, the resulting tensor object has ``dtype=torch.float32`` and
# its value range is normalized within ``[-1.0, 1.0]``.
#
# For the list of supported format, please refer to `the torchaudio
# documentation <https://pytorch.org/audio>`__.
#

waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)

print_stats(waveform, sample_rate=sample_rate)
plot_waveform(waveform, sample_rate)
plot_specgram(waveform, sample_rate)
play_audio(waveform, sample_rate)

######################################################################
# Loading from file-like object
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# ``torchaudio``\ ’s I/O functions now support file-like objects. This
# allows for fetching and decoding audio data from locations
# within and beyond the local file system.
# The following examples illustrate this.
#
def default_loader(path):
    audio, sr = torchaudio.load(path)
    return audio, sr
def create_gold_file(data_path, sample_rate):
    """
  Create the following files:
      gold_units.json : contains gold_dicts, a list of mappings of
          {"sentence_id" : str,
           "units" : a list of ints representing phoneme id for each feature frame,
           "text" : a list of strs representing phoneme tokens for each feature frame}
     abx_triplets.item : contains ABX triplets in the format
                         line 0 : whatever (not read)
                         line > 0: #file_ID onset offset #phone prev-phone next-phone speaker
                         onset : begining of the triplet (in s)
                         offset : end of the triplet (in s)
  """
    wav_scp_file = os.path.join(data_path, "mscoco2k_wav.scp")
    split_file = os.path.join(data_path, "mscoco2k_retrieval_split.txt")
    select_idxs = [
        idx for idx, is_test in enumerate(open(split_file, 'r'))
        if int(is_test)
    ]

    phone_info_dict = json.load(
        open(os.path.join(data_path, "mscoco2k_phone_info.json"), "r"))
    phone_to_index = {}
    gold_dicts = []
    triplets = ['#file_ID onset offset #phone prev-phone next-phone speaker']
    # Extract audio file names as sentence ids
    with open(wav_scp_file, 'r') as wav_scp_f:
        filenames = [l.split()[-1] for idx, l in enumerate(wav_scp_f)]

    # Extract utterance duration
    durations = [
        int(torchaudio.load(fn)[0].size(-1) * 1000 // (10 * sample_rate))
        for fn in filenames
    ]

    # Extract phone mapping
    phone_path = os.path.join(data_path, "phone2id.json")
    if os.path.exists(phone_path):
        phone_to_index = json.load(open(phone_path, "r"))
    else:
        phones = set()
        for idx, (_, phone_info) in enumerate(
                sorted(phone_info_dict.items(),
                       key=lambda x: int(x[0].split("_")[-1]))):
            for word_token in phone_info["data_ids"]:
                for phone_token in word_token[2]:
                    token = phone_token[0]
                    phones.update([token])
        phone_to_index = {x: i for i, x in enumerate(sorted(phones))}
        phone_to_index[UNK] = len(phone_to_index)
        json.dump(phone_to_index, open(phone_path, "w"), indent=2)

    # Extract phone units
    phone_to_word_counts = collections.defaultdict(dict)
    global_idx = 0
    for idx, (_, phone_info) in enumerate(
            sorted(phone_info_dict.items(),
                   key=lambda x: int(x[0].split("_")[-1]))):
        if not idx in select_idxs:
            continue

        begin_word = 0
        for word_info, word_token in zip(phone_info["data_ids"],
                                         phone_info["concepts"]):
            dur_word = word_info[2][-1][2] - word_info[2][0][1]
            end_word = begin_word + dur_word
            nframes = int(dur_word // 10)
            gold_dict = {
                "sentence_id": filenames[idx],
                "units": [-1] * nframes,
                "phoneme_text": [UNK] * nframes,
                "word_text": [word_token] * nframes,
                "interval": [begin_word, end_word]
            }
            begin_phone = 0
            prefix = filenames.split('/')[-1]
            example_id = f"{prefix}_{global_idx}"
            global_idx += 1
            for phn_idx, phone_token in enumerate(word_info[2]):
                if not word_token in phone_to_word_counts[phone_token[0]]:
                    phone_to_word_counts[phone_token[0]][word_token] = 1
                else:
                    phone_to_word_counts[phone_token[0]][word_token] += 1

                token, begin, end = phone_token[0], phone_token[
                    1], phone_token[2]

                dur_phone = end - begin
                begin_frame = int(begin_phone // 10)
                end_frame = int((begin_phone + dur_phone) // 10)
                if (begin_word + begin_phone +
                        dur_phone) // 10 > durations[idx]:
                    print(
                        'In {}: end frame exceeds duration of audio, {} > {}'.
                        format(filenames[idx],
                               (begin_word + begin_phone + dur_phone) // 10,
                               durations[idx]))
                    break

                if phn_idx == 0:
                    prev_token = NULL
                else:
                    prev_token = word_info[2][phn_idx - 1][0]

                if phn_idx == len(word_info[2]) - 1:
                    next_token = NULL
                else:
                    next_token = word_info[2][phn_idx + 1][0]
                triplets.append(
                    f'{example_id} {begin_phone / 1000.0:.4f} {(begin_phone + dur_phone)/ 1000.0:.4f} {token} {prev_token} {next_token} 0'
                )

                for t in range(begin_frame, end_frame):
                    gold_dict["units"][t] = phone_to_index[token]
                    gold_dict["phoneme_text"][t] = token
                begin_phone += dur_phone
            if end_frame != nframes:
                gold_dict['phoneme_text'] = gold_dict[
                    'phoneme_text'][:end_frame]
                gold_dict['word_text'] = gold_dict['word_text'][:end_frame]
                print('sentence_id, end_frame, nframes: ', filenames[idx],
                      end_frame, nframes)
            gold_dicts.append(gold_dict)
            begin_word += dur_word

    with open(os.path.join(data_path, 'phone_token_top_10_words.txt'),
              'w') as f:
        f.write('Phone\tWord\tCounts\n')
        for p in phone_to_word_counts:
            for w in sorted(phone_to_word_counts[p],
                            key=lambda x: phone_to_word_counts[p][x],
                            reverse=True):
                f.write('{}\t{}\t{}\n'.format(p, w,
                                              phone_to_word_counts[p][w]))

    with open(os.path.join(data_path, "gold_units.json"), "w") as gold_f:
        json.dump(gold_dicts, gold_f, indent=2)

    with open(os.path.join(data_path, "abx_triplets.item"), "w") as triplet_f:
        f.write('\n'.join(triplets))
Exemple #26
0
 def test_batch_pitch(self):
   waveform, sample_rate = torchaudio.load(self.test_filepath)
   self._test_batch(F.detect_pitch_frequency, waveform, sample_rate)
Exemple #27
0
import torch
import torchaudio
import matplotlib.pyplot as plt

torchaudio.set_audio_backend('soundfile')
waveform, sample_rate = torchaudio.load('data/Clover.flac')
print(f'Shape of waveform: {waveform.size()}')
print(f'Sample rate of waveform: {sample_rate}')

plt.figure()
plt.plot(waveform.t().numpy())
plt.show()
Exemple #28
0
 def test_jit_pitch(self):
   waveform, sample_rate = torchaudio.load(self.test_filepath)
   _test_torchscript_functional(F.detect_pitch_frequency, waveform, sample_rate)
Exemple #29
0
    def test_save(self):
        # load signal
        x, sr = torchaudio.load(self.test_filepath)

        # check save
        new_filepath = os.path.join(self.test_dirpath, "test.wav")
        torchaudio.save(new_filepath, x, sr)
        self.assertTrue(os.path.isfile(new_filepath))
        os.unlink(new_filepath)

        # check automatic normalization
        x /= 1 << 31
        torchaudio.save(new_filepath, x, sr)
        self.assertTrue(os.path.isfile(new_filepath))
        os.unlink(new_filepath)

        # test save 1d tensor
        x = x[:, 0]  # get mono signal
        x.squeeze_()  # remove channel dim
        torchaudio.save(new_filepath, x, sr)
        self.assertTrue(os.path.isfile(new_filepath))
        os.unlink(new_filepath)

        # don't allow invalid sizes as inputs
        with self.assertRaises(ValueError):
            x.unsqueeze_(0)  # N x L not L x N
            torchaudio.save(new_filepath, x, sr)

        with self.assertRaises(ValueError):
            x.squeeze_()
            x.unsqueeze_(1)
            x.unsqueeze_(0)  # 1 x L x 1
            torchaudio.save(new_filepath, x, sr)

        # automatically convert sr from floating point to int
        x.squeeze_(0)
        torchaudio.save(new_filepath, x, float(sr))
        self.assertTrue(os.path.isfile(new_filepath))
        os.unlink(new_filepath)

        # don't allow uneven integers
        with self.assertRaises(TypeError):
            torchaudio.save(new_filepath, x, float(sr) + 0.5)
            self.assertTrue(os.path.isfile(new_filepath))
            os.unlink(new_filepath)

        # don't save to folders that don't exist
        with self.assertRaises(OSError):
            new_filepath = os.path.join(self.test_dirpath, "no-path",
                                        "test.wav")
            torchaudio.save(new_filepath, x, sr)

        # save created file
        sinewave_filepath = os.path.join(self.test_dirpath, "assets",
                                         "sinewave.wav")
        sr = 16000
        freq = 440
        volume = 0.3

        y = (torch.cos(2 * math.pi * torch.arange(0, 4 * sr).float() * freq /
                       sr))
        y.unsqueeze_(1)
        # y is between -1 and 1, so must scale
        y = (y * volume * 2**31).long()
        torchaudio.save(sinewave_filepath, y, sr)
        self.assertTrue(os.path.isfile(sinewave_filepath))

        # test precision
        new_filepath = os.path.join(self.test_dirpath, "test.wav")
        _, _, _, bp = torchaudio.info(sinewave_filepath)
        torchaudio.save(new_filepath, y, sr, precision=16)
        _, _, _, bp16 = torchaudio.info(new_filepath)
        self.assertEqual(bp, 32)
        self.assertEqual(bp16, 16)
        os.unlink(new_filepath)
Exemple #30
0
class TestFunctional(unittest.TestCase):
  data_sizes = [(2, 20), (3, 15), (4, 10)]
  number_of_trials = 100
  specgram = torch.tensor([1., 2., 3., 4.])

  test_dirpath, test_dir = common_utils.create_temp_assets_dir()

  test_filepath = os.path.join(test_dirpath, 'assets',
                               'steam-train-whistle-daniel_simon.mp3')
  waveform_train, sr_train = torchaudio.load(test_filepath)

  def test_torchscript_spectrogram(self):

    tensor = torch.rand((1, 1000))
    n_fft = 400
    ws = 400
    hop = 200
    pad = 0
    window = torch.hann_window(ws)
    power = 2
    normalize = False

    _test_torchscript_functional(
        F.spectrogram, tensor, pad, window, n_fft, hop, ws, power, normalize
        )

  def test_torchscript_griffinlim(self):
    tensor = torch.rand((1, 201, 6))
    n_fft = 400
    ws = 400
    hop = 200
    window = torch.hann_window(ws)
    power = 2
    normalize = False
    momentum = 0.99
    n_iter = 32
    length = 1000
    init = 0

    _test_torchscript_functional(
        F.griffinlim, tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, 0
        )

  def test_batch_griffinlim(self):

    torch.random.manual_seed(42)
    tensor = torch.rand((1, 201, 6))

    n_fft = 400
    ws = 400
    hop = 200
    window = torch.hann_window(ws)
    power = 2
    normalize = False
    momentum = 0.99
    n_iter = 32
    length = 1000

    self._test_batch(
        F.griffinlim, tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, 0, atol=5e-5
        )

  def _test_compute_deltas(self, specgram, expected, win_length=3, atol=1e-6, rtol=1e-8):
    computed = F.compute_deltas(specgram, win_length=win_length)
    self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape))
    torch.testing.assert_allclose(computed, expected, atol=atol, rtol=rtol)

  def test_compute_deltas_onechannel(self):
    specgram = self.specgram.unsqueeze(0).unsqueeze(0)
    expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5]]])
    self._test_compute_deltas(specgram, expected)

  def test_compute_deltas_twochannel(self):
    specgram = self.specgram.repeat(1, 2, 1)
    expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5],
                              [0.5, 1.0, 1.0, 0.5]]])
    self._test_compute_deltas(specgram, expected)

  def test_compute_deltas_randn(self):
    channel = 13
    n_mfcc = channel * 3
    time = 1021
    win_length = 2 * 7 + 1
    specgram = torch.randn(channel, n_mfcc, time)
    computed = F.compute_deltas(specgram, win_length=win_length)

    self.assertTrue(computed.shape == specgram.shape, (computed.shape, specgram.shape))

    _test_torchscript_functional(F.compute_deltas, specgram, win_length=win_length)

  def test_batch_pitch(self):
    waveform, sample_rate = torchaudio.load(self.test_filepath)
    self._test_batch(F.detect_pitch_frequency, waveform, sample_rate)

  def test_jit_pitch(self):
    waveform, sample_rate = torchaudio.load(self.test_filepath)
    _test_torchscript_functional(F.detect_pitch_frequency, waveform, sample_rate)

  def _compare_estimate(self, sound, estimate, atol=1e-6, rtol=1e-8):
    # trim sound for case when constructed signal is shorter than original
    sound = sound[..., :estimate.size(-1)]

    self.assertTrue(sound.shape == estimate.shape, (sound.shape, estimate.shape))
    self.assertTrue(torch.allclose(sound, estimate, atol=atol, rtol=rtol))

  def _test_istft_is_inverse_of_stft(self, kwargs):
    # generates a random sound signal for each tril and then does the stft/istft
    # operation to check whether we can reconstruct signal
    for data_size in self.data_sizes:
      for i in range(self.number_of_trials):

        sound = common_utils.random_float_tensor(i, data_size)

        stft = torch.stft(sound, **kwargs)
        estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs)

        self._compare_estimate(sound, estimate)

  def test_istft_is_inverse_of_stft1(self):
    # hann_window, centered, normalized, onesided
    kwargs1 = {
        'n_fft':     12,
        'hop_length':4,
        'win_length':12,
        'window':    torch.hann_window(12),
        'center':    True,
        'pad_mode':  'reflect',
        'normalized':True,
        'onesided':  True,
        }

    self._test_istft_is_inverse_of_stft(kwargs1)

  def test_istft_is_inverse_of_stft2(self):
    # hann_window, centered, not normalized, not onesided
    kwargs2 = {
        'n_fft':     12,
        'hop_length':2,
        'win_length':8,
        'window':    torch.hann_window(8),
        'center':    True,
        'pad_mode':  'reflect',
        'normalized':False,
        'onesided':  False,
        }

    self._test_istft_is_inverse_of_stft(kwargs2)

  def test_istft_is_inverse_of_stft3(self):
    # hamming_window, centered, normalized, not onesided
    kwargs3 = {
        'n_fft':     15,
        'hop_length':3,
        'win_length':11,
        'window':    torch.hamming_window(11),
        'center':    True,
        'pad_mode':  'constant',
        'normalized':True,
        'onesided':  False,
        }

    self._test_istft_is_inverse_of_stft(kwargs3)

  def test_istft_is_inverse_of_stft4(self):
    # hamming_window, not centered, not normalized, onesided
    # window same size as n_fft
    kwargs4 = {
        'n_fft':     5,
        'hop_length':2,
        'win_length':5,
        'window':    torch.hamming_window(5),
        'center':    False,
        'pad_mode':  'constant',
        'normalized':False,
        'onesided':  True,
        }

    self._test_istft_is_inverse_of_stft(kwargs4)

  def test_istft_is_inverse_of_stft5(self):
    # hamming_window, not centered, not normalized, not onesided
    # window same size as n_fft
    kwargs5 = {
        'n_fft':     3,
        'hop_length':2,
        'win_length':3,
        'window':    torch.hamming_window(3),
        'center':    False,
        'pad_mode':  'reflect',
        'normalized':False,
        'onesided':  False,
        }

    self._test_istft_is_inverse_of_stft(kwargs5)

  def test_istft_of_ones(self):
    # stft = torch.stft(torch.ones(4), 4)
    stft = torch.tensor([
        [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]],
        [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]],
        [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]]
        ])

    estimate = torchaudio.functional.istft(stft, n_fft=4, length=4)
    self._compare_estimate(torch.ones(4), estimate)

  def test_istft_of_zeros(self):
    # stft = torch.stft(torch.zeros(4), 4)
    stft = torch.zeros((3, 5, 2))

    estimate = torchaudio.functional.istft(stft, n_fft=4, length=4)
    self._compare_estimate(torch.zeros(4), estimate)

  def test_istft_requires_overlap_windows(self):
    # the window is size 1 but it hops 20 so there is a gap which throw an error
    stft = torch.zeros((3, 5, 2))
    self.assertRaises(AssertionError, torchaudio.functional.istft, stft, n_fft=4,
                      hop_length=20, win_length=1, window=torch.ones(1))

  def test_istft_requires_nola(self):
    stft = torch.zeros((3, 5, 2))
    kwargs_ok = {
        'n_fft':     4,
        'win_length':4,
        'window':    torch.ones(4),
        }

    kwargs_not_ok = {
        'n_fft':     4,
        'win_length':4,
        'window':    torch.zeros(4),
        }

    # A window of ones meets NOLA but a window of zeros does not. This should
    # throw an error.
    torchaudio.functional.istft(stft, **kwargs_ok)
    self.assertRaises(AssertionError, torchaudio.functional.istft, stft, **kwargs_not_ok)

  def test_istft_requires_non_empty(self):
    self.assertRaises(AssertionError, torchaudio.functional.istft, torch.zeros((3, 0, 2)), 2)
    self.assertRaises(AssertionError, torchaudio.functional.istft, torch.zeros((0, 3, 2)), 2)

  def _test_istft_of_sine(self, amplitude, L, n):
    # stft of amplitude*sin(2*pi/L*n*x) with the hop length and window size equaling L
    x = torch.arange(2 * L + 1, dtype=torch.get_default_dtype())
    sound = amplitude * torch.sin(2 * math.pi / L * x * n)
    # stft = torch.stft(sound, L, hop_length=L, win_length=L,
    #                   window=torch.ones(L), center=False, normalized=False)
    stft = torch.zeros((L // 2 + 1, 2, 2))
    stft_largest_val = (amplitude * L) / 2.0
    if n < stft.size(0):
      stft[n, :, 1] = -stft_largest_val

    if 0 <= L - n < stft.size(0):
      # symmetric about L // 2
      stft[L - n, :, 1] = stft_largest_val

    estimate = torchaudio.functional.istft(stft, L, hop_length=L, win_length=L,
                                           window=torch.ones(L), center=False, normalized=False)
    # There is a larger error due to the scaling of amplitude
    self._compare_estimate(sound, estimate, atol=1e-3)

  def test_istft_of_sine(self):
    self._test_istft_of_sine(amplitude=123, L=5, n=1)
    self._test_istft_of_sine(amplitude=150, L=5, n=2)
    self._test_istft_of_sine(amplitude=111, L=5, n=3)
    self._test_istft_of_sine(amplitude=160, L=7, n=4)
    self._test_istft_of_sine(amplitude=145, L=8, n=5)
    self._test_istft_of_sine(amplitude=80, L=9, n=6)
    self._test_istft_of_sine(amplitude=99, L=10, n=7)

  def _test_linearity_of_istft(self, data_size, kwargs, atol=1e-6, rtol=1e-8):
    for i in range(self.number_of_trials):
      tensor1 = common_utils.random_float_tensor(i, data_size)
      tensor2 = common_utils.random_float_tensor(i * 2, data_size)
      a, b = torch.rand(2)
      istft1 = torchaudio.functional.istft(tensor1, **kwargs)
      istft2 = torchaudio.functional.istft(tensor2, **kwargs)
      istft = a * istft1 + b * istft2
      estimate = torchaudio.functional.istft(a * tensor1 + b * tensor2, **kwargs)
      self._compare_estimate(istft, estimate, atol, rtol)

  def test_linearity_of_istft1(self):
    # hann_window, centered, normalized, onesided
    kwargs1 = {
        'n_fft':     12,
        'window':    torch.hann_window(12),
        'center':    True,
        'pad_mode':  'reflect',
        'normalized':True,
        'onesided':  True,
        }
    data_size = (2, 7, 7, 2)
    self._test_linearity_of_istft(data_size, kwargs1)

  def test_linearity_of_istft2(self):
    # hann_window, centered, not normalized, not onesided
    kwargs2 = {
        'n_fft':     12,
        'window':    torch.hann_window(12),
        'center':    True,
        'pad_mode':  'reflect',
        'normalized':False,
        'onesided':  False,
        }
    data_size = (2, 12, 7, 2)
    self._test_linearity_of_istft(data_size, kwargs2)

  def test_linearity_of_istft3(self):
    # hamming_window, centered, normalized, not onesided
    kwargs3 = {
        'n_fft':     12,
        'window':    torch.hamming_window(12),
        'center':    True,
        'pad_mode':  'constant',
        'normalized':True,
        'onesided':  False,
        }
    data_size = (2, 12, 7, 2)
    self._test_linearity_of_istft(data_size, kwargs3)

  def test_linearity_of_istft4(self):
    # hamming_window, not centered, not normalized, onesided
    kwargs4 = {
        'n_fft':     12,
        'window':    torch.hamming_window(12),
        'center':    False,
        'pad_mode':  'constant',
        'normalized':False,
        'onesided':  True,
        }
    data_size = (2, 7, 3, 2)
    self._test_linearity_of_istft(data_size, kwargs4, atol=1e-5, rtol=1e-8)

  def test_batch_istft(self):

    stft = torch.tensor([
        [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]],
        [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]],
        [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]]
        ])

    self._test_batch(F.istft, stft, n_fft=4, length=4)

  def _test_create_fb(self, n_mels=40, sample_rate=22050, n_fft=2048, fmin=0.0, fmax=8000.0):

    librosa_fb = librosa.filters.mel(sr=sample_rate,
                                     n_fft=n_fft,
                                     n_mels=n_mels,
                                     fmax=fmax,
                                     fmin=fmin,
                                     htk=True,
                                     norm=None)
    fb = F.create_fb_matrix(sample_rate=sample_rate,
                            n_mels=n_mels,
                            f_max=fmax,
                            f_min=fmin,
                            n_freqs=(n_fft // 2 + 1))

    for i_mel_bank in range(n_mels):
      assert torch.allclose(fb[:, i_mel_bank], torch.tensor(librosa_fb[i_mel_bank]), atol=1e-4)

  def test_create_fb(self):
    self._test_create_fb()
    self._test_create_fb(n_mels=128, sample_rate=44100)
    self._test_create_fb(n_mels=128, fmin=2000.0, fmax=5000.0)
    self._test_create_fb(n_mels=56, fmin=100.0, fmax=9000.0)
    self._test_create_fb(n_mels=56, fmin=800.0, fmax=900.0)
    self._test_create_fb(n_mels=56, fmin=1900.0, fmax=900.0)
    self._test_create_fb(n_mels=10, fmin=1900.0, fmax=900.0)

  def test_gain(self):
    waveform_gain = F.gain(self.waveform_train, 3)
    self.assertTrue(waveform_gain.abs().max().item(), 1.)

    E = torchaudio.sox_effects.SoxEffectsChain()
    E.set_input_file(self.test_filepath)
    E.append_effect_to_chain("gain", [3])
    sox_gain_waveform = E.sox_build_flow_effects()[0]

    self.assertTrue(torch.allclose(waveform_gain, sox_gain_waveform, atol=1e-04))

  def test_dither(self):
    waveform_dithered = F.dither(self.waveform_train)
    waveform_dithered_noiseshaped = F.dither(self.waveform_train, noise_shaping=True)

    E = torchaudio.sox_effects.SoxEffectsChain()
    E.set_input_file(self.test_filepath)
    E.append_effect_to_chain("dither", [])
    sox_dither_waveform = E.sox_build_flow_effects()[0]

    self.assertTrue(torch.allclose(waveform_dithered, sox_dither_waveform, atol=1e-04))
    E.clear_chain()

    E.append_effect_to_chain("dither", ["-s"])
    sox_dither_waveform_ns = E.sox_build_flow_effects()[0]

    self.assertTrue(torch.allclose(waveform_dithered_noiseshaped, sox_dither_waveform_ns, atol=1e-02))

  def test_vctk_transform_pipeline(self):
    test_filepath_vctk = os.path.join(self.test_dirpath, "assets/VCTK-Corpus/wav48/p224/", "p224_002.wav")
    wf_vctk, sr_vctk = torchaudio.load(test_filepath_vctk)

    # rate
    sample = T.Resample(sr_vctk, 16000, resampling_method='sinc_interpolation')
    wf_vctk = sample(wf_vctk)
    # dither
    wf_vctk = F.dither(wf_vctk, noise_shaping=True)

    E = torchaudio.sox_effects.SoxEffectsChain()
    E.set_input_file(test_filepath_vctk)
    E.append_effect_to_chain("gain", ["-h"])
    E.append_effect_to_chain("channels", [1])
    E.append_effect_to_chain("rate", [16000])
    E.append_effect_to_chain("gain", ["-rh"])
    E.append_effect_to_chain("dither", ["-s"])
    wf_vctk_sox = E.sox_build_flow_effects()[0]

    self.assertTrue(torch.allclose(wf_vctk, wf_vctk_sox, rtol=1e-03, atol=1e-03))

  def test_pitch(self):
    test_dirpath, test_dir = common_utils.create_temp_assets_dir()
    test_filepath_100 = os.path.join(test_dirpath, 'assets', "100Hz_44100Hz_16bit_05sec.wav")
    test_filepath_440 = os.path.join(test_dirpath, 'assets', "440Hz_44100Hz_16bit_05sec.wav")

    # Files from https://www.mediacollege.com/audio/tone/download/
    tests = [
        (test_filepath_100, 100),
        (test_filepath_440, 440),
        ]

    for filename, freq_ref in tests:
      waveform, sample_rate = torchaudio.load(filename)

      freq = torchaudio.functional.detect_pitch_frequency(waveform, sample_rate)

      threshold = 1
      s = ((freq - freq_ref).abs() > threshold).sum()
      self.assertFalse(s)

      # Convert to stereo and batch for testing purposes
      self._test_batch(F.detect_pitch_frequency, waveform, sample_rate)

  def _test_batch_shape(self, functional, tensor, *args, **kwargs):

    kwargs_compare = {}
    if 'atol' in kwargs:
      atol = kwargs['atol']
      del kwargs['atol']
      kwargs_compare['atol'] = atol

    if 'rtol' in kwargs:
      rtol = kwargs['rtol']
      del kwargs['rtol']
      kwargs_compare['rtol'] = rtol

    # Single then transform then batch

    torch.random.manual_seed(42)
    expected = functional(tensor.clone(), *args, **kwargs)
    expected = expected.unsqueeze(0).unsqueeze(0)

    # 1-Batch then transform

    tensors = tensor.unsqueeze(0).unsqueeze(0)

    torch.random.manual_seed(42)
    computed = functional(tensors.clone(), *args, **kwargs)

    self._compare_estimate(computed, expected, **kwargs_compare)

    return tensors, expected

  def _test_batch(self, functional, tensor, *args, **kwargs):

    tensors, expected = self._test_batch_shape(functional, tensor, *args, **kwargs)

    kwargs_compare = {}
    if 'atol' in kwargs:
      atol = kwargs['atol']
      del kwargs['atol']
      kwargs_compare['atol'] = atol

    if 'rtol' in kwargs:
      rtol = kwargs['rtol']
      del kwargs['rtol']
      kwargs_compare['rtol'] = rtol

    # 3-Batch then transform

    ind = [3] + [1] * (int(tensors.dim()) - 1)
    tensors = tensor.repeat(*ind)

    ind = [3] + [1] * (int(expected.dim()) - 1)
    expected = expected.repeat(*ind)

    torch.random.manual_seed(42)
    computed = functional(tensors.clone(), *args, **kwargs)

  def test_torchscript_create_fb_matrix(self):

    n_stft = 100
    f_min = 0.0
    f_max = 20.0
    n_mels = 10
    sample_rate = 16000

    _test_torchscript_functional(F.create_fb_matrix, n_stft, f_min, f_max, n_mels, sample_rate)

  def test_torchscript_amplitude_to_DB(self):
    spec = torch.rand((6, 201))
    multiplier = 10.0
    amin = 1e-10
    db_multiplier = 0.0
    top_db = 80.0

    _test_torchscript_functional(F.amplitude_to_DB, spec, multiplier, amin, db_multiplier, top_db)

  def test_torchscript_DB_to_amplitude(self):

    x = torch.rand((1, 100))
    ref = 1.
    power = 1.

    _test_torchscript_functional(F.DB_to_amplitude, x, ref, power)

  def test_DB_to_amplitude(self):
    # Make some noise
    x = torch.rand(1000)
    spectrogram = torchaudio.transforms.Spectrogram()
    spec = spectrogram(x)

    amin = 1e-10
    ref = 1.0
    db_multiplier = math.log10(max(amin, ref))

    # Waveform amplitude -> DB -> amplitude
    multiplier = 20.
    power = 0.5

    db = F.amplitude_to_DB(torch.abs(x), multiplier, amin, db_multiplier, top_db=None)
    x2 = F.DB_to_amplitude(db, ref, power)

    self.assertTrue(torch.allclose(torch.abs(x), x2, atol=5e-5))

    # Spectrogram amplitude -> DB -> amplitude
    db = F.amplitude_to_DB(spec, multiplier, amin, db_multiplier, top_db=None)
    x2 = F.DB_to_amplitude(db, ref, power)

    self.assertTrue(torch.allclose(spec, x2, atol=5e-5))

    # Waveform power -> DB -> power
    multiplier = 10.
    power = 1.

    db = F.amplitude_to_DB(x, multiplier, amin, db_multiplier, top_db=None)
    x2 = F.DB_to_amplitude(db, ref, power)

    self.assertTrue(torch.allclose(torch.abs(x), x2, atol=5e-5))

    # Spectrogram power -> DB -> power
    db = F.amplitude_to_DB(spec, multiplier, amin, db_multiplier, top_db=None)
    x2 = F.DB_to_amplitude(db, ref, power)

    self.assertTrue(torch.allclose(spec, x2, atol=5e-5))

  def test_torchscript_create_dct(self):

    n_mfcc = 40
    n_mels = 128
    norm = "ortho"

    _test_torchscript_functional(F.create_dct, n_mfcc, n_mels, norm)

  def test_torchscript_mu_law_encoding(self):

    tensor = torch.rand((1, 10))
    qc = 256

    _test_torchscript_functional(F.mu_law_encoding, tensor, qc)

  def test_torchscript_mu_law_decoding(self):

    tensor = torch.rand((1, 10))
    qc = 256

    _test_torchscript_functional(F.mu_law_decoding, tensor, qc)

  def test_torchscript_complex_norm(self):

    complex_tensor = torch.randn(1, 2, 1025, 400, 2)
    power = 2

    _test_torchscript_functional(F.complex_norm, complex_tensor, power)

  def test_mask_along_axis(self):

    specgram = torch.randn(2, 1025, 400)
    mask_param = 100
    mask_value = 30.
    axis = 2

    _test_torchscript_functional(F.mask_along_axis, specgram, mask_param, mask_value, axis)

  def test_mask_along_axis_iid(self):

    specgrams = torch.randn(4, 2, 1025, 400)
    mask_param = 100
    mask_value = 30.
    axis = 2

    _test_torchscript_functional(F.mask_along_axis_iid, specgrams, mask_param, mask_value, axis)

  def test_torchscript_gain(self):
    tensor = torch.rand((1, 1000))
    gainDB = 2.0

    _test_torchscript_functional(F.gain, tensor, gainDB)

  def test_torchscript_dither(self):
    tensor = torch.rand((2, 1000))

    _test_torchscript_functional_shape(F.dither, tensor)
    _test_torchscript_functional_shape(F.dither, tensor, "RPDF")
    _test_torchscript_functional_shape(F.dither, tensor, "GPDF")
Exemple #31
0
 def test_Vad(self):
     filepath = common_utils.get_asset_path("vad-go-mono-32000.wav")
     waveform, sample_rate = torchaudio.load(filepath)
     self._assert_consistency(T.Vad(sample_rate=sample_rate), waveform)
Exemple #32
0
 def load_wav(self, file):
     waveform, sample_rate = torchaudio.load(self.wav_folder + file)
     if sample_rate != self.sr:
         raise (AssertionError)
     return waveform, sample_rate
Exemple #33
0
def load_audio(path):
    waveform, sample_rate = torchaudio.load(path)
    return waveform
Exemple #34
0
f_info[0] : speaker ID
f_info[1] : session number
f_info[2] : sentence number
f_info[3] : extension
"""

print("===extract features to .npy===")
for gender in ["male", "female"]:
    SUBDIR_PATH = DIR_PATH / gender
    for f in SUBDIR_PATH.rglob('*.wav'):
        f_info = re.split('[_.]', f.name)
        if int(f_info[2]) > 30:
            continue

        waveform, sample_rate = torchaudio.load(SUBDIR_PATH / f_info[0] /
                                                f.name,
                                                normalization=True)

        feature = mfcc(waveform,
                       sample_rate,
                       winfunc=np.hamming,
                       numcep=hyper_params["numcep"],
                       nfilt=26,
                       nfft=512,
                       preemph=0.97)

        speaker = str(f_info[0])

        if speaker in train_speaker_list:
            f_path = FEATURE_PATH / "train" / f.name[:-4]
        elif speaker in val_speaker_list:
Exemple #35
0
from pase.models.frontend import wf_builder
pase = wf_builder('cfg/frontend/PASE+.cfg').eval()
pase.load_pretrained('FE_e199.ckpt', load_last=True, verbose=True)
pase.cuda()

import sys
wav_path = sys.argv[1]
out_path = sys.argv[2]

# Now we can forward waveforms as Torch tensors
import torch
import torchaudio
torchaudio.set_audio_backend('sox')

x, sr = torchaudio.load(wav_path)
x = x.view(-1).cuda()

x = x.view(1, 1, -1)

with torch.no_grad():
    pase.eval()

    # y size will be (1, 256, 625), which are 625 frames of 256 dims each
    y = pase(x)[0].transpose(0, 1)

torch.save(y.detach().cpu(), out_path)

#build dataset
dataset_path = "D:/songs_headphones/stereo-cut-16k"
n_files = 0
fs = 0
pad = nn.ConstantPad1d((0, 1), 0)
start_time = time.time()
for subdir, dirs, files in os.walk(dataset_path):

    for file in files:
        filepath = subdir + os.sep + file

        if filepath.endswith(".wav"):
            if n_files == 0:
                dataset, fs = torchaudio.load(filepath)
                dataset = pad(dataset)

                dataset = dataset / torch.max(dataset)
            elif n_files == 1:
                new_file, fs = torchaudio.load(filepath)
                new_file = pad(new_file)
                new_file = new_file / torch.max(new_file)

                dataset = torch.stack((dataset, new_file))
            else:
                new_file, fs = torchaudio.load(filepath)
                new_file = pad(new_file)
                new_file = new_file / torch.max(new_file)

                dataset = torch.cat((dataset, new_file.unsqueeze(0)))
Exemple #37
0
                              padding=False,
                              num_padding=0)
        wav_file = "/d1/jbaik/ics-asr/temp/conan1-8k.wav"
        audio = transformer(wav_file)
    # test Spectrogram
    elif test == 2:
        import matplotlib
        matplotlib.use('TkAgg')
        matplotlib.interactive(True)
        import matplotlib.pyplot as plt

        nperseg = int(p.SAMPLE_RATE * p.WINDOW_SIZE)
        noverlap = int(p.SAMPLE_RATE * (p.WINDOW_SIZE - p.WINDOW_SHIFT))

        wav_file = Path("../data/aspire/000/fe_03_00047-A-025005-025135.wav")
        audio, _ = torchaudio.load(wav_file)

        # pyplot specgram
        audio = torch.squeeze(audio)
        fig = plt.figure(0)
        plt.specgram(audio,
                     Fs=p.SAMPLE_RATE,
                     NFFT=p.NFFT,
                     noverlap=noverlap,
                     cmap='plasma')

        # implemented transformer - scipy stft
        transformer = Spectrogram(sample_rate=p.SAMPLE_RATE,
                                  window_stride=p.WINDOW_SHIFT,
                                  window_size=p.WINDOW_SIZE,
                                  nfft=p.NFFT)
    def read_audio_file(path, src_dir, side, sample_rate, window_size,
                        window_stride, window, normalize_audio,
                        truncate=None):
        """
        Args:
            path (str): location of a src file containing audio paths.
            src_dir (str): location of source audio files.
            side (str): 'src' or 'tgt'.
            sample_rate (int): sample_rate.
            window_size (float) : window size for spectrogram in seconds.
            window_stride (float): window stride for spectrogram in seconds.
            window (str): window type for spectrogram generation.
            normalize_audio (bool): subtract spectrogram by mean and divide
                by std or not.
            truncate (int): maximum audio length (0 or None for unlimited).

        Yields:
            a dictionary containing audio data for each line.
        """
        assert (src_dir is not None) and os.path.exists(src_dir),\
            "src_dir must be a valid directory if data_type is audio"

        global torchaudio, librosa, np
        import torchaudio
        import librosa
        import numpy as np

        with codecs.open(path, "r", "utf-8") as corpus_file:
            index = 0
            for line in corpus_file:
                audio_path = os.path.join(src_dir, line.strip())
                if not os.path.exists(audio_path):
                    audio_path = line

                assert os.path.exists(audio_path), \
                    'audio path %s not found' % (line.strip())

                sound, sample_rate = torchaudio.load(audio_path)
                if truncate and truncate > 0:
                    if sound.size(0) > truncate:
                        continue

                assert sample_rate == sample_rate, \
                    'Sample rate of %s != -sample_rate (%d vs %d)' \
                    % (audio_path, sample_rate, sample_rate)

                sound = sound.numpy()
                if len(sound.shape) > 1:
                    if sound.shape[1] == 1:
                        sound = sound.squeeze()
                    else:
                        sound = sound.mean(axis=1)  # average multiple channels

                n_fft = int(sample_rate * window_size)
                win_length = n_fft
                hop_length = int(sample_rate * window_stride)
                # STFT
                d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
                                 win_length=win_length, window=window)
                spect, _ = librosa.magphase(d)
                spect = np.log1p(spect)
                spect = torch.FloatTensor(spect)
                if normalize_audio:
                    mean = spect.mean()
                    std = spect.std()
                    spect.add_(-mean)
                    spect.div_(std)

                example_dict = {side: spect,
                                side + '_path': line.strip(),
                                'indices': index}
                index += 1

                yield example_dict