Esempio n. 1
0
 def test_switch(self):
     torchaudio.set_audio_backend(self.backend)
     if self.backend is None:
         assert torchaudio.get_audio_backend() is None
     else:
         assert torchaudio.get_audio_backend() == self.backend
     assert torchaudio.load == self.backend_module.load
     assert torchaudio.save == self.backend_module.save
     assert torchaudio.info == self.backend_module.info
Esempio n. 2
0
def AudioBackendScope(new_backend):
    previous_backend = torchaudio.get_audio_backend()
    try:
        torchaudio.set_audio_backend(new_backend)
        yield
    finally:
        torchaudio.set_audio_backend(previous_backend)
Esempio n. 3
0
 def __getitem__(self, index):
     for (file, _), examples in zip(self.files, self.num_examples):
         if index >= examples:
             index -= examples
             continue
         num_frames = 0
         offset = 0
         if self.length is not None:
             offset = self.stride * index
             num_frames = self.length
         if torchaudio.get_audio_backend() in ['soundfile', 'sox_io']:
             out, sr = torchaudio.load(str(file),
                                       frame_offset=offset,
                                       num_frames=num_frames or -1)
         else:
             out, sr = torchaudio.load(str(file),
                                       offset=offset,
                                       num_frames=num_frames)
         if self.sample_rate is not None:
             if sr != self.sample_rate:
                 raise RuntimeError(
                     f"Expected {file} to have sample rate of "
                     f"{self.sample_rate}, but got {sr}")
         if num_frames:
             out = F.pad(out, (0, num_frames - out.shape[-1]))
         if self.with_path:
             return out, file
         else:
             return out
Esempio n. 4
0
    def _load_audio(self,
                    path: str,
                    start_time: float,
                    end_time: float,
                    sample_rate: int = 16000) -> [Tensor, int]:
        """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality
        and load individual sentences from a full ted audio talk file.

        Args:
            path (str): Path to audio file
            start_time (int, optional): Time in seconds where the sample sentence stars
            end_time (int, optional): Time in seconds where the sample sentence finishes

        Returns:
            [Tensor, int]: Audio tensor representation and sample rate
        """
        start_time = int(float(start_time) * sample_rate)
        end_time = int(float(end_time) * sample_rate)

        backend = torchaudio.get_audio_backend()
        if backend == "sox" or (backend == "soundfile"
                                and torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE):
            kwargs = {
                "offset": start_time,
                "num_frames": end_time - start_time
            }
        else:
            kwargs = {
                "frame_offset": start_time,
                "num_frames": end_time - start_time
            }

        return torchaudio.load(path, **kwargs)
Esempio n. 5
0
    def __init__(self, root_dir, sr, duration=None, transform=None):
        self.sr = sr
        self.duration = duration
        self.transform = transform
        self.offsets = [0]
        self.rates = []

        self.paths = sorted(list(pathlib.Path(root_dir).glob('**/*.wav')))

        for p in self.paths:
            si, _ = torchaudio.info(str(p))
            self.rates.append(si.rate)
            if self.duration is None:
                self.offsets.append(self.offsets[-1] + 1)
                continue
            if torchaudio.get_audio_backend() in ('sox', 'sox_io'):
                n_frames = si.length // si.channels
            elif torchaudio.get_audio_backend() == 'soundfile':
                n_frames = si.length
            n_segments = math.floor(n_frames / si.rate / self.duration)
            self.offsets.append(self.offsets[-1] + n_segments)
def read_audio(path: str, target_sr: int = 16000):
    assert torchaudio.get_audio_backend() == 'soundfile'
    wav, sr = torchaudio.load(path)
    if wav.size(0) > 1:
        wav = wav.mean(dim=0, keepdim=True)
    if sr != target_sr:
        transform = torchaudio.transforms.Resample(orig_freq=sr,
                                                   new_freq=target_sr)
        wav = transform(wav)
        sr = target_sr

    assert sr == target_sr
    return wav.squeeze(0)
Esempio n. 7
0
def load_info(path: str) -> dict:
    """Load audio metadata
    this is a backend_independent wrapper around torchaudio.info
    Args:
        path: Path of filename
    Returns:
        Dict: Metadata with
        `samplerate`, `samples` and `duration` in seconds
    """
    # get length of file in samples
    if torchaudio.get_audio_backend() == "sox":
        raise RuntimeError("Deprecated backend is not supported")

    info = {}
    si = torchaudio.info(str(path))
    info["samplerate"] = si.sample_rate
    info["samples"] = si.num_frames
    info["channels"] = si.num_channels
    info["duration"] = info["samples"] / info["samplerate"]
    return info
Esempio n. 8
0
    def _load_audio(self,
                    path: str,
                    start_time: float,
                    end_time: float,
                    sample_rate: int = 16000) -> [Tensor, int]:
        """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality
        and load individual sentences from a full ted audio talk file.

        Args:
            path (str): Path to audio file
            start_time (int, optional): Time in seconds where the sample sentence stars
            end_time (int, optional): Time in seconds where the sample sentence finishes

        Returns:
            [Tensor, int]: Audio tensor representation and sample rate
        """
        start_time = int(float(start_time) * sample_rate)
        end_time = int(float(end_time) * sample_rate)
        if torchaudio.get_audio_backend() == "sox_io":
            return torchaudio.load(path,
                                   frame_offset=start_time,
                                   num_frames=end_time - start_time)
        return torchaudio.load(path)[:, start_time:end_time]
Esempio n. 9
0
    # I/O Parameters
    parser.add_argument(
        "--seq-dur",
        type=float,
        default=5.0,
        help="Duration of <=0.0 will result in the full audio",
    )

    parser.add_argument("--batch-size", type=int, default=16)

    args, _ = parser.parse_known_args()

    torchaudio.set_audio_backend(args.audio_backend)

    train_dataset, valid_dataset, args = load_datasets(parser, args)
    print("Audio Backend: ", torchaudio.get_audio_backend())

    # Iterate over training dataset and compute statistics
    total_training_duration = 0
    for k in tqdm.tqdm(range(len(train_dataset))):
        x, y = train_dataset[k]
        total_training_duration += x.shape[1] / train_dataset.sample_rate
        if args.save:
            torchaudio.save("test/" + str(k) + "x.wav", x.T,
                            train_dataset.sample_rate)
            torchaudio.save("test/" + str(k) + "y.wav", y.T,
                            train_dataset.sample_rate)

    print("Total training duration (h): ", total_training_duration / 3600)
    print("Number of train samples: ", len(train_dataset))
    print("Number of validation samples: ", len(valid_dataset))
Esempio n. 10
0
def create_csv(orig_tsv_file,
               csv_file,
               data_folder,
               accented_letters=False,
               language="en"):
    """
    Creates the csv file given a list of wav files.

    Arguments
    ---------
    orig_tsv_file : str
        Path to the Common Voice tsv file (standard file).
    data_folder : str
        Path of the CommonVoice dataset.
    accented_letters : bool, optional
        Defines if accented letters will be kept as individual letters or
        transformed to the closest non-accented letters.

    Returns
    -------
    None
    """

    # Check if the given files exists
    if not os.path.isfile(orig_tsv_file):
        msg = "\t%s doesn't exist, verify your dataset!" % (orig_tsv_file)
        logger.info(msg)
        raise FileNotFoundError(msg)

    # We load and skip the header
    loaded_csv = open(orig_tsv_file, "r").readlines()[1:]
    nb_samples = str(len(loaded_csv))

    msg = "Preparing CSV files for %s samples ..." % (str(nb_samples))
    logger.info(msg)

    # Adding some Prints
    msg = "Creating csv lists in %s ..." % (csv_file)
    logger.info(msg)

    csv_lines = [["ID", "duration", "wav", "spk_id", "wrd"]]

    # Start processing lines
    total_duration = 0.0
    for line in tzip(loaded_csv):

        line = line[0]

        # Path is at indice 1 in Common Voice tsv files. And .mp3 files
        # are located in datasets/lang/clips/
        mp3_path = data_folder + "/clips/" + line.split("\t")[1]
        file_name = mp3_path.split(".")[-2].split("/")[-1]
        spk_id = line.split("\t")[0]
        snt_id = file_name

        # Setting torchaudio backend to sox-io (needed to read mp3 files)
        if torchaudio.get_audio_backend() != "sox_io":
            logger.warning(
                "This recipe needs the sox-io backend of torchaudio")
            logger.warning("The torchaudio backend is changed to sox_io")
            torchaudio.set_audio_backend("sox_io")

        # Reading the signal (to retrieve duration in seconds)
        if os.path.isfile(mp3_path):
            info = torchaudio.info(mp3_path)
        else:
            msg = "\tError loading: %s" % (str(len(file_name)))
            logger.info(msg)
            continue

        duration = info.num_frames / info.sample_rate
        total_duration += duration

        # Getting transcript
        words = line.split("\t")[2]

        # Unicode Normalization
        words = unicode_normalisation(words)

        # !! Language specific cleaning !!
        # Important: feel free to specify the text normalization
        # corresponding to your alphabet.

        if language in ["en", "fr", "it", "rw"]:
            words = re.sub("[^’'A-Za-z0-9À-ÖØ-öø-ÿЀ-ӿéæœâçèàûî]+", " ",
                           words).upper()

        if language == "fr":
            # Replace J'y D'hui etc by J_ D_hui
            words = words.replace("'", " ")
            words = words.replace("’", " ")

        elif language == "ar":
            HAMZA = "\u0621"
            ALEF_MADDA = "\u0622"
            ALEF_HAMZA_ABOVE = "\u0623"
            letters = ("ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ" + HAMZA +
                       ALEF_MADDA + ALEF_HAMZA_ABOVE)
            words = re.sub("[^" + letters + "]+", " ", words).upper()
        elif language == "ga-IE":
            # Irish lower() is complicated, but upper() is nondeterministic, so use lowercase
            def pfxuc(a):
                return len(a) >= 2 and a[0] in "tn" and a[1] in "AEIOUÁÉÍÓÚ"

            def galc(w):
                return w.lower(
                ) if not pfxuc(w) else w[0] + "-" + w[1:].lower()

            words = re.sub("[^-A-Za-z'ÁÉÍÓÚáéíóú]+", " ", words)
            words = " ".join(map(galc, words.split(" ")))

        # Remove accents if specified
        if not accented_letters:
            words = strip_accents(words)
            words = words.replace("'", " ")
            words = words.replace("’", " ")

        # Remove multiple spaces
        words = re.sub(" +", " ", words)

        # Remove spaces at the beginning and the end of the sentence
        words = words.lstrip().rstrip()

        # Getting chars
        chars = words.replace(" ", "_")
        chars = " ".join([char for char in chars][:])

        # Remove too short sentences (or empty):
        if len(words.split(" ")) < 3:
            continue

        # Composition of the csv_line
        csv_line = [snt_id, str(duration), mp3_path, spk_id, str(words)]

        # Adding this line to the csv_lines list
        csv_lines.append(csv_line)

    # Writing the csv lines
    with open(csv_file, mode="w", encoding="utf-8") as csv_f:
        csv_writer = csv.writer(csv_f,
                                delimiter=",",
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)

        for line in csv_lines:
            csv_writer.writerow(line)

    # Final prints
    msg = "%s successfully created!" % (csv_file)
    logger.info(msg)
    msg = "Number of samples: %s " % (str(len(loaded_csv)))
    logger.info(msg)
    msg = "Total duration: %s Hours" % (str(round(total_duration / 3600, 2)))
    logger.info(msg)
Esempio n. 11
0
import logging
from typing import Iterable, List, Tuple

import numpy as np
import torch
from torch import nn
import torchaudio
from pathos.threading import ThreadPool
from torchaudio.transforms import MFCC, Resample
import torchlibrosa

logger = logging.getLogger()

# Use sox_io backend if available
if (
    torchaudio.get_audio_backend() != "sox_io"
    and "sox_io" in torchaudio.list_audio_backends()
):
    torchaudio.set_audio_backend("sox_io")
    logger.debug("Set audio backend to sox_io")

# Required because as of 0.7.2 on OSX, torchaudio links its own OpenMP runtime in addition to pytorch
# This tells OpenMP not to crash when this happens.
if sys.platform == "darwin":
    os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"


class AudioTooShortError(ValueError):
    pass

Esempio n. 12
0
 def __init__(self, backend):
     self.new_backend = backend
     self.previous_backend = torchaudio.get_audio_backend()