Beispiel #1
0
def check_dependencies(segment_words: Optional[bool] = False):
    if not is_module_available('pandas'):
        raise ImportError(
            "GALE Mandarin data preparation requires the 'pandas' package to be installed. "
            "Please install it with 'pip install pandas' and try again.")

    if segment_words and not is_module_available('jieba'):
        raise ImportError(
            "The '--segment-words' option requires the 'jieba' package to be installed. "
            "Please install it with 'pip install jieba' and try again.")
Beispiel #2
0
    def __init__(
        self,
        pattern: str,
        maxcount: int = 100000,
        maxsize: float = 3e9,
        post: Optional[Callable] = None,
        start_shard: int = 0,
        **kw,
    ):
        """Create a ShardWriter.

        :param pattern: output file pattern
        :param maxcount: maximum number of records per shard (Default value = 100000)
        :param maxsize: maximum size of each shard (Default value = 3e9)
        :param kw: other options passed to TarWriter
        """
        if not is_module_available("webdataset"):
            raise ImportError("Please 'pip install webdataset' first.")

        self.verbose = 1
        self.kw = kw
        self.maxcount = maxcount
        self.maxsize = maxsize
        self.post = post

        self.tarstream = None
        self.shard = start_shard
        self.pattern = pattern
        self.total = 0
        self.count = 0
        self.size = 0
        self.fname = None
        self.next_stream()
Beispiel #3
0
    def __init__(
        self,
        path_or_url: Pathlike,
        shard_size: Optional[int] = None,
        audio_format: str = "flac",
        load_audio: bool = True,
        load_features: bool = True,
        load_custom: bool = True,
        fault_tolerant: bool = True,
    ) -> None:
        if not is_module_available("webdataset"):
            raise ImportError("Please 'pip install webdataset' first.")

        from webdataset import TarWriter

        self.path_or_url = path_or_url
        self.shard_size = shard_size
        self.audio_format = audio_format
        self.load_audio = load_audio
        self.load_features = load_features
        self.load_custom = load_custom
        self.fault_tolerant = fault_tolerant

        if self.shard_size is not None:
            assert self.shard_size > 0
            # Note: this ShardWriter is not from webdataset, but defined below in this file.
            self.writer_init_fn = partial(
                ShardWriter, self.path_or_url, maxcount=self.shard_size
            )
        else:
            self.writer_init_fn = partial(TarWriter, self.path_or_url)

        self.writer = None
        self.num_shards_written = None
        self.finished = None
Beispiel #4
0
def get_duration(
    path: Pathlike,
) -> float:
    """
    Read a audio file, it supports pipeline style wave path and real waveform.

    :param path: Path to an audio file or a Kaldi-style pipe.
    :return: float duration of the recording, in seconds.
    """
    path = str(path)
    if path.strip().endswith("|"):
        if not is_module_available("kaldi_native_io"):
            raise ValueError(
                "To read Kaldi's data dir where wav.scp has 'pipe' inputs, "
                "please 'pip install kaldi_native_io' first."
            )
        import kaldi_native_io

        wave = kaldi_native_io.read_wave(path)
        assert wave.data.shape[0] == 1, f"Expect 1 channel. Given {wave.data.shape[0]}"

        return wave.duration
    try:
        # Try to parse the file using pysoundfile first.
        import soundfile

        info = soundfile.info(path)
    except:
        # Try to parse the file using audioread as a fallback.
        info = audioread_info(path)
    return info.duration
Beispiel #5
0
    def __init__(self, source: Union[Pathlike, Sequence[Pathlike]],
                 **wds_kwargs) -> None:
        if not is_module_available("webdataset"):
            raise ImportError("Please 'pip install webdataset' first.")

        self.source = source
        self.wds_kwargs = wds_kwargs
Beispiel #6
0
def get_duration(path: Pathlike, ) -> float:
    """
    Read a audio file, it supports pipeline style wave path and real waveform.

    :param path: Path to an audio file or a Kaldi-style pipe.
    :return: float duration of the recording, in seconds.
    """
    path = str(path)
    if path.strip().endswith("|"):
        if not is_module_available("kaldiio"):
            raise ValueError(
                "To read Kaldi's data dir where wav.scp has 'pipe' inputs, "
                "please 'pip install kaldiio' first.")
        from kaldiio import load_mat

        # Note: kaldiio.load_mat returns
        # (sampling_rate: int, samples: 1-D np.array[int])
        sampling_rate, samples = load_mat(path)
        assert len(samples.shape) == 1
        duration = samples.shape[0] / sampling_rate
        return duration
    try:
        # Try to parse the file using pysoundfile first.
        import soundfile

        info = soundfile.info(path)
    except:
        # Try to parse the file using audioread as a fallback.
        info = audioread_info(path)
    return info.duration
Beispiel #7
0
    def __init__(self, config: Optional[Any] = None):
        super().__init__(config=config)
        assert is_module_available(
            "opensmile"
        ), 'To use opensmile extractors, please "pip install opensmile" first.'
        import opensmile

        if isinstance(self.config.feature_set, str):
            self.feature_set = opensmile.FeatureSet[self.config.feature_set]
        else:
            self.feature_set = self.config.feature_set
        self.feature_level = opensmile.FeatureLevel(self.config.feature_level)
        self.smileExtractor = opensmile.Smile(
            feature_set=self.feature_set,
            feature_level=self.feature_level,
            sampling_rate=self.config.sampling_rate,
            options=self.config.options,
            loglevel=self.config.loglevel,
            logfile=self.config.logfile,
            channels=self.config.channels,
            mixdown=self.config.mixdown,
            resample=self.config.resample,
            num_workers=self.config.num_workers,
            verbose=self.config.verbose,
        )
Beispiel #8
0
def make_supervisions(xml_path: str, mer_thresh: int) -> None:
    if not is_module_available("bs4"):
        raise ValueError(
            "To prepare MGB2 data, please 'pip install beautifulsoup4' first."
        )
    from bs4 import BeautifulSoup

    xml_handle = open(xml_path, "r")
    soup = BeautifulSoup(xml_handle, "xml")
    return [
        SupervisionSegment(
            id=segment["id"] + "_" + segment["starttime"] + ":" + segment["endtime"],
            recording_id=segment["id"].split("_utt")[0].replace("_", "-"),
            start=float(segment["starttime"]),
            duration=round(
                float(segment["endtime"]) - float(segment["starttime"]), ndigits=8
            ),
            channel=0,
            text=" ".join(
                [
                    element.string
                    for element in segment.find_all("element")
                    if element.string is not None
                ]
            ),
            language="Arabic",
            speaker=int(match(r"\w+speaker(\d+)\w+", segment["who"]).group(1)),
        )
        for segment in soup.find_all("segment")
        if mer_thresh is None or float(segment["WMER"]) <= mer_thresh
    ]
Beispiel #9
0
 def __init__(self, storage_path: Pathlike, *args, **kwargs):
     if not is_module_available('kaldiio'):
         raise ValueError(
             "To read Kaldi feats.scp, please 'pip install kaldiio' first.")
     import kaldiio
     super().__init__()
     self.storage_path = storage_path
     self.storage = kaldiio.load_scp(str(self.storage_path))
Beispiel #10
0
def mini_webdataset(
    urls: Union[Pathlike, Sequence[Pathlike]],
    epoch: int = 0,
    shuffle_shards: bool = False,
    split_by_worker: bool = True,
    split_by_node: bool = False,
    ignore_error_shards: bool = True,
):
    """
    Return a pipeline for WebDataset-style data files.

    This is a convenience function for constructing a partial pipeline
    that reads from a set of sharded tar files, extracts the individual
    files, and groups them together into samples (dictionaries).

    You can use all the methods from `Composable` (`then`, `compose`) and
    from `Shorthands` (`batched`, `unbatched`, `decode`, `shuffle`, etc.)
    on the result.

    .. note: This is a reduced version of ``webdataset.WebDataset`` function,
        that only uses the functionalities relevant to Lhotse, and makes it
        possible to disable the node/worker splitting.

    :param urls: the source URLs: a string or a list.
    :param epoch: epoch number (used only when ``shuffle_shards`` is enabled).
    :param shuffle_shards: shuffle the shards if True.
        Only takes effect when ``urls`` is a list of shard paths/urls.
    :param split_by_worker: DEPRECATED: always acts as if True.
        If True, shards are split per DataLoader worker subprocesses,
        otherwise each dataloader worker will yield the same data.
        Only takes effect when ``urls`` is a list of shard paths/urls.
    :param split_by_node: if True, shards are split per node in DDP training,
        otherwise on each node we'll yield the same data.
        Only takes effect when ``urls`` is a list of shard paths/urls.
    :param ignore_error_shards: when ``True``, we tell WebDataset to ignore shards that
        failed during loading and emit a warning. When ``False``, we won't catch the exceptions.
    """
    if not is_module_available("webdataset"):
        raise ImportError("Please 'pip install webdataset' first.")

    from webdataset import DataPipeline, SimpleShardList, reraise_exception
    from webdataset import split_by_node as split_by_node_
    from webdataset import split_by_worker as split_by_worker_
    from webdataset import tarfile_to_samples, warn_and_continue

    wds = DataPipeline(SimpleShardList(urls=urls))
    if split_by_node:
        wds.append(split_by_node_)
    if split_by_worker:
        wds.append(split_by_worker_)
    if shuffle_shards:
        wds.append(create_shard_shuffler(epoch=epoch))
    wds.append(
        tarfile_to_samples(
            handler=warn_and_continue if ignore_error_shards else reraise_exception,
        )
    )
    return wds
Beispiel #11
0
def prepare_single_commonvoice_tsv(
    lang: str,
    part: str,
    output_dir: Pathlike,
    lang_path: Pathlike,
) -> Tuple[RecordingSet, SupervisionSet]:
    """
    Prepares part of CommonVoice data from a single TSV file.

    :param lang: string language code (e.g., "en").
    :param part: which split to prepare (e.g., "train", "validated", etc.).
    :param output_dir: path to directory where we will store the manifests.
    :param lang_path: path to a CommonVoice directory for a specific language
        (e.g., "/path/to/cv-corpus-7.0-2021-07-21/pl").
    :return: a tuple of (RecordingSet, SupervisionSet) objects opened in lazy mode,
        as CommonVoice manifests may be fairly large in memory.
    """
    if not is_module_available("pandas"):
        raise ValueError(
            "To prepare CommonVoice data, please 'pip install pandas' first.")
    import pandas as pd

    lang_path = Path(lang_path)
    output_dir = Path(output_dir)
    tsv_path = lang_path / f"{part}.tsv"

    # Read the metadata
    df = pd.read_csv(tsv_path, sep="\t")
    # Scan all the audio files
    with RecordingSet.open_writer(
            output_dir / f"cv_recordings_{lang}_{part}.jsonl.gz",
            overwrite=False,
    ) as recs_writer, SupervisionSet.open_writer(
            output_dir / f"cv_supervisions_{lang}_{part}.jsonl.gz",
            overwrite=False,
    ) as sups_writer:
        for idx, row in tqdm(
                df.iterrows(),
                desc="Processing audio files",
                total=len(df),
        ):
            try:
                result = parse_utterance(row, lang_path, lang)
                if result is None:
                    continue
                recording, segment = result
                validate_recordings_and_supervisions(recording, segment)
                recs_writer.write(recording)
                sups_writer.write(segment)
            except Exception as e:
                logging.error(
                    f"Error when processing TSV file: line no. {idx}: '{row}'.\n"
                    f"Original error type: '{type(e)}' and message: {e}")
                continue
    recordings = RecordingSet.from_jsonl_lazy(recs_writer.path)
    supervisions = SupervisionSet.from_jsonl_lazy(sups_writer.path)
    return recordings, supervisions
Beispiel #12
0
    def featuresets_names():
        """
        Returns list of strings with names of pretrained FeatureSets available in opensmile.
        """
        assert is_module_available(
            "opensmile"
        ), 'To use opensmile extractors, please "pip install opensmile" first.'
        import opensmile

        return list(opensmile.FeatureSet.__members__)
Beispiel #13
0
    def __init__(self, storage_path: Pathlike, *args, **kwargs):
        if not is_module_available("kaldi_native_io"):
            raise ValueError(
                "To read Kaldi feats.scp, please 'pip install kaldi_native_io' first."
            )
        import kaldi_native_io

        super().__init__()
        self.storage_path = storage_path
        self.storage = kaldi_native_io.RandomAccessFloatMatrixReader(
            f"scp:{self.storage_path}")
Beispiel #14
0
def open_best(path: Pathlike, mode: str = "r"):
    if is_module_available("smart_open"):
        from smart_open import smart_open

        # This will work with JSONL anywhere that smart_open supports, e.g. cloud storage.
        open_fn = smart_open
    else:
        compressed = str(path).endswith(".gz")
        if compressed and "t" not in mode and "b" not in mode:
            # Opening as bytes not requested explicitly, use "t" to tell gzip to handle unicode.
            mode = mode + "t"
        open_fn = gzip_open_robust if compressed else open

    return open_fn(path, mode)
Beispiel #15
0
def prepare_gigaspeech(
        gigaspeech: Any,
        dataset_parts: Union[str, Sequence[str]] = 'auto',
        output_dir: Optional[Pathlike] = None,
        num_jobs: int = 1
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    if is_module_available('speechcolab'):
        from speechcolab.datasets.gigaspeech import GigaSpeech
    else:
        raise ImportError(
            'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab')

    subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, suffix='jsonl')
        if maybe_manifests is not None:
            return maybe_manifests

    manifests = defaultdict(dict)
    with ThreadPoolExecutor(num_jobs) as ex:
        for part in subsets:
            futures = []
            for audio in tqdm(gigaspeech.audios(part), desc='Distributing tasks', leave=False):
                futures.append(ex.submit(parse_utterance, audio, gigaspeech.root_path))

            recordings = []
            supervisions = []
            for future in tqdm(futures, desc='Processing', leave=False):
                result = future.result()
                if result is None:
                    continue
                recording, segments = result
                recordings.append(recording)
                supervisions += segments

            manifests[part] = {
                'recordings': RecordingSet.from_recordings(recordings),
                'supervisions': SupervisionSet.from_segments(supervisions)
            }

            if output_dir is not None:
                manifests[part]['recordings'].to_file(output_dir / f'recordings_{part}.jsonl')
                manifests[part]['supervisions'].to_file(output_dir / f'supervisions_{part}.jsonl')

    return dict(manifests)
Beispiel #16
0
    def __init__(
        self,
        storage_path: Pathlike,
        compression_method: Optional[int] = None,
        *args,
        **kwargs,
    ):
        if not is_module_available("kaldiio"):
            raise ValueError(
                "To read Kaldi feats.scp, please 'pip install kaldiio' first.")
        import kaldiio

        super().__init__()
        self.storage_dir = Path(storage_path)
        self.storage_dir.mkdir(parents=True, exist_ok=True)
        self.storage_path_ = str(self.storage_dir / "feats.scp")
        self.storage = kaldiio.WriteHelper(
            f"ark,scp:{self.storage_dir}/feats.ark,{self.storage_dir}/feats.scp",
            compression_method=compression_method,
        )
Beispiel #17
0
    def __init__(
        self,
        storage_path: Pathlike,
        compression_method: int = 1,
        *args,
        **kwargs,
    ):
        if not is_module_available("kaldi_native_io"):
            raise ValueError(
                "To read Kaldi feats.scp, please 'pip install kaldi_native_io' first."
            )
        import kaldi_native_io

        super().__init__()
        self.storage_dir = Path(storage_path)
        self.storage_dir.mkdir(parents=True, exist_ok=True)
        self.storage_path_ = str(self.storage_dir / "feats.scp")
        self.storage = kaldi_native_io.CompressedMatrixWriter(
            f"ark,scp:{self.storage_dir}/feats.ark,{self.storage_dir}/feats.scp"
        )
        self.compression_method = kaldi_native_io.CompressionMethod(compression_method)
Beispiel #18
0
def dereverb_wpe_torch(
    audio: torch.Tensor,
    n_fft: int = 512,
    hop_length: int = 128,
    taps: int = 10,
    delay: int = 3,
    iterations: int = 3,
    statistics_mode: str = "full",
) -> torch.Tensor:
    if not is_module_available("nara_wpe"):
        raise ImportError(
            "Please install nara_wpe first using 'pip install git+https://github.com/fgnt/nara_wpe' "
            "(at the time of writing, only GitHub version has a PyTorch implementation)."
        )

    from nara_wpe.torch_wpe import wpe_v6

    assert audio.ndim == 2

    window = torch.blackman_window(n_fft)
    Y = torch.stft(
        audio,
        n_fft=n_fft,
        hop_length=hop_length,
        return_complex=True,
        window=window,
    )
    Y = Y.permute(1, 0, 2)
    Z = wpe_v6(
        Y,
        taps=taps,
        delay=delay,
        iterations=iterations,
        statistics_mode=statistics_mode,
    )
    z = torch.istft(Z.permute(1, 0, 2),
                    n_fft=n_fft,
                    hop_length=hop_length,
                    window=window)
    return z
Beispiel #19
0
def download_gigaspeech(
    password: str,
    target_dir: Pathlike = ".",
    dataset_parts: Optional[Union[str, Sequence[str]]] = "auto",
    host: Optional[str] = "tsinghua",
):
    if is_module_available("speechcolab"):
        from speechcolab.datasets.gigaspeech import GigaSpeech
    else:
        raise ImportError(
            "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab"
        )
    gigaspeech = GigaSpeech(target_dir)

    if dataset_parts == "auto":
        dataset_parts = ("XL", "DEV", "TEST")
    elif isinstance(dataset_parts, str):
        dataset_parts = [dataset_parts]

    for part in dataset_parts:
        logging.info(f"Downloading GigaSpeech part: {part}")
        gigaspeech.download(password, "{" + part + "}", host=host)
Beispiel #20
0
def _parse_vtt(lines, noise):
    # Import regex for some special unicode handling that re has issues with
    if not is_module_available("regex"):
        raise ImportError(
            "regex package not found. Please install..." " (pip install regex)"
        )
    else:
        import regex as re2

    noise_pattern = re2.compile(r"\([^)]*\)", re2.UNICODE)
    apostrophe_pattern = re2.compile(r"(\w)'(\w)")
    html_tags = re2.compile(r"(&[^ ;]*;)|(</?[iu]>)")

    blocks = lines.split("\n\n")
    for i, b in enumerate(blocks, -1):
        if i > 0 and b.strip() != "":
            b_lines = b.split("\n")
            start, end = _parse_time_segment(b_lines[0])
            line = " ".join(b_lines[1:])
            line_new = line
            if line.strip("- ") != "":
                line_parts = noise_pattern.sub(noise, line_new)
                line_parts = apostrophe_pattern.sub(r"\1\u2019\2", line_parts)
                line_parts = html_tags.sub("", line_parts)
                line_parts_new = []
                for lp in line_parts.split(noise):
                    line_parts_new.append(
                        "".join(
                            [i for i in filter(_filter, lp.strip().replace("-", " "))]
                        )
                    )
                joiner = " " + noise + " "
                line_new = joiner.join(line_parts_new)
                line_new = re2.sub(
                    r"\p{Zs}", lambda m: _normalize_space(m.group(0)), line_new
                )
                line_new = re2.sub(r" +", " ", line_new).strip().lower()
            yield start, end, line_new
Beispiel #21
0
def prepare_aishell4(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    if not is_module_available("textgrid"):
        raise ValueError(
            "To prepare AISHELL-4 data, please 'pip install textgrid' first.")
    import textgrid

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    manifests = defaultdict(dict)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    global_spk_id = {}
    for part in ["train_L", "train_M", "train_S", "test"]:
        recordings = []
        supervisions = []
        wav_path = corpus_dir / part / "wav"
        for audio_path in wav_path.rglob("*.flac"):
            idx = audio_path.stem

            try:
                tg = textgrid.TextGrid.fromFile(
                    f"{corpus_dir}/{part}/TextGrid/{idx}.TextGrid")
            except ValueError:
                logging.warning(
                    f"{idx} has annotation issues. Skipping this recording.")
                continue

            recording = Recording.from_file(audio_path)
            recordings.append(recording)

            for tier in tg.tiers:
                local_spk_id = tier.name
                key = (idx, local_spk_id)
                if key not in global_spk_id:
                    global_spk_id[key] = f"SPK{len(global_spk_id)+1:04d}"
                spk_id = global_spk_id[key]
                for j, interval in enumerate(tier.intervals):
                    if interval.mark != "":
                        start = interval.minTime
                        end = interval.maxTime
                        text = interval.mark
                        segment = SupervisionSegment(
                            id=f"{idx}-{spk_id}-{j}",
                            recording_id=idx,
                            start=start,
                            duration=round(end - start, 4),
                            channel=0,
                            language="Chinese",
                            speaker=spk_id,
                            text=text.strip(),
                        )
                        supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_file(output_dir / f"supervisions_{part}.jsonl")
            recording_set.to_file(output_dir / f"recordings_{part}.jsonl")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests
Beispiel #22
0
def check_dependencies():
    if not is_module_available("pandas"):
        raise ImportError(
            "Gale Arabic data preparation requires the 'pandas' package to be installed. "
            "Please install it with 'pip install pandas' and try again")
Beispiel #23
0
def prepare_commonvoice(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    languages: Union[str, Sequence[str]] = "auto",
    splits: Union[str, Sequence[str]] = COMMONVOICE_DEFAULT_SPLITS,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    This function expects the input directory structure of::

        >>> metadata_path = corpus_dir / language_code / "{train,dev,test}.tsv"
        >>> # e.g. pl_train_metadata_path = "/path/to/cv-corpus-7.0-2021-07-21/pl/train.tsv"
        >>> audio_path = corpus_dir / language_code / "clips"
        >>> # e.g. pl_audio_path = "/path/to/cv-corpus-7.0-2021-07-21/pl/clips"

    Returns a dict with 3-level structure (lang -> split -> manifest-type)::

        >>> {'en/fr/pl/...': {'train/dev/test': {'recordings/supervisions': manifest}}}

    :param corpus_dir: Pathlike, the path to the downloaded corpus.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param languages: 'auto' (prepare all discovered data) or a list of language codes.
    :param splits: by default ``['train', 'dev', 'test']``, can also include
        ``'validated'``, ``'invalidated'``, and ``'other'``.
    :param num_jobs: How many concurrent workers to use for scanning of the audio files.
    :return: a dict with manifests for all specified languagues and their train/dev/test splits.
    """
    if not is_module_available("pandas"):
        raise ValueError(
            "To prepare CommonVoice data, please 'pip install pandas' first.")
    if num_jobs > 1:
        warnings.warn(
            "num_jobs>1 currently not supported for CommonVoice data prep;"
            "setting to 1.")

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    assert output_dir is not None, (
        "CommonVoice recipe requires to specify the output "
        "manifest directory (output_dir cannot be None).")
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if languages == "auto":
        languages = set(COMMONVOICE_LANGS).intersection(
            path.name for path in corpus_dir.glob("*"))
        if not languages:
            raise ValueError(
                f"Could not find any of CommonVoice languages in: {corpus_dir}"
            )
    elif isinstance(languages, str):
        languages = [languages]

    manifests = {}

    for lang in tqdm(languages, desc="Processing CommonVoice languages"):
        logging.info(f"Language: {lang}")
        lang_path = corpus_dir / lang

        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        # Pattern: "cv_recordings_en_train.jsonl.gz" / "cv_supervisions_en_train.jsonl.gz"
        lang_manifests = read_cv_manifests_if_cached(output_dir=output_dir,
                                                     language=lang)

        for part in splits:
            logging.info(f"Split: {part}")
            if part in lang_manifests:
                logging.info(
                    f"CommonVoice language: {lang} already prepared - skipping."
                )
                continue
            recording_set, supervision_set = prepare_single_commonvoice_tsv(
                lang=lang,
                part=part,
                output_dir=output_dir,
                lang_path=lang_path,
            )
            lang_manifests[part] = {
                "supervisions": supervision_set,
                "recordings": recording_set,
            }

        manifests[lang] = lang_manifests

    return manifests
Beispiel #24
0
        arr = cuts[1].load_features()
        assert arr.shape[0] == 100
        assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate)


@pytest.mark.parametrize(
    "extractor_type",
    [
        Fbank,
        Mfcc,
        TorchaudioFbank,
        TorchaudioMfcc,
        pytest.param(
            KaldifeatFbank,
            marks=pytest.mark.skipif(
                not is_module_available("kaldifeat"),
                reason="Requires kaldifeat to run.",
            ),
        ),
        pytest.param(
            KaldifeatMfcc,
            marks=pytest.mark.skipif(
                not is_module_available("kaldifeat"),
                reason="Requires kaldifeat to run.",
            ),
        ),
        pytest.param(
            lambda: LibrosaFbank(LibrosaFbankConfig(sampling_rate=16000)),
            marks=[
                pytest.mark.skipif(
                    not is_module_available("librosa"),
Beispiel #25
0
    assert (stats["norm_stds"] == read_stats["norm_stds"]).all()


@pytest.mark.parametrize(
    "storage_fn",
    [
        lambda: LilcomFilesWriter(TemporaryDirectory().name),
        lambda: LilcomHdf5Writer(NamedTemporaryFile().name),
        lambda: ChunkedLilcomHdf5Writer(NamedTemporaryFile().name),
        lambda: LilcomChunkyWriter(NamedTemporaryFile().name),
        lambda: NumpyFilesWriter(TemporaryDirectory().name),
        lambda: NumpyHdf5Writer(NamedTemporaryFile().name),
        pytest.param(
            lambda: KaldiWriter(TemporaryDirectory().name),
            marks=pytest.mark.skipif(
                not is_module_available("kaldiio"),
                reason="kaldiio must be installed for scp+ark feature writing",
            ),
        ),
    ],
)
def test_feature_set_builder(storage_fn):
    recordings: RecordingSet = RecordingSet.from_json(
        "test/fixtures/audio.json")
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with storage_fn() as storage:
        builder = FeatureSetBuilder(
            feature_extractor=extractor,
            storage=storage,
        )
        feature_set = builder.process_and_store_recordings(
Beispiel #26
0
    ('yaml', True),
    ('json', False),
    ('json', True),
    ('jsonl', False),
    ('jsonl', True),
])
def test_generic_serialization(manifests, manifest_type, format, compressed):
    manifest = manifests[manifest_type]
    with NamedTemporaryFile(suffix='.' + format +
                            ('.gz' if compressed else '')) as f:
        store_manifest(manifest, f.name)
        restored = load_manifest(f.name)
        assert manifest == restored


@pytest.mark.skipif(not is_module_available('pyarrow'),
                    reason='Requires pyarrow')
@pytest.mark.parametrize('manifest_type',
                         ['recording_set', 'supervision_set', 'cut_set'])
@pytest.mark.parametrize(['format', 'compressed'], [
    ('jsonl', False),
    ('jsonl', True),
])
def test_lazy_jsonl_deserialization(manifests, manifest_type, format,
                                    compressed):
    manifest = manifests[manifest_type]
    with NamedTemporaryFile(suffix='.' + format +
                            ('.gz' if compressed else '')) as f:
        store_manifest(manifest, f.name)
        lazy_manifest = type(manifest).from_jsonl_lazy(f.name)
        # Test iteration
Beispiel #27
0
def download_librispeech(
    target_dir: Pathlike = ".",
    dataset_parts: Optional[Union[str, Sequence[str]]] = "mini_librispeech",
    force_download: bool = False,
    alignments: bool = False,
    base_url: str = "http://www.openslr.org/resources",
    alignments_url: str = LIBRISPEECH_ALIGNMENTS_URL,
) -> Path:
    """
    Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: "librispeech", "mini_librispeech",
        or a list of splits (e.g. "dev-clean") to download.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param alignments: should we download the alignments. The original source is:
        https://github.com/CorentinJ/librispeech-alignments
    :param base_url: str, the url of the OpenSLR resources.
    :param alignments_url: str, the url of LibriSpeech word alignments
    :return: the path to downloaded and extracted directory with data.
    """
    target_dir = Path(target_dir)
    corpus_dir = target_dir / "LibriSpeech"
    target_dir.mkdir(parents=True, exist_ok=True)

    if dataset_parts == "librispeech":
        dataset_parts = LIBRISPEECH
    elif dataset_parts == "mini_librispeech":
        dataset_parts = MINI_LIBRISPEECH
    elif isinstance(dataset_parts, str):
        dataset_parts = [dataset_parts]

    for part in tqdm(dataset_parts, desc="Downloading LibriSpeech parts"):
        logging.info(f"Processing split: {part}")
        # Determine the valid URL for a given split.
        if part in LIBRISPEECH:
            url = f"{base_url}/12"
        elif part in MINI_LIBRISPEECH:
            url = f"{base_url}/31"
        else:
            logging.warning(f"Invalid dataset part name: {part}")
            continue
        # Split directory exists and seem valid? Skip this split.
        part_dir = corpus_dir / part
        completed_detector = part_dir / ".completed"
        if completed_detector.is_file():
            logging.info(
                f"Skipping {part} because {completed_detector} exists.")
            continue
        # Maybe-download the archive.
        tar_name = f"{part}.tar.gz"
        tar_path = target_dir / tar_name
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f"{url}/{tar_name}",
                                 filename=tar_path,
                                 desc=f"Downloading {tar_name}")
        # Remove partial unpacked files, if any, and unpack everything.
        shutil.rmtree(part_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
        completed_detector.touch()

    if alignments:
        completed_detector = target_dir / ".ali_completed"
        if completed_detector.is_file() and not force_download:
            return corpus_dir
        assert is_module_available(
            "gdown"
        ), 'To download LibriSpeech alignments, please install "pip install gdown"'
        import gdown

        ali_zip_path = str(target_dir / "LibriSpeech-Alignments.zip")
        gdown.download(alignments_url, output=ali_zip_path)
        with zipfile.ZipFile(ali_zip_path) as f:
            f.extractall(path=target_dir)
            completed_detector.touch()

    return corpus_dir
Beispiel #28
0
    :param data: the contents of ``manifest.custom`` field.
    :return: ``custom`` field dict with deserialized manifests (if any),
        or None when input is None.
    """
    if data is None:
        return None

    from lhotse.array import deserialize_array

    # If any of the values in the input are also dicts,
    # it indicates that might be a serialized array manifest.
    # We'll try to deserialize it, and if there is an error,
    # we'll just leave it as it was.
    for key, value in data.items():
        if isinstance(value, dict):
            try:
                data[key] = deserialize_array(value)
            except:
                pass

    return data


if is_module_available("orjson"):
    import orjson

    decode_json_line = orjson.loads
else:
    decode_json_line = json.loads
Beispiel #29
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests.
    For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet.
    In particular, feats.scp files are ignored.
    """
    path = Path(path)
    assert path.is_dir()

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True)

    durations = defaultdict(float)
    reco2dur = path / 'reco2dur'
    if not reco2dur.is_file():
        raise ValueError(
            f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>"
        )
    with reco2dur.open() as f:
        for line in f:
            recording_id, dur = line.strip().split()
            durations[recording_id] = float(dur)

    recording_set = RecordingSet.from_recordings(
        Recording(id=recording_id,
                  sources=[
                      AudioSource(type='command' if path_or_cmd.
                                  endswith('|') else 'file',
                                  channels=[0],
                                  source=path_or_cmd[:-1] if path_or_cmd.
                                  endswith('|') else path_or_cmd)
                  ],
                  sampling_rate=sampling_rate,
                  num_samples=int(durations[recording_id] * sampling_rate),
                  duration=durations[recording_id])
        for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / 'segments'
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [l.strip().split() for l in f]

        texts = load_kaldi_text_mapping(path / 'text')
        speakers = load_kaldi_text_mapping(path / 'utt2spk')
        genders = load_kaldi_text_mapping(path / 'spk2gender')
        languages = load_kaldi_text_mapping(path / 'utt2lang')

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(id=segment_id,
                               recording_id=recording_id,
                               start=float(start),
                               duration=float(end) - float(start),
                               channel=0,
                               text=texts[segment_id],
                               language=languages[segment_id],
                               speaker=speakers[segment_id],
                               gender=genders[speakers[segment_id]])
            for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / 'feats.scp'
    if feats_scp.exists() and is_module_available('kaldiio'):
        if frame_shift is not None:
            import kaldiio
            from lhotse.features.io import KaldiReader
            feature_set = FeatureSet.from_features(
                Features(type='kaldiio',
                         num_frames=mat.shape[0],
                         num_features=mat.shape[1],
                         frame_shift=frame_shift,
                         sampling_rate=sampling_rate,
                         start=0,
                         duration=mat.shape[0] * frame_shift,
                         storage_type=KaldiReader.name,
                         storage_path=str(feats_scp),
                         storage_key=utt_id,
                         recording_id=supervision_set[utt_id].recording_id
                         if supervision_set is not None else utt_id,
                         channels=0)
                for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp)))
        else:
            warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: "
                          f"frame_shift must be not None. "
                          f"Feature import omitted.")

    return recording_set, supervision_set, feature_set
Beispiel #30
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
    map_string_to_underscores: Optional[str] = None,
    num_jobs: int = 1,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and
    SupervisionSet manifests. For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might
    not be handled yet. In particular, feats.scp files are ignored.

    :param map_string_to_underscores: optional string, when specified, we will replace
        all instances of this string in SupervisonSegment IDs to underscores.
        This is to help with handling underscores in Kaldi
        (see :func:`.export_to_kaldi`). This is also done for speaker IDs.
    """
    path = Path(path)
    assert path.is_dir()

    def fix_id(t: str) -> str:
        if map_string_to_underscores is None:
            return t
        return t.replace(map_string_to_underscores, "_")

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True)

    with ProcessPoolExecutor(num_jobs) as ex:
        dur_vals = ex.map(get_duration, recordings.values())
    durations = dict(zip(recordings.keys(), dur_vals))

    recording_set = RecordingSet.from_recordings(
        Recording(
            id=recording_id,
            sources=[
                AudioSource(
                    type="command" if path_or_cmd.endswith("|") else "file",
                    channels=[0],
                    source=path_or_cmd[:-1]
                    if path_or_cmd.endswith("|")
                    else path_or_cmd,
                )
            ],
            sampling_rate=sampling_rate,
            num_samples=compute_num_samples(durations[recording_id], sampling_rate),
            duration=durations[recording_id],
        )
        for recording_id, path_or_cmd in recordings.items()
    )

    supervision_set = None
    segments = path / "segments"
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [sup_string.strip().split() for sup_string in f]

        texts = load_kaldi_text_mapping(path / "text")
        speakers = load_kaldi_text_mapping(path / "utt2spk")
        genders = load_kaldi_text_mapping(path / "spk2gender")
        languages = load_kaldi_text_mapping(path / "utt2lang")

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(
                id=fix_id(segment_id),
                recording_id=recording_id,
                start=float(start),
                duration=add_durations(
                    float(end), -float(start), sampling_rate=sampling_rate
                ),
                channel=0,
                text=texts[segment_id],
                language=languages[segment_id],
                speaker=fix_id(speakers[segment_id]),
                gender=genders[speakers[segment_id]],
            )
            for segment_id, recording_id, start, end in supervision_segments
        )

    feature_set = None
    feats_scp = path / "feats.scp"
    if feats_scp.exists() and is_module_available("kaldi_native_io"):
        if frame_shift is not None:
            import kaldi_native_io
            from lhotse.features.io import KaldiReader

            feature_set = FeatureSet.from_features(
                Features(
                    type="kaldi_native_io",
                    num_frames=mat_shape.num_rows,
                    num_features=mat_shape.num_cols,
                    frame_shift=frame_shift,
                    sampling_rate=sampling_rate,
                    start=0,
                    duration=mat_shape.num_rows * frame_shift,
                    storage_type=KaldiReader.name,
                    storage_path=str(feats_scp),
                    storage_key=utt_id,
                    recording_id=supervision_set[utt_id].recording_id
                    if supervision_set is not None
                    else utt_id,
                    channels=0,
                )
                for utt_id, mat_shape in kaldi_native_io.SequentialMatrixShapeReader(
                    f"scp:{feats_scp}"
                )
            )
        else:
            warnings.warn(
                "Failed to import Kaldi 'feats.scp' to Lhotse: "
                "frame_shift must be not None. "
                "Feature import omitted."
            )

    return recording_set, supervision_set, feature_set