コード例 #1
0
ファイル: yesno.py プロジェクト: glynpu/lhotse
def prepare_yesno(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. It's expected to
        contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there
        are 8 x's and each x is either 1 or 0.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is either "train" or "test", and the value is
        Dicts with the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    wave_files = list(corpus_dir.glob("*.wav"))
    assert len(wave_files) == 60

    wave_files.sort()
    train_set = wave_files[::2]
    test_set = wave_files[1::2]

    assert len(train_set) == 30
    assert len(test_set) == 30

    manifests = defaultdict(dict)
    for name, dataset in zip(["train", "test"], [train_set, test_set]):
        recordings, supervisions = _prepare_dataset(dataset)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)

        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{name}.json")
            recording_set.to_json(output_dir / f"recordings_{name}.json")

        manifests[name] = {
            "recordings": recording_set,
            "supervisions": supervision_set,
        }

    return manifests
コード例 #2
0
ファイル: voxceleb.py プロジェクト: glynpu/lhotse
def download_voxceleb1(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
) -> Path:
    """
    Download and unzip the VoxCeleb1 data.

    .. note:: A "connection refused" error may occur if you are downloading without a password.

    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param force_download: bool, if True, download the archive even if it already exists.
    :return: the path to downloaded and extracted directory with data.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    zip_name = "vox1_dev_wav.zip"
    zip_path = target_dir / zip_name
    if zip_path.exists() and not force_download:
        logging.info(f"Skipping {zip_name} because file exists.")
    else:
        # Download the data in parts
        for url in VOXCELEB1_PARTS_URL:
            urlretrieve_progress(
                url, desc=f"Downloading VoxCeleb1 {url.split('/')[-1]}"
            )
        # Combine the parts for dev set
        with open(zip_name, "wb") as outFile:
            for file in target_dir.glob("vox1_dev_wav_part*"):
                with open(file, "rb") as inFile:
                    shutil.copyfileobj(inFile, outFile)
    logging.info(f"Unzipping dev...")
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(target_dir)
    logging.info(f"Unzipping test...")
    with zipfile.ZipFile(target_dir / "vox1_test_wav.zip") as zf:
        zf.extractall(target_dir)

    return target_dir
コード例 #3
0
ファイル: librispeech.py プロジェクト: glynpu/lhotse
def prepare_librispeech(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "auto",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if dataset_parts == "mini_librispeech":
        dataset_parts = set(MINI_LIBRISPEECH).intersection(
            path.name for path in corpus_dir.glob("*"))
    elif dataset_parts == "auto":
        dataset_parts = (set(LIBRISPEECH).union(MINI_LIBRISPEECH).intersection(
            path.name for path in corpus_dir.glob("*")))
        if not dataset_parts:
            raise ValueError(
                f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}"
            )
    elif isinstance(dataset_parts, str):
        dataset_parts = [dataset_parts]

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir)

    with ThreadPoolExecutor(num_jobs) as ex:
        for part in tqdm(dataset_parts, desc="Dataset parts"):
            logging.info(f"Processing LibriSpeech subset: {part}")
            if manifests_exist(part=part, output_dir=output_dir):
                logging.info(
                    f"LibriSpeech subset: {part} already prepared - skipping.")
                continue
            recordings = []
            supervisions = []
            part_path = corpus_dir / part
            futures = []
            for trans_path in tqdm(part_path.rglob("*.trans.txt"),
                                   desc="Distributing tasks",
                                   leave=False):
                alignments = {}
                ali_path = trans_path.parent / (trans_path.stem.split(".")[0] +
                                                ".alignment.txt")
                if ali_path.exists():
                    alignments = parse_alignments(ali_path)
                # "trans_path" file contains lines like:
                #
                #   121-121726-0000 ALSO A POPULAR CONTRIVANCE
                #   121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE
                #   121-121726-0002 ANGOR PAIN PAINFUL TO HEAR
                #
                # We will create a separate Recording and SupervisionSegment for those.
                with open(trans_path) as f:
                    for line in f:
                        futures.append(
                            ex.submit(parse_utterance, part_path, line,
                                      alignments))

            for future in tqdm(futures, desc="Processing", leave=False):
                result = future.result()
                if result is None:
                    continue
                recording, segment = result
                recordings.append(recording)
                supervisions.append(segment)

            recording_set = RecordingSet.from_recordings(recordings)
            supervision_set = SupervisionSet.from_segments(supervisions)

            validate_recordings_and_supervisions(recording_set,
                                                 supervision_set)

            if output_dir is not None:
                supervision_set.to_file(output_dir /
                                        f"supervisions_{part}.json")
                recording_set.to_file(output_dir / f"recordings_{part}.json")

            manifests[part] = {
                "recordings": recording_set,
                "supervisions": supervision_set,
            }

    return manifests
コード例 #4
0
ファイル: mls.py プロジェクト: underdogliu/lhotse
def prepare_mls(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    opus: bool = True,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]]:
    """
    Prepare Multilingual LibriSpeech corpus.

    Returns a dict structured like the following:

    .. code-block:: python

        {
            'english': {
                'train': {'recordings': RecordingSet(...), 'supervisions': SupervisionSet(...)},
                'dev': ...,
                'test': ...
            },
            'polish': { ... },
            ...
        }

    :param corpus_dir: Path to the corpus root (directories with specific languages should be inside).
    :param output_dir: Optional path where the manifests should be stored.
    :param opus: Should we scan for OPUS files (otherwise we'll look for FLAC files).
    :param num_jobs: How many jobs should be used for creating recording manifests.
    :return: A dict with structure: ``d[language][split] = {recordings, supervisions}``.
    """
    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir) if output_dir is not None else None
    assert corpus_dir.is_dir()

    languages = {
        d.name.split("_")[1]: d
        for d in corpus_dir.glob("mls_*")
        if d.is_dir() and "_lm_" not in d.name and (
            opus or not d.name.endswith("opus"))
    }
    logging.info(f"Found MLS languages: {list(languages)}")

    manifests = defaultdict(dict)
    for lang, lang_dir in tqdm(languages.items(),
                               desc="Langauges",
                               total=len(languages)):
        logging.info(f"Processing language: {lang}")

        # Read the speaker to gender mapping.
        spk2gender = {}
        for line in (lang_dir / "metainfo.txt").read_text().splitlines():
            spk, gender, *_ = line.split("|")
            spk2gender[spk.strip()] = gender.strip()

        for split in tqdm(["test", "dev", "train"], desc="Splits"):

            # If everything is ready, read it and skip it.
            recordings_path = (None if output_dir is None else output_dir /
                               f"recordings_{lang}_{split}.jsonl.gz")
            supervisions_path = (None if output_dir is None else output_dir /
                                 f"supervisions_{lang}_{split}.jsonl.gz")
            if (recordings_path is not None and recordings_path.is_file()
                    and supervisions_path is not None
                    and supervisions_path.is_file()):
                logging.info(f"Skipping - {lang}/{split} - already exists!")
                recordings = RecordingSet.from_file(recordings_path)
                supervisions = SupervisionSet.from_file(supervisions_path)
                manifests[lang][split] = {
                    "recordings": recordings,
                    "supervisions": supervisions,
                }
                continue

            # Create recordings manifest.
            split_dir = lang_dir / split
            recordings = RecordingSet.from_dir(
                path=split_dir,
                pattern="*.opus" if opus else "*.flac",
                num_jobs=num_jobs,
                force_opus_sampling_rate=16000,
            )

            # Create supervisions manifest.
            supervisions = []
            for line in (split_dir /
                         "transcripts.txt").read_text().splitlines():
                recording_id, text = line.split("\t")
                speaker = recording_id.split("_")[0]
                supervisions.append(
                    SupervisionSegment(
                        id=recording_id,
                        recording_id=recording_id,
                        text=text,
                        speaker=speaker,
                        gender=spk2gender[speaker],
                        start=0.0,
                        duration=recordings.duration(recording_id),
                        language=lang,
                    ))
            supervisions = SupervisionSet.from_segments(supervisions)

            # Fix any missing recordings/supervisions.
            recordings, supervisions = fix_manifests(recordings, supervisions)
            validate_recordings_and_supervisions(recordings, supervisions)

            # Save for return.
            manifests[lang][split] = {
                "recordings": recordings,
                "supervisions": supervisions,
            }

            # Optional storage on disk.
            if output_dir is not None:
                output_dir.mkdir(exist_ok=True, parents=True)
                recordings.to_jsonl(recordings_path)
                supervisions.to_jsonl(supervisions_path)

    return dict(manifests)
コード例 #5
0
ファイル: librispeech.py プロジェクト: fanlu/lhotse
def prepare_librispeech(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = 'auto',
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

    if dataset_parts == 'auto':
        dataset_parts = (set(LIBRISPEECH).union(MINI_LIBRISPEECH).intersection(
            path.name for path in corpus_dir.glob('*')))
        if not dataset_parts:
            raise ValueError(
                f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}"
            )
    elif isinstance(dataset_parts, str):
        dataset_parts = [dataset_parts]

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                                   output_dir=output_dir)
        if maybe_manifests is not None:
            return maybe_manifests

    manifests = defaultdict(dict)
    with ThreadPoolExecutor(num_jobs) as ex:
        for part in tqdm(dataset_parts, desc='Dataset parts'):
            recordings = []
            supervisions = []
            part_path = corpus_dir / part
            futures = []
            for trans_path in tqdm(part_path.rglob('*.txt'),
                                   desc='Distributing tasks',
                                   leave=False):
                # "trans_path" file contains lines like:
                #
                #   121-121726-0000 ALSO A POPULAR CONTRIVANCE
                #   121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE
                #   121-121726-0002 ANGOR PAIN PAINFUL TO HEAR
                #
                # We will create a separate Recording and SupervisionSegment for those.
                with open(trans_path) as f:
                    for line in f:
                        futures.append(
                            ex.submit(parse_utterance, part_path, line))

            for future in tqdm(futures, desc='Processing', leave=False):
                result = future.result()
                if result is None:
                    continue
                recording, segment = result
                recordings.append(recording)
                supervisions.append(segment)

            recording_set = RecordingSet.from_recordings(recordings)
            supervision_set = SupervisionSet.from_segments(supervisions)

            validate_recordings_and_supervisions(recording_set,
                                                 supervision_set)

            if output_dir is not None:
                supervision_set.to_json(output_dir /
                                        f'supervisions_{part}.json')
                recording_set.to_json(output_dir / f'recordings_{part}.json')

            manifests[part] = {
                'recordings': recording_set,
                'supervisions': supervision_set
            }

    return dict(manifests)  # Convert to normal dict
コード例 #6
0
def prepare_commonvoice(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    languages: Union[str, Sequence[str]] = "auto",
    splits: Union[str, Sequence[str]] = COMMONVOICE_DEFAULT_SPLITS,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    This function expects the input directory structure of::

        >>> metadata_path = corpus_dir / language_code / "{train,dev,test}.tsv"
        >>> # e.g. pl_train_metadata_path = "/path/to/cv-corpus-7.0-2021-07-21/pl/train.tsv"
        >>> audio_path = corpus_dir / language_code / "clips"
        >>> # e.g. pl_audio_path = "/path/to/cv-corpus-7.0-2021-07-21/pl/clips"

    Returns a dict with 3-level structure (lang -> split -> manifest-type)::

        >>> {'en/fr/pl/...': {'train/dev/test': {'recordings/supervisions': manifest}}}

    :param corpus_dir: Pathlike, the path to the downloaded corpus.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param languages: 'auto' (prepare all discovered data) or a list of language codes.
    :param splits: by default ``['train', 'dev', 'test']``, can also include
        ``'validated'``, ``'invalidated'``, and ``'other'``.
    :param num_jobs: How many concurrent workers to use for scanning of the audio files.
    :return: a dict with manifests for all specified languagues and their train/dev/test splits.
    """
    if not is_module_available("pandas"):
        raise ValueError(
            "To prepare CommonVoice data, please 'pip install pandas' first.")
    if num_jobs > 1:
        warnings.warn(
            "num_jobs>1 currently not supported for CommonVoice data prep;"
            "setting to 1.")

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    assert output_dir is not None, (
        "CommonVoice recipe requires to specify the output "
        "manifest directory (output_dir cannot be None).")
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if languages == "auto":
        languages = set(COMMONVOICE_LANGS).intersection(
            path.name for path in corpus_dir.glob("*"))
        if not languages:
            raise ValueError(
                f"Could not find any of CommonVoice languages in: {corpus_dir}"
            )
    elif isinstance(languages, str):
        languages = [languages]

    manifests = {}

    for lang in tqdm(languages, desc="Processing CommonVoice languages"):
        logging.info(f"Language: {lang}")
        lang_path = corpus_dir / lang

        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        # Pattern: "cv_recordings_en_train.jsonl.gz" / "cv_supervisions_en_train.jsonl.gz"
        lang_manifests = read_cv_manifests_if_cached(output_dir=output_dir,
                                                     language=lang)

        for part in splits:
            logging.info(f"Split: {part}")
            if part in lang_manifests:
                logging.info(
                    f"CommonVoice language: {lang} already prepared - skipping."
                )
                continue
            recording_set, supervision_set = prepare_single_commonvoice_tsv(
                lang=lang,
                part=part,
                output_dir=output_dir,
                lang_path=lang_path,
            )
            lang_manifests[part] = {
                "supervisions": supervision_set,
                "recordings": recording_set,
            }

        manifests[lang] = lang_manifests

    return manifests
コード例 #7
0
def prepare_hifitts(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the HiFiTTS dataset.

    :param corpus_dir: Path or str, the path to the downloaded corpus main directory.
    :param output_dir: Path or str, the path where to write the manifests.
    :param num_jobs: How many concurrent workers to use for preparing each dataset partition.
    :return: a dict with manifests for all the partitions
        (example query: ``manifests['92_clean_train']['recordings']``).
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    manifests = {}

    json_manifests = list(corpus_dir.glob("*.json"))
    dataset_partitions = [to_partition_id(p) for p in json_manifests]

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_partitions,
                                             output_dir=output_dir,
                                             prefix="hifitts")

    with ProcessPoolExecutor(num_jobs) as ex:
        futures = []
        partition_ids = []
        for raw_manifest_path in json_manifests:
            speaker_id, _, clean_or_other, part = raw_manifest_path.stem.split(
                "_")
            partition_id = to_partition_id(raw_manifest_path)
            if manifests_exist(part=partition_id,
                               output_dir=output_dir,
                               prefix="hifitts"):
                logging.info(
                    f"HiFiTTS subset: {part} already prepared - skipping.")
                continue
            futures.append(
                ex.submit(
                    prepare_single_partition,
                    raw_manifest_path,
                    corpus_dir,
                    speaker_id,
                    clean_or_other,
                ))
            partition_ids.append(partition_id)

        for future, partition_id in tqdm(
                zip(as_completed(futures), partition_ids),
                desc="Preparing HiFiTTS parts",
                total=len(futures),
        ):
            recordings, supervisions = future.result()

            if output_dir is not None:
                supervisions.to_json(
                    output_dir / f"hifitts_supervisions_{partition_id}.json")
                recordings.to_json(output_dir /
                                   f"hifitts_recordings_{partition_id}.json")

            manifests[partition_id] = {
                "recordings": recordings,
                "supervisions": supervisions,
            }

    return manifests