Exemple #1
0
    def __init__(
        self,
        root: str,
        split: str,
        source_language: str,
        target_language: Optional[str] = None,
        version: int = 2,
    ) -> None:
        assert version in self.VERSIONS and split in self.SPLITS
        assert source_language is not None
        self.no_translation = target_language is None
        if not self.no_translation:
            assert "en" in {source_language, target_language}
            if source_language == "en":
                assert target_language in self.EN_XX_LANGUAGES[version]
            else:
                assert source_language in self.XX_EN_LANGUAGES[version]
        else:
            # Hack here so that we can get "split" column from CoVoST TSV.
            # Note that we use CoVoST train split for ASR which is an extension
            # to Common Voice train split.
            target_language = "de" if source_language == "en" else "en"

        self.root: Path = Path(root)

        cv_tsv_path = self.root / "validated.tsv"
        assert cv_tsv_path.is_file()

        covost_url = self.COVOST_URL_TEMPLATE.format(
            src_lang=source_language, tgt_lang=target_language
        )
        covost_archive = self.root / Path(covost_url).name
        if not covost_archive.is_file():
            download_url(covost_url, self.root.as_posix(), hash_value=None)
        extract_archive(covost_archive.as_posix())

        cv_tsv = load_df_from_tsv(cv_tsv_path)
        covost_tsv = load_df_from_tsv(
            self.root / Path(covost_url).name.replace(".tar.gz", "")
        )
        df = pd.merge(
            left=cv_tsv[["path", "sentence", "client_id"]],
            right=covost_tsv[["path", "translation", "split"]],
            how="inner",
            on="path",
        )
        if split == "train":
            df = df[(df["split"] == split) | (df["split"] == f"{split}_covost")]
        else:
            df = df[df["split"] == split]
        data = df.to_dict(orient="index").items()
        data = [v for k, v in sorted(data, key=lambda x: x[0])]
        self.data = []
        for e in data:
            try:
                path = self.root / "clips" / e["path"]
                _ = torchaudio.info(path.as_posix())
                self.data.append(e)
            except RuntimeError:
                pass
Exemple #2
0
    def __init__(
            self, root, params, url=URL, download=False):

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.split(self._ext_archive)[0]
        base_folder = os.path.join(root, basename)
        
        self._wav_path = os.path.join(base_folder, 'wavs')
        self._mel_path = os.path.join(base_folder, 'mels')
        self._char_path = os.path.join(base_folder, 'chars')
        self._phone_path = os.path.join(base_folder, 'phones')
        self._metadata_path = os.path.join(base_folder, 'metadata.csv')

        if download:
            if not os.path.isdir(self._wav_path):
                if not os.path.isfile(archive):
                    download_url(url, root)
                extract_archive(archive)

        if not os.path.isdir(self._mel_path):
            precompute_spectrograms(base_folder, params)

        if not os.path.isdir(self._char_path) or not os.path.isdir(self._phone_path):
            precompute_char_phone(base_folder)
            
        with open(self._metadata_path, "r") as metadata:
            walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE)
            self._walker = list(walker)
def download(args):
    if args.subset in LANGUAGES:
        languages = [args.subset]
        years = YEARS
    else:
        languages = {
            "100k": LANGUAGES, "10k": LANGUAGES, "asr": ["original"]
        }.get(args.subset, None)
        years = {
            "100k": YEARS, "10k": [2019, 2020], "asr": YEARS
        }.get(args.subset, None)

    url_list = []
    for l in languages:
        for y in years:
            url_list.append(f"{DOWNLOAD_BASE_URL}/audios/{l}_{y}.tar")

    out_root = Path(args.root) / "raw_audios"
    out_root.mkdir(exist_ok=True, parents=True)
    print(f"{len(url_list)} files to download...")
    for url in tqdm(url_list):
        tar_path = out_root / Path(url).name
        download_url(url, out_root, Path(url).name)
        extract_archive(tar_path.as_posix())
        os.remove(tar_path)
Exemple #4
0
    def load_from_nemo(cls,
                       *,
                       nemo_filepath: str = None,
                       checkpoint_name: NemoCheckpoint = None):
        if checkpoint_name is not None:
            nemo_filepath = download_checkpoint(checkpoint_name)
        if nemo_filepath is None and checkpoint_name is None:
            raise ValueError(
                "Either nemo_filepath or checkpoint_name must be passed")

        with TemporaryDirectory() as extract_path:
            extract_path = Path(extract_path)
            extract_archive(str(nemo_filepath), extract_path)
            config_path = extract_path / "model_config.yaml"
            encoder_params, initial_vocab, preprocess_params = read_params_from_config(
                config_path)
            module = cls(
                initial_vocab_tokens=initial_vocab,
                **encoder_params,
                **preprocess_params,
                nemo_compat_vocab=True,
            )
            weights_path = extract_path / "model_weights.ckpt"
            load_quartznet_weights(module.encoder, module.decoder,
                                   weights_path)
        # Here we set it in eval mode, so it correctly works during inference
        # Supposing that the majority of applications will be either load a checkpoint
        # and directly run inference, or fine-tuning. Either way this will prevent a silent
        # bug (case 1) or will be ignored (case 2).
        module.eval()
        return module
Exemple #5
0
    def __init__(self,
                 root: Union[str, Path],
                 url: str = URL,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 download: bool = False) -> None:

        # Get string representation of 'root' in case Path object is passed
        root = os.fspath(root)

        archive = os.path.basename(url)
        archive = os.path.join(root, archive)
        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url,
                                 root,
                                 hash_value=checksum,
                                 hash_type="md5")
                extract_archive(archive)

        if not os.path.isdir(self._path):
            raise RuntimeError(
                "Dataset not found. Please use `download=True` to download it."
            )

        self._walker = sorted(
            str(p.stem) for p in Path(self._path).glob('*' + self._ext_audio))
Exemple #6
0
    def __init__(self, root, tsv=TSV, url=URL, download=False):

        languages = {
            "tatar": "tt",
            "english": "en",
            "german": "de",
            "french": "fr",
            "welsh": "cy",
            "breton": "br",
            "chuvash": "cv",
            "turkish": "tr",
            "kyrgyz": "ky",
            "irish": "ga-IE",
            "kabyle": "kab",
            "catalan": "ca",
            "taiwanese": "zh-TW",
            "slovenian": "sl",
            "italian": "it",
            "dutch": "nl",
            "hakha chin": "cnh",
            "esperanto": "eo",
            "estonian": "et",
            "persian": "fa",
            "basque": "eu",
            "spanish": "es",
            "chinese": "zh-CN",
            "mongolian": "mn",
            "sakha": "sah",
            "dhivehi": "dv",
            "kinyarwanda": "rw",
            "swedish": "sv-SE",
            "russian": "ru",
        }

        if url is languages:
            ext_archive = ".tar.gz"
            language = languages[url]

            base_url = (
                "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4"
                + ".s3.amazonaws.com/cv-corpus-3/"
            )
            url = base_url + language + ext_archive

        archive = os.path.basename(url)
        archive = os.path.join(root, archive)
        self._path = root

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    download_url(url, root)
                extract_archive(archive)

        self._tsv = os.path.join(root, tsv)

        with open(self._tsv, "r") as tsv:
            walker = unicode_csv_reader(tsv, delimiter="\t")
            self._header = next(walker)
            self._walker = list(walker)
Exemple #7
0
    def __init__(self,
                 root,
                 url=URL,
                 folder_in_archive=FOLDER_IN_ARCHIVE,
                 download=False):
        if url in [
                "speech_commands_v0.01",
                "speech_commands_v0.02",
        ]:
            base_url = "https://storage.googleapis.com/download.tensorflow.org/data/"
            ext_archive = ".tar.gz"

            url = os.path.join(base_url, url + ext_archive)

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.rsplit(".", 2)[0]
        folder_in_archive = os.path.join(folder_in_archive, basename)

        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    download_url(url, root)
                extract_archive(archive, self._path)

        walker = walk_files(self._path, suffix=".wav", prefix=True)
        walker = filter(lambda w: HASH_DIVIDER in w and EXCEPT_FOLDER not in w,
                        walker)
        self._walker = list(walker)
Exemple #8
0
    def __init__(self,
                 root: str,
                 url: str = URL,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 download: bool = False) -> None:

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.split(self._ext_archive)[0]
        folder_in_archive = os.path.join(basename, folder_in_archive)

        self._path = os.path.join(root, folder_in_archive)
        self._metadata_path = os.path.join(root, basename, 'metadata.csv')

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

        with open(self._metadata_path, "r", newline='') as metadata:
            walker = unicode_csv_reader(metadata,
                                        delimiter="|",
                                        quoting=csv.QUOTE_NONE)
            self._walker = list(walker)
Exemple #9
0
    def __init__(self,
                 root: str,
                 release: str = "release1",
                 subset: str = None,
                 download: bool = False,
                 audio_ext=".sph") -> None:
        self._ext_audio = audio_ext
        if release in _RELEASE_CONFIGS.keys():
            folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"]
            url = _RELEASE_CONFIGS[release]["url"]
            subset = subset if subset else _RELEASE_CONFIGS[release]["subset"]
        else:
            # Raise warning
            raise RuntimeError(
                "The release {} does not match any of the supported tedlium releases{} "
                .format(
                    release,
                    _RELEASE_CONFIGS.keys(),
                ))
        if subset not in _RELEASE_CONFIGS[release]["supported_subsets"]:
            # Raise warning
            raise RuntimeError(
                "The subset {} does not match any of the supported tedlium subsets{} "
                .format(
                    subset,
                    _RELEASE_CONFIGS[release]["supported_subsets"],
                ))

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.split(".")[0]

        self._path = os.path.join(root, folder_in_archive,
                                  _RELEASE_CONFIGS[release]["data_path"])
        if subset in ["train", "dev", "test"]:
            self._path = os.path.join(self._path, subset)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _RELEASE_CONFIGS[release]["checksum"]
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

        # Create list for all samples
        self._filelist = []
        stm_path = os.path.join(self._path, "stm")
        for file in sorted(os.listdir(stm_path)):
            if file.endswith(".stm"):
                stm_path = os.path.join(self._path, "stm", file)
                with open(stm_path) as f:
                    l = len(f.readlines())
                    file = file.replace(".stm", "")
                    self._filelist.extend((file, line) for line in range(l))
        # Create dict path for later read
        self._dict_path = os.path.join(root, folder_in_archive,
                                       _RELEASE_CONFIGS[release]["dict"])
        self._phoneme_dict = None
Exemple #10
0
    def __init__(self,
                 root: Union[str, Path],
                 url: str = URL,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 download: bool = False) -> None:

        if url in [
            "aew",
            "ahw",
            "aup",
            "awb",
            "axb",
            "bdl",
            "clb",
            "eey",
            "fem",
            "gka",
            "jmk",
            "ksp",
            "ljm",
            "lnh",
            "rms",
            "rxr",
            "slp",
            "slt"
        ]:

            url = "cmu_us_" + url + "_arctic"
            ext_archive = ".tar.bz2"
            base_url = "http://www.festvox.org/cmu_arctic/packed/"

            url = os.path.join(base_url, url + ext_archive)

        # Get string representation of 'root' in case Path object is passed
        root = os.fspath(root)

        basename = os.path.basename(url)
        root = os.path.join(root, folder_in_archive)
        if not os.path.isdir(root):
            os.mkdir(root)
        archive = os.path.join(root, basename)

        basename = basename.split(".")[0]

        self._path = os.path.join(root, basename)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url, root, hash_value=checksum, hash_type="md5")
                extract_archive(archive)

        self._text = os.path.join(self._path, self._folder_text, self._file_text)

        with open(self._text, "r") as text:
            walker = csv.reader(text, delimiter="\n")
            self._walker = list(walker)
Exemple #11
0
def sample_data():
    path = get_default_cache_folder()
    download_url(
        "https://github.com/scart97/lapsbm-backup/archive/refs/tags/lapsbm-ci.tar.gz",
        download_folder=path,
        resume=True,
    )
    extract_archive(path / "lapsbm-backup-lapsbm-ci.tar.gz", path)
    return path / "lapsbm-backup-lapsbm-ci"
Exemple #12
0
    def download(self) -> None:
        """Download the dataset and extract the archive"""
        if self.check_integrity(self.target_directory):
            print("Dataset already downloaded and verified.")

        else:
            archive_path = os.path.join(self.root, FOLDER_IN_ARCHIVE + ".zip")

            download_url(self.url, self.root)
            extract_archive(archive_path, self.root)
Exemple #13
0
    def __init__(
        self,
        root: str,
        url: str = URL,
        folder_in_archive: str = FOLDER_IN_ARCHIVE,
        download: bool = False,
        subset: Any = None,
    ) -> None:

        # super(GTZAN, self).__init__()
        self.root = root
        self.url = url
        self.folder_in_archive = folder_in_archive
        self.download = download
        self.subset = subset

        assert subset is None or subset in [
            "training", "validation", "testing"
        ], ("When `subset` not None, it must take a value from " +
            "{'training', 'validation', 'testing'}.")

        archive = os.path.basename(url)
        archive = os.path.join(root, archive)
        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url,
                                 root,
                                 hash_value=checksum,
                                 hash_type="md5")
                extract_archive(archive)

        if not os.path.isdir(self._path):
            raise RuntimeError(
                "Dataset not found. Please use `download=True` to download it."
            )

        if self.subset is None:
            walker = walk_files(self._path,
                                suffix=self._ext_audio,
                                prefix=False,
                                remove_suffix=True)
            self._walker = list(walker)
        else:
            if self.subset == "training":
                self._walker = filtered_train
            elif self.subset == "validation":
                self._walker = filtered_valid
            elif self.subset == "testing":
                self._walker = filtered_test
Exemple #14
0
    def __init__(self,
                 root: str,
                 url: str = URL,
                 download: bool = False,
                 mic_id: str = "mic2") -> None:

        archive = os.path.join(root, os.path.basename("VCTK-Corpus-0.92.zip"))

        self._path = os.path.join(root, "VCTK-Corpus-0.92")
        self._txt_dir = os.path.join(self._path, "txt")
        self._audio_dir = os.path.join(self._path, "wav48_silence_trimmed")
        self._mic_id = mic_id

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url,
                                 root,
                                 hash_value=checksum,
                                 hash_type="md5")
                extract_archive(archive, self._path)

        if not os.path.isdir(self._path):
            raise RuntimeError(
                "Dataset not found. Please use `download=True` to download it."
            )

        # Extracting speaker IDs from the folder structure
        self._speaker_ids = sorted(os.listdir(self._txt_dir))
        self._sample_ids = []
        """
        Due to some insufficient data complexity in the 0.92 version of this dataset,
        we start traversing the audio folder structure in accordance with the text folder.
        As some of the audio files are missing of either ``mic_1`` or ``mic_2`` but the
        text is present for the same, we first check for the existence of the audio file
        before adding it to the ``sample_ids`` list.

        Once the ``audio_ids`` are loaded into memory we can quickly access the list for
        different parameters required by the user.
        """
        for speaker_id in self._speaker_ids:
            utterance_dir = os.path.join(self._txt_dir, speaker_id)
            for utterance_file in sorted(f for f in os.listdir(utterance_dir)
                                         if f.endswith(".txt")):
                utterance_id = os.path.splitext(utterance_file)[0]
                audio_path_mic = os.path.join(self._audio_dir, speaker_id,
                                              f"{utterance_id}_{mic_id}.flac")
                if speaker_id == "p280" and mic_id == "mic2":
                    break
                if speaker_id == "p362" and not os.path.isfile(audio_path_mic):
                    continue
                self._sample_ids.append(utterance_id.split("_"))
Exemple #15
0
    def __init__(
            self, root: str, split: str, source_language: str,
            target_language: Optional[str] = None, version: int = 2,
            download: bool = False
    ) -> None:
        assert version in self.VERSIONS and split in self.SPLITS
        assert source_language is not None
        self.no_translation = (target_language is None)
        if not self.no_translation:
            assert 'en' in {source_language, target_language}
            if source_language == 'en':
                assert target_language in self.EN_XX_LANGUAGES[version]
            else:
                assert source_language in self.XX_EN_LANGUAGES[version]
        else:
            # Hack here so that we can get "split" column from CoVoST TSV.
            # Note that we use CoVoST train split for ASR which is an extension
            # to Common Voice train split.
            target_language = 'de' if source_language == 'en' else 'en'

        self.root = os.path.join(root, 'raw')
        os.makedirs(self.root, exist_ok=True)

        cv_url = self.CV_URL_TEMPLATE.format(ver=self.CV_VERSION_ID[version],
                                             lang=source_language)
        cv_archive = os.path.join(self.root, os.path.basename(cv_url))
        if download:
            if not os.path.isfile(cv_archive):
                download_url(cv_url, self.root, hash_value=None)
            extract_archive(cv_archive)

        covost_url = self.COVOST_URL_TEMPLATE.format(src_lang=source_language,
                                                     tgt_lang=target_language)
        covost_archive = os.path.join(self.root, os.path.basename(covost_url))
        if download:
            if not os.path.isfile(covost_archive):
                download_url(covost_url, self.root, hash_value=None)
            extract_archive(covost_archive)

        cv_tsv = self.load_from_tsv(os.path.join(self.root, 'validated.tsv'))
        covost_tsv = self.load_from_tsv(
            os.path.join(self.root,
                         os.path.basename(covost_url).replace('.tar.gz', ''))
        )
        df = pd.merge(left=cv_tsv[['path', 'sentence', 'client_id']],
                      right=covost_tsv[['path', 'translation', 'split']],
                      how='inner', on='path')
        if split == 'train':
            df = df[(df['split'] == split) | (df['split'] == f'{split}_covost')]
        else:
            df = df[df['split'] == split]
        self.data = df.to_dict(orient='index').items()
        self.data = [v for k, v in sorted(self.data, key=lambda x: x[0])]
Exemple #16
0
    def __init__(
        self,
        root,
        url=URL,
        folder_in_archive=FOLDER_IN_ARCHIVE,
        download=False,
        transform=None,
        target_transform=None,
        return_dict=False,
    ):

        if not return_dict:
            warnings.warn(
                "In the next version, the item returned will be a dictionary. "
                "Please use `return_dict=True` to enable this behavior now, "
                "and suppress this warning.",
                DeprecationWarning,
            )

        if transform is not None or target_transform is not None:
            warnings.warn(
                "In the next version, transforms will not be part of the dataset. "
                "Please remove the option `transform=True` and "
                "`target_transform=True` to suppress this warning.",
                DeprecationWarning,
            )

        self.transform = transform
        self.target_transform = target_transform
        self.return_dict = return_dict

        archive = os.path.basename(url)
        archive = os.path.join(root, archive)
        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    download_url(url, root)
                extract_archive(archive)

        if not os.path.isdir(self._path):
            raise RuntimeError(
                "Dataset not found. Please use `download=True` to download it."
            )

        walker = walk_files(self._path,
                            suffix=self._ext_audio,
                            prefix=False,
                            remove_suffix=True)
        self._walker = list(walker)
    def __init__(self,
                 root: str,
                 url: str,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 download: bool = False) -> None:

        if url in [
            "dev-clean",
            "dev-other",
            "test-clean",
            "test-other",
            "train-clean-100",
            "train-clean-360",
            "train-other-500",
        ]:

            ext_archive = ".tar.gz"
            base_url = "http://www.openslr.org/resources/12/"

            url = os.path.join(base_url, url + ext_archive)

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.split(".")[0]
        folder_in_archive = os.path.join(folder_in_archive, basename)

        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

            audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128)
            for root, dirs, files in os.walk(self._path):
                if len(files) != 0:
                    for file in files:
                        if file.split('.')[-1]==self._ext_wav.split('.')[-1]:
                            file_audio = os.path.join(root, file)
                            waveform, _ = torchaudio.load(file_audio)
                            spec = audio_transforms(waveform)
                            file_spec = os.path.join(root, file.split('.')[0]+ self._ext_wav)
                            torch.save(spec, file_spec)

        walker = walk_files(
            self._path, suffix=self._ext_mel, prefix=False, remove_suffix=True
        )
        self._walker = list(walker)
Exemple #18
0
    def __init__(self,
                 root: str,
                 url: str = URL,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 download: bool = False,
                 downsample: bool = False,
                 transform: Any = None,
                 target_transform: Any = None) -> None:

        if downsample:
            warnings.warn(
                "In the next version, transforms will not be part of the dataset. "
                "Please use `downsample=False` to enable this behavior now, ",
                "and suppress this warning.")

        if transform is not None or target_transform is not None:
            warnings.warn(
                "In the next version, transforms will not be part of the dataset. "
                "Please remove the option `transform=True` and "
                "`target_transform=True` to suppress this warning.")

        self.downsample = downsample
        self.transform = transform
        self.target_transform = target_transform

        archive = os.path.basename(url)
        archive = os.path.join(root, archive)
        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url,
                                 root,
                                 hash_value=checksum,
                                 hash_type="md5")
                extract_archive(archive)

        if not os.path.isdir(self._path):
            raise RuntimeError(
                "Dataset not found. Please use `download=True` to download it."
            )

        walker = walk_files(self._path,
                            suffix=self._ext_audio,
                            prefix=False,
                            remove_suffix=True)
        walker = filter(lambda w: self._except_folder not in w, walker)
        self._walker = list(walker)
    def _download(self) -> None:
        """Download the dataset and extract the archive"""
        archive_path = os.path.join(self.root, self.basename)
        print(self.basename)

        if self._check_integrity(self._path):
            print("Dataset already download and verified")

        else:
            checksum = _CHECKSUMS.get(self.url, None)

            download_url(self.url,
                         self.root,
                         hash_value=checksum,
                         hash_type="md5")
            extract_archive(archive_path, self._path)
Exemple #20
0
def test_can_load_weights():
    # Quartznet 5x5 is small (25mb), so it can be downloaded while testing.
    try:

        cfg = download_checkpoint(NemoCheckpoint.QuartzNet5x5LS_En)
        with TemporaryDirectory() as extract_path:
            extract_path = Path(extract_path)
            extract_archive(str(cfg), extract_path)
            config_path = extract_path / "model_config.yaml"
            encoder_params, initial_vocab, _ = read_params_from_config(config_path)
            encoder = Quartznet5(64, **encoder_params)
            decoder = Quartznet_decoder(len(initial_vocab) + 1)
            load_quartznet_weights(
                encoder, decoder, extract_path / "model_weights.ckpt"
            )
    except HTTPError:
        return
Exemple #21
0
    def __init__(
        self,
        root: Union[str, Path],
        url: str = URL,
        folder_in_archive: str = FOLDER_IN_ARCHIVE,
        download: bool = False,
    ) -> None:

        if url in [
                "dev-clean",
                "dev-other",
                "test-clean",
                "test-other",
                "train-clean-100",
                "train-clean-360",
                "train-other-500",
        ]:

            ext_archive = ".tar.gz"
            base_url = "http://www.openslr.org/resources/60/"

            url = os.path.join(base_url, url + ext_archive)

        # Get string representation of 'root' in case Path object is passed
        root = os.fspath(root)

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.split(".")[0]
        folder_in_archive = os.path.join(folder_in_archive, basename)

        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

        walker = walk_files(self._path,
                            suffix=self._ext_audio,
                            prefix=False,
                            remove_suffix=True)
        self._walker = list(walker)
    def __init__(self,
                 root,
                 url=URL,
                 folder_in_archive=FOLDER_IN_ARCHIVE,
                 download=False,
                 preprocess=False):

        if url in [
                "dev-clean",
                "dev-other",
                "test-clean",
                "test-other",
                "train-clean-100",
                "train-clean-360",
                "train-other-500",
        ]:

            ext_archive = ".tar.gz"
            base_url = "http://www.openslr.org/resources/12/"

            url = os.path.join(base_url, url + ext_archive)

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.split(".")[0]
        folder_in_archive = os.path.join(folder_in_archive, basename)

        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    download_url(url, root)
                extract_archive(archive)

        walker = walk_files(self._path,
                            suffix=self._ext_audio,
                            prefix=False,
                            remove_suffix=True)
        self._walker = list(walker)

        if preprocess:
            self.preprocess_embeddings(self._path, self._ext_audio,
                                       self._ext_embed)
Exemple #23
0
    def load_from_nemo(
            cls,
            *,
            nemo_filepath: str = None,
            checkpoint_name: NemoCheckpoint = None) -> "QuartznetModule":
        """Load from the original nemo checkpoint.

        Args:
            nemo_filepath : Path to local .nemo file.
            checkpoint_name : Name of checkpoint to be downloaded locally and lodaded.

        Raises:
            ValueError: You need to pass only one of the two parameters.

        Returns:
            The model loaded from the checkpoint
        """
        if checkpoint_name is not None:
            nemo_filepath = download_checkpoint(checkpoint_name)
        if nemo_filepath is None and checkpoint_name is None:
            raise ValueError(
                "Either nemo_filepath or checkpoint_name must be passed")

        with TemporaryDirectory() as extract_path:
            extract_path = Path(extract_path)
            extract_archive(str(nemo_filepath), extract_path)
            config_path = extract_path / "model_config.yaml"
            encoder_params, initial_vocab, preprocess_params = read_params_from_config(
                config_path)
            module = cls(
                initial_vocab_tokens=initial_vocab,
                **encoder_params,
                **preprocess_params,
                nemo_compat_vocab=True,
            )
            weights_path = extract_path / "model_weights.ckpt"
            load_quartznet_weights(module.encoder, module.decoder,
                                   weights_path)
        # Here we set it in eval mode, so it correctly works during inference
        # Supposing that the majority of applications will be either (1) load a checkpoint
        # and directly run inference, or (2) fine-tuning. Either way this will prevent a silent
        # bug (case 1) or will be ignored (case 2).
        module.eval()
        return module
Exemple #24
0
    def __init__(self,
                 root: Union[str, Path],
                 url: str = URL,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 download: bool = False,
                 transform: Any = None,
                 target_transform: Any = None) -> None:

        if transform is not None or target_transform is not None:
            warnings.warn(
                "In the next version, transforms will not be part of the dataset. "
                "Please remove the option `transform=True` and "
                "`target_transform=True` to suppress this warning.")

        self.transform = transform
        self.target_transform = target_transform

        # Get string representation of 'root' in case Path object is passed
        root = os.fspath(root)

        archive = os.path.basename(url)
        archive = os.path.join(root, archive)
        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url,
                                 root,
                                 hash_value=checksum,
                                 hash_type="md5")
                extract_archive(archive)

        if not os.path.isdir(self._path):
            raise RuntimeError(
                "Dataset not found. Please use `download=True` to download it."
            )

        walker = walk_files(self._path,
                            suffix=self._ext_audio,
                            prefix=False,
                            remove_suffix=True)
        self._walker = list(walker)
Exemple #25
0
    def _parse_filesystem(self, root: str, url: str, folder_in_archive: str,
                          download: bool) -> None:
        root = Path(root)
        archive = os.path.basename(url)
        archive = root / archive

        self._path = root / folder_in_archive
        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _RELEASE_CONFIGS["release1"]["checksum"]
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

        if not os.path.isdir(self._path):
            raise RuntimeError(
                "Dataset not found. Please use `download=True` to download it."
            )

        self._walker = sorted(
            str(p.stem) for p in Path(self._path).glob("*.wav"))
Exemple #26
0
    def __init__(
            self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False
    ):

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.split(self._ext_archive)[0]
        folder_in_archive = os.path.join(basename, folder_in_archive)

        self._path = os.path.join(root, folder_in_archive)
        self._metadata_path = os.path.join(root, basename, 'metadata.csv')

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    download_url(url, root)
                extract_archive(archive)

        with open(self._metadata_path, "r") as metadata:
            walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE)
            self._walker = list(walker)
Exemple #27
0
    def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None:
        root = Path(root)

        basename = os.path.basename(url)
        archive = root / basename

        basename = Path(basename.split(".tar.bz2")[0])
        folder_in_archive = basename / folder_in_archive

        self._path = root / folder_in_archive
        self._metadata_path = root / basename / 'metadata.csv'

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _RELEASE_CONFIGS["release1"]["checksum"]
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

        with open(self._metadata_path, "r", newline='') as metadata:
            flist = csv.reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE)
            self._flist = list(flist)
Exemple #28
0
    def __init__(self,
                 root: str,
                 url: str = URL,
                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
                 download: bool = False,
                 transform: Any = None,
                 target_transform: Any = None) -> None:

        if transform is not None or target_transform is not None:
            warnings.warn(
                "In the next version, transforms will not be part of the dataset. "
                "Please remove the option `transform=True` and "
                "`target_transform=True` to suppress this warning."
            )

        self.transform = transform
        self.target_transform = target_transform

        archive = os.path.basename(url)
        archive = os.path.join(root, archive)
        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    download_url(url, root)
                extract_archive(archive)

        if not os.path.isdir(self._path):
            raise RuntimeError(
                "Dataset not found. Please use `download=True` to download it."
            )

        walker = walk_files(
            self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
        )
        self._walker = list(walker)
    def __init__(self,
                 root,
                 sample_rate: int,
                 num_noise_to_load: int = 3,
                 noise_to_load: list = None,
                 num_synthetic_noise: int = 0,
                 folder_in_archive=FOLDER_IN_ARCHIVE,
                 url=DEMAND_JSON,
                 download=False,
                 transform=None):

        assert sample_rate in (16000, 48000)
        available_noise = set([
            i['key'].split('_')[0] for i in url['files']
            if (str(sample_rate // 1000) in i['key'])
        ])
        self.available_noise = list(available_noise)
        self.available_noise.sort()
        self.num_noise_to_load = num_noise_to_load

        if noise_to_load is None:
            self.noise_to_load = self.available_noise[:num_noise_to_load]
        else:
            assert all([i in self.available_noise for i in noise_to_load])
            self.noise_to_load = noise_to_load

        self.transform = transform

        urls_to_load = [[
            i['links']['self'] for i in url['files']
            if i['key'] == f'{noise}_{int(sample_rate / 1000)}k.zip'
        ][0] for noise in self.noise_to_load]

        self._path = os.path.join(root, folder_in_archive)
        archive_list = [
            os.path.join(self._path, f'{noise}_{int(sample_rate / 1000)}k.zip')
            for noise in self.noise_to_load
        ]

        if download:
            for archive, url, data_name in zip(archive_list, urls_to_load,
                                               self.noise_to_load):
                if os.path.isdir(os.path.join(self._path, data_name)):
                    continue
                if not os.path.isfile(archive):
                    logging.info(f'Loading {archive}')
                    folder_to_load = os.path.split(archive)[0]
                    os.makedirs(folder_to_load, exist_ok=True)
                    download_url(url, folder_to_load)
                extract_archive(archive)
                os.remove(archive)

        if not os.path.isdir(self._path):
            raise RuntimeError(
                "Dataset not found. Please use `download=True` to download it."
            )

        walker = walk_files(self._path,
                            suffix=self._ext_audio,
                            prefix=True,
                            remove_suffix=True)
        self._walker = list(walker)

        for i in range(num_synthetic_noise):
            self._walker.append(os.path.join(self._path, 'synthetic', str(i)))
Exemple #30
0
    def __init__(
        self,
        root: str,
        split: str,
        source_language: str,
        target_language: Optional[str] = None,
        version: int = 2,
        download: bool = False,
    ) -> None:
        assert version in self.VERSIONS and split in self.SPLITS
        assert source_language is not None
        self.no_translation = target_language is None
        if not self.no_translation:
            assert "en" in {source_language, target_language}
            if source_language == "en":
                assert target_language in self.EN_XX_LANGUAGES[version]
            else:
                assert source_language in self.XX_EN_LANGUAGES[version]
        else:
            # Hack here so that we can get "split" column from CoVoST TSV.
            # Note that we use CoVoST train split for ASR which is an extension
            # to Common Voice train split.
            target_language = "de" if source_language == "en" else "en"

        self.root = os.path.join(root, "raw")
        os.makedirs(self.root, exist_ok=True)

        cv_url = self.CV_URL_TEMPLATE.format(
            ver=self.CV_VERSION_ID[version], lang=source_language
        )
        cv_archive = os.path.join(self.root, os.path.basename(cv_url))
        if download:
            if not os.path.isfile(cv_archive):
                download_url(cv_url, self.root, hash_value=None)
            extract_archive(cv_archive)

        covost_url = self.COVOST_URL_TEMPLATE.format(
            src_lang=source_language, tgt_lang=target_language
        )
        covost_archive = os.path.join(self.root, os.path.basename(covost_url))
        if download:
            if not os.path.isfile(covost_archive):
                download_url(covost_url, self.root, hash_value=None)
            extract_archive(covost_archive)

        cv_tsv = self.load_from_tsv(os.path.join(self.root, "validated.tsv"))
        covost_tsv = self.load_from_tsv(
            os.path.join(self.root, os.path.basename(covost_url).replace(".tar.gz", ""))
        )
        df = pd.merge(
            left=cv_tsv[["path", "sentence", "client_id"]],
            right=covost_tsv[["path", "translation", "split"]],
            how="inner",
            on="path",
        )
        if split == "train":
            df = df[(df["split"] == split) | (df["split"] == f"{split}_covost")]
        else:
            df = df[df["split"] == split]
        self.data = df.to_dict(orient="index").items()
        self.data = [v for k, v in sorted(self.data, key=lambda x: x[0])]