def __init__( self, root: str, split: str, source_language: str, target_language: Optional[str] = None, version: int = 2, ) -> None: assert version in self.VERSIONS and split in self.SPLITS assert source_language is not None self.no_translation = target_language is None if not self.no_translation: assert "en" in {source_language, target_language} if source_language == "en": assert target_language in self.EN_XX_LANGUAGES[version] else: assert source_language in self.XX_EN_LANGUAGES[version] else: # Hack here so that we can get "split" column from CoVoST TSV. # Note that we use CoVoST train split for ASR which is an extension # to Common Voice train split. target_language = "de" if source_language == "en" else "en" self.root: Path = Path(root) cv_tsv_path = self.root / "validated.tsv" assert cv_tsv_path.is_file() covost_url = self.COVOST_URL_TEMPLATE.format( src_lang=source_language, tgt_lang=target_language ) covost_archive = self.root / Path(covost_url).name if not covost_archive.is_file(): download_url(covost_url, self.root.as_posix(), hash_value=None) extract_archive(covost_archive.as_posix()) cv_tsv = load_df_from_tsv(cv_tsv_path) covost_tsv = load_df_from_tsv( self.root / Path(covost_url).name.replace(".tar.gz", "") ) df = pd.merge( left=cv_tsv[["path", "sentence", "client_id"]], right=covost_tsv[["path", "translation", "split"]], how="inner", on="path", ) if split == "train": df = df[(df["split"] == split) | (df["split"] == f"{split}_covost")] else: df = df[df["split"] == split] data = df.to_dict(orient="index").items() data = [v for k, v in sorted(data, key=lambda x: x[0])] self.data = [] for e in data: try: path = self.root / "clips" / e["path"] _ = torchaudio.info(path.as_posix()) self.data.append(e) except RuntimeError: pass
def __init__( self, root, params, url=URL, download=False): basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(self._ext_archive)[0] base_folder = os.path.join(root, basename) self._wav_path = os.path.join(base_folder, 'wavs') self._mel_path = os.path.join(base_folder, 'mels') self._char_path = os.path.join(base_folder, 'chars') self._phone_path = os.path.join(base_folder, 'phones') self._metadata_path = os.path.join(base_folder, 'metadata.csv') if download: if not os.path.isdir(self._wav_path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) if not os.path.isdir(self._mel_path): precompute_spectrograms(base_folder, params) if not os.path.isdir(self._char_path) or not os.path.isdir(self._phone_path): precompute_char_phone(base_folder) with open(self._metadata_path, "r") as metadata: walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE) self._walker = list(walker)
def download(args): if args.subset in LANGUAGES: languages = [args.subset] years = YEARS else: languages = { "100k": LANGUAGES, "10k": LANGUAGES, "asr": ["original"] }.get(args.subset, None) years = { "100k": YEARS, "10k": [2019, 2020], "asr": YEARS }.get(args.subset, None) url_list = [] for l in languages: for y in years: url_list.append(f"{DOWNLOAD_BASE_URL}/audios/{l}_{y}.tar") out_root = Path(args.root) / "raw_audios" out_root.mkdir(exist_ok=True, parents=True) print(f"{len(url_list)} files to download...") for url in tqdm(url_list): tar_path = out_root / Path(url).name download_url(url, out_root, Path(url).name) extract_archive(tar_path.as_posix()) os.remove(tar_path)
def load_from_nemo(cls, *, nemo_filepath: str = None, checkpoint_name: NemoCheckpoint = None): if checkpoint_name is not None: nemo_filepath = download_checkpoint(checkpoint_name) if nemo_filepath is None and checkpoint_name is None: raise ValueError( "Either nemo_filepath or checkpoint_name must be passed") with TemporaryDirectory() as extract_path: extract_path = Path(extract_path) extract_archive(str(nemo_filepath), extract_path) config_path = extract_path / "model_config.yaml" encoder_params, initial_vocab, preprocess_params = read_params_from_config( config_path) module = cls( initial_vocab_tokens=initial_vocab, **encoder_params, **preprocess_params, nemo_compat_vocab=True, ) weights_path = extract_path / "model_weights.ckpt" load_quartznet_weights(module.encoder, module.decoder, weights_path) # Here we set it in eval mode, so it correctly works during inference # Supposing that the majority of applications will be either load a checkpoint # and directly run inference, or fine-tuning. Either way this will prevent a silent # bug (case 1) or will be ignored (case 2). module.eval() return module
def __init__(self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False) -> None: # Get string representation of 'root' in case Path object is passed root = os.fspath(root) archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) self._walker = sorted( str(p.stem) for p in Path(self._path).glob('*' + self._ext_audio))
def __init__(self, root, tsv=TSV, url=URL, download=False): languages = { "tatar": "tt", "english": "en", "german": "de", "french": "fr", "welsh": "cy", "breton": "br", "chuvash": "cv", "turkish": "tr", "kyrgyz": "ky", "irish": "ga-IE", "kabyle": "kab", "catalan": "ca", "taiwanese": "zh-TW", "slovenian": "sl", "italian": "it", "dutch": "nl", "hakha chin": "cnh", "esperanto": "eo", "estonian": "et", "persian": "fa", "basque": "eu", "spanish": "es", "chinese": "zh-CN", "mongolian": "mn", "sakha": "sah", "dhivehi": "dv", "kinyarwanda": "rw", "swedish": "sv-SE", "russian": "ru", } if url is languages: ext_archive = ".tar.gz" language = languages[url] base_url = ( "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4" + ".s3.amazonaws.com/cv-corpus-3/" ) url = base_url + language + ext_archive archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = root if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) self._tsv = os.path.join(root, tsv) with open(self._tsv, "r") as tsv: walker = unicode_csv_reader(tsv, delimiter="\t") self._header = next(walker) self._walker = list(walker)
def __init__(self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False): if url in [ "speech_commands_v0.01", "speech_commands_v0.02", ]: base_url = "https://storage.googleapis.com/download.tensorflow.org/data/" ext_archive = ".tar.gz" url = os.path.join(base_url, url + ext_archive) basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.rsplit(".", 2)[0] folder_in_archive = os.path.join(folder_in_archive, basename) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive, self._path) walker = walk_files(self._path, suffix=".wav", prefix=True) walker = filter(lambda w: HASH_DIVIDER in w and EXCEPT_FOLDER not in w, walker) self._walker = list(walker)
def __init__(self, root: str, url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False) -> None: basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(self._ext_archive)[0] folder_in_archive = os.path.join(basename, folder_in_archive) self._path = os.path.join(root, folder_in_archive) self._metadata_path = os.path.join(root, basename, 'metadata.csv') if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum) extract_archive(archive) with open(self._metadata_path, "r", newline='') as metadata: walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE) self._walker = list(walker)
def __init__(self, root: str, release: str = "release1", subset: str = None, download: bool = False, audio_ext=".sph") -> None: self._ext_audio = audio_ext if release in _RELEASE_CONFIGS.keys(): folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"] url = _RELEASE_CONFIGS[release]["url"] subset = subset if subset else _RELEASE_CONFIGS[release]["subset"] else: # Raise warning raise RuntimeError( "The release {} does not match any of the supported tedlium releases{} " .format( release, _RELEASE_CONFIGS.keys(), )) if subset not in _RELEASE_CONFIGS[release]["supported_subsets"]: # Raise warning raise RuntimeError( "The subset {} does not match any of the supported tedlium subsets{} " .format( subset, _RELEASE_CONFIGS[release]["supported_subsets"], )) basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(".")[0] self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"]) if subset in ["train", "dev", "test"]: self._path = os.path.join(self._path, subset) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _RELEASE_CONFIGS[release]["checksum"] download_url(url, root, hash_value=checksum) extract_archive(archive) # Create list for all samples self._filelist = [] stm_path = os.path.join(self._path, "stm") for file in sorted(os.listdir(stm_path)): if file.endswith(".stm"): stm_path = os.path.join(self._path, "stm", file) with open(stm_path) as f: l = len(f.readlines()) file = file.replace(".stm", "") self._filelist.extend((file, line) for line in range(l)) # Create dict path for later read self._dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"]) self._phoneme_dict = None
def __init__(self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False) -> None: if url in [ "aew", "ahw", "aup", "awb", "axb", "bdl", "clb", "eey", "fem", "gka", "jmk", "ksp", "ljm", "lnh", "rms", "rxr", "slp", "slt" ]: url = "cmu_us_" + url + "_arctic" ext_archive = ".tar.bz2" base_url = "http://www.festvox.org/cmu_arctic/packed/" url = os.path.join(base_url, url + ext_archive) # Get string representation of 'root' in case Path object is passed root = os.fspath(root) basename = os.path.basename(url) root = os.path.join(root, folder_in_archive) if not os.path.isdir(root): os.mkdir(root) archive = os.path.join(root, basename) basename = basename.split(".")[0] self._path = os.path.join(root, basename) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive) self._text = os.path.join(self._path, self._folder_text, self._file_text) with open(self._text, "r") as text: walker = csv.reader(text, delimiter="\n") self._walker = list(walker)
def sample_data(): path = get_default_cache_folder() download_url( "https://github.com/scart97/lapsbm-backup/archive/refs/tags/lapsbm-ci.tar.gz", download_folder=path, resume=True, ) extract_archive(path / "lapsbm-backup-lapsbm-ci.tar.gz", path) return path / "lapsbm-backup-lapsbm-ci"
def download(self) -> None: """Download the dataset and extract the archive""" if self.check_integrity(self.target_directory): print("Dataset already downloaded and verified.") else: archive_path = os.path.join(self.root, FOLDER_IN_ARCHIVE + ".zip") download_url(self.url, self.root) extract_archive(archive_path, self.root)
def __init__( self, root: str, url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False, subset: Any = None, ) -> None: # super(GTZAN, self).__init__() self.root = root self.url = url self.folder_in_archive = folder_in_archive self.download = download self.subset = subset assert subset is None or subset in [ "training", "validation", "testing" ], ("When `subset` not None, it must take a value from " + "{'training', 'validation', 'testing'}.") archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) if self.subset is None: walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) self._walker = list(walker) else: if self.subset == "training": self._walker = filtered_train elif self.subset == "validation": self._walker = filtered_valid elif self.subset == "testing": self._walker = filtered_test
def __init__(self, root: str, url: str = URL, download: bool = False, mic_id: str = "mic2") -> None: archive = os.path.join(root, os.path.basename("VCTK-Corpus-0.92.zip")) self._path = os.path.join(root, "VCTK-Corpus-0.92") self._txt_dir = os.path.join(self._path, "txt") self._audio_dir = os.path.join(self._path, "wav48_silence_trimmed") self._mic_id = mic_id if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive, self._path) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) # Extracting speaker IDs from the folder structure self._speaker_ids = sorted(os.listdir(self._txt_dir)) self._sample_ids = [] """ Due to some insufficient data complexity in the 0.92 version of this dataset, we start traversing the audio folder structure in accordance with the text folder. As some of the audio files are missing of either ``mic_1`` or ``mic_2`` but the text is present for the same, we first check for the existence of the audio file before adding it to the ``sample_ids`` list. Once the ``audio_ids`` are loaded into memory we can quickly access the list for different parameters required by the user. """ for speaker_id in self._speaker_ids: utterance_dir = os.path.join(self._txt_dir, speaker_id) for utterance_file in sorted(f for f in os.listdir(utterance_dir) if f.endswith(".txt")): utterance_id = os.path.splitext(utterance_file)[0] audio_path_mic = os.path.join(self._audio_dir, speaker_id, f"{utterance_id}_{mic_id}.flac") if speaker_id == "p280" and mic_id == "mic2": break if speaker_id == "p362" and not os.path.isfile(audio_path_mic): continue self._sample_ids.append(utterance_id.split("_"))
def __init__( self, root: str, split: str, source_language: str, target_language: Optional[str] = None, version: int = 2, download: bool = False ) -> None: assert version in self.VERSIONS and split in self.SPLITS assert source_language is not None self.no_translation = (target_language is None) if not self.no_translation: assert 'en' in {source_language, target_language} if source_language == 'en': assert target_language in self.EN_XX_LANGUAGES[version] else: assert source_language in self.XX_EN_LANGUAGES[version] else: # Hack here so that we can get "split" column from CoVoST TSV. # Note that we use CoVoST train split for ASR which is an extension # to Common Voice train split. target_language = 'de' if source_language == 'en' else 'en' self.root = os.path.join(root, 'raw') os.makedirs(self.root, exist_ok=True) cv_url = self.CV_URL_TEMPLATE.format(ver=self.CV_VERSION_ID[version], lang=source_language) cv_archive = os.path.join(self.root, os.path.basename(cv_url)) if download: if not os.path.isfile(cv_archive): download_url(cv_url, self.root, hash_value=None) extract_archive(cv_archive) covost_url = self.COVOST_URL_TEMPLATE.format(src_lang=source_language, tgt_lang=target_language) covost_archive = os.path.join(self.root, os.path.basename(covost_url)) if download: if not os.path.isfile(covost_archive): download_url(covost_url, self.root, hash_value=None) extract_archive(covost_archive) cv_tsv = self.load_from_tsv(os.path.join(self.root, 'validated.tsv')) covost_tsv = self.load_from_tsv( os.path.join(self.root, os.path.basename(covost_url).replace('.tar.gz', '')) ) df = pd.merge(left=cv_tsv[['path', 'sentence', 'client_id']], right=covost_tsv[['path', 'translation', 'split']], how='inner', on='path') if split == 'train': df = df[(df['split'] == split) | (df['split'] == f'{split}_covost')] else: df = df[df['split'] == split] self.data = df.to_dict(orient='index').items() self.data = [v for k, v in sorted(self.data, key=lambda x: x[0])]
def __init__( self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False, transform=None, target_transform=None, return_dict=False, ): if not return_dict: warnings.warn( "In the next version, the item returned will be a dictionary. " "Please use `return_dict=True` to enable this behavior now, " "and suppress this warning.", DeprecationWarning, ) if transform is not None or target_transform is not None: warnings.warn( "In the next version, transforms will not be part of the dataset. " "Please remove the option `transform=True` and " "`target_transform=True` to suppress this warning.", DeprecationWarning, ) self.transform = transform self.target_transform = target_transform self.return_dict = return_dict archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) self._walker = list(walker)
def __init__(self, root: str, url: str, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False) -> None: if url in [ "dev-clean", "dev-other", "test-clean", "test-other", "train-clean-100", "train-clean-360", "train-other-500", ]: ext_archive = ".tar.gz" base_url = "http://www.openslr.org/resources/12/" url = os.path.join(base_url, url + ext_archive) basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(".")[0] folder_in_archive = os.path.join(folder_in_archive, basename) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum) extract_archive(archive) audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128) for root, dirs, files in os.walk(self._path): if len(files) != 0: for file in files: if file.split('.')[-1]==self._ext_wav.split('.')[-1]: file_audio = os.path.join(root, file) waveform, _ = torchaudio.load(file_audio) spec = audio_transforms(waveform) file_spec = os.path.join(root, file.split('.')[0]+ self._ext_wav) torch.save(spec, file_spec) walker = walk_files( self._path, suffix=self._ext_mel, prefix=False, remove_suffix=True ) self._walker = list(walker)
def __init__(self, root: str, url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False, downsample: bool = False, transform: Any = None, target_transform: Any = None) -> None: if downsample: warnings.warn( "In the next version, transforms will not be part of the dataset. " "Please use `downsample=False` to enable this behavior now, ", "and suppress this warning.") if transform is not None or target_transform is not None: warnings.warn( "In the next version, transforms will not be part of the dataset. " "Please remove the option `transform=True` and " "`target_transform=True` to suppress this warning.") self.downsample = downsample self.transform = transform self.target_transform = target_transform archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) walker = filter(lambda w: self._except_folder not in w, walker) self._walker = list(walker)
def _download(self) -> None: """Download the dataset and extract the archive""" archive_path = os.path.join(self.root, self.basename) print(self.basename) if self._check_integrity(self._path): print("Dataset already download and verified") else: checksum = _CHECKSUMS.get(self.url, None) download_url(self.url, self.root, hash_value=checksum, hash_type="md5") extract_archive(archive_path, self._path)
def test_can_load_weights(): # Quartznet 5x5 is small (25mb), so it can be downloaded while testing. try: cfg = download_checkpoint(NemoCheckpoint.QuartzNet5x5LS_En) with TemporaryDirectory() as extract_path: extract_path = Path(extract_path) extract_archive(str(cfg), extract_path) config_path = extract_path / "model_config.yaml" encoder_params, initial_vocab, _ = read_params_from_config(config_path) encoder = Quartznet5(64, **encoder_params) decoder = Quartznet_decoder(len(initial_vocab) + 1) load_quartznet_weights( encoder, decoder, extract_path / "model_weights.ckpt" ) except HTTPError: return
def __init__( self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False, ) -> None: if url in [ "dev-clean", "dev-other", "test-clean", "test-other", "train-clean-100", "train-clean-360", "train-other-500", ]: ext_archive = ".tar.gz" base_url = "http://www.openslr.org/resources/60/" url = os.path.join(base_url, url + ext_archive) # Get string representation of 'root' in case Path object is passed root = os.fspath(root) basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(".")[0] folder_in_archive = os.path.join(folder_in_archive, basename) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum) extract_archive(archive) walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) self._walker = list(walker)
def __init__(self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False, preprocess=False): if url in [ "dev-clean", "dev-other", "test-clean", "test-other", "train-clean-100", "train-clean-360", "train-other-500", ]: ext_archive = ".tar.gz" base_url = "http://www.openslr.org/resources/12/" url = os.path.join(base_url, url + ext_archive) basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(".")[0] folder_in_archive = os.path.join(folder_in_archive, basename) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) self._walker = list(walker) if preprocess: self.preprocess_embeddings(self._path, self._ext_audio, self._ext_embed)
def load_from_nemo( cls, *, nemo_filepath: str = None, checkpoint_name: NemoCheckpoint = None) -> "QuartznetModule": """Load from the original nemo checkpoint. Args: nemo_filepath : Path to local .nemo file. checkpoint_name : Name of checkpoint to be downloaded locally and lodaded. Raises: ValueError: You need to pass only one of the two parameters. Returns: The model loaded from the checkpoint """ if checkpoint_name is not None: nemo_filepath = download_checkpoint(checkpoint_name) if nemo_filepath is None and checkpoint_name is None: raise ValueError( "Either nemo_filepath or checkpoint_name must be passed") with TemporaryDirectory() as extract_path: extract_path = Path(extract_path) extract_archive(str(nemo_filepath), extract_path) config_path = extract_path / "model_config.yaml" encoder_params, initial_vocab, preprocess_params = read_params_from_config( config_path) module = cls( initial_vocab_tokens=initial_vocab, **encoder_params, **preprocess_params, nemo_compat_vocab=True, ) weights_path = extract_path / "model_weights.ckpt" load_quartznet_weights(module.encoder, module.decoder, weights_path) # Here we set it in eval mode, so it correctly works during inference # Supposing that the majority of applications will be either (1) load a checkpoint # and directly run inference, or (2) fine-tuning. Either way this will prevent a silent # bug (case 1) or will be ignored (case 2). module.eval() return module
def __init__(self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False, transform: Any = None, target_transform: Any = None) -> None: if transform is not None or target_transform is not None: warnings.warn( "In the next version, transforms will not be part of the dataset. " "Please remove the option `transform=True` and " "`target_transform=True` to suppress this warning.") self.transform = transform self.target_transform = target_transform # Get string representation of 'root' in case Path object is passed root = os.fspath(root) archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) self._walker = list(walker)
def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None: root = Path(root) archive = os.path.basename(url) archive = root / archive self._path = root / folder_in_archive if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _RELEASE_CONFIGS["release1"]["checksum"] download_url(url, root, hash_value=checksum) extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) self._walker = sorted( str(p.stem) for p in Path(self._path).glob("*.wav"))
def __init__( self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False ): basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(self._ext_archive)[0] folder_in_archive = os.path.join(basename, folder_in_archive) self._path = os.path.join(root, folder_in_archive) self._metadata_path = os.path.join(root, basename, 'metadata.csv') if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) with open(self._metadata_path, "r") as metadata: walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE) self._walker = list(walker)
def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None: root = Path(root) basename = os.path.basename(url) archive = root / basename basename = Path(basename.split(".tar.bz2")[0]) folder_in_archive = basename / folder_in_archive self._path = root / folder_in_archive self._metadata_path = root / basename / 'metadata.csv' if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _RELEASE_CONFIGS["release1"]["checksum"] download_url(url, root, hash_value=checksum) extract_archive(archive) with open(self._metadata_path, "r", newline='') as metadata: flist = csv.reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE) self._flist = list(flist)
def __init__(self, root: str, url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False, transform: Any = None, target_transform: Any = None) -> None: if transform is not None or target_transform is not None: warnings.warn( "In the next version, transforms will not be part of the dataset. " "Please remove the option `transform=True` and " "`target_transform=True` to suppress this warning." ) self.transform = transform self.target_transform = target_transform archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) walker = walk_files( self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True ) self._walker = list(walker)
def __init__(self, root, sample_rate: int, num_noise_to_load: int = 3, noise_to_load: list = None, num_synthetic_noise: int = 0, folder_in_archive=FOLDER_IN_ARCHIVE, url=DEMAND_JSON, download=False, transform=None): assert sample_rate in (16000, 48000) available_noise = set([ i['key'].split('_')[0] for i in url['files'] if (str(sample_rate // 1000) in i['key']) ]) self.available_noise = list(available_noise) self.available_noise.sort() self.num_noise_to_load = num_noise_to_load if noise_to_load is None: self.noise_to_load = self.available_noise[:num_noise_to_load] else: assert all([i in self.available_noise for i in noise_to_load]) self.noise_to_load = noise_to_load self.transform = transform urls_to_load = [[ i['links']['self'] for i in url['files'] if i['key'] == f'{noise}_{int(sample_rate / 1000)}k.zip' ][0] for noise in self.noise_to_load] self._path = os.path.join(root, folder_in_archive) archive_list = [ os.path.join(self._path, f'{noise}_{int(sample_rate / 1000)}k.zip') for noise in self.noise_to_load ] if download: for archive, url, data_name in zip(archive_list, urls_to_load, self.noise_to_load): if os.path.isdir(os.path.join(self._path, data_name)): continue if not os.path.isfile(archive): logging.info(f'Loading {archive}') folder_to_load = os.path.split(archive)[0] os.makedirs(folder_to_load, exist_ok=True) download_url(url, folder_to_load) extract_archive(archive) os.remove(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) walker = walk_files(self._path, suffix=self._ext_audio, prefix=True, remove_suffix=True) self._walker = list(walker) for i in range(num_synthetic_noise): self._walker.append(os.path.join(self._path, 'synthetic', str(i)))
def __init__( self, root: str, split: str, source_language: str, target_language: Optional[str] = None, version: int = 2, download: bool = False, ) -> None: assert version in self.VERSIONS and split in self.SPLITS assert source_language is not None self.no_translation = target_language is None if not self.no_translation: assert "en" in {source_language, target_language} if source_language == "en": assert target_language in self.EN_XX_LANGUAGES[version] else: assert source_language in self.XX_EN_LANGUAGES[version] else: # Hack here so that we can get "split" column from CoVoST TSV. # Note that we use CoVoST train split for ASR which is an extension # to Common Voice train split. target_language = "de" if source_language == "en" else "en" self.root = os.path.join(root, "raw") os.makedirs(self.root, exist_ok=True) cv_url = self.CV_URL_TEMPLATE.format( ver=self.CV_VERSION_ID[version], lang=source_language ) cv_archive = os.path.join(self.root, os.path.basename(cv_url)) if download: if not os.path.isfile(cv_archive): download_url(cv_url, self.root, hash_value=None) extract_archive(cv_archive) covost_url = self.COVOST_URL_TEMPLATE.format( src_lang=source_language, tgt_lang=target_language ) covost_archive = os.path.join(self.root, os.path.basename(covost_url)) if download: if not os.path.isfile(covost_archive): download_url(covost_url, self.root, hash_value=None) extract_archive(covost_archive) cv_tsv = self.load_from_tsv(os.path.join(self.root, "validated.tsv")) covost_tsv = self.load_from_tsv( os.path.join(self.root, os.path.basename(covost_url).replace(".tar.gz", "")) ) df = pd.merge( left=cv_tsv[["path", "sentence", "client_id"]], right=covost_tsv[["path", "translation", "split"]], how="inner", on="path", ) if split == "train": df = df[(df["split"] == split) | (df["split"] == f"{split}_covost")] else: df = df[df["split"] == split] self.data = df.to_dict(orient="index").items() self.data = [v for k, v in sorted(self.data, key=lambda x: x[0])]