def __init__( self, root: str, split: str, source_language: str, target_language: Optional[str] = None, version: int = 2, ) -> None: assert version in self.VERSIONS and split in self.SPLITS assert source_language is not None self.no_translation = target_language is None if not self.no_translation: assert "en" in {source_language, target_language} if source_language == "en": assert target_language in self.EN_XX_LANGUAGES[version] else: assert source_language in self.XX_EN_LANGUAGES[version] else: # Hack here so that we can get "split" column from CoVoST TSV. # Note that we use CoVoST train split for ASR which is an extension # to Common Voice train split. target_language = "de" if source_language == "en" else "en" self.root: Path = Path(root) cv_tsv_path = self.root / "validated.tsv" assert cv_tsv_path.is_file() covost_url = self.COVOST_URL_TEMPLATE.format( src_lang=source_language, tgt_lang=target_language ) covost_archive = self.root / Path(covost_url).name if not covost_archive.is_file(): download_url(covost_url, self.root.as_posix(), hash_value=None) extract_archive(covost_archive.as_posix()) cv_tsv = load_df_from_tsv(cv_tsv_path) covost_tsv = load_df_from_tsv( self.root / Path(covost_url).name.replace(".tar.gz", "") ) df = pd.merge( left=cv_tsv[["path", "sentence", "client_id"]], right=covost_tsv[["path", "translation", "split"]], how="inner", on="path", ) if split == "train": df = df[(df["split"] == split) | (df["split"] == f"{split}_covost")] else: df = df[df["split"] == split] data = df.to_dict(orient="index").items() data = [v for k, v in sorted(data, key=lambda x: x[0])] self.data = [] for e in data: try: path = self.root / "clips" / e["path"] _ = torchaudio.info(path.as_posix()) self.data.append(e) except RuntimeError: pass
def __init__(self, root: str, url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False) -> None: basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(self._ext_archive)[0] folder_in_archive = os.path.join(basename, folder_in_archive) self._path = os.path.join(root, folder_in_archive) self._metadata_path = os.path.join(root, basename, 'metadata.csv') if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum) extract_archive(archive) with open(self._metadata_path, "r", newline='') as metadata: walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE) self._walker = list(walker)
def __init__(self, root: Union[str, Path], url: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b", url_symbols: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols", download: bool = False, exclude_punctuations: bool = True) -> None: self.exclude_punctuations = exclude_punctuations root = Path(root) if not os.path.isdir(root): os.mkdir(root) if download: if os.path.isdir(root): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") checksum = _CHECKSUMS.get(url_symbols, None) download_url(url_symbols, root, hash_value=checksum, hash_type="md5") else: RuntimeError("The argument `root` must be a path to directory, " f"but '{root}' is passed in instead.") self._root_path = root basename = os.path.basename(url) basename_symbols = os.path.basename(url_symbols) with open(os.path.join(self._root_path, basename_symbols), "r") as text: self._symbols = [line.strip() for line in text.readlines()] with open(os.path.join(self._root_path, basename), "r", encoding='latin-1') as text: self._dictionary = _parse_dictionary(text.readlines(), exclude_punctuations=self.exclude_punctuations)
def __init__(self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False) -> None: # Get string representation of 'root' in case Path object is passed root = os.fspath(root) archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) self._walker = sorted( str(p.stem) for p in Path(self._path).glob('*' + self._ext_audio))
def get_metadata(out_root, subset): def predicate(id_): is_plenary = id_.find("PLENARY") > -1 if subset in {"10k", "10k_sd"}: return is_plenary and 20190101 <= int(id_[:8]) < 20200801 elif subset in {"100k"}: return is_plenary elif subset in LANGUAGES: return is_plenary and id_.endswith(subset) elif subset in LANGUAGES_V2: return id_.endswith(subset.split("_")[0]) return True filename = "unlabelled_sd" if subset == "10k_sd" else "unlabelled_v2" url = f"{DOWNLOAD_BASE_URL}/annotations/{filename}.tsv.gz" tsv_path = out_root / Path(url).name if not tsv_path.exists(): download_url(url, out_root.as_posix(), Path(url).name) if subset == '10k_sd': with gzip.open(tsv_path, mode="rt") as f: rows = [(r["session_id"], r["id_"], r["start_time"], r["end_time"]) for r in csv.DictReader(f, delimiter="|") if predicate(r["session_id"])] else: with gzip.open(tsv_path, mode="rt") as f: rows = [(r["event_id"], r["segment_no"], r["start"], r["end"]) for r in csv.DictReader(f, delimiter="\t") if predicate(r["event_id"])] return rows
def __init__(self, root, tsv=TSV, url=URL, download=False): languages = { "tatar": "tt", "english": "en", "german": "de", "french": "fr", "welsh": "cy", "breton": "br", "chuvash": "cv", "turkish": "tr", "kyrgyz": "ky", "irish": "ga-IE", "kabyle": "kab", "catalan": "ca", "taiwanese": "zh-TW", "slovenian": "sl", "italian": "it", "dutch": "nl", "hakha chin": "cnh", "esperanto": "eo", "estonian": "et", "persian": "fa", "basque": "eu", "spanish": "es", "chinese": "zh-CN", "mongolian": "mn", "sakha": "sah", "dhivehi": "dv", "kinyarwanda": "rw", "swedish": "sv-SE", "russian": "ru", } if url is languages: ext_archive = ".tar.gz" language = languages[url] base_url = ( "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4" + ".s3.amazonaws.com/cv-corpus-3/" ) url = base_url + language + ext_archive archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = root if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) self._tsv = os.path.join(root, tsv) with open(self._tsv, "r") as tsv: walker = unicode_csv_reader(tsv, delimiter="\t") self._header = next(walker) self._walker = list(walker)
def __init__( self, root, params, url=URL, download=False): basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(self._ext_archive)[0] base_folder = os.path.join(root, basename) self._wav_path = os.path.join(base_folder, 'wavs') self._mel_path = os.path.join(base_folder, 'mels') self._char_path = os.path.join(base_folder, 'chars') self._phone_path = os.path.join(base_folder, 'phones') self._metadata_path = os.path.join(base_folder, 'metadata.csv') if download: if not os.path.isdir(self._wav_path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) if not os.path.isdir(self._mel_path): precompute_spectrograms(base_folder, params) if not os.path.isdir(self._char_path) or not os.path.isdir(self._phone_path): precompute_char_phone(base_folder) with open(self._metadata_path, "r") as metadata: walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE) self._walker = list(walker)
def __init__(self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False): if url in [ "speech_commands_v0.01", "speech_commands_v0.02", ]: base_url = "https://storage.googleapis.com/download.tensorflow.org/data/" ext_archive = ".tar.gz" url = os.path.join(base_url, url + ext_archive) basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.rsplit(".", 2)[0] folder_in_archive = os.path.join(folder_in_archive, basename) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive, self._path) walker = walk_files(self._path, suffix=".wav", prefix=True) walker = filter(lambda w: HASH_DIVIDER in w and EXCEPT_FOLDER not in w, walker) self._walker = list(walker)
def download(args): if args.subset in LANGUAGES: languages = [args.subset] years = YEARS else: languages = { "100k": LANGUAGES, "10k": LANGUAGES, "asr": ["original"] }.get(args.subset, None) years = { "100k": YEARS, "10k": [2019, 2020], "asr": YEARS }.get(args.subset, None) url_list = [] for l in languages: for y in years: url_list.append(f"{DOWNLOAD_BASE_URL}/audios/{l}_{y}.tar") out_root = Path(args.root) / "raw_audios" out_root.mkdir(exist_ok=True, parents=True) print(f"{len(url_list)} files to download...") for url in tqdm(url_list): tar_path = out_root / Path(url).name download_url(url, out_root, Path(url).name) extract_archive(tar_path.as_posix()) os.remove(tar_path)
def __init__(self, root: str, release: str = "release1", subset: str = None, download: bool = False, audio_ext=".sph") -> None: self._ext_audio = audio_ext if release in _RELEASE_CONFIGS.keys(): folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"] url = _RELEASE_CONFIGS[release]["url"] subset = subset if subset else _RELEASE_CONFIGS[release]["subset"] else: # Raise warning raise RuntimeError( "The release {} does not match any of the supported tedlium releases{} " .format( release, _RELEASE_CONFIGS.keys(), )) if subset not in _RELEASE_CONFIGS[release]["supported_subsets"]: # Raise warning raise RuntimeError( "The subset {} does not match any of the supported tedlium subsets{} " .format( subset, _RELEASE_CONFIGS[release]["supported_subsets"], )) basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(".")[0] self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"]) if subset in ["train", "dev", "test"]: self._path = os.path.join(self._path, subset) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _RELEASE_CONFIGS[release]["checksum"] download_url(url, root, hash_value=checksum) extract_archive(archive) # Create list for all samples self._filelist = [] stm_path = os.path.join(self._path, "stm") for file in sorted(os.listdir(stm_path)): if file.endswith(".stm"): stm_path = os.path.join(self._path, "stm", file) with open(stm_path) as f: l = len(f.readlines()) file = file.replace(".stm", "") self._filelist.extend((file, line) for line in range(l)) # Create dict path for later read self._dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"]) self._phoneme_dict = None
def get(args): src_lang, tgt_lang = args.source_lang, args.target_lang if args.use_annotated_target: assert tgt_lang in S2S_TGT_LANGUAGES_WITH_HUMAN_TRANSCRIPTION in_root = Path(args.root) / "raw_audios" / tgt_lang asr_root = Path(args.root) / "transcribed_data" / src_lang out_root = asr_root / tgt_lang out_root.mkdir(exist_ok=True, parents=True) # Get metadata TSV url = f"{DOWNLOAD_BASE_URL}/annotations/asr/asr_{src_lang}.tsv.gz" tsv_path = asr_root / Path(url).name if not tsv_path.exists(): download_url(url, asr_root.as_posix(), Path(url).name) with gzip.open(tsv_path, "rt") as f: src_metadata = [x for x in csv.DictReader(f, delimiter="|")] src_metadata = { "{}-{}".format(r["session_id"], r["id_"]): (r["original_text"], r["speaker_id"]) for r in src_metadata } ref_sfx = "_ref" if args.use_annotated_target else "" url = f"{DOWNLOAD_BASE_URL}/annotations/s2s/s2s_{tgt_lang}{ref_sfx}.tsv.gz" tsv_path = out_root / Path(url).name if not tsv_path.exists(): download_url(url, out_root.as_posix(), Path(url).name) with gzip.open(tsv_path, "rt") as f: tgt_metadata = [x for x in csv.DictReader(f, delimiter="\t")] # Get segment into list items = defaultdict(list) manifest = [] print("Loading manifest...") for r in tqdm(tgt_metadata): src_id = r["id"] event_id, _src_lang, utt_id = parse_src_id(src_id) if _src_lang != src_lang: continue year = event_id[:4] in_path = in_root / year / f"{event_id}_{tgt_lang}.ogg" cur_out_root = out_root / year cur_out_root.mkdir(exist_ok=True, parents=True) tgt_id = f"{event_id}-{tgt_lang}_{utt_id}" out_path = cur_out_root / f"{tgt_id}.ogg" items[in_path.as_posix()].append( (out_path.as_posix(), float(r["start_time"]), float(r["end_time"]))) src_text, src_speaker_id = src_metadata[src_id] tgt_text = r["tgt_text"] if args.use_annotated_target else "" manifest.append((src_id, src_text, src_speaker_id, tgt_id, tgt_text)) items = list(items.items()) # Segment print(f"Segmenting {len(items):,} files...") multiprocess_run(items, _segment) # Output per-data-split list header = ["src_id", "src_text", "src_speaker_id", "tgt_id", "tgt_text"] with open(out_root / f"s2s{ref_sfx}.tsv", "w") as f_o: f_o.write("\t".join(header) + "\n") for cols in manifest: f_o.write("\t".join(cols) + "\n")
def __init__(self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False) -> None: if url in [ "aew", "ahw", "aup", "awb", "axb", "bdl", "clb", "eey", "fem", "gka", "jmk", "ksp", "ljm", "lnh", "rms", "rxr", "slp", "slt" ]: url = "cmu_us_" + url + "_arctic" ext_archive = ".tar.bz2" base_url = "http://www.festvox.org/cmu_arctic/packed/" url = os.path.join(base_url, url + ext_archive) # Get string representation of 'root' in case Path object is passed root = os.fspath(root) basename = os.path.basename(url) root = os.path.join(root, folder_in_archive) if not os.path.isdir(root): os.mkdir(root) archive = os.path.join(root, basename) basename = basename.split(".")[0] self._path = os.path.join(root, basename) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive) self._text = os.path.join(self._path, self._folder_text, self._file_text) with open(self._text, "r") as text: walker = csv.reader(text, delimiter="\n") self._walker = list(walker)
def sample_data(): path = get_default_cache_folder() download_url( "https://github.com/scart97/lapsbm-backup/archive/refs/tags/lapsbm-ci.tar.gz", download_folder=path, resume=True, ) extract_archive(path / "lapsbm-backup-lapsbm-ci.tar.gz", path) return path / "lapsbm-backup-lapsbm-ci"
def download(self) -> None: """Download the dataset and extract the archive""" if self.check_integrity(self.target_directory): print("Dataset already downloaded and verified.") else: archive_path = os.path.join(self.root, FOLDER_IN_ARCHIVE + ".zip") download_url(self.url, self.root) extract_archive(archive_path, self.root)
def __init__( self, root: str, url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False, subset: Any = None, ) -> None: # super(GTZAN, self).__init__() self.root = root self.url = url self.folder_in_archive = folder_in_archive self.download = download self.subset = subset assert subset is None or subset in [ "training", "validation", "testing" ], ("When `subset` not None, it must take a value from " + "{'training', 'validation', 'testing'}.") archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) if self.subset is None: walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) self._walker = list(walker) else: if self.subset == "training": self._walker = filtered_train elif self.subset == "validation": self._walker = filtered_valid elif self.subset == "testing": self._walker = filtered_test
def __init__( self, root: str, split: str, source_language: str, target_language: Optional[str] = None, version: int = 2, download: bool = False ) -> None: assert version in self.VERSIONS and split in self.SPLITS assert source_language is not None self.no_translation = (target_language is None) if not self.no_translation: assert 'en' in {source_language, target_language} if source_language == 'en': assert target_language in self.EN_XX_LANGUAGES[version] else: assert source_language in self.XX_EN_LANGUAGES[version] else: # Hack here so that we can get "split" column from CoVoST TSV. # Note that we use CoVoST train split for ASR which is an extension # to Common Voice train split. target_language = 'de' if source_language == 'en' else 'en' self.root = os.path.join(root, 'raw') os.makedirs(self.root, exist_ok=True) cv_url = self.CV_URL_TEMPLATE.format(ver=self.CV_VERSION_ID[version], lang=source_language) cv_archive = os.path.join(self.root, os.path.basename(cv_url)) if download: if not os.path.isfile(cv_archive): download_url(cv_url, self.root, hash_value=None) extract_archive(cv_archive) covost_url = self.COVOST_URL_TEMPLATE.format(src_lang=source_language, tgt_lang=target_language) covost_archive = os.path.join(self.root, os.path.basename(covost_url)) if download: if not os.path.isfile(covost_archive): download_url(covost_url, self.root, hash_value=None) extract_archive(covost_archive) cv_tsv = self.load_from_tsv(os.path.join(self.root, 'validated.tsv')) covost_tsv = self.load_from_tsv( os.path.join(self.root, os.path.basename(covost_url).replace('.tar.gz', '')) ) df = pd.merge(left=cv_tsv[['path', 'sentence', 'client_id']], right=covost_tsv[['path', 'translation', 'split']], how='inner', on='path') if split == 'train': df = df[(df['split'] == split) | (df['split'] == f'{split}_covost')] else: df = df[df['split'] == split] self.data = df.to_dict(orient='index').items() self.data = [v for k, v in sorted(self.data, key=lambda x: x[0])]
def __init__(self, root: str, url: str = URL, download: bool = False, mic_id: str = "mic2") -> None: archive = os.path.join(root, os.path.basename("VCTK-Corpus-0.92.zip")) self._path = os.path.join(root, "VCTK-Corpus-0.92") self._txt_dir = os.path.join(self._path, "txt") self._audio_dir = os.path.join(self._path, "wav48_silence_trimmed") self._mic_id = mic_id if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive, self._path) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) # Extracting speaker IDs from the folder structure self._speaker_ids = sorted(os.listdir(self._txt_dir)) self._sample_ids = [] """ Due to some insufficient data complexity in the 0.92 version of this dataset, we start traversing the audio folder structure in accordance with the text folder. As some of the audio files are missing of either ``mic_1`` or ``mic_2`` but the text is present for the same, we first check for the existence of the audio file before adding it to the ``sample_ids`` list. Once the ``audio_ids`` are loaded into memory we can quickly access the list for different parameters required by the user. """ for speaker_id in self._speaker_ids: utterance_dir = os.path.join(self._txt_dir, speaker_id) for utterance_file in sorted(f for f in os.listdir(utterance_dir) if f.endswith(".txt")): utterance_id = os.path.splitext(utterance_file)[0] audio_path_mic = os.path.join(self._audio_dir, speaker_id, f"{utterance_id}_{mic_id}.flac") if speaker_id == "p280" and mic_id == "mic2": break if speaker_id == "p362" and not os.path.isfile(audio_path_mic): continue self._sample_ids.append(utterance_id.split("_"))
def __init__(self, root: str, url: str, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False) -> None: if url in [ "dev-clean", "dev-other", "test-clean", "test-other", "train-clean-100", "train-clean-360", "train-other-500", ]: ext_archive = ".tar.gz" base_url = "http://www.openslr.org/resources/12/" url = os.path.join(base_url, url + ext_archive) basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(".")[0] folder_in_archive = os.path.join(folder_in_archive, basename) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum) extract_archive(archive) audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128) for root, dirs, files in os.walk(self._path): if len(files) != 0: for file in files: if file.split('.')[-1]==self._ext_wav.split('.')[-1]: file_audio = os.path.join(root, file) waveform, _ = torchaudio.load(file_audio) spec = audio_transforms(waveform) file_spec = os.path.join(root, file.split('.')[0]+ self._ext_wav) torch.save(spec, file_spec) walker = walk_files( self._path, suffix=self._ext_mel, prefix=False, remove_suffix=True ) self._walker = list(walker)
def __init__( self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False, transform=None, target_transform=None, return_dict=False, ): if not return_dict: warnings.warn( "In the next version, the item returned will be a dictionary. " "Please use `return_dict=True` to enable this behavior now, " "and suppress this warning.", DeprecationWarning, ) if transform is not None or target_transform is not None: warnings.warn( "In the next version, transforms will not be part of the dataset. " "Please remove the option `transform=True` and " "`target_transform=True` to suppress this warning.", DeprecationWarning, ) self.transform = transform self.target_transform = target_transform self.return_dict = return_dict archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) self._walker = list(walker)
def __init__(self, root: str, url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False, downsample: bool = False, transform: Any = None, target_transform: Any = None) -> None: if downsample: warnings.warn( "In the next version, transforms will not be part of the dataset. " "Please use `downsample=False` to enable this behavior now, ", "and suppress this warning.") if transform is not None or target_transform is not None: warnings.warn( "In the next version, transforms will not be part of the dataset. " "Please remove the option `transform=True` and " "`target_transform=True` to suppress this warning.") self.downsample = downsample self.transform = transform self.target_transform = target_transform archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) walker = filter(lambda w: self._except_folder not in w, walker) self._walker = list(walker)
def _download(self) -> None: """Download the dataset and extract the archive""" archive_path = os.path.join(self.root, self.basename) print(self.basename) if self._check_integrity(self._path): print("Dataset already download and verified") else: checksum = _CHECKSUMS.get(self.url, None) download_url(self.url, self.root, hash_value=checksum, hash_type="md5") extract_archive(archive_path, self._path)
def __init__( self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False, ) -> None: if url in [ "dev-clean", "dev-other", "test-clean", "test-other", "train-clean-100", "train-clean-360", "train-other-500", ]: ext_archive = ".tar.gz" base_url = "http://www.openslr.org/resources/60/" url = os.path.join(base_url, url + ext_archive) # Get string representation of 'root' in case Path object is passed root = os.fspath(root) basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(".")[0] folder_in_archive = os.path.join(folder_in_archive, basename) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum) extract_archive(archive) walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) self._walker = list(walker)
def __init__( self, root: Union[str, Path], exclude_punctuations: bool = True, *, download: bool = False, url: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b", url_symbols: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols", ) -> None: self.exclude_punctuations = exclude_punctuations self._root_path = Path(root) if not os.path.isdir(self._root_path): raise RuntimeError(f'The root directory does not exist; {root}') dict_file = self._root_path / os.path.basename(url) symbol_file = self._root_path / os.path.basename(url_symbols) if not os.path.exists(dict_file): if not download: raise RuntimeError( 'The dictionary file is not found in the following location. ' f'Set `download=True` to download it. {dict_file}') checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") if not os.path.exists(symbol_file): if not download: raise RuntimeError( 'The symbol file is not found in the following location. ' f'Set `download=True` to download it. {symbol_file}') checksum = _CHECKSUMS.get(url_symbols, None) download_url(url_symbols, root, hash_value=checksum, hash_type="md5") with open(symbol_file, "r") as text: self._symbols = [line.strip() for line in text.readlines()] with open(dict_file, "r", encoding='latin-1') as text: self._dictionary = _parse_dictionary( text.readlines(), exclude_punctuations=self.exclude_punctuations)
def __init__(self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False, preprocess=False): if url in [ "dev-clean", "dev-other", "test-clean", "test-other", "train-clean-100", "train-clean-360", "train-other-500", ]: ext_archive = ".tar.gz" base_url = "http://www.openslr.org/resources/12/" url = os.path.join(base_url, url + ext_archive) basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(".")[0] folder_in_archive = os.path.join(folder_in_archive, basename) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) self._walker = list(walker) if preprocess: self.preprocess_embeddings(self._path, self._ext_audio, self._ext_embed)
def _download_dataset(self): if not osp.isdir(self._dataset_root): os.mkdir(self._dataset_root) if self._verbose >= 1: print('Download files for the dataset...') infos = FILES_INFOS[self._subset] # Download archives files for name, info in infos.items(): filename, url, hash_ = info['filename'], info['url'], info['hash'] filepath = osp.join(self._dataset_root, filename) if not osp.isfile(filepath): if self._verbose >= 1: print(f'Download file "{filename}" from url "{url}"...') if osp.exists(filepath): raise RuntimeError( f'Object "{filepath}" already exists but it\'s not a file.' ) download_url(url, self._dataset_root, filename, hash_value=hash_, hash_type='md5') # Extract audio files from archives for name, info in infos.items(): filename = info['filename'] filepath = osp.join(self._dataset_root, filename) extension = filename.split('.')[-1] if extension == '7z': extracted_path = osp.join(self._dataset_root, self._subset) if not osp.isdir(extracted_path): if self._verbose >= 1: print(f'Extract archive file "{filename}"...') archive_file = SevenZipFile(filepath) archive_file.extractall(self._dataset_root) archive_file.close()
def __init__(self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False, transform: Any = None, target_transform: Any = None) -> None: if transform is not None or target_transform is not None: warnings.warn( "In the next version, transforms will not be part of the dataset. " "Please remove the option `transform=True` and " "`target_transform=True` to suppress this warning.") self.transform = transform self.target_transform = target_transform # Get string representation of 'root' in case Path object is passed root = os.fspath(root) archive = os.path.basename(url) archive = os.path.join(root, archive) self._path = os.path.join(root, folder_in_archive) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _CHECKSUMS.get(url, None) download_url(url, root, hash_value=checksum, hash_type="md5") extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) walker = walk_files(self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True) self._walker = list(walker)
def test_expected_prediction_from_pretrained_model(): # Loading the sample file try: folder = get_default_cache_folder() download_url( "https://github.com/fastaudio/10_Speakers_Sample/raw/76f365de2f4d282ec44450d68f5b88de37b8b7ad/train/f0001_us_f0001_00001.wav", download_folder=str(folder), filename="f0001_us_f0001_00001.wav", resume=True, ) # Preparing data and model module = QuartznetModule.load_from_nemo( checkpoint_name=NemoCheckpoint.QuartzNet5x5LS_En) audio, sr = torchaudio.load(folder / "f0001_us_f0001_00001.wav") assert sr == 16000 output = module.predict(audio) expected = "the world needs opportunities for new leaders and new ideas" assert output[0].strip() == expected except HTTPError: return
def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None: root = Path(root) archive = os.path.basename(url) archive = root / archive self._path = root / folder_in_archive if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _RELEASE_CONFIGS["release1"]["checksum"] download_url(url, root, hash_value=checksum) extract_archive(archive) if not os.path.isdir(self._path): raise RuntimeError( "Dataset not found. Please use `download=True` to download it." ) self._walker = sorted( str(p.stem) for p in Path(self._path).glob("*.wav"))
def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None: root = Path(root) basename = os.path.basename(url) archive = root / basename basename = Path(basename.split(".tar.bz2")[0]) folder_in_archive = basename / folder_in_archive self._path = root / folder_in_archive self._metadata_path = root / basename / 'metadata.csv' if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): checksum = _RELEASE_CONFIGS["release1"]["checksum"] download_url(url, root, hash_value=checksum) extract_archive(archive) with open(self._metadata_path, "r", newline='') as metadata: flist = csv.reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE) self._flist = list(flist)
def __init__( self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False ): basename = os.path.basename(url) archive = os.path.join(root, basename) basename = basename.split(self._ext_archive)[0] folder_in_archive = os.path.join(basename, folder_in_archive) self._path = os.path.join(root, folder_in_archive) self._metadata_path = os.path.join(root, basename, 'metadata.csv') if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url(url, root) extract_archive(archive) with open(self._metadata_path, "r") as metadata: walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE) self._walker = list(walker)