def tatoeba_loader(keep_archive): """Download, extract and build the output strings that can be written to the desired TXT files. Args: keep_archive (bool): Keep or delete the downloaded archive afterwards. Returns: str: String containing the output string that can be written to TXT files. """ # Download and extract the dataset if necessary. download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive) if not os.path.isdir(__SOURCE_PATH): raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH)) # Download user ratings CSV file. csv_path = os.path.join(__SOURCE_PATH, 'users_sentences.csv') download.download_with_progress('http://downloads.tatoeba.org/exports/users_sentences.csv', csv_path) assert os.path.exists(csv_path) target = 'train' # Generate the WAV and a string for the `<target>.txt` file. output = __tatoeba_loader(target) # Generate the `<target>.txt` file. txt_path = generate_txt(__NAME, target, output) # Cleanup extracted folder. download.cleanup_cache(__FOLDER_NAME) return txt_path
def timit_loader(): """Build the output string that can be written to the desired *.txt file. Returns: Tuple[str]: Tuple containing the output string that can be written to TXT file. """ targets = ['train', 'test'] txt_paths = [] for target in targets: output = __timit_loader(target) txt_paths.append(generate_txt(__NAME, target, output)) return tuple(txt_paths)
def tedlium_loader(keep_archive): """Download, extract and build the output strings that can be written to the desired TXT files. Requires lots of disk space, since the original format (SPH) is converted to WAV and then split up into parts. Args: keep_archive (bool): Keep or delete the downloaded archive afterwards. Returns: Tuple[str]: Tuple containing the output strings that can be written to TXT files. """ # Download and extract the dataset if necessary. download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive) if not os.path.isdir(__SOURCE_PATH): raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH)) # Folders for each target. targets = [{ 'name': 'train', 'folder': 'train' }, { 'name': 'test', 'folder': 'test' }, { 'name': 'dev', 'folder': 'dev' }] txt_paths = [] for target in targets: # Create target folder if necessary. target_directory = os.path.join(__TARGET_PATH, target['folder'], 'sph') if not os.path.exists(target_directory): os.makedirs(target_directory) # Generate the WAV and a string for the `<target>.txt` file. source_directory = os.path.join(__SOURCE_PATH, target['folder']) output = __tedlium_loader(source_directory) # Generate the `<target>.txt` file. txt_paths.append(generate_txt(__NAME, target['name'], output)) # Cleanup extracted folder. download.cleanup_cache(__FOLDER_NAME) return tuple(txt_paths)
def common_voice_loader(keep_archive): """Download and extract the common voice archive. Then build the output strings that can be written to the desired TXT files. Uses only the valid datasets, additional constraints are: * Downvotes must be at maximum 1/4 of upvotes. * Valid accents are: 'us', 'england', 'canada', 'australia'. * Accepting samples with only 1 upvote at the moment. Args: keep_archive (bool): Keep or delete the downloaded archive afterwards. Returns: Tuple[str]: Tuple containing the output strings that can be written to TXT files. """ # Download and extract the dataset if necessary. download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive) if not os.path.isdir(__SOURCE_PATH): raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH)) # Folders for each target. targets = [{ 'name': 'train', 'folders': ['cv-valid-train'] }, { 'name': 'test', 'folders': ['cv-valid-test'] }, { 'name': 'dev', 'folders': ['cv-valid-dev'] }] txt_paths = [] for target in targets: # Generate the WAV and a string for the `<target>.txt` file. output = __common_voice_loader(target['folders']) # Generate the `<target>.txt` file. txt_paths.append(generate_txt(__NAME, target['name'], output)) # Cleanup extracted folder. download.cleanup_cache(__FOLDER_NAME) return tuple(txt_paths)
def libri_speech_loader(keep_archive): """Download, extract and build the output strings that can be written to the desired TXT files. L8ER: Can this be parallelized? Args: keep_archive (bool): Keep or delete the downloaded archive afterwards. Returns: Tuple[str]: Tuple containing the output string that can be written to TXT files. """ # Download and extract the dataset if necessary. download.maybe_download_batch(__URLs, md5s=__MD5s, cache_archives=keep_archive) if not os.path.isdir(__SOURCE_PATH): raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH)) # Folders for each target. targets = [{ 'name': 'train', 'folders': ['train-clean-100', 'train-clean-360'] }, { 'name': 'test', 'folders': ['test-clean'] }, { 'name': 'dev', 'folders': ['dev-clean'] }] txt_paths = [] for target in targets: # Generate the WAV and a string for the `<target>.txt` file. output = __libri_speech_loader(target['folders']) # Generate the `<target>.txt` file. txt_paths.append(generate_txt(__NAME, target['name'], output)) # Cleanup extracted folder. download.cleanup_cache(__FOLDER_NAME) return tuple(txt_paths)