Exemple #1
0
def tatoeba_loader(keep_archive):
    """Download, extract and convert the Tatoeba archive.

    Then build all possible CSV files (e.g. `<dataset_name>_train.csv`, `<dataset_name>_test.csv`).

    Args:
        keep_archive (bool): Keep or delete the downloaded archive afterwards.

    Returns:
        List[str]: List containing the created CSV file paths.
    """

    # Download and extract the dataset if necessary.
    download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive)
    if not os.path.isdir(__SOURCE_PATH):
        raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH))

    # Download user ratings CSV file.
    csv_path = os.path.join(__SOURCE_PATH, 'users_sentences.csv')
    download.download_with_progress(
        'http://downloads.tatoeba.org/exports/users_sentences.csv', csv_path)
    assert os.path.exists(csv_path)

    target = 'train'
    # Generate the WAV and a string for the `<target>.txt` file.
    output = __tatoeba_loader(target)
    # Generate the `<target>.txt` file.
    csv_path = generate_csv(__NAME, target, output)

    # Cleanup extracted folder.
    download.cleanup_cache(__FOLDER_NAME)

    return csv_path
def tedlium_loader(keep_archive):
    """Download, extract and convert the TEDLIUM archive.

    Then build all possible CSV files (e.g. `<dataset_name>_train.csv`, `<dataset_name>_test.csv`).

    Requires lots of disk space, since the original format (SPH) is converted to WAV and then split
    up into parts.

    Args:
        keep_archive (bool): Keep or delete the downloaded archive afterwards.

    Returns:
        List[str]: List containing the created CSV file paths.
    """

    # Download and extract the dataset if necessary.
    download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive)
    if not os.path.isdir(__SOURCE_PATH):
        raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH))

    # Folders for each target.
    targets = [
        {
            'name': 'train',
            'folder': 'train'
        }, {
            'name': 'test',
            'folder': 'test'
        }, {
            'name': 'dev',
            'folder': 'dev'
        }
    ]

    txt_paths = []
    for target in targets:
        # Create target folder if necessary.
        target_directory = os.path.join(__TARGET_PATH, target['folder'], 'sph')
        if not os.path.exists(target_directory):
            os.makedirs(target_directory)

        # Generate the WAV and a string for the `<target>.txt` file.
        source_directory = os.path.join(__SOURCE_PATH, target['folder'])
        output = __tedlium_loader(source_directory)
        # Generate the `<target>.txt` file.
        txt_paths.append(generate_csv(__NAME, target['name'], output))

    # Cleanup extracted folder.
    download.cleanup_cache(__FOLDER_NAME)

    return tuple(txt_paths)
Exemple #3
0
def cv_loader(keep_archive):
    """Download, extract and convert the Common Voice archive.

    Then build all possible CSV files (e.g. `<dataset_name>_train.csv`, `<dataset_name>_test.csv`).

    Uses only the valid datasets, additional constraints are:
    * Downvotes must be at maximum 1/4 of upvotes.
    * Valid accents are: 'us', 'england', 'canada', 'australia'.
    * Accepting samples with only 1 upvote at the moment.

    Args:
        keep_archive (bool): Keep or delete the downloaded archive afterwards.

    Returns:
        List[str]: List containing the created CSV file paths.
    """

    # Download and extract the dataset if necessary.
    download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive)
    if not os.path.isdir(__SOURCE_PATH):
        raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH))

    # Folders for each target.
    targets = [{
        'name': 'train',
        'folders': ['cv-valid-train']
    }, {
        'name': 'test',
        'folders': ['cv-valid-test']
    }, {
        'name': 'dev',
        'folders': ['cv-valid-dev']
    }]

    csv_paths = []
    for target in targets:
        # Generate the path and label for the `<target>.csv` file.
        output = __common_voice_loader(target['folders'])
        # Generate the `<target>.csv` file.
        csv_paths.append(generate_csv(__NAME, target['name'], output))

    # Cleanup extracted folder.
    download.cleanup_cache(__FOLDER_NAME)

    return csv_paths
Exemple #4
0
def libri_loader(keep_archive):
    """Download, extract and convert the Libri Speech archive.

    Then build all possible CSV files (e.g. `<dataset_name>_train.csv`, `<dataset_name>_test.csv`).

    Args:
        keep_archive (bool): Keep or delete the downloaded archive afterwards.

    Returns:
        List[str]: List containing the created CSV file paths.
    """

    # Download and extract the dataset if necessary.
    download.maybe_download_batch(__URLS,
                                  md5s=__MD5S,
                                  cache_archives=keep_archive)
    if not os.path.isdir(__SOURCE_PATH):
        raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH))

    # Folders for each target.
    targets = [{
        'name': 'train',
        'folders': ['train-clean-100', 'train-clean-360']
    }, {
        'name': 'test',
        'folders': ['test-clean']
    }, {
        'name': 'dev',
        'folders': ['dev-clean']
    }]

    csv_paths = []
    for target in targets:
        # Generate the WAV and a string for the `<target>.txt` file.
        output = __libri_speech_loader(target['folders'])
        # Generate the `<target>.txt` file.
        csv_paths.append(generate_csv(__NAME, target['name'], output))

    # Cleanup extracted folder.
    download.cleanup_cache(__FOLDER_NAME)

    return csv_paths
Exemple #5
0
def cv_loader(keep_archive):
    """Download, extract and convert the Common Voice archive.

    Then build all possible CSV files (e.g. `<dataset_name>_train.csv`, `<dataset_name>_test.csv`).

    Uses only the valid dataset, additional constraints are:
    * Downvotes must be at maximum 1/4 of upvotes.
    * Valid accents are: 'us', 'england', 'canada', 'australia'.
    * Accepting samples with only 1 upvote at the moment.

    Args:
        keep_archive (bool): Keep or delete the downloaded archive afterwards.

    Returns:
        str: String containing the created CSV file path.
    """

    print(
        'Please read and accept the Mozilla Common Voice terms before downloading! '
        'Visit: https://voice.mozilla.org/en/datasets')

    # Download and extract the dataset if necessary.
    download.maybe_download(__URL,
                            md5=__MD5,
                            cache_archive=keep_archive,
                            target_subdir='cvv2')
    if not os.path.isdir(__SOURCE_PATH):
        raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH))

    tsv_path = os.path.join(CACHE_DIR, 'cvv2', 'validated.tsv')
    assert os.path.exists(tsv_path), '.TSV file not found: {}'.format(tsv_path)

    # Generate the path and label for the `<target>.csv` file.
    output = __common_voice_loader(tsv_path)
    # Generate the `<target>.csv` file.
    csv_path = generate_csv(__NAME, 'train', output)

    # Cleanup extracted folder.
    download.cleanup_cache(__FOLDER_NAME)

    return csv_path