Python generate_txt Exemples, python.dataset.txt_files.generate_txt Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : tatoeba_loader.py Projet : yweweler/ctc-asr

def tatoeba_loader(keep_archive):
    """Download, extract and build the output strings that can be written to the desired TXT files.

    Args:
        keep_archive (bool): Keep or delete the downloaded archive afterwards.

    Returns:
        str: String containing the output string that can be written to TXT files.
    """

    # Download and extract the dataset if necessary.
    download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive)
    if not os.path.isdir(__SOURCE_PATH):
        raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH))

    # Download user ratings CSV file.
    csv_path = os.path.join(__SOURCE_PATH, 'users_sentences.csv')
    download.download_with_progress('http://downloads.tatoeba.org/exports/users_sentences.csv',
                                    csv_path)
    assert os.path.exists(csv_path)

    target = 'train'
    # Generate the WAV and a string for the `<target>.txt` file.
    output = __tatoeba_loader(target)
    # Generate the `<target>.txt` file.
    txt_path = generate_txt(__NAME, target, output)

    # Cleanup extracted folder.
    download.cleanup_cache(__FOLDER_NAME)

    return txt_path

Exemple #2

0

Afficher le fichier

def timit_loader():
    """Build the output string that can be written to the desired *.txt file.

    Returns:
        Tuple[str]: Tuple containing the output string that can be written to TXT file.
    """

    targets = ['train', 'test']

    txt_paths = []
    for target in targets:
        output = __timit_loader(target)
        txt_paths.append(generate_txt(__NAME, target, output))

    return tuple(txt_paths)

Exemple #3

0

Afficher le fichier

def tedlium_loader(keep_archive):
    """Download, extract and build the output strings that can be written to the desired TXT files.

    Requires lots of disk space, since the original format (SPH) is converted to WAV and then split
    up into parts.

    Args:
        keep_archive (bool): Keep or delete the downloaded archive afterwards.

    Returns:
        Tuple[str]: Tuple containing the output strings that can be written to TXT files.
    """

    # Download and extract the dataset if necessary.
    download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive)
    if not os.path.isdir(__SOURCE_PATH):
        raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH))

    # Folders for each target.
    targets = [{
        'name': 'train',
        'folder': 'train'
    }, {
        'name': 'test',
        'folder': 'test'
    }, {
        'name': 'dev',
        'folder': 'dev'
    }]

    txt_paths = []
    for target in targets:
        # Create target folder if necessary.
        target_directory = os.path.join(__TARGET_PATH, target['folder'], 'sph')
        if not os.path.exists(target_directory):
            os.makedirs(target_directory)

        # Generate the WAV and a string for the `<target>.txt` file.
        source_directory = os.path.join(__SOURCE_PATH, target['folder'])
        output = __tedlium_loader(source_directory)
        # Generate the `<target>.txt` file.
        txt_paths.append(generate_txt(__NAME, target['name'], output))

    # Cleanup extracted folder.
    download.cleanup_cache(__FOLDER_NAME)

    return tuple(txt_paths)

Exemple #4

0

Afficher le fichier

def common_voice_loader(keep_archive):
    """Download and extract the common voice archive. Then build the output strings that can be
    written to the desired TXT files.

    Uses only the valid datasets, additional constraints are:
    * Downvotes must be at maximum 1/4 of upvotes.
    * Valid accents are: 'us', 'england', 'canada', 'australia'.
    * Accepting samples with only 1 upvote at the moment.

    Args:
        keep_archive (bool): Keep or delete the downloaded archive afterwards.

    Returns:
        Tuple[str]: Tuple containing the output strings that can be written to TXT files.
    """

    # Download and extract the dataset if necessary.
    download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive)
    if not os.path.isdir(__SOURCE_PATH):
        raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH))

    # Folders for each target.
    targets = [{
        'name': 'train',
        'folders': ['cv-valid-train']
    }, {
        'name': 'test',
        'folders': ['cv-valid-test']
    }, {
        'name': 'dev',
        'folders': ['cv-valid-dev']
    }]

    txt_paths = []
    for target in targets:
        # Generate the WAV and a string for the `<target>.txt` file.
        output = __common_voice_loader(target['folders'])
        # Generate the `<target>.txt` file.
        txt_paths.append(generate_txt(__NAME, target['name'], output))

    # Cleanup extracted folder.
    download.cleanup_cache(__FOLDER_NAME)

    return tuple(txt_paths)

Exemple #5

0

Afficher le fichier

Fichier : libri_speech_loeader.py Projet : yweweler/ctc-asr

def libri_speech_loader(keep_archive):
    """Download, extract and build the output strings that can be written to the desired TXT files.

    L8ER: Can this be parallelized?

    Args:
        keep_archive (bool): Keep or delete the downloaded archive afterwards.

    Returns:
        Tuple[str]: Tuple containing the output string that can be written to TXT files.
    """

    # Download and extract the dataset if necessary.
    download.maybe_download_batch(__URLs,
                                  md5s=__MD5s,
                                  cache_archives=keep_archive)
    if not os.path.isdir(__SOURCE_PATH):
        raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH))

    # Folders for each target.
    targets = [{
        'name': 'train',
        'folders': ['train-clean-100', 'train-clean-360']
    }, {
        'name': 'test',
        'folders': ['test-clean']
    }, {
        'name': 'dev',
        'folders': ['dev-clean']
    }]

    txt_paths = []
    for target in targets:
        # Generate the WAV and a string for the `<target>.txt` file.
        output = __libri_speech_loader(target['folders'])
        # Generate the `<target>.txt` file.
        txt_paths.append(generate_txt(__NAME, target['name'], output))

    # Cleanup extracted folder.
    download.cleanup_cache(__FOLDER_NAME)

    return tuple(txt_paths)