Beispiel #1
0
def _write_part_to_wav(wav_data, path, start, end, sr=16000):
    assert 0. <= start < (len(wav_data) / sr)
    assert start < end <= (len(wav_data) / sr)

    # print('Saving {:12,d}({:6.2f}s) to {:12,d}({:6.2f}s) at: {}'
    #       .format(seconds_to_sample(start), start, seconds_to_sample(end), end, path))

    delete_file_if_exists(path)
    wavfile.write(
        path, sr,
        wav_data[_seconds_to_sample(start, True
                                    ):_seconds_to_sample(end, False)])
Beispiel #2
0
def maybe_download(url, md5=None, cache_archive=True):
    """Downloads a archive file if it's not cached. The archive gets extracted afterwards.
    It is advised to call `cleanup_cache()` after pre-processing to remove the cached extracted
    folder.
    Currently only TAR and ZIP files are supported.

    Args:
        url (str):
            URL for dataset download.
        md5 (str):
            Checksum for optional integrity check or `None`.
        cache_archive (bool):
            `True` if the downloaded archive should be kept, `False` if it should be deleted.

    Returns:
        Nothing.
    """
    file_name = os.path.basename(urlparse(url).path)
    storage_path = os.path.join(CACHE_DIR, '{}'.format(file_name))

    # Download archive if necessary.
    if not os.path.isfile(storage_path):
        download_with_progress(url, storage_path)
    else:
        print('Using cached archive: {}'.format(storage_path))

    # Optional md5 integrity check.
    if md5:
        md5sum = storage.md5(storage_path)
        assert md5 == md5sum, 'Checksum does not match.'

    # Extract archive to cache directory.
    print('Starting extraction of: {}'.format(storage_path))
    if tarfile.is_tarfile(storage_path):
        storage.tar_extract_all(storage_path, CACHE_DIR)
    elif zipfile.is_zipfile(storage_path):
        with zipfile.ZipFile(storage_path, 'r') as zip_:
            zip_.extractall(CACHE_DIR)
    else:
        raise ValueError('Compression method not supported: ', storage_path)
    print('Completed extraction of: {}'.format(storage_path))

    # Delete cached archive if requested.
    if not cache_archive:
        storage.delete_file_if_exists(storage_path)
        print('Cache file "{}" deleted.'.format(storage_path))
Beispiel #3
0
def __tatoeba_loader_helper(sample):
    path = sample['path']
    text = sample['text']
    mp3_path = '{}.mp3'.format(path)
    wav_path = '{}.wav'.format(path)
    wav_path = os.path.join(__TARGET_PATH, os.path.relpath(wav_path, __SOURCE_PATH))

    # Check if audio file MP3 exists.
    if not os.path.isfile(mp3_path):
        # print('WARN: Audio file missing: {}'.format(mp3_path))
        return None

    # Check if file isn't empty.
    try:
        if os.path.getsize(mp3_path) <= 4048:
            return None
    except OSError:
        return None

    delete_file_if_exists(wav_path)

    # Convert MP3 file into WAV file, reduce volume to 0.95, downsample to 16kHz mono sound.
    ret = subprocess.call(['sox', '-v', '0.95', mp3_path, '-r', '16k', wav_path, 'remix', '1'])
    if not os.path.isfile(wav_path):
        raise RuntimeError('Failed to create WAV file with error code={}: {}'.format(ret, wav_path))

    # Validate that the example length is within boundaries.
    for i in range(5):
        try:
            (sr, y) = wavfile.read(wav_path)
            length_sec = len(y) / sr
            if not MIN_EXAMPLE_LENGTH <= length_sec <= MAX_EXAMPLE_LENGTH:
                return None
            break
        except ValueError:
            print('WARN: Could not load ({}/5) wavfile: {}'.format(i, wav_path))
            if i == 4:
                raise
            time.sleep(1)

    # TODO: Copy used files to corpus dir
    wav_path = os.path.relpath(wav_path, CORPUS_DIR)
    return '{} {}\n'.format(wav_path, text.strip())
Beispiel #4
0
def generate_txt(dataset_name, target, output):
    """Generate *.txt files containing the audio path and the corresponding sentence.
    Generated files are being stored at `TXT_TARGET_PATH`.

    Return additional data set information, see below.

    Args:
        dataset_name (str):
            Name of the dataset, e.g. 'libri_speech'.
        target (str):
            Target name, e.g. 'train', 'test', 'dev'
        output (str):
            String containing the content for the `<dataset_name>_<target>.txt` file.

    Returns:
        str: Path to the created TXT file.
    """

    target_txt_path = os.path.join(TXT_DIR,
                                   '{}_{}.txt'.format(dataset_name, target))
    print('Starting to generate: {}'.format(os.path.basename(target_txt_path)))

    # Remove illegal characters from labels.
    output = _remove_illegal_characters(output)

    # Filter out labels that are only shorter than 2 characters.
    output = list(
        filter(lambda x: len((x.split(' ', 1)[-1]).strip()) >= 2, output))

    # Write list to .txt file.
    print('> Writing {} lines of {} files to {}'.format(
        len(output), target, target_txt_path))
    # Delete the old file if it exists.
    storage.delete_file_if_exists(target_txt_path)

    # Write data to the file.
    with open(target_txt_path, 'w') as f:
        f.writelines(output)

    return target_txt_path
Beispiel #5
0
def __common_voice_loader_helper(line):
    # Helper method for thread pool.

    # Cleanup label text.
    text = line[1].strip().replace('  ', ' ')
    # Enforce min label length.
    if len(text) > 1:
        # Check upvotes vs downvotes.
        if int(line[2]) >= 1 and int(line[3]) / int(line[2]) <= 1 / 4:
            # Check if speaker accent is valid.
            if line[6] in __VALID_ACCENTS:
                mp3_path = os.path.join(__SOURCE_PATH, line[0])
                assert os.path.isfile(mp3_path)
                wav_path = os.path.relpath('{}.wav'.format(mp3_path[:-4]),
                                           __SOURCE_PATH)
                wav_path = os.path.join(__TARGET_PATH, wav_path)

                delete_file_if_exists(wav_path)
                # Convert MP3 to WAV, reduce volume to 0.95, downsample to 16kHz and mono sound.
                subprocess.call([
                    'sox', '-v', '0.95', mp3_path, '-r', '16k', wav_path,
                    'remix', '1'
                ])
                assert os.path.isfile(wav_path)

                # Validate that the example length is within boundaries.
                (sr, y) = wavfile.read(wav_path)
                length_sec = len(y) / sr
                if not MIN_EXAMPLE_LENGTH <= length_sec <= MAX_EXAMPLE_LENGTH:
                    return None

                # Add dataset relative to dataset path, label to TXT file buffer.
                wav_path = os.path.relpath(wav_path, CORPUS_DIR)
                return '{} {}\n'.format(wav_path, text)

    return None
def sort_txt_by_seq_len(txt_path, num_buckets=64, max_length=1700):
    """Sort a train.txt like file by it's audio files sequence length.
    Additionally outputs longer than `max_length` are being discarded from the given TXT file.
    Also it prints out optimal bucket sizes after computation.

    Args:
        txt_path (str): Path to the `train.txt`.
        num_buckets (int): Number ob buckets to split the input into.
        max_length (int): Positive integer. Max length for a feature vector to keep.
            Set to `0` to keep everything.

    Returns:
        Tuple[List[int], float]: A tuple containing the boundary array and the total corpus length
        in seconds.
    """
    # Read train.txt file.
    with open(txt_path, 'r') as f:
        lines = f.readlines()

        # Setup thread pool.
        lock = Lock()
        buffer = []  # Output buffer.

        with Pool(processes=cpu_count()) as pool:
            for result in tqdm(pool.imap_unordered(_feature_length,
                                                   lines,
                                                   chunksize=4),
                               desc='Reading audio samples',
                               total=len(lines),
                               file=sys.stdout,
                               unit='samples',
                               dynamic_ncols=True):
                lock.acquire()
                buffer.append(result)
                lock.release()

        # Sort by sequence length.
        buffer = sorted(buffer, key=lambda x: x[0])

        # Remove samples longer than `max_length` points.
        if max_length > 0:
            original_length = len(buffer)
            buffer = [s for s in buffer if s[0] < max_length]
            print(
                'Removed {:,d} samples from training.'.format(original_length -
                                                              len(buffer)))

        # Calculate optimal bucket sizes.
        lengths = [l[0] for l in buffer]
        step = len(lengths) // num_buckets
        buckets = set()
        for i in range(step, len(lengths), step):
            buckets.add(lengths[i])
        buckets = list(buckets)
        buckets.sort()
        print('Suggested buckets: ', buckets)

        # Plot histogram of feature vector length distribution.
        _plot_sequence_lengths(lengths)

        # Determine total corpus length in seconds.
        total_length = sum(map(lambda x: x[0], buffer)) / 0.1

        # Remove sequence length.
        buffer = ['{} {}'.format(p, l) for _, p, l in buffer]

    # Write back to file.
    storage.delete_file_if_exists(txt_path)
    with open(txt_path, 'w') as f:
        f.writelines(buffer)

    with open(txt_path, 'r') as f:
        print('Successfully sorted {} lines of {}'.format(
            len(f.readlines()), txt_path))

    return buckets[:-1], total_length