Beispiel #1
0
def __common_voice_loader_helper(csv_line, target_dir):
    # Helper method for thread pool.
    audio_file_hash = csv_line[1]
    label = csv_line[2].strip().replace('  ', ' ').replace('"', '')
    upvotes = int(csv_line[3])
    downvotes = int(csv_line[4])
    # age = line[5]
    # gender = line[6]
    accent = csv_line[7]

    # Source and target paths.
    mp3_path = os.path.join(__SOURCE_PATH, 'clips',
                            '{}.mp3'.format(audio_file_hash))
    wav_path = os.path.join(target_dir, '{}.wav'.format(audio_file_hash))

    # Enforce min label length.
    if len(label) < 3:
        # print('WARN: Label "{}" to short: {}'.format(label, mp3_path))
        return None

    # Check upvotes vs. downvotes relation.
    if downvotes >= 1 and upvotes / downvotes > 1 / 4:
        # print('WARN: Too many down votes ({}/{}): {}'.format(upvotes, downvotes, mp3_path))
        return None

    # Check if speaker accent is valid.
    if accent not in __VALID_ACCENTS:
        # print('WARN: Invalid accent "{}": {}'.format(accent, mp3_path))
        return None

    # Make sure the file exists and is not empty.
    if not os.path.exists(mp3_path):
        print('WARN: MP3 file not found: {}'.format(mp3_path))
        return None
    if os.path.getsize(mp3_path) < 1024:
        print('WARN: MP3 file appears to be empty: {}'.format(mp3_path))
        return None

    delete_file_if_exists(wav_path)
    # Convert MP3 to WAV, reduce volume to 0.95, downsample to 16kHz and mono sound.
    subprocess.call(sox_commandline(mp3_path, wav_path))
    assert os.path.isfile(wav_path), 'Created WAV file not found: {}'.format(
        wav_path)

    # Validate that the example length is within boundaries.
    (sampling_rate, audio_data) = wavfile.read(wav_path)
    length_sec = len(audio_data) / sampling_rate
    if not MIN_EXAMPLE_LENGTH <= length_sec <= MAX_EXAMPLE_LENGTH:
        return None

    # Add dataset relative to dataset path, label to CSV file buffer.
    wav_path = os.path.relpath(wav_path, CORPUS_DIR)

    return {
        CSV_HEADER_PATH: wav_path,
        CSV_HEADER_LENGTH: length_sec,
        CSV_HEADER_LABEL: label
    }
Beispiel #2
0
def __write_part_to_wav(wav_data, path, start, end, sampling_rate=16000):
    assert 0. <= start < (len(wav_data) / sampling_rate)
    assert start < end <= (len(wav_data) / sampling_rate)

    # print('Saving {:12,d}({:6.2f}s) to {:12,d}({:6.2f}s) at: {}'
    #       .format(seconds_to_sample(start), start, seconds_to_sample(end), end, path))

    delete_file_if_exists(path)
    wavfile.write(path, sampling_rate, wav_data[__seconds_to_sample(start, True):
                                                __seconds_to_sample(end, False)])
Beispiel #3
0
def maybe_download(url, md5=None, cache_archive=True, target_subdir=''):
    """Downloads a archive file if it's not cached. The archive gets extracted afterwards.

    It is advised to call `cleanup_cache()` after pre-processing to remove the cached extracted
    folder.
    Currently only TAR and ZIP files are supported.

    Args:
        url (str):
            URL for dataset download.
        md5 (str): Optional.
            Checksum for optional integrity check or `None`.
        cache_archive (bool): Optional.
            `True` if the downloaded archive should be kept, `False` if it should be deleted.
        target_subdir (str): Optional.
            Subdirectory within the cache folder, to where the archive should be extracted.

    Returns:
        Nothing.
    """
    file_name = os.path.basename(urlparse(url).path)
    archive_path = os.path.join(CACHE_DIR, '{}'.format(file_name))

    # Download archive if necessary.
    if not os.path.isfile(archive_path):
        download_with_progress(url, archive_path)
    else:
        print('Using cached archive: {}'.format(archive_path))

    # Optional md5 integrity check.
    if md5:
        md5sum = storage.md5(archive_path)
        assert md5 == md5sum, 'Checksum does not match.'

    # Create target subdirectory if needed.
    if len(target_subdir) > 0:
        storage.makedirs([os.path.join(CACHE_DIR, target_subdir)])

    # Extract archive to cache directory.
    print('Starting extraction of: {}'.format(archive_path))
    if tarfile.is_tarfile(archive_path):
        storage.tar_extract_all(archive_path,
                                os.path.join(CACHE_DIR, target_subdir))
    elif zipfile.is_zipfile(archive_path):
        with zipfile.ZipFile(archive_path, 'r') as zip_:
            zip_.extractall(os.path.join(CACHE_DIR, target_subdir))
    else:
        raise ValueError('Compression method not supported for: ',
                         archive_path)
    print('Completed extraction of: {}'.format(archive_path))

    # Delete cached archive if requested.
    if not cache_archive:
        storage.delete_file_if_exists(archive_path)
        print('Archive "{}" removed.'.format(archive_path))
Beispiel #4
0
def generate_csv(dataset_name, target, csv_data):
    """Generate CSV files containing the audio path and the corresponding sentence.

    Generated files are being stored at `CSV_TARGET_PATH`.
    Examples with labels consisting of one or two characters are omitted.

    Return additional data set information, see below.

    Args:
        dataset_name (str):
            Name of the dataset, e.g. 'libri_speech'.

        target (str):
            Target name, e.g. 'train', 'test', 'dev'

        csv_data (List[Dict]):
            List containing the csv content for the `<dataset_name>_<target>.csv` file.

    Returns:
        str: Path to the created CSV file.
    """

    target_csv_path = os.path.join(DATA_DIR,
                                   '{}_{}.csv'.format(dataset_name, target))
    print('Starting to generate: {}'.format(os.path.basename(target_csv_path)))

    # Remove illegal characters from labels.
    for csv_entry in csv_data:
        # Apply label whitelist filter.
        csv_entry[CSV_HEADER_LABEL] = re.sub(
            LABEL_WHITELIST_PATTERN, '', csv_entry[CSV_HEADER_LABEL].lower())
        # Remove double spaces.
        csv_entry[CSV_HEADER_LABEL] = csv_entry[CSV_HEADER_LABEL].strip(
        ).replace('  ', ' ')

    # Filter out labels that are only shorter than 2 characters.
    csv_data = list(filter(lambda x: len(x[CSV_HEADER_LABEL]) >= 2, csv_data))

    # Write list to CSV file.
    print('> Writing {} lines of {} files to {}'.format(
        len(csv_data), target, target_csv_path))
    # Delete the old file if it exists.
    delete_file_if_exists(target_csv_path)

    # Write data to the file.
    with open(target_csv_path, 'w', encoding='utf-8') as file_handle:
        writer = csv.DictWriter(file_handle,
                                delimiter=CSV_DELIMITER,
                                fieldnames=CSV_FIELDNAMES)
        writer.writeheader()

        writer.writerows(csv_data)

    return target_csv_path
Beispiel #5
0
def __tatoeba_loader_helper(sample):
    path = sample['path']
    text = sample['text']
    mp3_path = '{}.mp3'.format(path)
    wav_path = '{}.wav'.format(path)
    wav_path = os.path.join(__TARGET_PATH,
                            os.path.relpath(wav_path, __SOURCE_PATH))

    # Check if audio file MP3 exists.
    if not os.path.isfile(mp3_path):
        # print('WARN: Audio file missing: {}'.format(mp3_path))
        return None

    # Make sure the file isn't empty.
    try:
        if os.path.getsize(mp3_path) <= 4048:
            return None
    except OSError:
        return None

    # If a WAV file with the desired name already exist, delete it.
    delete_file_if_exists(wav_path)

    # Convert MP3 file into WAV file, reduce volume to 0.95, downsample to 16kHz mono sound.
    # Note that this call produces the WAV files in the `data/corpus` directory.
    ret = subprocess.call(sox_commandline(mp3_path, wav_path))
    if not os.path.isfile(wav_path):
        raise RuntimeError(
            'Failed to create WAV file with error code={}: {}'.format(
                ret, wav_path))

    # Validate that the example length is within boundaries.
    length_sec = None
    for i in range(5):
        try:
            (sampling_rate, audio_data) = wavfile.read(wav_path)
            length_sec = len(audio_data) / sampling_rate
            if not MIN_EXAMPLE_LENGTH <= length_sec <= MAX_EXAMPLE_LENGTH:
                return None
            break
        except ValueError:
            print('WARN: Could not load ({}/5) wavfile: {}'.format(
                i, wav_path))
            if i == 4:
                raise
            time.sleep(1)

    wav_path = os.path.relpath(wav_path, CORPUS_DIR)

    return {
        CSV_HEADER_PATH: wav_path,
        CSV_HEADER_LABEL: text.strip(),
        CSV_HEADER_LENGTH: length_sec
    }
Beispiel #6
0
def sort_by_seq_len(csv_path, max_length=17.0):
    """Sort a train.csv like file by it's audio files sequence length.

    Additionally outputs longer than `max_length` are being discarded from the given CSV file.
    Also it prints out optimal bucket sizes after computation.

    Args:
        csv_path (str):
            Path to the `train.csv`.

        max_length (float):
            Positive float. Maximum length in seconds for a feature vector to keep.
            Set to `0.` to keep everything.

    Returns:
        Nothing.
    """
    assert os.path.exists(csv_path) and os.path.isfile(csv_path)

    # Read train.csv file.
    with open(csv_path, 'r', encoding='utf-8') as file_handle:
        reader = csv.DictReader(file_handle,
                                delimiter=CSV_DELIMITER,
                                fieldnames=CSV_FIELDNAMES)

        # Read all lines into memory and remove CSV header.
        csv_data = [csv_entry for csv_entry in reader][1:]

    # Sort entries by sequence length.
    csv_data = sorted(csv_data, key=lambda x: float(x[CSV_HEADER_LENGTH]))

    # Remove samples longer than `max_length` points.
    if max_length > 0:
        number_of_entries = len(csv_data)
        csv_data = [
            d for d in csv_data if float(d[CSV_HEADER_LENGTH]) < max_length
        ]
        print('Removed {:,d} examples because they are too long.'.format(
            number_of_entries - len(csv_data)))

    # Write CSV data back to file.
    delete_file_if_exists(csv_path)
    with open(csv_path, 'w', encoding='utf-8') as file_handle:
        writer = csv.DictWriter(file_handle,
                                delimiter=CSV_DELIMITER,
                                fieldnames=CSV_FIELDNAMES)
        writer.writeheader()

        writer.writerows(csv_data)

    with open(csv_path, 'r', encoding='utf-8') as file_handle:
        print('Successfully sorted {} lines of {}'.format(
            len(file_handle.readlines()), csv_path))
Beispiel #7
0
def __common_voice_loader_helper(line):
    # Helper method for thread pool.

    # Cleanup label text.
    text = line[1].strip().replace('  ', ' ')
    # Enforce min label length.
    if len(text) > 1:
        # Check upvotes vs downvotes.
        if int(line[2]) >= 1 and int(line[3]) / int(line[2]) <= 1 / 4:
            # Check if speaker accent is valid.
            if line[6] in __VALID_ACCENTS:
                mp3_path = os.path.join(__SOURCE_PATH, line[0])
                assert os.path.isfile(mp3_path)
                wav_path = os.path.relpath('{}.wav'.format(mp3_path[:-4]),
                                           __SOURCE_PATH)
                wav_path = os.path.join(__TARGET_PATH, wav_path)

                delete_file_if_exists(wav_path)
                # Convert MP3 to WAV, reduce volume to 0.95, downsample to 16kHz and mono sound.
                subprocess.call(sox_commandline(mp3_path, wav_path))
                assert os.path.isfile(wav_path)

                # Validate that the example length is within boundaries.
                (sampling_rate, audio_data) = wavfile.read(wav_path)
                length_sec = len(audio_data) / sampling_rate
                if not MIN_EXAMPLE_LENGTH <= length_sec <= MAX_EXAMPLE_LENGTH:
                    return None

                # Add dataset relative to dataset path, label to CSV file buffer.
                wav_path = os.path.relpath(wav_path, CORPUS_DIR)

                return {
                    CSV_HEADER_PATH: wav_path,
                    CSV_HEADER_LABEL: text,
                    CSV_HEADER_LENGTH: length_sec
                }

    return None