def __common_voice_loader_helper(csv_line, target_dir): # Helper method for thread pool. audio_file_hash = csv_line[1] label = csv_line[2].strip().replace(' ', ' ').replace('"', '') upvotes = int(csv_line[3]) downvotes = int(csv_line[4]) # age = line[5] # gender = line[6] accent = csv_line[7] # Source and target paths. mp3_path = os.path.join(__SOURCE_PATH, 'clips', '{}.mp3'.format(audio_file_hash)) wav_path = os.path.join(target_dir, '{}.wav'.format(audio_file_hash)) # Enforce min label length. if len(label) < 3: # print('WARN: Label "{}" to short: {}'.format(label, mp3_path)) return None # Check upvotes vs. downvotes relation. if downvotes >= 1 and upvotes / downvotes > 1 / 4: # print('WARN: Too many down votes ({}/{}): {}'.format(upvotes, downvotes, mp3_path)) return None # Check if speaker accent is valid. if accent not in __VALID_ACCENTS: # print('WARN: Invalid accent "{}": {}'.format(accent, mp3_path)) return None # Make sure the file exists and is not empty. if not os.path.exists(mp3_path): print('WARN: MP3 file not found: {}'.format(mp3_path)) return None if os.path.getsize(mp3_path) < 1024: print('WARN: MP3 file appears to be empty: {}'.format(mp3_path)) return None delete_file_if_exists(wav_path) # Convert MP3 to WAV, reduce volume to 0.95, downsample to 16kHz and mono sound. subprocess.call(sox_commandline(mp3_path, wav_path)) assert os.path.isfile(wav_path), 'Created WAV file not found: {}'.format( wav_path) # Validate that the example length is within boundaries. (sampling_rate, audio_data) = wavfile.read(wav_path) length_sec = len(audio_data) / sampling_rate if not MIN_EXAMPLE_LENGTH <= length_sec <= MAX_EXAMPLE_LENGTH: return None # Add dataset relative to dataset path, label to CSV file buffer. wav_path = os.path.relpath(wav_path, CORPUS_DIR) return { CSV_HEADER_PATH: wav_path, CSV_HEADER_LENGTH: length_sec, CSV_HEADER_LABEL: label }
def __write_part_to_wav(wav_data, path, start, end, sampling_rate=16000): assert 0. <= start < (len(wav_data) / sampling_rate) assert start < end <= (len(wav_data) / sampling_rate) # print('Saving {:12,d}({:6.2f}s) to {:12,d}({:6.2f}s) at: {}' # .format(seconds_to_sample(start), start, seconds_to_sample(end), end, path)) delete_file_if_exists(path) wavfile.write(path, sampling_rate, wav_data[__seconds_to_sample(start, True): __seconds_to_sample(end, False)])
def maybe_download(url, md5=None, cache_archive=True, target_subdir=''): """Downloads a archive file if it's not cached. The archive gets extracted afterwards. It is advised to call `cleanup_cache()` after pre-processing to remove the cached extracted folder. Currently only TAR and ZIP files are supported. Args: url (str): URL for dataset download. md5 (str): Optional. Checksum for optional integrity check or `None`. cache_archive (bool): Optional. `True` if the downloaded archive should be kept, `False` if it should be deleted. target_subdir (str): Optional. Subdirectory within the cache folder, to where the archive should be extracted. Returns: Nothing. """ file_name = os.path.basename(urlparse(url).path) archive_path = os.path.join(CACHE_DIR, '{}'.format(file_name)) # Download archive if necessary. if not os.path.isfile(archive_path): download_with_progress(url, archive_path) else: print('Using cached archive: {}'.format(archive_path)) # Optional md5 integrity check. if md5: md5sum = storage.md5(archive_path) assert md5 == md5sum, 'Checksum does not match.' # Create target subdirectory if needed. if len(target_subdir) > 0: storage.makedirs([os.path.join(CACHE_DIR, target_subdir)]) # Extract archive to cache directory. print('Starting extraction of: {}'.format(archive_path)) if tarfile.is_tarfile(archive_path): storage.tar_extract_all(archive_path, os.path.join(CACHE_DIR, target_subdir)) elif zipfile.is_zipfile(archive_path): with zipfile.ZipFile(archive_path, 'r') as zip_: zip_.extractall(os.path.join(CACHE_DIR, target_subdir)) else: raise ValueError('Compression method not supported for: ', archive_path) print('Completed extraction of: {}'.format(archive_path)) # Delete cached archive if requested. if not cache_archive: storage.delete_file_if_exists(archive_path) print('Archive "{}" removed.'.format(archive_path))
def generate_csv(dataset_name, target, csv_data): """Generate CSV files containing the audio path and the corresponding sentence. Generated files are being stored at `CSV_TARGET_PATH`. Examples with labels consisting of one or two characters are omitted. Return additional data set information, see below. Args: dataset_name (str): Name of the dataset, e.g. 'libri_speech'. target (str): Target name, e.g. 'train', 'test', 'dev' csv_data (List[Dict]): List containing the csv content for the `<dataset_name>_<target>.csv` file. Returns: str: Path to the created CSV file. """ target_csv_path = os.path.join(DATA_DIR, '{}_{}.csv'.format(dataset_name, target)) print('Starting to generate: {}'.format(os.path.basename(target_csv_path))) # Remove illegal characters from labels. for csv_entry in csv_data: # Apply label whitelist filter. csv_entry[CSV_HEADER_LABEL] = re.sub( LABEL_WHITELIST_PATTERN, '', csv_entry[CSV_HEADER_LABEL].lower()) # Remove double spaces. csv_entry[CSV_HEADER_LABEL] = csv_entry[CSV_HEADER_LABEL].strip( ).replace(' ', ' ') # Filter out labels that are only shorter than 2 characters. csv_data = list(filter(lambda x: len(x[CSV_HEADER_LABEL]) >= 2, csv_data)) # Write list to CSV file. print('> Writing {} lines of {} files to {}'.format( len(csv_data), target, target_csv_path)) # Delete the old file if it exists. delete_file_if_exists(target_csv_path) # Write data to the file. with open(target_csv_path, 'w', encoding='utf-8') as file_handle: writer = csv.DictWriter(file_handle, delimiter=CSV_DELIMITER, fieldnames=CSV_FIELDNAMES) writer.writeheader() writer.writerows(csv_data) return target_csv_path
def __tatoeba_loader_helper(sample): path = sample['path'] text = sample['text'] mp3_path = '{}.mp3'.format(path) wav_path = '{}.wav'.format(path) wav_path = os.path.join(__TARGET_PATH, os.path.relpath(wav_path, __SOURCE_PATH)) # Check if audio file MP3 exists. if not os.path.isfile(mp3_path): # print('WARN: Audio file missing: {}'.format(mp3_path)) return None # Make sure the file isn't empty. try: if os.path.getsize(mp3_path) <= 4048: return None except OSError: return None # If a WAV file with the desired name already exist, delete it. delete_file_if_exists(wav_path) # Convert MP3 file into WAV file, reduce volume to 0.95, downsample to 16kHz mono sound. # Note that this call produces the WAV files in the `data/corpus` directory. ret = subprocess.call(sox_commandline(mp3_path, wav_path)) if not os.path.isfile(wav_path): raise RuntimeError( 'Failed to create WAV file with error code={}: {}'.format( ret, wav_path)) # Validate that the example length is within boundaries. length_sec = None for i in range(5): try: (sampling_rate, audio_data) = wavfile.read(wav_path) length_sec = len(audio_data) / sampling_rate if not MIN_EXAMPLE_LENGTH <= length_sec <= MAX_EXAMPLE_LENGTH: return None break except ValueError: print('WARN: Could not load ({}/5) wavfile: {}'.format( i, wav_path)) if i == 4: raise time.sleep(1) wav_path = os.path.relpath(wav_path, CORPUS_DIR) return { CSV_HEADER_PATH: wav_path, CSV_HEADER_LABEL: text.strip(), CSV_HEADER_LENGTH: length_sec }
def sort_by_seq_len(csv_path, max_length=17.0): """Sort a train.csv like file by it's audio files sequence length. Additionally outputs longer than `max_length` are being discarded from the given CSV file. Also it prints out optimal bucket sizes after computation. Args: csv_path (str): Path to the `train.csv`. max_length (float): Positive float. Maximum length in seconds for a feature vector to keep. Set to `0.` to keep everything. Returns: Nothing. """ assert os.path.exists(csv_path) and os.path.isfile(csv_path) # Read train.csv file. with open(csv_path, 'r', encoding='utf-8') as file_handle: reader = csv.DictReader(file_handle, delimiter=CSV_DELIMITER, fieldnames=CSV_FIELDNAMES) # Read all lines into memory and remove CSV header. csv_data = [csv_entry for csv_entry in reader][1:] # Sort entries by sequence length. csv_data = sorted(csv_data, key=lambda x: float(x[CSV_HEADER_LENGTH])) # Remove samples longer than `max_length` points. if max_length > 0: number_of_entries = len(csv_data) csv_data = [ d for d in csv_data if float(d[CSV_HEADER_LENGTH]) < max_length ] print('Removed {:,d} examples because they are too long.'.format( number_of_entries - len(csv_data))) # Write CSV data back to file. delete_file_if_exists(csv_path) with open(csv_path, 'w', encoding='utf-8') as file_handle: writer = csv.DictWriter(file_handle, delimiter=CSV_DELIMITER, fieldnames=CSV_FIELDNAMES) writer.writeheader() writer.writerows(csv_data) with open(csv_path, 'r', encoding='utf-8') as file_handle: print('Successfully sorted {} lines of {}'.format( len(file_handle.readlines()), csv_path))
def __common_voice_loader_helper(line): # Helper method for thread pool. # Cleanup label text. text = line[1].strip().replace(' ', ' ') # Enforce min label length. if len(text) > 1: # Check upvotes vs downvotes. if int(line[2]) >= 1 and int(line[3]) / int(line[2]) <= 1 / 4: # Check if speaker accent is valid. if line[6] in __VALID_ACCENTS: mp3_path = os.path.join(__SOURCE_PATH, line[0]) assert os.path.isfile(mp3_path) wav_path = os.path.relpath('{}.wav'.format(mp3_path[:-4]), __SOURCE_PATH) wav_path = os.path.join(__TARGET_PATH, wav_path) delete_file_if_exists(wav_path) # Convert MP3 to WAV, reduce volume to 0.95, downsample to 16kHz and mono sound. subprocess.call(sox_commandline(mp3_path, wav_path)) assert os.path.isfile(wav_path) # Validate that the example length is within boundaries. (sampling_rate, audio_data) = wavfile.read(wav_path) length_sec = len(audio_data) / sampling_rate if not MIN_EXAMPLE_LENGTH <= length_sec <= MAX_EXAMPLE_LENGTH: return None # Add dataset relative to dataset path, label to CSV file buffer. wav_path = os.path.relpath(wav_path, CORPUS_DIR) return { CSV_HEADER_PATH: wav_path, CSV_HEADER_LABEL: text, CSV_HEADER_LENGTH: length_sec } return None