def test_bytesio(self, ext, compression): """Saving audio to BytesIO object returns the same result as via file path.""" sample_rate = 16000 dtype = 'float32' num_channels = 2 num_frames = 16000 channels_first = True data = get_wav_data(dtype, num_channels, num_frames=num_frames) ref_path = self.get_temp_path(f'reference.{ext}') res_path = self.get_temp_path(f'test.{ext}') sox_io_backend.save(ref_path, data, channels_first=channels_first, sample_rate=sample_rate, compression=compression) fileobj = io.BytesIO() sox_io_backend.save(fileobj, data, channels_first=channels_first, sample_rate=sample_rate, compression=compression, format=ext) fileobj.seek(0) with open(res_path, 'wb') as file_: file_.write(fileobj.read()) expected_data, _ = sox_io_backend.load(ref_path) data, sr = sox_io_backend.load(res_path) assert sample_rate == sr self.assertEqual(expected_data, data)
def test_load_fail(self): """ When attempted to load a non-existing file, error message must contain the file path. """ path = "non_existing_audio.wav" with self.assertRaisesRegex( RuntimeError, "^Error loading audio file: failed to open file {0}$".format( path)): sox_io_backend.load(path)
def assert_flac(self, sample_rate, num_channels, compression_level, duration): """`sox_io_backend.load` can load flac format. This test takes the same strategy as mp3 to compare the result """ path = self.get_temp_path('1.original.flac') ref_path = self.get_temp_path('2.reference.wav') # 1. Generate flac with sox sox_utils.gen_audio_file(path, sample_rate, num_channels, compression=compression_level, bit_depth=16, duration=duration) # 2. Convert to wav with sox sox_utils.convert_audio_file(path, ref_path) # 3. Load flac with torchaudio data, sr = sox_io_backend.load(path) # 4. Load wav with scipy data_ref = load_wav(ref_path)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
def run_smoke_test(self, ext, sample_rate, num_channels, *, compression=None, dtype='float32'): duration = 1 num_frames = sample_rate * duration path = self.get_temp_path(f'test.{ext}') original = get_wav_data(dtype, num_channels, normalize=False, num_frames=num_frames) # 1. run save sox_io_backend.save(path, original, sample_rate, compression=compression) # 2. run info info = sox_io_backend.info(path) assert info.sample_rate == sample_rate assert info.num_channels == num_channels # 3. run load loaded, sr = sox_io_backend.load(path, normalize=False) assert sr == sample_rate assert loaded.shape[0] == num_channels
def run_smoke_test(self, ext, sample_rate, num_channels, *, compression=None, dtype='float32'): duration = 1 num_frames = sample_rate * duration original = get_wav_data(dtype, num_channels, normalize=False, num_frames=num_frames) fileobj = io.BytesIO() # 1. run save sox_io_backend.save(fileobj, original, sample_rate, compression=compression, format=ext) # 2. run info fileobj.seek(0) info = sox_io_backend.info(fileobj, format=ext) assert info.sample_rate == sample_rate assert info.num_channels == num_channels # 3. run load fileobj.seek(0) loaded, sr = sox_io_backend.load(fileobj, normalize=False, format=ext) assert sr == sample_rate assert loaded.shape[0] == num_channels
def test_requests(self, ext, compression): sample_rate = 16000 format_ = ext if ext in ['mp3'] else None audio_file = f'test.{ext}' audio_path = self.get_temp_path(audio_file) sox_utils.gen_audio_file( audio_path, sample_rate, num_channels=2, compression=compression) expected, _ = sox_io_backend.load(audio_path) url = self.get_url(audio_file) with requests.get(url, stream=True) as resp: found, sr = sox_io_backend.load(resp.raw, format=format_) assert sr == sample_rate self.assertEqual(expected, found)
def assert_vorbis(self, sample_rate, num_channels, quality_level, duration): """`sox_io_backend.load` can load vorbis format. This test takes the same strategy as mp3 to compare the result """ path = self.get_temp_path( f'{sample_rate}_{num_channels}_{quality_level}_{duration}.vorbis') ref_path = f'{path}.wav' # 1. Generate vorbis with sox sox_utils.gen_audio_file(path, sample_rate, num_channels, compression=quality_level, bit_depth=16, duration=duration) # 2. Convert to wav with sox sox_utils.convert_audio_file(path, ref_path) # 3. Load vorbis with torchaudio data, sr = sox_io_backend.load(path) # 4. Load wav with scipy data_ref = load_wav(ref_path)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
def assert_amb(self, dtype, sample_rate, num_channels, normalize, duration): """`sox_io_backend.load` can load amb format. This test takes the same strategy as mp3 to compare the result """ path = self.get_temp_path('1.original.amb') ref_path = self.get_temp_path('2.reference.wav') # 1. Generate amb with sox sox_utils.gen_audio_file(path, sample_rate, num_channels, encoding=sox_utils.get_encoding(dtype), bit_depth=sox_utils.get_bit_depth(dtype), duration=duration) # 2. Convert to wav with sox sox_utils.convert_audio_file(path, ref_path) # 3. Load amb with torchaudio data, sr = sox_io_backend.load(path, normalize=normalize) # 4. Load wav with scipy data_ref = load_wav(ref_path, normalize=normalize)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
def test_fileobj(self, ext, compression): """Loading audio via file object returns the same result as via file path.""" sample_rate = 16000 format_ = ext if ext in ['mp3'] else None path = self.get_temp_path(f'test.{ext}') sox_utils.gen_audio_file( path, sample_rate, num_channels=2, compression=compression) expected, _ = sox_io_backend.load(path) with open(path, 'rb') as fileobj: found, sr = sox_io_backend.load(fileobj, format=format_) assert sr == sample_rate self.assertEqual(expected, found)
def test_channels_first(self, channels_first): """channels_first swaps axes""" found, _ = sox_io_backend.load(self.path, channels_first=channels_first) expected = self.original if channels_first else self.original.transpose( 1, 0) self.assertEqual(found, expected)
def _make_file(self, format_): sample_rate = 8000 path = self.get_temp_path(f'test.{format_}') sox_utils.gen_audio_file(f'{path}', sample_rate, num_channels=2) self.original = sox_io_backend.load(path)[0] self.path = os.path.splitext(path)[0] os.rename(path, self.path)
def test_wav(self, dtype, sample_rate, num_channels): """save/load round trip should not degrade data for wav formats""" original = get_wav_data(dtype, num_channels, normalize=False) data = original for i in range(10): path = self.get_temp_path(f'{i}.wav') sox_io_backend.save(path, data, sample_rate) data, sr = sox_io_backend.load(path, normalize=False) assert sr == sample_rate self.assertEqual(original, data)
def test_bytesio_clogged(self, ext, compression): """Loading audio via clogged file object returns the same result as via file path. This test case validates the case where fileobject returns shorter bytes than requeted. """ sample_rate = 16000 format_ = ext if ext in ['mp3'] else None path = self.get_temp_path(f'test.{ext}') sox_utils.gen_audio_file( path, sample_rate, num_channels=2, compression=compression) expected, _ = sox_io_backend.load(path) with open(path, 'rb') as file_: fileobj = CloggedFileObj(io.BytesIO(file_.read())) found, sr = sox_io_backend.load(fileobj, format=format_) assert sr == sample_rate self.assertEqual(expected, found)
def test_opus(self, bitrate, num_channels, compression_level): """`sox_io_backend.load` can load opus file correctly.""" ops_path = get_asset_path('io', f'{bitrate}_{compression_level}_{num_channels}ch.opus') wav_path = self.get_temp_path(f'{bitrate}_{compression_level}_{num_channels}ch.opus.wav') sox_utils.convert_audio_file(ops_path, wav_path) expected, sample_rate = load_wav(wav_path) found, sr = sox_io_backend.load(ops_path) assert sample_rate == sr self.assertEqual(expected, found)
def test_tarfile(self, ext, compression): """Loading compressed audio via file-like object returns the same result as via file path.""" sample_rate = 16000 format_ = ext if ext in ['mp3'] else None audio_file = f'test.{ext}' audio_path = self.get_temp_path(audio_file) archive_path = self.get_temp_path('archive.tar.gz') sox_utils.gen_audio_file( audio_path, sample_rate, num_channels=2, compression=compression) expected, _ = sox_io_backend.load(audio_path) with tarfile.TarFile(archive_path, 'w') as tarobj: tarobj.add(audio_path, arcname=audio_file) with tarfile.TarFile(archive_path, 'r') as tarobj: fileobj = tarobj.extractfile(audio_file) found, sr = sox_io_backend.load(fileobj, format=format_) assert sr == sample_rate self.assertEqual(expected, found)
def assert_wav(self, dtype, sample_rate, num_channels, normalize, duration): """`sox_io_backend.load` can load wav format correctly. Wav data loaded with sox_io backend should match those with scipy """ path = self.get_temp_path('reference.wav') data = get_wav_data(dtype, num_channels, normalize=normalize, num_frames=duration * sample_rate) save_wav(path, data, sample_rate) expected = load_wav(path, normalize=normalize)[0] data, sr = sox_io_backend.load(path, normalize=normalize) assert sr == sample_rate self.assertEqual(data, expected)
def test_mp3(self): """Providing format allows to read mp3 without extension libsox does not check header for mp3 https://github.com/pytorch/audio/issues/1040 The file was generated with the following command ffmpeg -f lavfi -i "sine=frequency=1000:duration=5" -ar 16000 -f mp3 test_noext """ path = get_asset_path("mp3_without_ext") _, sr = sox_io_backend.load(path, format="mp3") assert sr == 16000
def test_flac(self, sample_rate, num_channels, compression_level): """save/load round trip should not degrade data for flac formats""" original = get_wav_data('float32', num_channels) data = original for i in range(10): path = self.get_temp_path(f'{i}.flac') sox_io_backend.save(path, data, sample_rate, compression=compression_level) data, sr = sox_io_backend.load(path) assert sr == sample_rate self.assertEqual(original, data)
def __init__(self, path, streams, img_size): # Having this helper index makes everything go incredibly fast self.curr_index = -1 # To ensure playback and whatnot we reencode to just audio using the native ffmpeg os.system( f'ffmpeg -n -i {path} -acodec pcm_s16le -ar 44100 {path}.wav') # Video loading is very simple self.video_reader = torchvision.io.VideoReader(path, 'video') self.visual_info = self.video_reader.get_metadata()['video'] # Since in general, video fps <<< audio fps, we want to sync the audio to the video. # So the audio initialization is more complicated. # Recall that samples/second * (second/frame) = samples/frame self.audio_info = { 'sample_rate': sox_io.info(path + '.wav').sample_rate, 'num_frames': sox_io.info(path + '.wav').num_frames, 'num_channels': sox_io.info(path + '.wav').num_channels } self.a_v_ratio = int(self.audio_info['sample_rate'] / self.visual_info['fps'][0]) self.audio_reader = lambda index: sox_io.load(path + '.wav', index * self.a_v_ratio, self.a_v_ratio, normalize=True) # Wrapper to make the iteration much more simple # TODO: Put this in a function self.resize = torchvision.transforms.Resize(img_size) if streams == 'audioaudio': self.streams = self.audio_streamer if streams == 'videovideo': self.streams = self.video_streamer if streams == 'audiovideo': self.streams = self.audio_video_streamer if streams == 'videoaudio': self.streams = self.video_audio_streamer
def test_frame(self, frame_offset, num_frames): """num_frames and frame_offset correctly specify the region of data""" sample_rate = 8000 audio_file = 'test.wav' audio_path = self.get_temp_path(audio_file) original = get_wav_data('float32', num_channels=2) save_wav(audio_path, original, sample_rate) frame_end = None if num_frames == -1 else frame_offset + num_frames expected = original[:, frame_offset:frame_end] url = self.get_url(audio_file) with requests.get(url, stream=True) as resp: found, sr = sox_io_backend.load(resp.raw, frame_offset, num_frames) assert sr == sample_rate self.assertEqual(expected, found)
def assert_mp3(self, sample_rate, num_channels, bit_rate, duration): """`sox_io_backend.load` can load mp3 format. mp3 encoding introduces delay and boundary effects so we create reference wav file from mp3 x | | 1. Generate mp3 with Sox | v 2. Convert to wav with Sox mp3 ------------------------------> wav | | | 3. Load with torchaudio | 4. Load with scipy | | v v tensor ----------> x <----------- tensor 5. Compare Underlying assumptions are; i. Conversion of mp3 to wav with Sox preserves data. ii. Loading wav file with scipy is correct. By combining i & ii, step 2. and 4. allows to load reference mp3 data without using torchaudio """ path = self.get_temp_path( f'{sample_rate}_{num_channels}_{bit_rate}_{duration}.mp3') ref_path = f'{path}.wav' # 1. Generate mp3 with sox sox_utils.gen_audio_file(path, sample_rate, num_channels, compression=bit_rate, duration=duration) # 2. Convert to wav with sox sox_utils.convert_audio_file(path, ref_path) # 3. Load mp3 with torchaudio data, sr = sox_io_backend.load(path) # 4. Load wav with scipy data_ref = load_wav(ref_path)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=3e-03, rtol=1.3e-06)
def assert_24bit_wav(self, sample_rate, num_channels, normalize, duration): """ `sox_io_backend.load` can load 24-bit signed PCM wav format. Since torch does not support the ``int24`` dtype, we implicitly cast the resulting tensor to the ``int32`` dtype. It is not possible to use #assert_wav method above, as #get_wav_data does not support the 'int24' dtype. This is because torch does not support the ``int24`` dtype. Hence, we must use the following workaround. x | | 1. Generate 24-bit wav with Sox. | v 2. Convert 24-bit wav to 32-bit wav with Sox. wav(24-bit) ----------------------> wav(32-bit) | | | 3. Load 24-bit wav with torchaudio| 4. Load 32-bit wav with scipy | | v v tensor ----------> x <----------- tensor 5. Compare # Underlying assumptions are: # i. Sox properly converts from 24-bit to 32-bit # ii. Loading 32-bit wav file with scipy is correct. """ path = self.get_temp_path('1.original.wav') ref_path = self.get_temp_path('2.reference.wav') # 1. Generate 24-bit signed wav with Sox sox_utils.gen_audio_file(path, sample_rate, num_channels, bit_depth=24, duration=duration) # 2. Convert from 24-bit wav to 32-bit wav with sox sox_utils.convert_audio_file(path, ref_path, bit_depth=32) # 3. Load 24-bit wav with torchaudio data, sr = sox_io_backend.load(path, normalize=normalize) # 4. Load 32-bit wav with scipy data_ref = load_wav(ref_path, normalize=normalize)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=3e-03, rtol=1.3e-06)
def write_tar_file(data_list, no_segments, tar_file, resample=16000, index=0, total=1): logging.info('Processing {} {}/{}'.format(tar_file, index, total)) read_time = 0.0 save_time = 0.0 write_time = 0.0 with tarfile.open(tar_file, "w") as tar: prev_wav = None for item in data_list: if no_segments: key, txt, wav = item else: key, txt, wav, start, end = item suffix = wav.split('.')[-1] assert suffix in AUDIO_FORMAT_SETS if no_segments: ts = time.time() with open(wav, 'rb') as fin: data = fin.read() read_time += (time.time() - ts) else: if wav != prev_wav: ts = time.time() waveforms, sample_rate = sox.load(wav, normalize=False) read_time += (time.time() - ts) prev_wav = wav start = int(start * sample_rate) end = int(end * sample_rate) audio = waveforms[:1, start:end] # resample if sample_rate != resample: if not audio.is_floating_point(): # normalize the audio before resample # because resample can't process int audio audio = audio / (1 << 15) audio = torchaudio.transforms.Resample( sample_rate, resample)(audio) audio = (audio * (1 << 15)).short() else: audio = torchaudio.transforms.Resample( sample_rate, resample)(audio) ts = time.time() f = io.BytesIO() sox.save(f, audio, resample, format="wav", bits_per_sample=16) # Save to wav for segments file suffix = "wav" f.seek(0) data = f.read() save_time += (time.time() - ts) assert isinstance(txt, str) ts = time.time() txt_file = key + '.txt' txt = txt.encode('utf8') txt_data = io.BytesIO(txt) txt_info = tarfile.TarInfo(txt_file) txt_info.size = len(txt) tar.addfile(txt_info, txt_data) wav_file = key + '.' + suffix wav_data = io.BytesIO(data) wav_info = tarfile.TarInfo(wav_file) wav_info.size = len(data) tar.addfile(wav_info, wav_data) write_time += (time.time() - ts) logging.info('read {} save {} write {}'.format(read_time, save_time, write_time))
def assert_format( self, format: str, sample_rate: float, num_channels: int, compression: float = None, bit_depth: int = None, duration: float = 1, normalize: bool = True, encoding: str = None, atol: float = 4e-05, rtol: float = 1.3e-06, ): """`sox_io_backend.load` can load given format correctly. file encodings introduce delay and boundary effects so we create a reference wav file from the original file format x | | 1. Generate given format with Sox | v 2. Convert to wav with Sox given format ----------------------> wav | | | 3. Load with torchaudio | 4. Load with scipy | | v v tensor ----------> x <----------- tensor 5. Compare Underlying assumptions are; i. Conversion of given format to wav with Sox preserves data. ii. Loading wav file with scipy is correct. By combining i & ii, step 2. and 4. allows to load reference given format data without using torchaudio """ path = self.get_temp_path(f'1.original.{format}') ref_path = self.get_temp_path('2.reference.wav') # 1. Generate the given format with sox sox_utils.gen_audio_file( path, sample_rate, num_channels, encoding=encoding, compression=compression, bit_depth=bit_depth, duration=duration, ) # 2. Convert to wav with sox wav_bit_depth = 32 if bit_depth == 24 else None # for 24-bit wav sox_utils.convert_audio_file(path, ref_path, bit_depth=wav_bit_depth) # 3. Load the given format with torchaudio data, sr = sox_io_backend.load(path, normalize=normalize) # 4. Load wav with scipy data_ref = load_wav(ref_path, normalize=normalize)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=atol, rtol=rtol)
def test_format(self, format_): """Providing format allows to read file without extension""" self._make_file(format_) found, _ = sox_io_backend.load(self.path) self.assertEqual(found, self.original)
def py_load_func(filepath: str, normalize: bool, channels_first: bool): return sox_io_backend.load(filepath, normalize=normalize, channels_first=channels_first)
def test_frame(self, frame_offset, num_frames): """num_frames and frame_offset correctly specify the region of data""" found, _ = sox_io_backend.load(self.path, frame_offset, num_frames) frame_end = None if num_frames == -1 else frame_offset + num_frames self.assertEqual(found, self.original[:, frame_offset:frame_end])