def assert_amb(self, dtype, sample_rate, num_channels, duration): """`sox_io_backend.save` can save amb format. This test takes the same strategy as mp3 to compare the result """ src_path = self.get_temp_path('1.reference.wav') amb_path = self.get_temp_path('2.1.torchaudio.amb') wav_path = self.get_temp_path('2.2.torchaudio.wav') amb_path_sox = self.get_temp_path('3.1.sox.amb') wav_path_sox = self.get_temp_path('3.2.sox.wav') # 1. Generate original wav data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate) save_wav(src_path, data, sample_rate) # 2.1. Convert the original wav to amb with torchaudio sox_io_backend.save(amb_path, load_wav(src_path, normalize=False)[0], sample_rate) # 2.2. Convert the amb to wav with Sox sox_utils.convert_audio_file(amb_path, wav_path) # 2.3. Load found = load_wav(wav_path)[0] # 3.1. Convert the original wav to amb with SoX sox_utils.convert_audio_file(src_path, amb_path_sox) # 3.2. Convert the amb to wav with Sox sox_utils.convert_audio_file(amb_path_sox, wav_path_sox) # 3.3. Load expected = load_wav(wav_path_sox)[0] self.assertEqual(found, expected)
def assert_mp3(self, sample_rate, num_channels, bit_rate, duration): """`sox_io_backend.save` can save mp3 format. mp3 encoding introduces delay and boundary effects so we convert the resulting mp3 to wav and compare the results there | | 1. Generate original wav file with SciPy | v -------------- wav ---------------- | | | 2.1. load with scipy | 3.1. Convert to mp3 with Sox | then save with torchaudio | v v mp3 mp3 | | | 2.2. Convert to wav with Sox | 3.2. Convert to wav with Sox | | v v wav wav | | | 2.3. load with scipy | 3.3. load with scipy | | v v tensor -------> compare <--------- tensor """ src_path = self.get_temp_path('1.reference.wav') mp3_path = self.get_temp_path('2.1.torchaudio.mp3') wav_path = self.get_temp_path('2.2.torchaudio.wav') mp3_path_sox = self.get_temp_path('3.1.sox.mp3') wav_path_sox = self.get_temp_path('3.2.sox.wav') # 1. Generate original wav data = get_wav_data('float32', num_channels, normalize=True, num_frames=duration * sample_rate) save_wav(src_path, data, sample_rate) # 2.1. Convert the original wav to mp3 with torchaudio sox_io_backend.save(mp3_path, load_wav(src_path)[0], sample_rate, compression=bit_rate) # 2.2. Convert the mp3 to wav with Sox sox_utils.convert_audio_file(mp3_path, wav_path) # 2.3. Load found = load_wav(wav_path)[0] # 3.1. Convert the original wav to mp3 with SoX sox_utils.convert_audio_file(src_path, mp3_path_sox, compression=bit_rate) # 3.2. Convert the mp3 to wav with Sox sox_utils.convert_audio_file(mp3_path_sox, wav_path_sox) # 3.3. Load expected = load_wav(wav_path_sox)[0] self.assertEqual(found, expected)
def assert_sphere(self, sample_rate, num_channels, duration): """`sox_io_backend.save` can save sph format. This test takes the same strategy as mp3 to compare the result """ src_path = self.get_temp_path('1.reference.wav') flc_path = self.get_temp_path('2.1.torchaudio.sph') wav_path = self.get_temp_path('2.2.torchaudio.wav') flc_path_sox = self.get_temp_path('3.1.sox.sph') wav_path_sox = self.get_temp_path('3.2.sox.wav') # 1. Generate original wav data = get_wav_data('float32', num_channels, normalize=True, num_frames=duration * sample_rate) save_wav(src_path, data, sample_rate) # 2.1. Convert the original wav to sph with torchaudio sox_io_backend.save(flc_path, load_wav(src_path)[0], sample_rate) # 2.2. Convert the sph to wav with Sox # converting to 32 bit because sph file has 24 bit depth which scipy cannot handle. sox_utils.convert_audio_file(flc_path, wav_path, bit_depth=32) # 2.3. Load found = load_wav(wav_path)[0] # 3.1. Convert the original wav to sph with SoX sox_utils.convert_audio_file(src_path, flc_path_sox) # 3.2. Convert the sph to wav with Sox # converting to 32 bit because sph file has 24 bit depth which scipy cannot handle. sox_utils.convert_audio_file(flc_path_sox, wav_path_sox, bit_depth=32) # 3.3. Load expected = load_wav(wav_path_sox)[0] self.assertEqual(found, expected)
def run_smoke_test(self, ext, sample_rate, num_channels, *, compression=None, dtype='float32'): duration = 1 num_frames = sample_rate * duration original = get_wav_data(dtype, num_channels, normalize=False, num_frames=num_frames) fileobj = io.BytesIO() # 1. run save sox_io_backend.save(fileobj, original, sample_rate, compression=compression, format=ext) # 2. run info fileobj.seek(0) info = sox_io_backend.info(fileobj, format=ext) assert info.sample_rate == sample_rate assert info.num_channels == num_channels # 3. run load fileobj.seek(0) loaded, sr = sox_io_backend.load(fileobj, normalize=False, format=ext) assert sr == sample_rate assert loaded.shape[0] == num_channels
def run_smoke_test(self, ext, sample_rate, num_channels, *, compression=None, dtype='float32'): duration = 1 num_frames = sample_rate * duration path = self.get_temp_path(f'test.{ext}') original = get_wav_data(dtype, num_channels, normalize=False, num_frames=num_frames) # 1. run save sox_io_backend.save(path, original, sample_rate, compression=compression) # 2. run info info = sox_io_backend.info(path) assert info.sample_rate == sample_rate assert info.num_channels == num_channels # 3. run load loaded, sr = sox_io_backend.load(path, normalize=False) assert sr == sample_rate assert loaded.shape[0] == num_channels
def test_bytesio(self, ext, compression): """Saving audio to BytesIO object returns the same result as via file path.""" sample_rate = 16000 dtype = 'float32' num_channels = 2 num_frames = 16000 channels_first = True data = get_wav_data(dtype, num_channels, num_frames=num_frames) ref_path = self.get_temp_path(f'reference.{ext}') res_path = self.get_temp_path(f'test.{ext}') sox_io_backend.save(ref_path, data, channels_first=channels_first, sample_rate=sample_rate, compression=compression) fileobj = io.BytesIO() sox_io_backend.save(fileobj, data, channels_first=channels_first, sample_rate=sample_rate, compression=compression, format=ext) fileobj.seek(0) with open(res_path, 'wb') as file_: file_.write(fileobj.read()) expected_data, _ = sox_io_backend.load(ref_path) data, sr = sox_io_backend.load(res_path) assert sample_rate == sr self.assertEqual(expected_data, data)
def test_noncontiguous(self, dtype): """Noncontiguous tensors are saved correctly""" path = self.get_temp_path('data.wav') expected = get_wav_data(dtype, 4)[::2, ::2] assert not expected.is_contiguous() sox_io_backend.save(path, expected, 8000) found = load_wav(path)[0] self.assertEqual(found, expected)
def assert_wav(self, dtype, sample_rate, num_channels, num_frames): """`sox_io_backend.save` can save wav format.""" path = self.get_temp_path('data.wav') expected = get_wav_data(dtype, num_channels, num_frames=num_frames) sox_io_backend.save(path, expected, sample_rate) found, sr = load_wav(path) assert sample_rate == sr self.assertEqual(found, expected)
def test_dtype_conversion(self, dtype, expected): """`save` performs dtype conversion on float32 src tensors only.""" path = self.get_temp_path("data.wav") data = torch.tensor([-1.0, -0.5, 0, 0.5, 1.0]).to(torch.float32).view(-1, 1) sox_io_backend.save(path, data, 8000, dtype=dtype) found = load_wav(path, normalize=False)[0] self.assertEqual(found, expected.view(-1, 1))
def test_channels_first(self, channels_first): """channels_first swaps axes""" path = self.get_temp_path('data.wav') data = get_wav_data('int32', 2, channels_first=channels_first) sox_io_backend.save(path, data, 8000, channels_first=channels_first) found = load_wav(path)[0] expected = data if channels_first else data.transpose(1, 0) self.assertEqual(found, expected)
def test_tensor_preserve(self, dtype): """save function should not alter Tensor""" path = self.get_temp_path('data.wav') expected = get_wav_data(dtype, 4)[::2, ::2] data = expected.clone() sox_io_backend.save(path, data, 8000) self.assertEqual(data, expected)
def test_save_noncontiguous(self, dtype): """Noncontiguous tensors are saved correctly""" path = self.get_temp_path('data.wav') enc, bps = get_enc_params(dtype) expected = get_wav_data(dtype, 4, normalize=False)[::2, ::2] assert not expected.is_contiguous() sox_io_backend.save( path, expected, 8000, encoding=enc, bits_per_sample=bps) found = load_wav(path, normalize=False)[0] self.assertEqual(found, expected)
def test_save_fail(self): """ When attempted to save into a non-existing dir, error message must contain the file path. """ path = os.path.join("non_existing_directory", "foo.wav") with self.assertRaisesRegex( RuntimeError, "^Error saving audio file: failed to open file {0}$".format( path)): sox_io_backend.save(path, torch.zeros(1, 1), 8000)
def test_wav(self, dtype, sample_rate, num_channels): """save/load round trip should not degrade data for wav formats""" original = get_wav_data(dtype, num_channels, normalize=False) data = original for i in range(10): path = self.get_temp_path(f'{i}.wav') sox_io_backend.save(path, data, sample_rate) data, sr = sox_io_backend.load(path, normalize=False) assert sr == sample_rate self.assertEqual(original, data)
def test_flac(self, sample_rate, num_channels, compression_level): """save/load round trip should not degrade data for flac formats""" original = get_wav_data('float32', num_channels) data = original for i in range(10): path = self.get_temp_path(f'{i}.flac') sox_io_backend.save(path, data, sample_rate, compression=compression_level) data, sr = sox_io_backend.load(path) assert sr == sample_rate self.assertEqual(original, data)
def _assert_vorbis(self, sample_rate, num_channels, quality_level, duration): """`sox_io_backend.save` can save vorbis format. This test takes the same strategy as mp3 to compare the result """ src_path = self.get_temp_path('1.reference.wav') vbs_path = self.get_temp_path('2.1.torchaudio.vorbis') wav_path = self.get_temp_path('2.2.torchaudio.wav') vbs_path_sox = self.get_temp_path('3.1.sox.vorbis') wav_path_sox = self.get_temp_path('3.2.sox.wav') # 1. Generate original wav data = get_wav_data('int16', num_channels, normalize=False, num_frames=duration * sample_rate) save_wav(src_path, data, sample_rate) # 2.1. Convert the original wav to vorbis with torchaudio sox_io_backend.save(vbs_path, load_wav(src_path)[0], sample_rate, compression=quality_level, dtype=None) # 2.2. Convert the vorbis to wav with Sox sox_utils.convert_audio_file(vbs_path, wav_path) # 2.3. Load found = load_wav(wav_path)[0] # 3.1. Convert the original wav to vorbis with SoX sox_utils.convert_audio_file(src_path, vbs_path_sox, compression=quality_level) # 3.2. Convert the vorbis to wav with Sox sox_utils.convert_audio_file(vbs_path_sox, wav_path_sox) # 3.3. Load expected = load_wav(wav_path_sox)[0] # sox's vorbis encoding has some random boundary effect, which cause small number of # samples yields higher descrepency than the others. # so we allow small portions of data to be outside of absolute torelance. # make sure to pass somewhat long duration atol = 1.0e-4 max_failure_allowed = 0.01 # this percent of samples are allowed to outside of atol. failure_ratio = ( (found - expected).abs() > atol).sum().item() / found.numel() if failure_ratio > max_failure_allowed: # it's failed and this will give a better error message. self.assertEqual(found, expected, atol=atol, rtol=1.3e-6)
def write_tar_file(data_list, no_segments, tar_file, resample=16000, index=0, total=1): logging.info('Processing {} {}/{}'.format(tar_file, index, total)) read_time = 0.0 save_time = 0.0 write_time = 0.0 with tarfile.open(tar_file, "w") as tar: prev_wav = None for item in data_list: if no_segments: key, txt, wav = item else: key, txt, wav, start, end = item suffix = wav.split('.')[-1] assert suffix in AUDIO_FORMAT_SETS if no_segments: ts = time.time() with open(wav, 'rb') as fin: data = fin.read() read_time += (time.time() - ts) else: if wav != prev_wav: ts = time.time() waveforms, sample_rate = sox.load(wav, normalize=False) read_time += (time.time() - ts) prev_wav = wav start = int(start * sample_rate) end = int(end * sample_rate) audio = waveforms[:1, start:end] # resample if sample_rate != resample: if not audio.is_floating_point(): # normalize the audio before resample # because resample can't process int audio audio = audio / (1 << 15) audio = torchaudio.transforms.Resample( sample_rate, resample)(audio) audio = (audio * (1 << 15)).short() else: audio = torchaudio.transforms.Resample( sample_rate, resample)(audio) ts = time.time() f = io.BytesIO() sox.save(f, audio, resample, format="wav", bits_per_sample=16) # Save to wav for segments file suffix = "wav" f.seek(0) data = f.read() save_time += (time.time() - ts) assert isinstance(txt, str) ts = time.time() txt_file = key + '.txt' txt = txt.encode('utf8') txt_data = io.BytesIO(txt) txt_info = tarfile.TarInfo(txt_file) txt_info.size = len(txt) tar.addfile(txt_info, txt_data) wav_file = key + '.' + suffix wav_data = io.BytesIO(data) wav_info = tarfile.TarInfo(wav_file) wav_info.size = len(data) tar.addfile(wav_info, wav_data) write_time += (time.time() - ts) logging.info('read {} save {} write {}'.format(read_time, save_time, write_time))
def assert_save_consistency( self, format: str, *, compression: float = None, encoding: str = None, bits_per_sample: int = None, sample_rate: float = 8000, num_channels: int = 2, num_frames: float = 3 * 8000, src_dtype: str = 'int32', test_mode: str = "path", ): """`save` function produces file that is comparable with `sox` command To compare that the file produced by `save` function agains the file produced by the equivalent `sox` command, we need to load both files. But there are many formats that cannot be opened with common Python modules (like SciPy). So we use `sox` command to prepare the original data and convert the saved files into a format that SciPy can read (PCM wav). The following diagram illustrates this process. The difference is 2.1. and 3.1. This assumes that - loading data with SciPy preserves the data well. - converting the resulting files into WAV format with `sox` preserve the data well. x | 1. Generate source wav file with SciPy | v -------------- wav ---------------- | | | 2.1. load with scipy | 3.1. Convert to the target | then save it into the target | format depth with sox | format with torchaudio | v v target format target format | | | 2.2. Convert to wav with sox | 3.2. Convert to wav with sox | | v v wav wav | | | 2.3. load with scipy | 3.3. load with scipy | | v v tensor -------> compare <--------- tensor """ cmp_encoding = 'floating-point' cmp_bit_depth = 32 src_path = self.get_temp_path('1.source.wav') tgt_path = self.get_temp_path(f'2.1.torchaudio.{format}') tst_path = self.get_temp_path('2.2.result.wav') sox_path = self.get_temp_path(f'3.1.sox.{format}') ref_path = self.get_temp_path('3.2.ref.wav') # 1. Generate original wav data = get_wav_data(src_dtype, num_channels, normalize=False, num_frames=num_frames) save_wav(src_path, data, sample_rate) # 2.1. Convert the original wav to target format with torchaudio data = load_wav(src_path, normalize=False)[0] if test_mode == "path": sox_io_backend.save(tgt_path, data, sample_rate, compression=compression, encoding=encoding, bits_per_sample=bits_per_sample) elif test_mode == "fileobj": with open(tgt_path, 'bw') as file_: sox_io_backend.save(file_, data, sample_rate, format=format, compression=compression, encoding=encoding, bits_per_sample=bits_per_sample) elif test_mode == "bytesio": file_ = io.BytesIO() sox_io_backend.save(file_, data, sample_rate, format=format, compression=compression, encoding=encoding, bits_per_sample=bits_per_sample) file_.seek(0) with open(tgt_path, 'bw') as f: f.write(file_.read()) else: raise ValueError(f"Unexpected test mode: {test_mode}") # 2.2. Convert the target format to wav with sox sox_utils.convert_audio_file(tgt_path, tst_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth) # 2.3. Load with SciPy found = load_wav(tst_path, normalize=False)[0] # 3.1. Convert the original wav to target format with sox sox_encoding = _get_sox_encoding(encoding) sox_utils.convert_audio_file(src_path, sox_path, compression=compression, encoding=sox_encoding, bit_depth=bits_per_sample) # 3.2. Convert the target format to wav with sox sox_utils.convert_audio_file(sox_path, ref_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth) # 3.3. Load with SciPy expected = load_wav(ref_path, normalize=False)[0] self.assertEqual(found, expected)