def __init__(self, method='post', mono='mean', spectra_type=None, device=None, para=None, spec_aug=False): self.method = method self.mono = mono self.spectra_type = spectra_type self.device = device self.fs = para['fs'] self.time = para['time'] self.n_fft = para['n_fft'] self.n_mels = para['n_mels'] self.win_length = para['win_length'] self.hop_length = para['hop_length'] if self.spectra_type == 'mel_spectrum': self.spectrum = self.trans_melspectrogram() else: self.spectrum = self.trans_spectrogram() self.am_to_db = self.trans_am_to_db() self.au_to_img = self.trans_autoimg() # SpecAugment self.spec_aug = spec_aug torch_audio.set_audio_backend(backend="sox_io") self.spectrum = self.spectrum.to(self.device) self.am_to_db = self.am_to_db.to(self.device)
def test_3_load_and_save_is_identity(self): for backend in BACKENDS: if backend == 'sox_io': continue with self.subTest(): torchaudio.set_audio_backend(backend) self._test_3_load_and_save_is_identity()
def AudioBackendScope(new_backend): previous_backend = torchaudio.get_audio_backend() try: torchaudio.set_audio_backend(new_backend) yield finally: torchaudio.set_audio_backend(previous_backend)
def test_2_load_nonormalization(self): for backend in BACKENDS_MP3: if backend == 'sox_io': continue with self.subTest(): torchaudio.set_audio_backend(backend) self._test_2_load_nonormalization(self.test_filepath, 278756)
def info(request, torch_backend): torchaudio.set_audio_backend(torch_backend) if request.param: return data.load_info(audio_path) else: return None
def test_1_save_sine(self): for backend in BACKENDS: if backend == 'sox_io': continue with self.subTest(): torchaudio.set_audio_backend(backend) self._test_1_save_sine()
def test_4_load_partial(self): for backend in BACKENDS_MP3: if backend == 'sox_io': continue with self.subTest(): torchaudio.set_audio_backend(backend) self._test_4_load_partial()
def _decode_example_with_torchaudio(self, value): try: import torchaudio import torchaudio.transforms as T except ImportError as err: raise ImportError( "To support decoding 'mp3' audio files, please install 'torchaudio'." ) from err try: torchaudio.set_audio_backend("sox_io") except RuntimeError as err: raise ImportError( "To support decoding 'mp3' audio files, please install 'sox'." ) from err array, sampling_rate = torchaudio.load(value) if self.sampling_rate and self.sampling_rate != sampling_rate: if not hasattr(self, "_resampler"): self._resampler = T.Resample(sampling_rate, self.sampling_rate) array = self._resampler(array) sampling_rate = self.sampling_rate array = array.numpy() if self.mono: array = array.mean(axis=0) return array, sampling_rate
def supports_mp3(backend): torchaudio.set_audio_backend(backend) try: torchaudio.load(test_filepath) return True except (RuntimeError, ImportError): return False
def __init__(self, manifest_path: list, labels, max_duration=16.7, mask=False, win_len=0.02, sr=16000): torchaudio.set_audio_backend("sox_io") self.datasets = [] self.labels = labels self.mask = mask for item in manifest_path: total_count = 0 total_duration = 0. with open(item, encoding='utf-8') as f: for line in f.readlines(): data = json.loads(line, encoding='utf-8') if data['duration'] > max_duration: total_count += 1 total_duration += data['duration'] continue self.datasets.append(data) total_duration = total_duration / 60 logging.info("过滤音频条数:{:d}条".format(total_count)) logging.info("过滤音频时长:{:.2f}分钟".format(total_duration)) self.index2char = dict([(i, labels[i]) for i in range(len(labels))]) self.char2index = dict([(labels[i], i) for i in range(len(labels))]) self.audio_parser = AudioParser(win_len=win_len, sr=sr)
def conversion(wav_file): torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False torchaudio.set_audio_backend("soundfile") data, samplerate = librosa.load(wav_file, 44100, mono=True) sound = torch.from_numpy(data) # Fix all wav files to 176400 samples padded_data = torch.zeros( 176400) #tempData accounts for audio clips that are too short if sound.numel() < 176400: padded_data[:sound.numel()] = sound[:] else: padded_data[:] = sound[:176400] final_data = torch.zeros(32000) every_n = 176400 // 32000 count = 0 for i in range(32000): final_data[i] = padded_data[count] count += every_n return final_data
def test_5_get_info(self): for backend in BACKENDS: if backend == 'sox_io': continue with self.subTest(): torchaudio.set_audio_backend(backend) self._test_5_get_info()
def test_loadwav(dur, info, torch_backend): torchaudio.set_audio_backend(torch_backend) audio, _ = data.load_audio(audio_path, dur=dur, info=info) rate = 8000.0 if dur: assert audio.shape[-1] == int(dur * rate) else: assert audio.shape[-1] == rate * 3
def change_samplerate(): y, sr = torchaudio.load('audio.wav') y = y.mean(dim=0) # if there are multiple channels, average them to single channel if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) y_resampled = resampler(y) torchaudio.set_audio_backend(backend='sox') torchaudio.save(src=y_resampled,sample_rate=16000,filepath='sampled_audio.wav')
def test_switch(self): torchaudio.set_audio_backend(self.backend) if self.backend is None: assert torchaudio.get_audio_backend() is None else: assert torchaudio.get_audio_backend() == self.backend assert torchaudio.load == self.backend_module.load assert torchaudio.save == self.backend_module.save assert torchaudio.info == self.backend_module.info
def main(ARGS): model_name = "oliverguhr/wav2vec2-large-xlsr-53-german-cv9" wave_buffer = BehaviorSubject(np.array([])) wave2vec_asr = Wave2Vec2Inference(model_name) wave_buffer.subscribe( on_next=lambda x: asr_output_formatter(wave2vec_asr, x)) # Start audio with VAD vad_audio = VADAudio(aggressiveness=ARGS.webRTC_aggressiveness, device=ARGS.device, input_rate=ARGS.rate) print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() # load silero VAD torchaudio.set_audio_backend("soundfile") model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model=ARGS.silaro_model_name, force_reload=ARGS.reload, onnx=True) (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils # Stream from microphone to Wav2Vec 2.0 using VAD print("audio length\tinference time\ttext") spinner = None if not ARGS.nospinner: spinner = Halo(spinner='line') wav_data = bytearray() try: for frame in frames: if frame is not None: if spinner: spinner.start() wav_data.extend(frame) else: if spinner: spinner.stop() #print("webRTC has detected a possible speech") newsound = np.frombuffer(wav_data, np.int16) audio_float32 = Int2FloatSimple(newsound) time_stamps = get_speech_timestamps(audio_float32, model, sampling_rate=ARGS.rate) if (len(time_stamps) > 0): #print("silero VAD has detected a possible speech") wave_buffer.on_next(audio_float32.numpy()) else: print("VAD detected noise") wav_data = bytearray() except KeyboardInterrupt: exit()
def __init__(self, csv_dir, sample_rate=44100, segment=2): self.csv_dir = csv_dir self.segment = segment self.sample_rate = sample_rate self.mix_csv_path = os.path.join(self.csv_dir, 'metadata.csv') self.df_mix = pd.read_csv(self.mix_csv_path, engine='python', delimiter=';') torchaudio.set_audio_backend(backend='soundfile')
def main(): torch.multiprocessing.set_sharing_strategy('file_system') torchaudio.set_audio_backend('sox_io') hack_isinstance() # get config and arguments mode, args1, config1, args2, config2 = get_ttest_args() # Fix seed and make backends deterministic random.seed(args1.seed) np.random.seed(args1.seed) torch.manual_seed(args1.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(args1.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False tester1 = Tester(args1, config1) records1 = eval(f'tester1.{args1.mode}')() average1, sample_metric1 = process_records(records1, args1.evaluate_metric) tester2 = Tester(args2, config2) records2 = eval(f'tester2.{args2.mode}')() average2, sample_metric2 = process_records(records2, args2.evaluate_metric) if mode == 'ttest': statistic, p_value = stats.ttest_rel(sample_metric1, sample_metric2) elif mode == 'fisher': correct1 = sample_metric1.count(True) correct2 = sample_metric2.count(True) contingency_table = [[correct1, correct2], [ len(sample_metric1) - correct1, len(sample_metric2) - correct2 ]] statistic, p_value = stats.fisher_exact(contingency_table) elif mode == 'mcnemar': correct1 = sample_metric1.count(True) correct2 = sample_metric2.count(True) contingency_table = [[correct1, correct2], [ len(sample_metric1) - correct1, len(sample_metric2) - correct2 ]] b = mcnemar(contingency_table, exact=True) statistic, p_value = b.statistic, b.pvalue else: raise NotImplementedError print( f'[Runner] - The testing scores of the two ckpts are {average1} and {average2}, respectively.' ) print( f'[Runner] - The statistic of the significant test of the two ckpts is {statistic}' ) print( f'[Runner] - The P value of significant test of the two ckpts is {p_value}' )
def check_torchaudio_backend(): """Checks the torchaudio backend and sets it to soundfile if windows is detected. """ current_system = platform.system() if current_system == "Windows": logger.warn( "The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows." ) torchaudio.set_audio_backend("soundfile")
def test_1_save(self): for backend in BACKENDS_MP3: with self.subTest(): torchaudio.set_audio_backend(backend) self._test_1_save(self.test_filepath, False) for backend in BACKENDS: with self.subTest(): torchaudio.set_audio_backend(backend) self._test_1_save(self.test_filepath_wav, True)
def test_2_load(self): for backend in BACKENDS_MP3: with self.subTest(): torchaudio.set_audio_backend(backend) self._test_2_load(self.test_filepath, 278756) for backend in BACKENDS: with self.subTest(): torchaudio.set_audio_backend(backend) self._test_2_load(self.test_filepath_wav, 276858)
def __init__(self, csv_dir, sample_rate=44100, segment=3): self.segment = segment self.sample_rate = sample_rate self.paths = [ os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if (os.path.isfile(os.path.join(csv_dir, f)) and '.wav' in f) ] self.paths = sorted( self.paths, key=lambda i: int(os.path.splitext(os.path.basename(i))[0])) torchaudio.set_audio_backend(backend='soundfile')
def test_trackfolder_var(torch_backend): torchaudio.set_audio_backend(torch_backend) train_dataset = data.VariableSourcesTrackFolderDataset( split="train", seq_duration=1.0, root="TrackfolderDataset", sample_rate=8000.0, target_file="1.wav", ) for x, y in train_dataset: assert x.shape[-1] == 8000
def test_trackfolder_fix(torch_backend): torchaudio.set_audio_backend(torch_backend) train_dataset = data.FixedSourcesTrackFolderDataset( split="train", seq_duration=1.0, root="TrackfolderDataset", sample_rate=8000.0, target_file="1.wav", interferer_files=["2.wav", "3.wav", "4.wav"], ) for x, y in train_dataset: assert x.shape[-1] == 8000
def set_audio_backend(backend): """Allow additional backend value, 'default'""" if backend == 'default': if 'sox' in BACKENDS: be = 'sox' elif 'soundfile' in BACKENDS: be = 'soundfile' else: raise unittest.SkipTest('No default backend available') else: be = backend torchaudio.set_audio_backend(be)
def set_audio_backend(backend): """Allow additional backend value, 'default'""" backends = torchaudio.list_audio_backends() if backend == 'default': if 'sox_io' in backends: be = 'sox_io' elif 'soundfile' in backends: be = 'soundfile' else: raise unittest.SkipTest('No default backend available') else: be = backend torchaudio.set_audio_backend(be)
def main(): torch.multiprocessing.set_sharing_strategy('file_system') torchaudio.set_audio_backend('sox_io') hack_isinstance() # get config and arguments args, config, backup_files = get_downstream_args() if args.cache_dir is not None: torch.hub.set_dir(args.cache_dir) # When torch.distributed.launch is used if args.local_rank is not None: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(args.backend) if args.mode == 'train' and args.past_exp: ckpt = torch.load(args.init_ckpt, map_location='cpu') now_use_ddp = is_initialized() original_use_ddp = ckpt['Args'].local_rank is not None assert now_use_ddp == original_use_ddp, f'{now_use_ddp} != {original_use_ddp}' if now_use_ddp: now_world = get_world_size() original_world = ckpt['WorldSize'] assert now_world == original_world, f'{now_world} != {original_world}' # Save command if is_leader_process(): with open(os.path.join(args.expdir, f'args_{get_time_tag()}.yaml'), 'w') as file: yaml.dump(vars(args), file) with open(os.path.join(args.expdir, f'config_{get_time_tag()}.yaml'), 'w') as file: yaml.dump(config, file) for file in backup_files: backup(file, args.expdir) # Fix seed and make backends deterministic random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False runner = Runner(args, config) eval(f'runner.{args.mode}')()
def _test_3_load_and_save_is_identity_across_backend(self, backend1, backend2): torchaudio.set_audio_backend(backend1) input_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav') tensor1, sample_rate1 = torchaudio.load(input_path) output_path = os.path.join(self.test_dirpath, 'test.wav') torchaudio.save(output_path, tensor1, sample_rate1) torchaudio.set_audio_backend(backend2) tensor2, sample_rate2 = torchaudio.load(output_path) self.assertTrue(tensor1.allclose(tensor2)) self.assertEqual(sample_rate1, sample_rate2) os.unlink(output_path)
def main(ARGS): # Start audio with VAD vad_audio = VADAudio(aggressiveness=ARGS.webRTC_aggressiveness, device=ARGS.device, input_rate=ARGS.rate) print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() # load silero VAD torchaudio.set_audio_backend("soundfile") model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model=ARGS.silaro_model_name, force_reload=ARGS.reload) (get_speech_ts, _, _, _, _, _, _) = utils # Stream from microphone to DeepSpeech using VAD spinner = None if not ARGS.nospinner: spinner = Halo(spinner='line') wav_data = bytearray() for frame in frames: if frame is not None: if spinner: spinner.start() wav_data.extend(frame) else: if spinner: spinner.stop() print("webRTC has detected a possible speech") newsound = np.frombuffer(wav_data, np.int16) audio_float32 = Int2Float(newsound) time_stamps = get_speech_ts( audio_float32, model, num_steps=ARGS.num_steps, trig_sum=ARGS.trig_sum, neg_trig_sum=ARGS.neg_trig_sum, num_samples_per_window=ARGS.num_samples_per_window, min_speech_samples=ARGS.min_speech_samples, min_silence_samples=ARGS.min_silence_samples) if (len(time_stamps) > 0): print("silero VAD has detected a possible speech") else: print("silero VAD has detected a noise") print() wav_data = bytearray()
def __init__(self, mode="fbank", num_mel_bins=80, decode_wav=False, apply_cmvn=True, **kwargs): super(FeatureExtractor, self).__init__() # ToDo: Other surface representation assert mode=="fbank", "Only Mel-spectrogram implemented" self.mode = mode self.extract_fn = kaldi.fbank self.apply_cmvn = apply_cmvn if self.apply_cmvn: self.cmvn = CMVN() self.num_mel_bins = num_mel_bins self.kwargs = kwargs self.decode_wav = decode_wav if self.decode_wav: # HACK: sox cannot deal with wav with incorrect file length torchaudio.set_audio_backend('soundfile')