Example #1
0
    def __init__(self,
                 method='post',
                 mono='mean',
                 spectra_type=None,
                 device=None,
                 para=None,
                 spec_aug=False):
        self.method = method
        self.mono = mono
        self.spectra_type = spectra_type
        self.device = device
        self.fs = para['fs']
        self.time = para['time']
        self.n_fft = para['n_fft']
        self.n_mels = para['n_mels']
        self.win_length = para['win_length']
        self.hop_length = para['hop_length']
        if self.spectra_type == 'mel_spectrum':
            self.spectrum = self.trans_melspectrogram()
        else:
            self.spectrum = self.trans_spectrogram()
        self.am_to_db = self.trans_am_to_db()
        self.au_to_img = self.trans_autoimg()

        # SpecAugment
        self.spec_aug = spec_aug

        torch_audio.set_audio_backend(backend="sox_io")
        self.spectrum = self.spectrum.to(self.device)
        self.am_to_db = self.am_to_db.to(self.device)
Example #2
0
 def test_3_load_and_save_is_identity(self):
     for backend in BACKENDS:
         if backend == 'sox_io':
             continue
         with self.subTest():
             torchaudio.set_audio_backend(backend)
             self._test_3_load_and_save_is_identity()
Example #3
0
def AudioBackendScope(new_backend):
    previous_backend = torchaudio.get_audio_backend()
    try:
        torchaudio.set_audio_backend(new_backend)
        yield
    finally:
        torchaudio.set_audio_backend(previous_backend)
Example #4
0
 def test_2_load_nonormalization(self):
     for backend in BACKENDS_MP3:
         if backend == 'sox_io':
             continue
         with self.subTest():
             torchaudio.set_audio_backend(backend)
             self._test_2_load_nonormalization(self.test_filepath, 278756)
def info(request, torch_backend):
    torchaudio.set_audio_backend(torch_backend)

    if request.param:
        return data.load_info(audio_path)
    else:
        return None
Example #6
0
 def test_1_save_sine(self):
     for backend in BACKENDS:
         if backend == 'sox_io':
             continue
         with self.subTest():
             torchaudio.set_audio_backend(backend)
             self._test_1_save_sine()
Example #7
0
 def test_4_load_partial(self):
     for backend in BACKENDS_MP3:
         if backend == 'sox_io':
             continue
         with self.subTest():
             torchaudio.set_audio_backend(backend)
             self._test_4_load_partial()
Example #8
0
    def _decode_example_with_torchaudio(self, value):
        try:
            import torchaudio
            import torchaudio.transforms as T
        except ImportError as err:
            raise ImportError(
                "To support decoding 'mp3' audio files, please install 'torchaudio'."
            ) from err
        try:
            torchaudio.set_audio_backend("sox_io")
        except RuntimeError as err:
            raise ImportError(
                "To support decoding 'mp3' audio files, please install 'sox'."
            ) from err

        array, sampling_rate = torchaudio.load(value)
        if self.sampling_rate and self.sampling_rate != sampling_rate:
            if not hasattr(self, "_resampler"):
                self._resampler = T.Resample(sampling_rate, self.sampling_rate)
            array = self._resampler(array)
            sampling_rate = self.sampling_rate
        array = array.numpy()
        if self.mono:
            array = array.mean(axis=0)
        return array, sampling_rate
Example #9
0
 def supports_mp3(backend):
     torchaudio.set_audio_backend(backend)
     try:
         torchaudio.load(test_filepath)
         return True
     except (RuntimeError, ImportError):
         return False
 def __init__(self,
              manifest_path: list,
              labels,
              max_duration=16.7,
              mask=False,
              win_len=0.02,
              sr=16000):
     torchaudio.set_audio_backend("sox_io")
     self.datasets = []
     self.labels = labels
     self.mask = mask
     for item in manifest_path:
         total_count = 0
         total_duration = 0.
         with open(item, encoding='utf-8') as f:
             for line in f.readlines():
                 data = json.loads(line, encoding='utf-8')
                 if data['duration'] > max_duration:
                     total_count += 1
                     total_duration += data['duration']
                     continue
                 self.datasets.append(data)
             total_duration = total_duration / 60
             logging.info("过滤音频条数:{:d}条".format(total_count))
             logging.info("过滤音频时长:{:.2f}分钟".format(total_duration))
     self.index2char = dict([(i, labels[i]) for i in range(len(labels))])
     self.char2index = dict([(labels[i], i) for i in range(len(labels))])
     self.audio_parser = AudioParser(win_len=win_len, sr=sr)
Example #11
0
def conversion(wav_file):
    torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False
    torchaudio.set_audio_backend("soundfile")

    data, samplerate = librosa.load(wav_file, 44100, mono=True)
    sound = torch.from_numpy(data)

    # Fix all wav files to 176400 samples
    padded_data = torch.zeros(
        176400)  #tempData accounts for audio clips that are too short

    if sound.numel() < 176400:
        padded_data[:sound.numel()] = sound[:]
    else:
        padded_data[:] = sound[:176400]

    final_data = torch.zeros(32000)
    every_n = 176400 // 32000

    count = 0
    for i in range(32000):
        final_data[i] = padded_data[count]
        count += every_n

    return final_data
Example #12
0
 def test_5_get_info(self):
     for backend in BACKENDS:
         if backend == 'sox_io':
             continue
         with self.subTest():
             torchaudio.set_audio_backend(backend)
             self._test_5_get_info()
def test_loadwav(dur, info, torch_backend):
    torchaudio.set_audio_backend(torch_backend)
    audio, _ = data.load_audio(audio_path, dur=dur, info=info)
    rate = 8000.0
    if dur:
        assert audio.shape[-1] == int(dur * rate)
    else:
        assert audio.shape[-1] == rate * 3
Example #14
0
def change_samplerate():
    y, sr = torchaudio.load('audio.wav')
    y = y.mean(dim=0) # if there are multiple channels, average them to single channel
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        y_resampled = resampler(y)
    torchaudio.set_audio_backend(backend='sox')   
    torchaudio.save(src=y_resampled,sample_rate=16000,filepath='sampled_audio.wav')
Example #15
0
 def test_switch(self):
     torchaudio.set_audio_backend(self.backend)
     if self.backend is None:
         assert torchaudio.get_audio_backend() is None
     else:
         assert torchaudio.get_audio_backend() == self.backend
     assert torchaudio.load == self.backend_module.load
     assert torchaudio.save == self.backend_module.save
     assert torchaudio.info == self.backend_module.info
Example #16
0
def main(ARGS):
    model_name = "oliverguhr/wav2vec2-large-xlsr-53-german-cv9"

    wave_buffer = BehaviorSubject(np.array([]))
    wave2vec_asr = Wave2Vec2Inference(model_name)
    wave_buffer.subscribe(
        on_next=lambda x: asr_output_formatter(wave2vec_asr, x))

    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.webRTC_aggressiveness,
                         device=ARGS.device,
                         input_rate=ARGS.rate)

    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()

    # load silero VAD
    torchaudio.set_audio_backend("soundfile")
    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                  model=ARGS.silaro_model_name,
                                  force_reload=ARGS.reload,
                                  onnx=True)
    (get_speech_timestamps, save_audio, read_audio, VADIterator,
     collect_chunks) = utils

    # Stream from microphone to Wav2Vec 2.0 using VAD
    print("audio length\tinference time\ttext")
    spinner = None
    if not ARGS.nospinner:
        spinner = Halo(spinner='line')
    wav_data = bytearray()
    try:
        for frame in frames:
            if frame is not None:
                if spinner:
                    spinner.start()

                wav_data.extend(frame)
            else:
                if spinner:
                    spinner.stop()
                #print("webRTC has detected a possible speech")

                newsound = np.frombuffer(wav_data, np.int16)
                audio_float32 = Int2FloatSimple(newsound)
                time_stamps = get_speech_timestamps(audio_float32,
                                                    model,
                                                    sampling_rate=ARGS.rate)

                if (len(time_stamps) > 0):
                    #print("silero VAD has detected a possible speech")
                    wave_buffer.on_next(audio_float32.numpy())
                else:
                    print("VAD detected noise")
                wav_data = bytearray()
    except KeyboardInterrupt:
        exit()
Example #17
0
 def __init__(self, csv_dir, sample_rate=44100, segment=2):
     self.csv_dir = csv_dir
     self.segment = segment
     self.sample_rate = sample_rate
     self.mix_csv_path = os.path.join(self.csv_dir, 'metadata.csv')
     self.df_mix = pd.read_csv(self.mix_csv_path,
                               engine='python',
                               delimiter=';')
     torchaudio.set_audio_backend(backend='soundfile')
Example #18
0
def main():
    torch.multiprocessing.set_sharing_strategy('file_system')
    torchaudio.set_audio_backend('sox_io')
    hack_isinstance()

    # get config and arguments
    mode, args1, config1, args2, config2 = get_ttest_args()

    # Fix seed and make backends deterministic
    random.seed(args1.seed)
    np.random.seed(args1.seed)
    torch.manual_seed(args1.seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(args1.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    tester1 = Tester(args1, config1)
    records1 = eval(f'tester1.{args1.mode}')()
    average1, sample_metric1 = process_records(records1, args1.evaluate_metric)

    tester2 = Tester(args2, config2)
    records2 = eval(f'tester2.{args2.mode}')()
    average2, sample_metric2 = process_records(records2, args2.evaluate_metric)

    if mode == 'ttest':
        statistic, p_value = stats.ttest_rel(sample_metric1, sample_metric2)
    elif mode == 'fisher':
        correct1 = sample_metric1.count(True)
        correct2 = sample_metric2.count(True)
        contingency_table = [[correct1, correct2],
                             [
                                 len(sample_metric1) - correct1,
                                 len(sample_metric2) - correct2
                             ]]
        statistic, p_value = stats.fisher_exact(contingency_table)
    elif mode == 'mcnemar':
        correct1 = sample_metric1.count(True)
        correct2 = sample_metric2.count(True)
        contingency_table = [[correct1, correct2],
                             [
                                 len(sample_metric1) - correct1,
                                 len(sample_metric2) - correct2
                             ]]
        b = mcnemar(contingency_table, exact=True)
        statistic, p_value = b.statistic, b.pvalue
    else:
        raise NotImplementedError

    print(
        f'[Runner] - The testing scores of the two ckpts are {average1} and {average2}, respectively.'
    )
    print(
        f'[Runner] - The statistic of the significant test of the two ckpts is {statistic}'
    )
    print(
        f'[Runner] - The P value of significant test of the two ckpts is {p_value}'
    )
Example #19
0
def check_torchaudio_backend():
    """Checks the torchaudio backend and sets it to soundfile if
    windows is detected.
    """
    current_system = platform.system()
    if current_system == "Windows":
        logger.warn(
            "The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows."
        )
        torchaudio.set_audio_backend("soundfile")
Example #20
0
    def test_1_save(self):
        for backend in BACKENDS_MP3:
            with self.subTest():
                torchaudio.set_audio_backend(backend)
                self._test_1_save(self.test_filepath, False)

        for backend in BACKENDS:
            with self.subTest():
                torchaudio.set_audio_backend(backend)
                self._test_1_save(self.test_filepath_wav, True)
Example #21
0
    def test_2_load(self):
        for backend in BACKENDS_MP3:
            with self.subTest():
                torchaudio.set_audio_backend(backend)
                self._test_2_load(self.test_filepath, 278756)

        for backend in BACKENDS:
            with self.subTest():
                torchaudio.set_audio_backend(backend)
                self._test_2_load(self.test_filepath_wav, 276858)
Example #22
0
 def __init__(self, csv_dir, sample_rate=44100, segment=3):
     self.segment = segment
     self.sample_rate = sample_rate
     self.paths = [
         os.path.join(csv_dir, f) for f in os.listdir(csv_dir)
         if (os.path.isfile(os.path.join(csv_dir, f)) and '.wav' in f)
     ]
     self.paths = sorted(
         self.paths,
         key=lambda i: int(os.path.splitext(os.path.basename(i))[0]))
     torchaudio.set_audio_backend(backend='soundfile')
Example #23
0
def test_trackfolder_var(torch_backend):
    torchaudio.set_audio_backend(torch_backend)

    train_dataset = data.VariableSourcesTrackFolderDataset(
        split="train",
        seq_duration=1.0,
        root="TrackfolderDataset",
        sample_rate=8000.0,
        target_file="1.wav",
    )
    for x, y in train_dataset:
        assert x.shape[-1] == 8000
Example #24
0
def test_trackfolder_fix(torch_backend):
    torchaudio.set_audio_backend(torch_backend)

    train_dataset = data.FixedSourcesTrackFolderDataset(
        split="train",
        seq_duration=1.0,
        root="TrackfolderDataset",
        sample_rate=8000.0,
        target_file="1.wav",
        interferer_files=["2.wav", "3.wav", "4.wav"],
    )
    for x, y in train_dataset:
        assert x.shape[-1] == 8000
Example #25
0
def set_audio_backend(backend):
    """Allow additional backend value, 'default'"""
    if backend == 'default':
        if 'sox' in BACKENDS:
            be = 'sox'
        elif 'soundfile' in BACKENDS:
            be = 'soundfile'
        else:
            raise unittest.SkipTest('No default backend available')
    else:
        be = backend

    torchaudio.set_audio_backend(be)
Example #26
0
def set_audio_backend(backend):
    """Allow additional backend value, 'default'"""
    backends = torchaudio.list_audio_backends()
    if backend == 'default':
        if 'sox_io' in backends:
            be = 'sox_io'
        elif 'soundfile' in backends:
            be = 'soundfile'
        else:
            raise unittest.SkipTest('No default backend available')
    else:
        be = backend

    torchaudio.set_audio_backend(be)
Example #27
0
def main():
    torch.multiprocessing.set_sharing_strategy('file_system')
    torchaudio.set_audio_backend('sox_io')
    hack_isinstance()

    # get config and arguments
    args, config, backup_files = get_downstream_args()
    if args.cache_dir is not None:
        torch.hub.set_dir(args.cache_dir)

    # When torch.distributed.launch is used
    if args.local_rank is not None:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(args.backend)

    if args.mode == 'train' and args.past_exp:
        ckpt = torch.load(args.init_ckpt, map_location='cpu')

        now_use_ddp = is_initialized()
        original_use_ddp = ckpt['Args'].local_rank is not None
        assert now_use_ddp == original_use_ddp, f'{now_use_ddp} != {original_use_ddp}'

        if now_use_ddp:
            now_world = get_world_size()
            original_world = ckpt['WorldSize']
            assert now_world == original_world, f'{now_world} != {original_world}'

    # Save command
    if is_leader_process():
        with open(os.path.join(args.expdir, f'args_{get_time_tag()}.yaml'),
                  'w') as file:
            yaml.dump(vars(args), file)

        with open(os.path.join(args.expdir, f'config_{get_time_tag()}.yaml'),
                  'w') as file:
            yaml.dump(config, file)

        for file in backup_files:
            backup(file, args.expdir)

    # Fix seed and make backends deterministic
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    runner = Runner(args, config)
    eval(f'runner.{args.mode}')()
Example #28
0
    def _test_3_load_and_save_is_identity_across_backend(self, backend1, backend2):
        torchaudio.set_audio_backend(backend1)
        input_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav')
        tensor1, sample_rate1 = torchaudio.load(input_path)

        output_path = os.path.join(self.test_dirpath, 'test.wav')
        torchaudio.save(output_path, tensor1, sample_rate1)

        torchaudio.set_audio_backend(backend2)
        tensor2, sample_rate2 = torchaudio.load(output_path)

        self.assertTrue(tensor1.allclose(tensor2))
        self.assertEqual(sample_rate1, sample_rate2)
        os.unlink(output_path)
Example #29
0
def main(ARGS):
    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.webRTC_aggressiveness,
                         device=ARGS.device,
                         input_rate=ARGS.rate)

    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()

    # load silero VAD
    torchaudio.set_audio_backend("soundfile")
    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                  model=ARGS.silaro_model_name,
                                  force_reload=ARGS.reload)
    (get_speech_ts, _, _, _, _, _, _) = utils

    # Stream from microphone to DeepSpeech using VAD
    spinner = None
    if not ARGS.nospinner:
        spinner = Halo(spinner='line')
    wav_data = bytearray()
    for frame in frames:
        if frame is not None:
            if spinner: spinner.start()

            wav_data.extend(frame)
        else:
            if spinner: spinner.stop()
            print("webRTC has detected a possible speech")

            newsound = np.frombuffer(wav_data, np.int16)
            audio_float32 = Int2Float(newsound)
            time_stamps = get_speech_ts(
                audio_float32,
                model,
                num_steps=ARGS.num_steps,
                trig_sum=ARGS.trig_sum,
                neg_trig_sum=ARGS.neg_trig_sum,
                num_samples_per_window=ARGS.num_samples_per_window,
                min_speech_samples=ARGS.min_speech_samples,
                min_silence_samples=ARGS.min_silence_samples)

            if (len(time_stamps) > 0):
                print("silero VAD has detected a possible speech")
            else:
                print("silero VAD has detected a noise")
            print()
            wav_data = bytearray()
Example #30
0
 def __init__(self, mode="fbank", num_mel_bins=80, decode_wav=False, 
              apply_cmvn=True, **kwargs):
     super(FeatureExtractor, self).__init__()
     # ToDo: Other surface representation
     assert mode=="fbank", "Only Mel-spectrogram implemented"
     self.mode = mode
     self.extract_fn = kaldi.fbank
     self.apply_cmvn = apply_cmvn
     if self.apply_cmvn:
         self.cmvn = CMVN()
     self.num_mel_bins = num_mel_bins
     self.kwargs = kwargs
     self.decode_wav = decode_wav
     if self.decode_wav:
         # HACK: sox cannot deal with wav with incorrect file length
         torchaudio.set_audio_backend('soundfile')