Ejemplo n.º 1
0
 def test_sph_files(self, file, fails):
     # Some SPHERE files can be read with soundfile, but not all.
     path = get_file_path(file)
     if fails:
         with pytest.raises(RuntimeError):
             load_audio(path)
     else:
         load_audio(path)
Ejemplo n.º 2
0
 def test_dtype(
     self,
     array_dtype,
     dump_type,
     dumped_type,
     load_type,
     loaded_dtype,
 ):
     a = np.array([1, 2, -4, 4], dtype=array_dtype)
     dump_audio(a, path, dtype=dump_type, normalize=False)
     assert get_audio_type(path) == dumped_type
     b = load_audio(path, dtype=load_type)
     assert b.dtype == loaded_dtype
     content = io.BytesIO(dumps_audio(a, dtype=dump_type, normalize=False))
     c = load_audio(content, dtype=load_type)
Ejemplo n.º 3
0
 def test_default_wo_normalize(self):
     a = np.array([1, 2, -4, 4], dtype=np.int16)
     dump_audio(a, path, normalize=False)
     assert get_audio_type(path) == "PCM_16"
     b = load_audio(path)
     assert b.dtype == np.float64
     np.testing.assert_allclose(b, a / 2**15)
Ejemplo n.º 4
0
def pre_batch_transform(inputs):
    return {
        's':
        np.ascontiguousarray(
            [load_audio(p) for p in inputs['audio_path']['speech_source']],
            np.float32),
        'y':
        np.ascontiguousarray(load_audio(inputs['audio_path']['observation']),
                             np.float32),
        'num_samples':
        inputs['num_samples'],
        'example_id':
        inputs['example_id'],
        'audio_path':
        inputs['audio_path'],
    }
Ejemplo n.º 5
0
    def setUp(self):
        path = get_file_path("sample.wav")

        self.time_signal = load_audio(path)
        # self.time_signal = np.random.randn(5, 3, 5324)
        self.torch_signal = torch.from_numpy(self.time_signal)
        self.stft = STFT(size=self.size,
                         shift=self.shift,
                         window_length=self.window_length,
                         fading=self.fading,
                         complex_representation='concat',
                         window=self.window)
        self.fbins = self.stft.size // 2 + 1
Ejemplo n.º 6
0
def eval_estimator(db_json,
                   scenario,
                   ref_node_id,
                   vad_threshold,
                   activity_threshold):
    msg = ('scenario must be "Scenario-1", "Scenario-2", '
           '"Scenario-3" or "Scenario-4"')
    scenarios = ['Scenario-1', 'Scenario-2', 'Scenario-3', 'Scenario-4']
    assert scenario in scenarios, msg

    if scenario == 'Scenario-1':
        db = AsyncWASN(db_json).get_data_set_scenario_1()
    elif scenario == 'Scenario-2':
        db = AsyncWASN(db_json).get_data_set_scenario_2()
    elif scenario == 'Scenario-3':
        db = AsyncWASN(db_json).get_data_set_scenario_3()
    elif scenario == 'Scenario-4':
        db = AsyncWASN(db_json).get_data_set_scenario_4()

    sro_estimator = DynamicWACD()
    voice_activity_detector = VoiceActivityDetector(vad_threshold)
    num_examples = 3 * len(db)
    errors = np.zeros(num_examples)
    for ex_id, example in enumerate(db):
        print(f'Process example {example["example_id"].split("_")[-1]}')
        all_dists = get_distances(example)
        ref_sig = load_audio(example['audio_path'][f'node_{ref_node_id}'])
        other_nodes = [i for i in range(4) if i != ref_node_id]
        for cnt, node_id in enumerate(other_nodes):
            sig = load_audio(example['audio_path'][f'node_{node_id}'])

            # Align the signals coarsely
            sig_sync, ref_sig_sync, offset = \
                coarse_sync(sig, ref_sig, len_sync=320000)

            # Estimate the sampling rate offset (SRO)
            activity_sig = voice_activity_detector(sig_sync)
            activity_ref_sig = voice_activity_detector(ref_sig_sync)
            sro_est = sro_estimator(
                sig_sync, ref_sig_sync, activity_sig, activity_ref_sig
            )

            # Compensate for the SRO
            sig_sync = compensate_sro(sig_sync, sro_est)
            ref_sig_sync = ref_sig_sync[:len(sig_sync)]

            # Estimate the time shifts and distances
            sig_shifts = est_time_shift(sig_sync, ref_sig_sync, 16384, 2048)
            if offset > 0:
                dists = all_dists[int(np.round(offset)):, node_id]
                dists_ref = all_dists[:, ref_node_id]
            else:
                dists = all_dists[:, node_id]
                dists_ref = all_dists[int(np.round(-offset)):, ref_node_id]
            frame_ids = \
                8192 + np.asarray([i*2048 for i in range(len(sig_shifts))])
            dists = dists[frame_ids]
            dists_ref = dists_ref[frame_ids]

            # Discard estimates corresponding to periods in time
            # without source activity
            activity_ref_sig = voice_activity_detector(ref_sig_sync)
            activity_ref_sig = \
                (segment_axis(activity_ref_sig, 16384, 2048).sum(-1)
                 > activity_threshold)
            activity_sig = voice_activity_detector(sig_sync)
            activity_sig = (segment_axis(activity_sig, 16384, 2048).sum(-1)
                            > activity_threshold)
            activity_mask = np.logical_and(activity_sig, activity_ref_sig)
            sig_shifts = sig_shifts[activity_mask]
            dists = dists[activity_mask]
            dists_ref = dists_ref[activity_mask]

            # Estimate the sampling time offsett (STO)
            sto_est = est_sto(sig_shifts, dists, dists_ref) - offset

            # Calculate the estimation error
            sto = (example['sto'][f'node_{node_id}']
                   - example['sto'][f'node_{ref_node_id}'])
            errors[3*ex_id+cnt] = sto - sto_est
            print(f'node {node_id}: error = '
                  f'{np.round(errors[3*ex_id+cnt], 2)} samples')
    print(f'\nRMSE = {np.round(np.sqrt(np.mean(errors**2)), 2)} samples')
Ejemplo n.º 7
0
def generate_audio(example,
                   node_id,
                   std_sensor_noise,
                   sig_len=None,
                   single_channel=False,
                   max_sro=400):
    """
    Generates the audio signal recorded by a sensor node using the given
    simulation description. This function is typically used as map function in
    combination with the lazy_data set package.

    Args:
        example:
            Example dictionary specifying how to generate the audio signal
        node_id:
            Integer identifying the sensor node for which the recorded signal
            should be simulated.
        std_sensor_noise:
            Standard deviation of the simulated sensor noise.
        sig_len:
            Length (in samples) of the signal to be created.
        single_channel:
            Boolean specifying if all microphone signals should be simulated.
            If true only one channel is simulated. Otherwise, all microphone
            channels are simulated.
        max_sro:
            Expected maximum value for the sampling rate offset (SRO)
    Returns:
        Example dictionary with additionally added audio signal
    """
    min_sto = np.minimum(np.min([sto for sto in example['sto'].values()]), 0)
    stos = {node_id: sto - min_sto for node_id, sto in example['sto'].items()}
    src_diary = example['src_diary']

    if single_channel:
        num_channels = 1
    else:
        num_channels = len(load_audio(src_diary[0]['rirs']['node_0']))

    if sig_len is not None:
        min_sig_len = sig_len
        max_sro_delay = int(np.ceil(max_sro * 1e-6 * sig_len))
        min_sig_len += \
            max_sro_delay + np.max([np.abs(sto) for sto in stos.values()])
        if min_sig_len > example['src_diary'][-1]['offset']:
            min_sig_len = example['src_diary'][-1]['offset']
            warnings.warn(
                'Specified signal length is larger than maximum signal length'
                'defined by the source diary. The signal length is set to'
                'maximum signal length defined by the source diary')
    else:
        min_sig_len = src_diary[-1]['offset']

    audio_data = np.zeros((num_channels, min_sig_len))

    for source in src_diary:
        onset = source['onset']
        clean_audio = load_audio(source['audio_path'])
        rirs = load_audio(source['rirs'][node_id])
        if single_channel:
            rirs = rirs[0, None]
        reverberant_audio = reverb_signal(clean_audio, rirs)
        if onset + reverberant_audio.shape[-1] > audio_data.shape[-1]:
            missing_len = \
                onset + reverberant_audio.shape[-1] - audio_data.shape[-1]
            audio_data = \
                np.pad(audio_data, ((0, 0), (0, missing_len)), mode='constant')
            audio_data[:, onset:onset + reverberant_audio.shape[-1]] += \
                reverberant_audio
            break
        audio_data[:, onset:onset + reverberant_audio.shape[-1]] += \
            reverberant_audio

    audio_data = audio_data[:, stos[node_id]:]
    sro = example['sro'][node_id]
    if isinstance(sro, str):
        sro = load_binary(sro)
    audio_data = np.asarray([sim_sro(ch, sro) for ch in audio_data])
    audio_data += np.random.normal(0, std_sensor_noise, size=audio_data.shape)

    if sig_len is not None:
        audio_data = audio_data[:, :sig_len]

    if 'audio_data' in example.keys():
        example['audio_data'][node_id] = audio_data
    else:
        example['audio_data'] = {node_id: audio_data}
    return example