Esempio n. 1
0
    def __call__(self, example):
        def maybe_add_channel(signal):
            if signal.ndim == 1:
                return np.expand_dims(signal, axis=0)
            elif signal.ndim == 2:
                return signal
            else:
                raise ValueError('Either the signal has ndim 1 or 2',
                                 signal.shape)

        example[M_K.OBSERVATION_STFT] = self.stft(
            maybe_add_channel(example[OBSERVATION]))
        example[M_K.OBSERVATION_ABS] = np.abs(
            example[M_K.OBSERVATION_STFT]).astype(np.float32)
        example[NUM_FRAMES] = example[M_K.OBSERVATION_STFT].shape[-2]
        if SPEECH_IMAGE in example and NOISE_IMAGE in example:
            speech = self.stft(maybe_add_channel(example[SPEECH_IMAGE]))
            noise = self.stft(maybe_add_channel(example[NOISE_IMAGE]))
            target_mask, noise_mask = biased_binary_mask(
                np.stack([speech, noise], axis=0),
                low_cut=self.opts.low_cut,
                high_cut=self.opts.high_cut if self.opts.high_cut >= 0 else
                speech.shape[-1] + self.opts.high_cut)
            example[M_K.SPEECH_MASK_TARGET] = target_mask.astype(np.float32)
            example[M_K.NOISE_MASK_TARGET] = noise_mask.astype(np.float32)
        return example
Esempio n. 2
0
def change_example_structure(example):
    stft = pb.transform.stft
    audio_data = example[K.AUDIO_DATA]
    net_input = dict()
    net_input['observation_stft'] = stft(audio_data[K.OBSERVATION]).astype(
        np.complex64)
    net_input['observation_abs'] = np.abs(
        net_input['observation_stft']).astype(np.float32)
    speech_image = stft(audio_data[K.SPEECH_IMAGE])
    noise_image = stft(audio_data[K.NOISE_IMAGE])
    target_mask, noise_mask = biased_binary_mask(
        np.stack([speech_image, noise_image], axis=0))
    net_input['speech_mask_target'] = target_mask.astype(np.float32)
    net_input['noise_mask_target'] = noise_mask.astype(np.float32)
    return net_input
Esempio n. 3
0
def prepare_data(example):
    stft = pb.transform.STFT(shift=256, size=1024)
    net_input = dict()
    audio_data = dict()
    for key in ['observation', 'speech_image', 'noise_image']:
        audio_data[key] = stft(
            np.array([
                pb.io.load_audio(audio) for audio in example['audio_path'][key]
            ]))
    net_input['observation_abs'] = np.abs(audio_data['observation']).astype(
        np.float32)
    target_mask, noise_mask = biased_binary_mask(
        np.stack([audio_data['speech_image'], audio_data['noise_image']],
                 axis=0))
    net_input['speech_mask_target'] = target_mask.astype(np.float32)
    net_input['noise_mask_target'] = noise_mask.astype(np.float32)
    return net_input