Ejemplo n.º 1
0
    def generate(self, index=None, n_sample=1):
        seg_len = self.config.segment_config['seglen']
        seg_shift = self.config.segment_config['segshift']
        analyzer = self.single_source_simulator.analyzer
        min_len_sample = seg_len * analyzer.frame_shift + analyzer.frame_overlap

        if index is None:  # if no index is given, let the simulator do random sampling
            mixed_wav, early_reverb, mask, config = self.single_source_simulator.simulate(
                min_length=min_len_sample,
                normalize_gain=self.config.gain_norm)
        else:  # if index is given, use the specified sentence
            assert len(index) == 2
            sent_config = dict()
            sent_config['n_source'] = 1
            sent_config['source_stream_idx'] = index[0]
            sent_config['source_utt_id'] = [
                self.single_source_simulator.speech_streams[index[0]].utt_id[
                    index[1]]
            ]
            sent_config['source_speakers'] = [
                self.single_source_simulator.speech_streams[index[0]].utt2spk[
                    sent_config['source_utt_id'][0]]
            ]
            mixed_wav, early_reverb, mask, config = self.single_source_simulator.simulate(
                sent_config=sent_config, normalize_gain=self.config.gain_norm)

        speech_stream = self.single_source_simulator.speech_streams[
            config['source_stream_idx']]
        fbank = feature.feature.logfbank80(mixed_wav[:, 0])
        utt_id = config['source_utt_id']

        if self.config.load_label:
            _, label = speech_stream.read_label_with_id(
                config['source_utt_id'])
            frame_label = label['label'][0].T
            if 'aux_label' in label:
                aux_label = label['aux_label']
            else:
                aux_label = np.zeros((1, 1))

            if np.abs(frame_label.shape[0] - fbank.shape[0]) > 5:
                print(
                    "DataGeneratorTrain::generate: Warning: filterbank and label have significantly different number of frames. "
                )

            n_fr = np.minimum(frame_label.shape[0], fbank.shape[0])
            frame_label = frame_label[:n_fr, :]
            fbank = fbank[:n_fr, :]

        if self.config.segment_config['sequence_mode']:
            if self.config.load_label:
                train_samples = [(fbank, utt_id, frame_label, aux_label)]
            else:
                train_samples = [(fbank, utt_id)]
        else:
            fbank_seg = utils.utt2seg(fbank.T, seg_len, seg_shift)

            if self.config.load_label:
                label_seg = utils.utt2seg(frame_label.T, seg_len, seg_shift)
                train_samples = [(fbank_seg[i].T, utt_id, label_seg[i].T)
                                 for i in range(len(label_seg))]
            else:
                train_samples = [(fbank_seg[i].T, utt_id)
                                 for i in range(len(fbank_seg))]

            if self.DEBUG:
                import matplotlib.pyplot as plt
                n_sample = len(train_samples)
                for i in range(n_sample):
                    plt.subplot(n_sample, 2, i * 2 + 1)
                    simu.imagesc(train_samples[i][0].T)
                    plt.subplot(n_sample, 2, i * 2 + 2)
                    plt.plot(train_samples[i][2])

        return train_samples
Ejemplo n.º 2
0
    def get_mask_from_parallel_data(self,
                                    clean,
                                    distorted,
                                    vad=None,
                                    use_soft_mask=False,
                                    threshold=0.5,
                                    clean_mask_type='count',
                                    power_percentage_threshold=0.997):
        clean_spec = self.analyzer.analyze(clean)
        power_clean = np.abs(clean_spec)**2
        n_fr = clean_spec.shape[1]

        distorted_spec = self.analyzer.analyze(distorted)
        noise_spec = distorted_spec - clean_spec

        if use_soft_mask:
            power_distorted = np.abs(distorted_spec)**2
            mask_snr = np.minimum(1, power_clean / power_distorted)
        else:
            power_noise = np.abs(noise_spec)**2
            snr = 10 * np.log10(
                power_clean / np.maximum(power_noise,
                                         np.finfo(np.float32).eps))
            mask_snr = snr > threshold

        if vad is not None:
            vad_clean = vad > 0.5
            vad = np.convolve(vad_clean, np.ones(5, 1), mode='same')
        elif n_fr > 30:
            # use energy based VAD, to be implemented
            vad_clean = np.ones((1, n_fr))
        else:
            vad_clean = np.ones((1, n_fr))

        mask_clean = self.get_mask_from_clean(
            power_clean,
            clean_mask_type=clean_mask_type,
            power_percentage_threshold=power_percentage_threshold)

        mask_combined = mask_snr * mask_clean

        mask_combined_vad = mask_combined.astype(
            np.float32) * vad_clean.astype(np.float32)

        if 0:
            import matplotlib.pyplot as plt
            plt.subplot(231)
            simulation.imagesc(np.log(power_clean), title="Clean log spectrum")
            plt.plot(vad_clean.transpose() * power_clean.shape[0] * 0.8)
            plt.show()
            plt.subplot(232)
            simulation.imagesc(np.log(np.abs(distorted_spec)**2),
                               title="Distorted log spectrum")
            plt.subplot(233)
            simulation.imagesc(mask_snr.astype(float), title="SNR based mask")
            plt.subplot(234)
            simulation.imagesc(mask_clean.astype(float), title="Clean mask")
            plt.subplot(235)
            simulation.imagesc(mask_combined.astype(float),
                               title="Combined mask")
            plt.subplot(236)
            simulation.imagesc(mask_combined_vad.astype(float),
                               title="Combined mask with VAD")

        return mask_combined_vad
Ejemplo n.º 3
0
    def generate(self, index=None):
        """

        :param index: a tuple of 2 entries (source_stream_idx, utt_idx) that specifies which clean source file to use
        for simulation. If not provided, will randomly choose one clean source file from the clean source streams.
        :return: a list of training samples
        """
        seg_len = self._config.segment_config['seglen']
        seg_shift = self._config.segment_config['segshift']

        if index is None:  # if no index is given, let the simulator do random sampling
            # sample a clean speech stream
            source_stream_idx = np.random.choice(np.arange(
                len(self._source_streams)),
                                                 replace=True,
                                                 p=self._source_streams_prior)
            # sample a clean speech utterance
            _, utt_id, source_wav, _ = self._source_streams[
                source_stream_idx].sample_spk_and_utt(n_spk=1,
                                                      n_utt_per_spk=1,
                                                      load_data=True)
        else:  # if index is given, use the specified sentence
            assert len(index) == 2
            source_stream_idx = index[0]
            utt_id = [self._source_streams[source_stream_idx].utt_id[index[1]]]
            _, _, source_wav, _ = self._source_streams[
                source_stream_idx].read_utt_with_id(source_utt_id,
                                                    load_data=True)

        if np.random.random() > self._config.simulation_prob:
            simulated_wav = source_wav[0]
        else:
            if self._noise_streams is None:
                noise_wavs = None
            else:
                noise_stream_idx = np.random.choice(
                    np.arange(len(self._noise_streams)),
                    replace=True,
                    p=self._noise_streams_prior)
                noise_wavs, noise_files = self._noise_streams[
                    noise_stream_idx].sample_data()

            if self._rir_streams is None:
                source_rir = None
                noise_rirs = None
            else:
                rir_stream_idx = np.random.choice(np.arange(
                    len(self._rir_streams)),
                                                  replace=True,
                                                  p=self._rir_streams_prior)
                n_rir = 1 if noise_wavs is None else 1 + len(noise_wavs)
                rir_wav, room_size, array_position, positions, t60 = self._rir_streams[
                    rir_stream_idx].sample_rir(n_rir)
                source_rir = rir_wav[0]
                noise_rirs = rir_wav[1:]

            simulated_wav, _, mask, config = self._single_source_simulator(
                source_wav[0],
                dir_noise_wavs=noise_wavs,
                source_rir=source_rir,
                dir_noise_rirs=noise_rirs,
                gen_mask=False,
                normalize_gain=self._config.gain_norm)

        fbank = self._logfbank_extractor(simulated_wav[:, 0])

        if self._config.load_label:
            _, label = self._source_streams[
                source_stream_idx].read_label_with_id(utt_id)

            frame_label = label['label'][0].T
            if 'aux_label' in label:
                aux_label = label['aux_label']
            else:
                aux_label = np.zeros((1, 1))

            if np.abs(frame_label.shape[0] - fbank.shape[0]) > 5:
                print(
                    "DataGeneratorTrain::generate: Warning: filterbank and label have significantly different number of frames. "
                )

            n_fr = np.minimum(frame_label.shape[0], fbank.shape[0])
            frame_label = frame_label[:n_fr, :]
            fbank = fbank[:n_fr, :]

        if self._config.use_cmn:
            fbank = reader.preprocess.cmn(fbank, axis=0)

        if self._config.segment_config['sequence_mode']:
            if self._config.load_label:
                train_samples = [(fbank, utt_id, frame_label, aux_label)]
            else:
                train_samples = [(fbank, utt_id)]
        else:
            fbank_seg = _utt2seg(fbank.T, seg_len, seg_shift)
            if len(fbank_seg) == 0:
                return []

            if self._config.load_label:
                label_seg = _utt2seg(frame_label.T, seg_len, seg_shift)
                train_samples = [(fbank_seg[i].T, utt_id, label_seg[i].T)
                                 for i in range(len(label_seg))]
            else:
                train_samples = [(fbank_seg[i].T, utt_id)
                                 for i in range(len(fbank_seg))]

            if self._DEBUG:
                import matplotlib.pyplot as plt
                n_sample = len(train_samples)
                for i in range(n_sample):
                    plt.subplot(n_sample, 2, i * 2 + 1)
                    imagesc(train_samples[i][0].T)
                    plt.subplot(n_sample, 2, i * 2 + 2)
                    plt.plot(train_samples[i][2])

        return train_samples