def generate(self, index=None, n_sample=1): seg_len = self.config.segment_config['seglen'] seg_shift = self.config.segment_config['segshift'] analyzer = self.single_source_simulator.analyzer min_len_sample = seg_len * analyzer.frame_shift + analyzer.frame_overlap if index is None: # if no index is given, let the simulator do random sampling mixed_wav, early_reverb, mask, config = self.single_source_simulator.simulate( min_length=min_len_sample, normalize_gain=self.config.gain_norm) else: # if index is given, use the specified sentence assert len(index) == 2 sent_config = dict() sent_config['n_source'] = 1 sent_config['source_stream_idx'] = index[0] sent_config['source_utt_id'] = [ self.single_source_simulator.speech_streams[index[0]].utt_id[ index[1]] ] sent_config['source_speakers'] = [ self.single_source_simulator.speech_streams[index[0]].utt2spk[ sent_config['source_utt_id'][0]] ] mixed_wav, early_reverb, mask, config = self.single_source_simulator.simulate( sent_config=sent_config, normalize_gain=self.config.gain_norm) speech_stream = self.single_source_simulator.speech_streams[ config['source_stream_idx']] fbank = feature.feature.logfbank80(mixed_wav[:, 0]) utt_id = config['source_utt_id'] if self.config.load_label: _, label = speech_stream.read_label_with_id( config['source_utt_id']) frame_label = label['label'][0].T if 'aux_label' in label: aux_label = label['aux_label'] else: aux_label = np.zeros((1, 1)) if np.abs(frame_label.shape[0] - fbank.shape[0]) > 5: print( "DataGeneratorTrain::generate: Warning: filterbank and label have significantly different number of frames. " ) n_fr = np.minimum(frame_label.shape[0], fbank.shape[0]) frame_label = frame_label[:n_fr, :] fbank = fbank[:n_fr, :] if self.config.segment_config['sequence_mode']: if self.config.load_label: train_samples = [(fbank, utt_id, frame_label, aux_label)] else: train_samples = [(fbank, utt_id)] else: fbank_seg = utils.utt2seg(fbank.T, seg_len, seg_shift) if self.config.load_label: label_seg = utils.utt2seg(frame_label.T, seg_len, seg_shift) train_samples = [(fbank_seg[i].T, utt_id, label_seg[i].T) for i in range(len(label_seg))] else: train_samples = [(fbank_seg[i].T, utt_id) for i in range(len(fbank_seg))] if self.DEBUG: import matplotlib.pyplot as plt n_sample = len(train_samples) for i in range(n_sample): plt.subplot(n_sample, 2, i * 2 + 1) simu.imagesc(train_samples[i][0].T) plt.subplot(n_sample, 2, i * 2 + 2) plt.plot(train_samples[i][2]) return train_samples
def get_mask_from_parallel_data(self, clean, distorted, vad=None, use_soft_mask=False, threshold=0.5, clean_mask_type='count', power_percentage_threshold=0.997): clean_spec = self.analyzer.analyze(clean) power_clean = np.abs(clean_spec)**2 n_fr = clean_spec.shape[1] distorted_spec = self.analyzer.analyze(distorted) noise_spec = distorted_spec - clean_spec if use_soft_mask: power_distorted = np.abs(distorted_spec)**2 mask_snr = np.minimum(1, power_clean / power_distorted) else: power_noise = np.abs(noise_spec)**2 snr = 10 * np.log10( power_clean / np.maximum(power_noise, np.finfo(np.float32).eps)) mask_snr = snr > threshold if vad is not None: vad_clean = vad > 0.5 vad = np.convolve(vad_clean, np.ones(5, 1), mode='same') elif n_fr > 30: # use energy based VAD, to be implemented vad_clean = np.ones((1, n_fr)) else: vad_clean = np.ones((1, n_fr)) mask_clean = self.get_mask_from_clean( power_clean, clean_mask_type=clean_mask_type, power_percentage_threshold=power_percentage_threshold) mask_combined = mask_snr * mask_clean mask_combined_vad = mask_combined.astype( np.float32) * vad_clean.astype(np.float32) if 0: import matplotlib.pyplot as plt plt.subplot(231) simulation.imagesc(np.log(power_clean), title="Clean log spectrum") plt.plot(vad_clean.transpose() * power_clean.shape[0] * 0.8) plt.show() plt.subplot(232) simulation.imagesc(np.log(np.abs(distorted_spec)**2), title="Distorted log spectrum") plt.subplot(233) simulation.imagesc(mask_snr.astype(float), title="SNR based mask") plt.subplot(234) simulation.imagesc(mask_clean.astype(float), title="Clean mask") plt.subplot(235) simulation.imagesc(mask_combined.astype(float), title="Combined mask") plt.subplot(236) simulation.imagesc(mask_combined_vad.astype(float), title="Combined mask with VAD") return mask_combined_vad
def generate(self, index=None): """ :param index: a tuple of 2 entries (source_stream_idx, utt_idx) that specifies which clean source file to use for simulation. If not provided, will randomly choose one clean source file from the clean source streams. :return: a list of training samples """ seg_len = self._config.segment_config['seglen'] seg_shift = self._config.segment_config['segshift'] if index is None: # if no index is given, let the simulator do random sampling # sample a clean speech stream source_stream_idx = np.random.choice(np.arange( len(self._source_streams)), replace=True, p=self._source_streams_prior) # sample a clean speech utterance _, utt_id, source_wav, _ = self._source_streams[ source_stream_idx].sample_spk_and_utt(n_spk=1, n_utt_per_spk=1, load_data=True) else: # if index is given, use the specified sentence assert len(index) == 2 source_stream_idx = index[0] utt_id = [self._source_streams[source_stream_idx].utt_id[index[1]]] _, _, source_wav, _ = self._source_streams[ source_stream_idx].read_utt_with_id(source_utt_id, load_data=True) if np.random.random() > self._config.simulation_prob: simulated_wav = source_wav[0] else: if self._noise_streams is None: noise_wavs = None else: noise_stream_idx = np.random.choice( np.arange(len(self._noise_streams)), replace=True, p=self._noise_streams_prior) noise_wavs, noise_files = self._noise_streams[ noise_stream_idx].sample_data() if self._rir_streams is None: source_rir = None noise_rirs = None else: rir_stream_idx = np.random.choice(np.arange( len(self._rir_streams)), replace=True, p=self._rir_streams_prior) n_rir = 1 if noise_wavs is None else 1 + len(noise_wavs) rir_wav, room_size, array_position, positions, t60 = self._rir_streams[ rir_stream_idx].sample_rir(n_rir) source_rir = rir_wav[0] noise_rirs = rir_wav[1:] simulated_wav, _, mask, config = self._single_source_simulator( source_wav[0], dir_noise_wavs=noise_wavs, source_rir=source_rir, dir_noise_rirs=noise_rirs, gen_mask=False, normalize_gain=self._config.gain_norm) fbank = self._logfbank_extractor(simulated_wav[:, 0]) if self._config.load_label: _, label = self._source_streams[ source_stream_idx].read_label_with_id(utt_id) frame_label = label['label'][0].T if 'aux_label' in label: aux_label = label['aux_label'] else: aux_label = np.zeros((1, 1)) if np.abs(frame_label.shape[0] - fbank.shape[0]) > 5: print( "DataGeneratorTrain::generate: Warning: filterbank and label have significantly different number of frames. " ) n_fr = np.minimum(frame_label.shape[0], fbank.shape[0]) frame_label = frame_label[:n_fr, :] fbank = fbank[:n_fr, :] if self._config.use_cmn: fbank = reader.preprocess.cmn(fbank, axis=0) if self._config.segment_config['sequence_mode']: if self._config.load_label: train_samples = [(fbank, utt_id, frame_label, aux_label)] else: train_samples = [(fbank, utt_id)] else: fbank_seg = _utt2seg(fbank.T, seg_len, seg_shift) if len(fbank_seg) == 0: return [] if self._config.load_label: label_seg = _utt2seg(frame_label.T, seg_len, seg_shift) train_samples = [(fbank_seg[i].T, utt_id, label_seg[i].T) for i in range(len(label_seg))] else: train_samples = [(fbank_seg[i].T, utt_id) for i in range(len(fbank_seg))] if self._DEBUG: import matplotlib.pyplot as plt n_sample = len(train_samples) for i in range(n_sample): plt.subplot(n_sample, 2, i * 2 + 1) imagesc(train_samples[i][0].T) plt.subplot(n_sample, 2, i * 2 + 2) plt.plot(train_samples[i][2]) return train_samples