def _build_stft_feature(self): """ Compute STFT of waveform and slice the STFT in segment with the right length to feed the network. """ stft_name = self.stft_name spec_name = self.spectrogram_name if stft_name not in self._features: # pad input with a frame of zeros waveform = tf.concat([ tf.zeros((self._frame_length, self._n_channels)), self._features['waveform'] ], 0) stft_feature = tf.transpose( stft(tf.transpose(waveform), self._frame_length, self._frame_step, window_fn=lambda frame_length, dtype: (hann_window(frame_length, periodic=True, dtype=dtype)), pad_end=True), perm=[1, 2, 0]) self._features[f'{self._mix_name}_stft'] = stft_feature if spec_name not in self._features: self._features[spec_name] = tf.abs( pad_and_partition(self._features[stft_name], self._T))[:, :, :self._F, :]
def get_stft( y, return_magnitude=True, frame_length=4096, frame_step=1024, T=512, F=1024, ): waveform = tf.concat([tf.zeros( (frame_length, 1)), tf.expand_dims(y, -1)], 0) stft_feature = tf.transpose( stft( tf.transpose(waveform), frame_length, frame_step, window_fn=lambda frame_length, dtype: (hann_window(frame_length, periodic=True, dtype=dtype)), pad_end=True, ), perm=[1, 2, 0], ) if return_magnitude: D = tf.abs(pad_and_partition(stft_feature, T))[:, :, :F, :] return stft_feature, D else: return stft_feature
def compute_spectrogram_tf( waveform: tf.Tensor, frame_length: int = 2048, frame_step: int = 512, spec_exponent: float = 1.0, window_exponent: float = 1.0, ) -> tf.Tensor: """ Compute magnitude / power spectrogram from waveform as a `n_samples x n_channels` tensor. Parameters: waveform (tensorflow.Tensor): Input waveform as `(times x number of channels)` tensor. frame_length (int): Length of a STFT frame to use. frame_step (int): HOP between successive frames. spec_exponent (float): Exponent of the spectrogram (usually 1 for magnitude spectrogram, or 2 for power spectrogram). window_exponent (float): Exponent applied to the Hann windowing function (may be useful for making perfect STFT/iSTFT reconstruction). Returns: tensorflow.Tensor: Computed magnitude / power spectrogram as a `(T x F x n_channels)` tensor. """ stft_tensor: tf.Tensor = tf.transpose( stft( tf.transpose(waveform), frame_length, frame_step, window_fn=lambda f, dtype: hann_window( f, periodic=True, dtype=waveform.dtype ) ** window_exponent, ), perm=[1, 2, 0], ) return tf.abs(stft_tensor) ** spec_exponent
# subprocess.Popen is not instantaneous so we need to call it a bit # before than when we actually need its effects. # In this case, we need the performance mode for preprocessing the audio. subprocess.Popen(performance) frames += stream.read(chunk) stream.stop_stream() ###### Resample #frame = np.frombuffer(io.BytesIO(b''.join(frames)).getbuffer(), dtype=np.uint16) frame = frombuffer(BytesIO(frames).getbuffer(), dtype=uint16) audio = resample_poly(frame, 1, downsample) tf_audio = convert_to_tensor(audio, dtype=float32) / 32767 - 1 ###### STFT stft__ = stft(tf_audio, frame_length=frame_length, frame_step=frame_step, fft_length=frame_length) spectrogram = tfabs(stft__) ###### MFCCs mel_spectrogram = tensordot(spectrogram, linear_to_mel_weight_matrix, 1) log_mel_spectrogram = log(mel_spectrogram + 1e-6) mfccs = mfccs_from_log_mel_spectrograms(log_mel_spectrogram)[..., :10] ###### Saving the output f_res = f'{output_folder}/mfccs{n}.bin' mfccs_ser = serialize_tensor(mfccs) write_file(f_res, mfccs_ser) ###### Printing execution time t_savefile = t.time() print(t_savefile - start)
def _apply_stft_to_input(self): from returnn.tf.util.basic import get_shape # noinspection PyShadowingNames def _crop_stft_output_to_reference_frame_size_length( channel_concatenated_stft, crop_size): return tf.slice(channel_concatenated_stft, [0, 0, 0], [ get_shape(channel_concatenated_stft)[0], crop_size, get_shape(channel_concatenated_stft)[2] ]) input_placeholder = self.input_data.get_placeholder_as_batch_major() channel_wise_stft_res_list = list() for fft_size, frame_size in zip(self._fft_sizes, self._frame_sizes): def _get_window(window_length, dtype): if self._window == "hanning": try: # noinspection PyPackageRequirements from tensorflow.signal import hann_window except ImportError: # noinspection PyPackageRequirements,PyUnresolvedReferences from tensorflow.contrib.signal import hann_window window = hann_window(window_length, dtype=dtype) elif self._window == "blackman": # noinspection PyPackageRequirements import scipy.signal window = tf.constant( scipy.signal.windows.blackman(frame_size), dtype=tf.float32) elif self._window == "None" or self._window == "ones": window = tf.ones((window_length, ), dtype=dtype) else: assert False, "Window was not parsed correctly: {}".format( self._window) return window # noinspection PyShadowingNames def _pad_time_signal(input_placeholder, frame_size): if frame_size > self._reference_frame_size: return tf.concat([ input_signal, tf.ones([ get_shape(input_signal)[0], frame_size - self._reference_frame_size, get_shape(input_signal)[2] ]) * 1e-7 ], axis=1) else: return input_placeholder input_signal = _pad_time_signal(input_placeholder, frame_size) if self._use_rfft: try: # noinspection PyPackageRequirements from tensorflow.signal import stft except ImportError: # noinspection PyPackageRequirements,PyUnresolvedReferences from tensorflow.contrib.signal import stft channel_wise_stft = stft(signals=tf.transpose( input_signal, [0, 2, 1]), frame_length=frame_size, frame_step=self._frame_shift, fft_length=fft_size, window_fn=_get_window, pad_end=self._pad_last_frame) channel_wise_stft = tf.transpose(channel_wise_stft, [0, 2, 1, 3]) batch_dim = tf.shape(channel_wise_stft)[0] time_dim = tf.shape(channel_wise_stft)[1] concat_feature_dim = channel_wise_stft.shape[ 2] * channel_wise_stft.shape[3] channel_concatenated_stft = tf.reshape( channel_wise_stft, (batch_dim, time_dim, concat_feature_dim)) if channel_wise_stft_res_list: channel_concatenated_stft = ( _crop_stft_output_to_reference_frame_size_length( channel_concatenated_stft, get_shape(channel_wise_stft_res_list[0])[1])) channel_wise_stft_res_list.append(channel_concatenated_stft) output_placeholder = tf.concat(channel_wise_stft_res_list, axis=2) return output_placeholder