def test_rdft_mel_vs_merged_rdft_mel(self, mel_non_zero_only): # build merged rdft mel model and run it input_signal = tf.keras.Input(shape=(self.signal_size, ), batch_size=1) merged_rdft_mel = magnitude_rdft_mel.MagnitudeRDFTmel( use_tf_fft=self.use_tf_fft, magnitude_squared=self.magnitude_squared, num_mel_bins=self.num_mel_bins, lower_edge_hertz=self.lower_edge_hertz, upper_edge_hertz=self.upper_edge_hertz, sample_rate=self.sample_rate, mel_non_zero_only=mel_non_zero_only)(input_signal) model_merged_rdft_mel = tf.keras.Model(input_signal, merged_rdft_mel) model_merged_rdft_mel.summary() merged_rdft_mel_output = model_merged_rdft_mel.predict(self.signal) shape_rdft_melmerged = model_merged_rdft_mel.layers[ 1].mel_weight_matrix.shape if mel_non_zero_only: # shape of mel matrix with merged method is 2x smaller self.assertGreater( self.shape_rdft_mel[0] * self.shape_rdft_mel[1], 2 * shape_rdft_melmerged[0] * shape_rdft_melmerged[1]) else: self.assertEqual(self.shape_rdft_mel[0] * self.shape_rdft_mel[1], shape_rdft_melmerged[0] * shape_rdft_melmerged[1]) self.assertAllClose(self.rdft_mel_output, merged_rdft_mel_output)
def test_rdft_mel_vs_merged_rdft_mel(self, mel_non_zero_only): signal_size = 257 # input signal signal = np.random.rand(1, signal_size) # model parameters mode = Modes.NON_STREAM_INFERENCE use_tf_fft = False magnitude_squared = False num_mel_bins = 40 lower_edge_hertz = 20.0 upper_edge_hertz = 4000.0 sample_rate = 16000.0 # build rdft mel model and run it input_signal = tf.keras.Input(shape=(signal_size, ), batch_size=1) mag_rdft = magnitude_rdft.MagnitudeRDFT( mode=mode, use_tf_fft=use_tf_fft, magnitude_squared=magnitude_squared)(input_signal) mel_spectr = mel_spectrogram.MelSpectrogram( mode=mode, use_tf=False, num_mel_bins=num_mel_bins, lower_edge_hertz=lower_edge_hertz, upper_edge_hertz=upper_edge_hertz, sample_rate=sample_rate)(mag_rdft) model_rdft_mel = tf.keras.Model(input_signal, mel_spectr) model_rdft_mel.summary() rdft_mel_output = model_rdft_mel.predict(signal) # build merged rdft mel model and run it merged_rdft_mel = magnitude_rdft_mel.MagnitudeRDFTmel( use_tf_fft=use_tf_fft, magnitude_squared=magnitude_squared, num_mel_bins=num_mel_bins, lower_edge_hertz=lower_edge_hertz, upper_edge_hertz=upper_edge_hertz, sample_rate=sample_rate, mel_non_zero_only=mel_non_zero_only)(input_signal) model_merged_rdft_mel = tf.keras.Model(input_signal, merged_rdft_mel) model_merged_rdft_mel.summary() merged_rdft_mel_output = model_merged_rdft_mel.predict(signal) shape_rdft_mel = model_rdft_mel.layers[2].mel_weight_matrix.shape shape_rdft_melmerged = model_merged_rdft_mel.layers[ 1].mel_weight_matrix.shape if mel_non_zero_only: # shape of mel matrix with merged method is 2x smaller self.assertGreater( shape_rdft_mel[0] * shape_rdft_mel[1], 2 * shape_rdft_melmerged[0] * shape_rdft_melmerged[1]) else: self.assertEqual(shape_rdft_mel[0] * shape_rdft_mel[1], shape_rdft_melmerged[0] * shape_rdft_melmerged[1]) self.assertAllClose(rdft_mel_output, merged_rdft_mel_output)
def build(self, input_shape): super(SpeechFeatures, self).build(input_shape) self.data_frame = DataFrame( mode=self.mode, inference_batch_size=self.inference_batch_size, frame_size=self.frame_size, frame_step=self.frame_step) if self.noise_scale != 0.0 and self.mode == Modes.TRAINING: self.add_noise = tf.keras.layers.GaussianNoise( stddev=self.noise_scale) else: self.add_noise = tf.keras.layers.Lambda(lambda x: x) if self.preemph != 0.0: self.preemphasis = Preemphasis(preemph=self.preemph) else: self.preemphasis = tf.keras.layers.Lambda(lambda x: x) if self.window_type is not None: self.windowing = Windowing(window_size=self.frame_size, window_type=self.window_type) else: self.windowing = tf.keras.layers.Lambda(lambda x: x) # If use_tf_fft is False, we will use # Real Discrete Fourier Transformation(RDFT), which is slower than RFFT # To increase RDFT efficiency we use properties of mel spectrum. # We find a range of non zero values in mel spectrum # and use it to compute RDFT: it will speed up computations. # If use_tf_fft is True, then we use TF RFFT which require # signal length alignment, so we disable mel_non_zero_only. self.mag_rdft_mel = magnitude_rdft_mel.MagnitudeRDFTmel( use_tf_fft=self.use_tf_fft, magnitude_squared=self.fft_magnitude_squared, num_mel_bins=self.mel_num_bins, lower_edge_hertz=self.mel_lower_edge_hertz, upper_edge_hertz=self.mel_upper_edge_hertz, sample_rate=self.sample_rate, mel_non_zero_only=self.mel_non_zero_only) self.log_max = tf.keras.layers.Lambda( lambda x: tf.math.log(tf.math.maximum(x, self.log_epsilon))) if self.dct_num_features != 0: self.dct = DCT(num_features=self.dct_num_features) else: self.dct = tf.keras.layers.Lambda(lambda x: x) self.normalizer = Normalizer(mean=self.mean, stddev=self.stddev)
def build(self, input_shape): super(SpeechFeatures, self).build(input_shape) self.data_frame = data_frame.DataFrame( mode=self.mode, inference_batch_size=self.inference_batch_size, frame_size=self.frame_size, frame_step=self.frame_step) if self.noise_scale != 0.0 and self.mode == modes.Modes.TRAINING: self.add_noise = tf.keras.layers.GaussianNoise( stddev=self.noise_scale) else: self.add_noise = tf.keras.layers.Lambda(lambda x: x) if self.params['preemph'] != 0.0: self.preemphasis = preemphasis.Preemphasis( preemph=self.params['preemph']) else: self.preemphasis = tf.keras.layers.Lambda(lambda x: x) if self.params['window_type'] is not None: self.windowing = windowing.Windowing( window_size=self.frame_size, window_type=self.params['window_type']) else: self.windowing = tf.keras.layers.Lambda(lambda x: x) # If use_tf_fft is False, we will use # Real Discrete Fourier Transformation(RDFT), which is slower than RFFT # To increase RDFT efficiency we use properties of mel spectrum. # We find a range of non zero values in mel spectrum # and use it to compute RDFT: it will speed up computations. # If use_tf_fft is True, then we use TF RFFT which require # signal length alignment, so we disable mel_non_zero_only. self.mag_rdft_mel = magnitude_rdft_mel.MagnitudeRDFTmel( use_tf_fft=self.params['use_tf_fft'], magnitude_squared=self.params['fft_magnitude_squared'], num_mel_bins=self.params['mel_num_bins'], lower_edge_hertz=self.params['mel_lower_edge_hertz'], upper_edge_hertz=self.params['mel_upper_edge_hertz'], sample_rate=self.params['sample_rate'], mel_non_zero_only=self.params['mel_non_zero_only']) self.log_max = tf.keras.layers.Lambda(lambda x: tf.math.log( tf.math.maximum(x, self.params['log_epsilon']))) if self.params['dct_num_features'] != 0: self.dct = dct.DCT(num_features=self.params['dct_num_features']) else: self.dct = tf.keras.layers.Lambda(lambda x: x) self.normalizer = normalizer.Normalizer(mean=self.mean, stddev=self.stddev) # in any inference mode there is no need to add dynamic logic in tf graph if self.params[ 'use_spec_augment'] and self.mode == modes.Modes.TRAINING: self.spec_augment = spectrogram_augment.SpecAugment( time_masks_number=self.params['time_masks_number'], time_mask_max_size=self.params['time_mask_max_size'], frequency_masks_number=self.params['frequency_masks_number'], frequency_mask_max_size=self.params['frequency_mask_max_size']) else: self.spec_augment = tf.keras.layers.Lambda(lambda x: x)
def build(self, input_shape): super(SpeechFeatures, self).build(input_shape) if self.params[ 'sp_time_shift_samples'] != 0.0 and self.mode == modes.Modes.TRAINING: self.rand_shift = random_shift.RandomShift( self.params['sp_time_shift_samples']) else: self.rand_shift = tf.keras.layers.Lambda(lambda x: x) if self.params[ 'sp_resample'] != 0.0 and self.mode == modes.Modes.TRAINING: self.rand_stretch_squeeze = random_stretch_squeeze.RandomStretchSqueeze( self.params['sp_resample']) else: self.rand_stretch_squeeze = tf.keras.layers.Lambda(lambda x: x) self.data_frame = data_frame.DataFrame( mode=self.mode, inference_batch_size=self.inference_batch_size, frame_size=self.frame_size, frame_step=self.frame_step, use_one_step=self.params['use_one_step'], padding=self.params['data_frame_padding']) if self.noise_scale != 0.0 and self.mode == modes.Modes.TRAINING: self.add_noise = tf.keras.layers.GaussianNoise( stddev=self.noise_scale) else: self.add_noise = tf.keras.layers.Lambda(lambda x: x) if self.params['preemph'] != 0.0: self.preemphasis = preemphasis.Preemphasis( preemph=self.params['preemph']) else: self.preemphasis = tf.keras.layers.Lambda(lambda x: x) # if True it will replace direct DFT, DCT and hann window by tf functions # it is useful for model quantization, # because these functions will not be quantized use_tf_function = self.params['use_tf_fft'] mel_non_zero_only = self.params['mel_non_zero_only'] window_type = self.params['window_type'] # set mel and window type for tf function compatibility if use_tf_function: mel_non_zero_only = False window_type = 'hann_tf' if window_type is not None: self.windowing = windowing.Windowing(window_size=self.frame_size, window_type=window_type) else: self.windowing = tf.keras.layers.Lambda(lambda x: x) # If use_tf_fft is False, we will use # Real Discrete Fourier Transformation(RDFT), which is slower than RFFT # To increase RDFT efficiency we use properties of mel spectrum. # We find a range of non zero values in mel spectrum # and use it to compute RDFT: it will speed up computations. # If use_tf_fft is True, then we use TF RFFT which require # signal length alignment, so we disable mel_non_zero_only. self.mag_rdft_mel = magnitude_rdft_mel.MagnitudeRDFTmel( use_tf_fft=use_tf_function, magnitude_squared=self.params['fft_magnitude_squared'], num_mel_bins=self.params['mel_num_bins'], lower_edge_hertz=self.params['mel_lower_edge_hertz'], upper_edge_hertz=self.params['mel_upper_edge_hertz'], sample_rate=self.params['sample_rate'], mel_non_zero_only=mel_non_zero_only) self.log_max = tf.keras.layers.Lambda(lambda x: tf.math.log( tf.math.maximum(x, self.params['log_epsilon']))) if self.params['dct_num_features'] != 0: self.dct = dct.DCT(num_features=self.params['dct_num_features']) else: self.dct = tf.keras.layers.Lambda(lambda x: x) self.normalizer = normalizer.Normalizer(mean=self.mean, stddev=self.stddev) # in any inference mode there is no need to add dynamic logic in tf graph if self.params[ 'use_spec_augment'] and self.mode == modes.Modes.TRAINING: self.spec_augment = spectrogram_augment.SpecAugment( time_masks_number=self.params['time_masks_number'], time_mask_max_size=self.params['time_mask_max_size'], frequency_masks_number=self.params['frequency_masks_number'], frequency_mask_max_size=self.params['frequency_mask_max_size']) else: self.spec_augment = tf.keras.layers.Lambda(lambda x: x) if self.params['use_spec_cutout'] and self.mode == modes.Modes.TRAINING: self.spec_cutout = spectrogram_cutout.SpecCutout( masks_number=self.params['spec_cutout_masks_number'], time_mask_size=self.params['spec_cutout_time_mask_size'], frequency_mask_size=self. params['spec_cutout_frequency_mask_size']) else: self.spec_cutout = tf.keras.layers.Lambda(lambda x: x)