def _mfcc_op(self, inputs): # MFCC implementation based on TF custom op (supported by TFLite) # It reduces model size in comparison to _mfcc_tf if (self.mode == modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE or self.mode == modes.Modes.STREAM_INTERNAL_STATE_INFERENCE): outputs = self.data_frame(inputs) # in streaming mode there is only one frame for FFT calculation # dims will be [batch=1, time=1, frame], # but audio_spectrogram requre 2D input data, so we remove time dim outputs = tf.squeeze(outputs, axis=1) else: outputs = inputs # outputs has dims [batch, time] # but audio_spectrogram expects [time, channels/batch] so transpose it outputs = tf.transpose(outputs, [1, 0]) # outputs: [time, channels/batch] outputs = audio_ops.audio_spectrogram( outputs, window_size=self.frame_size, stride=self.frame_step, magnitude_squared=self.params['fft_magnitude_squared']) # outputs: [channels/batch, frames, fft_feature] outputs = audio_ops.mfcc( outputs, self.params['sample_rate'], upper_frequency_limit=self.params['mel_upper_edge_hertz'], lower_frequency_limit=self.params['mel_lower_edge_hertz'], filterbank_channel_count=self.params['mel_num_bins'], dct_coefficient_count=self.params['dct_num_features']) # outputs: [channels/batch, frames, dct_coefficient_count] outputs = self.spec_augment(outputs) return outputs
def random_cutout( inputs, mask_size, mask_value=0, seed=None, data_format='channels_last', ): """Applies cutout (https://arxiv.org/abs/1708.04552) to inputs. It is based on addons/tensorflow_addons/image/cutout_ops.py kept here here for backward compatibility Args: inputs: input tensor [batch_size, time, feature, channels] mask_size: mask size (time feature) mask_value: mask will be filled with this value seed: random seed data_format: dimesnions order Returns: masked image Raises: ValueError: if inputs.shape.rank != 4 """ if inputs.shape.rank != 4: raise ValueError('inputs.shape.rank:%d must be 4' % inputs.shape.rank) mask_size = tf.convert_to_tensor(mask_size) if tf.rank(mask_size) == 0: mask_size = tf.stack([mask_size, mask_size]) if data_format == 'channels_last': time_size, feature_size = tf.shape(inputs)[1], tf.shape(inputs)[2] else: time_size, feature_size = tf.shape(inputs)[2], tf.shape(inputs)[3] batch_size = tf.shape(inputs)[0] cutout_center_time = tf.random.uniform( shape=[batch_size], minval=0, maxval=time_size, dtype=tf.int32, seed=seed ) cutout_center_feature = tf.random.uniform( shape=[batch_size], minval=0, maxval=feature_size, dtype=tf.int32, seed=seed) offset = tf.transpose([cutout_center_time, cutout_center_feature], [1, 0]) origin_shape = inputs.shape offset = tf.convert_to_tensor(offset) mask_size = mask_size // 2 cutout_center_time = offset[:, 0] cutout_center_feature = offset[:, 1] lower_pads = tf.maximum(0, cutout_center_time - mask_size[0]) upper_pads = tf.maximum(0, time_size - cutout_center_time - mask_size[0]) left_pads = tf.maximum(0, cutout_center_feature - mask_size[1]) right_pads = tf.maximum(0, feature_size - cutout_center_feature - mask_size[1]) cutout_shape = tf.transpose( [ time_size - (lower_pads + upper_pads), feature_size - (left_pads + right_pads), ], [1, 0], ) masks = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(tf.shape(cutout_shape)[0]): padding_dims = [ [lower_pads[i], upper_pads[i]], [left_pads[i], right_pads[i]], ] mask = tf.pad( tf.zeros(cutout_shape[i], dtype=inputs.dtype), padding_dims, constant_values=1, ) masks = masks.write(i, mask) if data_format == 'channels_last': mask = tf.expand_dims(masks.stack(), -1) mask = tf.tile(mask, [1, 1, 1, tf.shape(inputs)[-1]]) else: mask = tf.expand_dims(masks.stack(), 1) mask = tf.tile(mask, [1, tf.shape(inputs)[1], 1, 1]) inputs = tf.where( tf.equal(mask, 0), tf.ones_like(inputs, dtype=inputs.dtype) * mask_value, inputs, ) inputs.set_shape(origin_shape) return inputs