def _non_streaming(self, inputs): # depthwise 1D convolution in non streaming mode # it is used for training or non streaming inference. # pad input data inputs_pad = temporal_padding.TemporalPadding( padding=self.pad, padding_size=self.memory_size - 1)(inputs) # expand dimensionality for depthwise_conv2d # to [memory_size, 1, feature_dim, 1] time_kernel_exp = tf.expand_dims(tf.expand_dims(self.time_kernel, 1), -1) # run convolution depthwise_conv1d = tf.nn.depthwise_conv2d( tf.expand_dims(inputs_pad, -2), time_kernel_exp, strides=[1, 1, 1, 1], padding='VALID') # [batch_size, time_steps, 1, feature_dim] # [batch_size, time_steps, feature_dim] depthwise_conv1d = tf.squeeze(depthwise_conv1d, [2]) # [batch_size, time_steps, feature_dim] if self.use_bias: depthwise_conv1d = depthwise_conv1d + self.bias return depthwise_conv1d
def _non_streaming(self, inputs): # depthwise 1D convolution in non streaming mode # it is used for training or non streaming inference. # Zero pad inputs from the left to make conv1d causal. # [batch_size, time_steps, feature_dim] if self.pad: inputs_pad = tf.keras.backend.temporal_padding( inputs, padding=(self.memory_size - 1, 0)) else: inputs_pad = inputs # expand dimensionality for depthwise_conv2d # to [memory_size, 1, feature_dim, 1] time_kernel_exp = tf.expand_dims(tf.expand_dims(self.time_kernel, 1), -1) # run convolution depthwise_conv1d = tf.nn.depthwise_conv2d( tf.expand_dims(inputs_pad, -2), time_kernel_exp, strides=[1, 1, 1, 1], padding='VALID') # [batch_size, time_steps, 1, feature_dim] # [batch_size, time_steps, feature_dim] depthwise_conv1d = tf.squeeze(depthwise_conv1d, [2]) # [batch_size, time_steps, feature_dim] if self.use_bias: depthwise_conv1d = depthwise_conv1d + self.bias return depthwise_conv1d
def _mfcc_op(self, inputs): # MFCC implementation based on TF custom op (supported by TFLite) # It reduces model size in comparison to _mfcc_tf if (self.mode == modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE or self.mode == modes.Modes.STREAM_INTERNAL_STATE_INFERENCE): outputs = self.data_frame(inputs) # in streaming mode there is only one frame for FFT calculation # dims will be [batch=1, time=1, frame], # but audio_spectrogram requre 2D input data, so we remove time dim outputs = tf.squeeze(outputs, axis=1) else: outputs = inputs # outputs has dims [batch, time] # but audio_spectrogram expects [time, channels/batch] so transpose it outputs = tf.transpose(outputs, [1, 0]) # outputs: [time, channels/batch] outputs = audio_ops.audio_spectrogram( outputs, window_size=self.frame_size, stride=self.frame_step, magnitude_squared=self.params['fft_magnitude_squared']) # outputs: [channels/batch, frames, fft_feature] outputs = audio_ops.mfcc( outputs, self.params['sample_rate'], upper_frequency_limit=self.params['mel_upper_edge_hertz'], lower_frequency_limit=self.params['mel_lower_edge_hertz'], filterbank_channel_count=self.params['mel_num_bins'], dct_coefficient_count=self.params['dct_num_features']) # outputs: [channels/batch, frames, dct_coefficient_count] outputs = self.spec_augment(outputs) return outputs
def random_stretch_squeeze(inputs, resample_offset, seed=None): """Stretches and squeezes audio data in time dim. It can be useful for augmenting training data with random stretchs squeezes in time dim for making model more robust to input audio sampling frequency and human speech frequency. Args: inputs: input tensor [batch_size, time] resample_offset: defines stretch squeeze range: 1-resample_offset...1+resample_offset seed: random seed Returns: masked image Raises: ValueError: if inputs.shape.rank != 2 """ if inputs.shape.rank != 2: raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank) inputs_shape = inputs.shape.as_list() batch_size = inputs_shape[0] sequence_length = inputs_shape[1] image = tf.expand_dims(inputs, 2) # feature image = tf.expand_dims(image, 3) # channels resample = 1.0 # when it is equal to 1 - no stretching or squeezing time_stretch_squeeze = tf.random.uniform( shape=[batch_size], minval=resample - resample_offset, maxval=resample + resample_offset, dtype=tf.float32, seed=seed) tf.print(time_stretch_squeeze) print(time_stretch_squeeze) shape = tf.shape(inputs) outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(batch_size): image_resized = tf.image.resize( images=image[i], size=(tf.cast((tf.cast(shape[1], tf.float32) * time_stretch_squeeze[i]), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=sequence_length, target_width=1, ) outputs = outputs.write(i, image_resized_cropped) outputs = tf.squeeze(outputs.stack(), axis=[2, 3]) outputs.set_shape(inputs_shape) return outputs
def random_shift(inputs, time_shift, seed=None): """Shifts input data randomly in time dim. It can be useful for augmenting training data with random shifts in time dim for making model more robust to input audio shifts Args: inputs: input tensor [batch_size, time] time_shift: defines time shift range: -time_shift...time_shift it is defiend in samples seed: random seed Returns: masked image Raises: ValueError: if inputs.shape.rank != 2 """ if inputs.shape.rank != 2: raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank) inputs_shape = inputs.shape.as_list() batch_size = inputs_shape[0] sequence_length = inputs_shape[1] # below function will process 2D arrays, convert it to [batch, time, dummy] inputs = tf.expand_dims(inputs, 2) time_shift_amounts = tf.random.uniform(shape=[batch_size], minval=-time_shift, maxval=time_shift, dtype=tf.int32, seed=seed) outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(batch_size): time_shift_amount = time_shift_amounts[i] # pylint: disable=cell-var-from-loop time_shift_padding = tf.cond(time_shift_amount > 0, lambda: [[time_shift_amount, 0], [0, 0]], lambda: [[0, -time_shift_amount], [0, 0]]) time_shift_offset = tf.cond(time_shift_amount > 0, lambda: [0, 0], lambda: [-time_shift_amount, 0]) # pylint: enable=cell-var-from-loop padded = tf.pad(tensor=inputs[i], paddings=time_shift_padding, mode='CONSTANT') padded_sliced = tf.slice(padded, time_shift_offset, [sequence_length, -1]) outputs = outputs.write(i, padded_sliced) # convert it back to [batch, time] outputs = tf.squeeze(outputs.stack(), axis=[2]) outputs.set_shape(inputs_shape) return outputs
def call(self, inputs, training=None): net = inputs # add fake dim [batch, time, 1, feature] net = tf.keras.backend.expand_dims(net, axis=2) net = self.dropout1(net, training=training) net = self.dense1(net) net = self.depth_cnn1(net) net = self.batch_norm(net, training=training) net = self.activation(net) net = self.dense2(net) # [batch, time, feature] net = tf.squeeze(net, [2]) return net
def model(flags): """BC-ResNet model. It is based on paper Broadcasted Residual Learning for Efficient Keyword Spotting https://arxiv.org/pdf/2106.04140.pdf Args: flags: data/model parameters Returns: Keras model for training Raises: ValueError: if any of input list has different length from any other; or if padding is not supported """ dropouts = utils.parse(flags.dropouts) filters = utils.parse(flags.filters) blocks_n = utils.parse(flags.blocks_n) strides = utils.parse(flags.strides) dilations = utils.parse(flags.dilations) for l in (dropouts, filters, strides, dilations): if len(blocks_n) != len(l): raise ValueError('all input lists have to be the same length ' 'but get %s and %s ' % (blocks_n, l)) input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # make it [batch, time, feature, 1] net = tf.keras.backend.expand_dims(net, axis=3) if flags.paddings == 'same': net = tf.keras.layers.Conv2D(filters=flags.first_filters, kernel_size=5, strides=(1, 2), padding='same')(net) else: net = stream.Stream(cell=tf.keras.layers.Conv2D( filters=flags.first_filters, kernel_size=5, strides=(1, 2), padding='valid'), use_one_step=True, pad_time_dim=flags.paddings, pad_freq_dim='same')(net) for n, n_filters, dilation, stride, dropout in zip(blocks_n, filters, dilations, strides, dropouts): net = TransitionBlock(n_filters, dilation, stride, flags.paddings, dropout, sub_groups=flags.sub_groups)(net) for _ in range(n): net = NormalBlock(n_filters, dilation, 1, flags.paddings, dropout, sub_groups=flags.sub_groups)(net) if flags.paddings == 'same': net = tf.keras.layers.DepthwiseConv2D(kernel_size=5, padding='same')(net) else: net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D( kernel_size=5, padding='valid'), use_one_step=True, pad_time_dim=flags.paddings, pad_freq_dim='same')(net) # average out frequency dim net = tf.keras.backend.mean(net, axis=2, keepdims=True) net = tf.keras.layers.Conv2D(filters=flags.last_filters, kernel_size=1, use_bias=False)(net) # average out time dim if flags.paddings == 'same': net = tf.keras.layers.GlobalAveragePooling2D(keepdims=True)(net) else: net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D( keepdims=True))(net) net = tf.keras.layers.Conv2D(filters=flags.label_count, kernel_size=1, use_bias=False)(net) # 1 and 2 dims are equal to 1 net = tf.squeeze(net, [1, 2]) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)