def model(flags): """Convolutional recurrent neural network (CRNN) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf Represented as sequence of Conv, RNN/GRU, FC layers. Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # expand dims for the next layer 2d conv net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = stream.Stream( cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = gru.GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size), utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate), utils.parse(flags.cnn_strides)): net = stream.Stream( cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def build(self, input_shape): super(Svdf, self).build(input_shape) if self.mode == modes.Modes.TRAINING: self.dropout1 = non_scaling_dropout.NonScalingDropout(self.dropout) else: self.dropout1 = tf.keras.layers.Lambda(lambda x, training: x) self.dense1 = tf.keras.layers.Dense(units=self.units1, use_bias=self.use_bias1) self.depth_cnn1 = stream.Stream( cell=tf.keras.layers.DepthwiseConv2D(kernel_size=(self.memory_size, 1), strides=(1, 1), padding='valid', dilation_rate=(1, 1), use_bias=self.use_bias), inference_batch_size=self.inference_batch_size, mode=self.mode, use_one_step=False, pad_time_dim=self.pad) if self.units2 > 0: self.dense2 = tf.keras.layers.Dense(units=self.units2, use_bias=True) else: self.dense2 = tf.keras.layers.Lambda(lambda x, training: x) if self.use_batch_norm: self.batch_norm = tf.keras.layers.BatchNormalization( scale=self.bn_scale) else: self.batch_norm = tf.keras.layers.Lambda(lambda x, training: x)
def test_strided_conv_alignment(self): kernel_size = 4 strides = 2 inputs = tf.keras.layers.Input(shape=(None, 1)) net = inputs net = stream.Stream(cell=tf.keras.layers.Conv1D( filters=1, kernel_size=kernel_size, strides=strides, padding='valid', kernel_initializer='ones'), use_one_step=False, pad_time_dim='causal')(net) model = tf.keras.Model(inputs=inputs, outputs=net) input_signal = np.arange(1, 5) # [1, 2, 3, 4] # Sanity check for the test itself: We only care about the case when input # length is a multiple of strides. If not, streaming is not meaningful. assert len(input_signal) % strides == 0 input_signal = input_signal[None, :, None] output_signal = model.predict(input_signal) outputs = output_signal[0, :, 0] # Make sure causal conv is right-aligned, so that the most recent samples # are never ignored. Thus we want: # 1 2 3 4 # -> [0 0] 1 2 3 4 (padding) # -> 3 10 (conv with kernel of ones: 3=0+0+1+2, 10=1+2+3+4) # Note that this is different from tf.keras.layersConv1D(..., 'causal'), # which will pad 3 zeroes on the left and produce [1(=0+0+0+1), 6(=0+1+2+3)] # instead. The latter is less ideal, since it pads an extra zero and ignores # the last (and hence most recent) valid sample "4". self.assertAllEqual(outputs, [3, 10])
def conv_model_no_stream_wrapper(flags, conv_cell, cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias): """Toy example of convolutional model. It has the same model topology as in conv_model() above, but without wrapping conv cell by Stream layer, so that all parameters set manually. Args: flags: model and data settings conv_cell: cell for streaming, for example: tf.keras.layers.Conv1D cnn_filters: list of filters in conv layer cnn_kernel_size: list of kernel_size in conv layer cnn_act: list of activation functions in conv layer cnn_dilation_rate: list of dilation_rate in conv layer cnn_strides: list of strides in conv layer cnn_use_bias: list of use_bias in conv layer Returns: Keras model """ if not all( len(cnn_filters) == len(l) for l in [ cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias ]): raise ValueError('all input lists have to be the same length') input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = input_audio net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides, use_bias in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias): ring_buffer_size_in_time_dim = dilation_rate * (kernel_size - 1) net = stream.Stream( cell=tf.identity, ring_buffer_size_in_time_dim=ring_buffer_size_in_time_dim, use_one_step=False, pad_time_dim=None)(net) padding_size = ring_buffer_size_in_time_dim net = temporal_padding.TemporalPadding( padding='causal', padding_size=padding_size)( net) net = conv_cell( filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides, use_bias=use_bias, padding='valid')(net) # padding has to be valid! return tf.keras.Model(input_audio, net)
def __init__(self, filters=8, dilation=1, stride=1, padding='same', dropout=0.5, use_one_step=True, sub_groups=5, **kwargs): super(TransitionBlock, self).__init__(**kwargs) self.filters = filters self.dilation = dilation self.stride = stride self.padding = padding self.dropout = dropout self.use_one_step = use_one_step self.sub_groups = sub_groups self.frequency_dw_conv = tf.keras.layers.DepthwiseConv2D( kernel_size=(1, 3), strides=self.stride, dilation_rate=self.dilation, padding='same', use_bias=False) if self.padding == 'same': self.temporal_dw_conv = tf.keras.layers.DepthwiseConv2D( kernel_size=(3, 1), strides=self.stride, dilation_rate=self.dilation, padding='same', use_bias=False) else: self.temporal_dw_conv = stream.Stream( cell=tf.keras.layers.DepthwiseConv2D( kernel_size=(3, 1), strides=self.stride, dilation_rate=self.dilation, padding='valid', use_bias=False), use_one_step=use_one_step, pad_time_dim=self.padding, pad_freq_dim='same') self.batch_norm1 = tf.keras.layers.BatchNormalization() self.batch_norm2 = tf.keras.layers.BatchNormalization() self.conv1x1_1 = tf.keras.layers.Conv2D( filters=self.filters, kernel_size=1, strides=1, padding='valid', use_bias=False) self.conv1x1_2 = tf.keras.layers.Conv2D( filters=self.filters, kernel_size=1, strides=1, padding='valid', use_bias=False) self.spatial_drop = tf.keras.layers.SpatialDropout2D(rate=self.dropout) self.spectral_norm = sub_spectral_normalization.SubSpectralNormalization( self.sub_groups)
def model(flags): """Temporal Convolution ResNet model. It can be configured to reproduce model config as described in the paper below Temporal Convolution for Real-time Keyword Spotting on Mobile Devices https://arxiv.org/pdf/1904.03814.pdf Args: flags: data/model parameters Returns: Keras model for training """ tc_filters = parse(flags.tc_filters) repeat_tc_convs = parse(flags.repeat_tc_convs) kernel_sizes = parse(flags.kernel_sizes) pool_sizes = parse(flags.pool_sizes) dilations = parse(flags.dilations) residuals = parse(flags.residuals) if len( set((len(repeat_tc_convs), len(kernel_sizes), len(pool_sizes), len(dilations), len(residuals), len(tc_filters)))) != 1: raise ValueError('all input lists have to be the same length') input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # make it [batch, time, 1, feature] net = tf.keras.backend.expand_dims(net, axis=2) for filters, repeat, kernel_size, pool_size, dilation, residual in zip( tc_filters, repeat_tc_convs, kernel_sizes, pool_sizes, dilations, residuals): net = resnet_block(net, repeat, kernel_size, filters, dilation, residual, flags.padding_in_time, flags.dropout, flags.activation) if pool_size > 1: net = tf.keras.layers.MaxPooling2D((pool_size, 1))(net) net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D())(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # for streaming mode it is better to use causal padding padding = 'causal' if flags.svdf_pad else 'valid' for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( utils.parse(flags.svdf_units1), utils.parse(flags.svdf_memory_size), utils.parse(flags.svdf_units2), utils.parse(flags.svdf_dropout), utils.parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=padding, name='svdf_%d' % i)( net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Fully connected layer based model. It is based on paper (with added pooling): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) for units, activation in zip( utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D( pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')(net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def conv_model(flags, conv_cell, cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias, **kwargs): """Toy example of convolutional model with Stream wrapper. It can be used for speech enhancement. Args: flags: model and data settings conv_cell: cell for streaming, for example: tf.keras.layers.Conv1D cnn_filters: list of filters in conv layer cnn_kernel_size: list of kernel_size in conv layer cnn_act: list of activation functions in conv layer cnn_dilation_rate: list of dilation_rate in conv layer cnn_strides: list of strides in conv layer cnn_use_bias: list of use_bias in conv layer **kwargs: Additional kwargs passed on to conv_cell. Returns: Keras model Raises: ValueError: if any of input list has different length from any other """ if not all( len(cnn_filters) == len(l) for l in [ cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias ]): raise ValueError('all input lists have to be the same length') input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = input_audio net = tf.keras.backend.expand_dims(net) for (filters, kernel_size, activation, dilation_rate, strides, use_bias) in zip(cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias): net = stream.Stream( cell=conv_cell( filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides, use_bias=use_bias, padding='valid', **kwargs), use_one_step=False, pad_time_dim='causal')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Fully connected layer based model on raw wav data. It is based on paper (with added pooling and raw audio data): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Args: flags: data/model parameters Returns: Keras model for training """ if flags.preprocess != 'raw': ValueError('input audio has to be raw, but get ', flags.preprocess) input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = data_frame.DataFrame( frame_size=flags.window_size_samples, frame_step=flags.window_stride_samples)( input_audio) for units, activation in zip( utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D( pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')( net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """LSTM model. Similar model in papers: Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) for units, return_sequences, num_proj in zip( utils.parse(flags.lstm_units), utils.parse(flags.return_sequences), utils.parse(flags.num_proj)): net = lstm.LSTM( units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)( net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def test_average_pooling_stream(self): # prepare input data params = test_utils.Params([1]) params.desired_samples = 5 batch_size = 1 time1 = params.desired_samples # it is time dim (will not be averaged out) time2 = 3 # this dim will be averaged out and become 1 feature = 16 # it is a feature dim # override data shape for streaming mode testing params.preprocess = 'custom' params.data_shape = (1, time2, feature) inp_audio = np.random.rand(batch_size, time1, time2, feature) inputs = tf.keras.layers.Input( shape=(time1, time2, feature), batch_size=batch_size) net = stream.Stream( cell=average_pooling2d.AveragePooling2D( kernel_size=(time1, time2), padding='valid'), use_one_step=False, pad_time_dim='causal')(inputs) model = tf.keras.Model(inputs, net) model.summary() # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run inference and compare streaming vs non streaming non_stream_out = model.predict(inp_audio) stream_out = test.run_stream_inference(params, model_stream, inp_audio) self.assertAllClose(stream_out, non_stream_out) net = tf.keras.layers.GlobalAveragePooling2D()(inputs) model_global = tf.keras.Model(inputs, net) model_global.summary() global_out = model_global.predict(inp_audio) # last result in streaming output has to be the same with global average self.assertAllClose(stream_out[0, -1, 0, :], global_out[0, :])
def test_padding(self, padding): batch_size = 1 time_dim = 3 feature_dim = 3 kernel_size = 3 inputs = tf.keras.layers.Input(shape=(time_dim, feature_dim), batch_size=batch_size) # set it in train mode (in stream mode padding is not applied) net = stream.Stream(mode=modes.Modes.TRAINING, cell=tf.keras.layers.Lambda(lambda x: x), ring_buffer_size_in_time_dim=kernel_size, pad_time_dim=padding)(inputs) model = tf.keras.Model(inputs, net) test_utils.set_seed(1) input_signal = np.random.rand(batch_size, time_dim, feature_dim) outputs = model.predict(input_signal) self.assertAllEqual( outputs.shape, [batch_size, time_dim + kernel_size - 1, feature_dim])
def model(flags): """SVDF model with residual connections. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf In addition we added residual connection Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) blocks_pool = utils.parse(flags.blocks_pool) if len(blocks_pool) != 3: raise ValueError('number of pooling blocks has to be 3, but get: ', len(blocks_pool)) # for streaming mode it is better to use causal padding padding = 'causal' if flags.svdf_pad else 'valid' # first residual block number_of_blocks = len(utils.parse(flags.block1_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip( utils.parse(flags.block1_units1), utils.parse(flags.block1_memory_size), activations)): # [batch, time, feature] net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_1_%d' % i)( net) # number of channels in the last layer units1_last = utils.parse(flags.block1_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) # [batch, time, feature] net = tf.keras.layers.Activation(flags.activation)(net) net = tf.keras.layers.MaxPool1D( blocks_pool[0], strides=blocks_pool[0], padding='valid')( net) # second residual block number_of_blocks = len(utils.parse(flags.block2_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip( utils.parse(flags.block2_units1), utils.parse(flags.block2_memory_size), activations)): # [batch, time, feature] net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_2_%d' % i)( net) # number of channels in the last layer units1_last = utils.parse(flags.block2_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) net = tf.keras.layers.Activation(flags.activation)(net) # [batch, time, feature] net = tf.keras.layers.MaxPool1D( blocks_pool[1], strides=blocks_pool[1], padding='valid')( net) # third residual block number_of_blocks = len(utils.parse(flags.block3_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip( utils.parse(flags.block3_units1), utils.parse(flags.block3_memory_size), activations)): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_3_%d' % i)( net) # number of channels in the last layer units1_last = utils.parse(flags.block3_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) net = tf.keras.layers.Activation(flags.activation)(net) net = tf.keras.layers.MaxPool1D( blocks_pool[2], strides=blocks_pool[2], padding='valid')( net) # [batch, time, feature] # convert all feature to one vector if flags.flatten: net = stream.Stream(use_one_step=False, cell=tf.keras.layers.Flatten())(net) else: net = tf.keras.backend.expand_dims(net, axis=2) net = stream.Stream( use_one_step=False, cell=tf.keras.layers.AveragePooling2D( pool_size=(int(net.shape[1]), int(net.shape[2]))))( net) net = tf.keras.layers.Flatten()(net) # [batch, feature] net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units in utils.parse(flags.units2): net = tf.keras.layers.Dense(units=units, activation=flags.activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """BC-ResNet model. It is based on paper Broadcasted Residual Learning for Efficient Keyword Spotting https://arxiv.org/pdf/2106.04140.pdf Args: flags: data/model parameters Returns: Keras model for training Raises: ValueError: if any of input list has different length from any other; or if padding is not supported """ dropouts = utils.parse(flags.dropouts) filters = utils.parse(flags.filters) blocks_n = utils.parse(flags.blocks_n) strides = utils.parse(flags.strides) dilations = utils.parse(flags.dilations) for l in (dropouts, filters, strides, dilations): if len(blocks_n) != len(l): raise ValueError('all input lists have to be the same length ' 'but get %s and %s ' % (blocks_n, l)) input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # make it [batch, time, feature, 1] net = tf.keras.backend.expand_dims(net, axis=3) if flags.paddings == 'same': net = tf.keras.layers.Conv2D(filters=flags.first_filters, kernel_size=5, strides=(1, 2), padding='same')(net) else: net = stream.Stream(cell=tf.keras.layers.Conv2D( filters=flags.first_filters, kernel_size=5, strides=(1, 2), padding='valid'), use_one_step=True, pad_time_dim=flags.paddings, pad_freq_dim='same')(net) for n, n_filters, dilation, stride, dropout in zip(blocks_n, filters, dilations, strides, dropouts): net = TransitionBlock(n_filters, dilation, stride, flags.paddings, dropout, sub_groups=flags.sub_groups)(net) for _ in range(n): net = NormalBlock(n_filters, dilation, 1, flags.paddings, dropout, sub_groups=flags.sub_groups)(net) if flags.paddings == 'same': net = tf.keras.layers.DepthwiseConv2D(kernel_size=5, padding='same')(net) else: net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D( kernel_size=5, padding='valid'), use_one_step=True, pad_time_dim=flags.paddings, pad_freq_dim='same')(net) # average out frequency dim net = tf.keras.backend.mean(net, axis=2, keepdims=True) net = tf.keras.layers.Conv2D(filters=flags.last_filters, kernel_size=1, use_bias=False)(net) # average out time dim if flags.paddings == 'same': net = tf.keras.layers.GlobalAveragePooling2D(keepdims=True)(net) else: net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D( keepdims=True))(net) net = tf.keras.layers.Conv2D(filters=flags.label_count, kernel_size=1, use_bias=False)(net) # 1 and 2 dims are equal to 1 net = tf.squeeze(net, [1, 2]) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def test_streaming_with_effective_tdim(self): time_size = 10 feature_size = 3 batch_size = 1 time_dim = 1 # index of time dimensions ring_buffer_size_in_time_dim = 3 # effective size of aperture in time dim inputs = tf.keras.layers.Input(shape=(time_size, feature_size), batch_size=batch_size, name='inp_sequence') mode = modes.Modes.TRAINING # in streaming mode it will create a # ring buffer with time dim size ring_buffer_size_in_time_dim outputs = stream.Stream( cell=Sum(time_dim=time_dim), mode=mode, ring_buffer_size_in_time_dim=ring_buffer_size_in_time_dim)(inputs) model_train = tf.keras.Model(inputs, outputs) model_train.summary() mode = modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE input_tensors = [ tf.keras.layers.Input( shape=( 1, # time dim is size 1 in streaming mode feature_size, ), batch_size=batch_size, name='inp_stream') ] # convert non streaming model to streaming one model_stream = utils.convert_to_inference_model( model_train, input_tensors, mode) model_stream.summary() # second input tostream model is a state, so we can use its shape input_state_np = np.zeros(model_stream.inputs[1].shape, dtype=np.float32) # input test data non_stream_input = np.random.randint(1, 10, size=(batch_size, time_size, feature_size)) # run streaming inference # iterate over time dim sample by sample for i in range(input_state_np.shape[1]): input_stream_np = np.expand_dims(non_stream_input[0][i], 0) input_stream_np = np.expand_dims(input_stream_np, 1) input_stream_np = input_stream_np.astype(np.float32) output_stream_np, output_state_np = model_stream.predict( [input_stream_np, input_state_np]) input_state_np = output_state_np # update input state # emulate sliding window summation target = np.sum( non_stream_input[:, max(0, i - ring_buffer_size_in_time_dim):i + 1], axis=time_dim) self.assertAllEqual(target, output_stream_np) # validate name tag of model's state expected_str = 'ExternalState' self.assertAllEqual( expected_str, model_stream.inputs[1].name.split('/')[-1][:len(expected_str)])
def resnet_block(inputs, repeat_tc_conv, kernel_size, filters, dilation, residual, padding_in_time, dropout, activation): """TC(time conv) Residual block. Args: inputs: input tensor repeat_tc_conv: number of repeating Conv1D in time kernel_size: kernel size of Conv1D in time dim filters: number of filters in Conv1D in time and 1x1 conv dilation: dilation in time dim for Conv1D residual: if True residual connection is added padding_in_time: can be 'same' or 'causal' dropout: dropout value activation: type of activation function (string) Returns: output tensor Raises: ValueError: if padding has invalid value """ if residual and (padding_in_time not in ('same', 'causal')): raise ValueError('padding should be same or causal') net = inputs if residual: # 1x1 conv layer_res = tf.keras.layers.Conv2D(filters=filters, kernel_size=1, activation='linear')(net) layer_res = tf.keras.layers.BatchNormalization()(layer_res) for _ in range(repeat_tc_conv - 1): # 1D conv in time net = stream.Stream(cell=tf.keras.layers.Conv2D( filters=filters, kernel_size=(kernel_size, 1), dilation_rate=(dilation, 1), padding='valid', activation='linear'), pad_time_dim=padding_in_time)(net) net = tf.keras.layers.BatchNormalization()(net) net = tf.keras.layers.Activation(activation)(net) # 1D conv in time net = stream.Stream(cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=(kernel_size, 1), dilation_rate=(dilation, 1), padding='valid', activation='linear'), pad_time_dim=padding_in_time)(net) net = tf.keras.layers.BatchNormalization()(net) # residual connection if residual: net = tf.keras.layers.Add()([net, layer_res]) net = tf.keras.layers.Activation(activation)(net) net = tf.keras.layers.Dropout(rate=dropout)(net) return net
def resnet_block(inputs, repeat, kernel_size, filters, dilation, stride, residual=False, padding='same', dropout=0.0, activation='relu'): """Residual block. It is based on paper Jasper: An End-to-End Convolutional Neural Acoustic Model https://arxiv.org/pdf/1904.03288.pdf Args: inputs: input tensor repeat: number of repeating DepthwiseConv1D and Conv1D block kernel_size: kernel size of DepthwiseConv1D in time dim filters: number of filters in DepthwiseConv1D and Conv1D dilation: dilation in time dim for DepthwiseConv1D stride: stride in time dim for DepthwiseConv1D residual: if True residual connection is added padding: can be 'same' or 'causal' dropout: dropout value activation: type of activation function (string) Returns: output tensor Raises: ValueError: if any of input list has different length from any other; or if padding has invalid value """ if padding not in ('same', 'causal'): raise ValueError('padding should be same or causal') net = inputs for _ in range(repeat-1): # DepthwiseConv1D net = stream.Stream( cell=tf.keras.layers.DepthwiseConv2D( kernel_size=(kernel_size, 1), strides=(stride, 1), padding='valid', dilation_rate=(dilation, 1), use_bias=False), pad_time_dim=padding)( net) # Conv1D 1x1 net = stream.Stream( cell=tf.keras.layers.Conv2D( filters=filters, kernel_size=1, use_bias=False, padding='valid'), pad_time_dim=padding)( net) net = tf.keras.layers.BatchNormalization()(net) net = tf.keras.layers.Activation(activation)(net) net = tf.keras.layers.Dropout(rate=dropout)(net) # DepthwiseConv1D net = stream.Stream( cell=tf.keras.layers.DepthwiseConv2D( kernel_size=(kernel_size, 1), strides=(stride, 1), padding='valid', dilation_rate=(dilation, 1), use_bias=False), pad_time_dim=padding)( net) # Conv1D 1x1 net = stream.Stream( cell=tf.keras.layers.Conv2D( filters=filters, kernel_size=1, use_bias=False, padding='valid'), pad_time_dim=padding)( net) net = tf.keras.layers.BatchNormalization()(net) if residual: # Conv1D 1x1 net_res = stream.Stream( cell=tf.keras.layers.Conv2D( filters=filters, kernel_size=1, use_bias=False, padding='valid'), pad_time_dim=padding)( inputs) net_res = tf.keras.layers.BatchNormalization()(net_res) net = tf.keras.layers.Add()([net, net_res]) net = tf.keras.layers.Activation(activation)(net) net = tf.keras.layers.Dropout(rate=dropout)(net) return net
def model(flags): """Depthwise convolutional model. It is based on paper: MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861 Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) net = tf.keras.backend.expand_dims(net) net = stream.Stream(cell=tf.keras.layers.Conv2D( kernel_size=utils.parse(flags.cnn1_kernel_size), dilation_rate=utils.parse(flags.cnn1_dilation_rate), filters=flags.cnn1_filters, padding=flags.cnn1_padding, strides=utils.parse(flags.cnn1_strides)))(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) for kernel_size, dw2_act, dilation_rate, strides, filters, cnn2_act in zip( utils.parse(flags.dw2_kernel_size), utils.parse(flags.dw2_act), utils.parse(flags.dw2_dilation_rate), utils.parse(flags.dw2_strides), utils.parse(flags.cnn2_filters), utils.parse(flags.cnn2_act)): net = stream.Stream( cell=tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size, dilation_rate=dilation_rate, padding=flags.dw2_padding, strides=strides))(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation(dw2_act)(net) net = tf.keras.layers.Conv2D(kernel_size=(1, 1), filters=filters)(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation(cnn2_act)(net) net = stream.Stream(cell=tf.keras.layers.AveragePooling2D( pool_size=(int(net.shape[1]), int(net.shape[2]))))(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) if flags.quantize: net = quantize_layer.QuantizeLayer( AllValuesQuantizer(num_bits=8, per_axis=False, symmetric=False, narrow_range=False))(net) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size), utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate), utils.parse(flags.cnn_strides)): net = stream.Stream(cell=quantize.quantize_layer( tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, dilation_rate=dilation_rate, activation='linear', strides=strides), flags.quantize, quantize.NoOpActivationConfig(['kernel'], ['activation'], False)), pad_time_dim='causal', use_one_step=False)(net) net = quantize.quantize_layer( tf.keras.layers.BatchNormalization(), default_8bit_quantize_configs.NoOpQuantizeConfig())(net) net = quantize.quantize_layer( tf.keras.layers.Activation(activation))(net) net = stream.Stream(cell=quantize.quantize_layer( tf.keras.layers.Flatten(), apply_quantization=flags.quantize))(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units2), utils.parse(flags.act2)): net = quantize.quantize_layer(tf.keras.layers.Dense( units=units, activation=activation), apply_quantization=flags.quantize)(net) net = quantize.quantize_layer( tf.keras.layers.Dense(units=flags.label_count), apply_quantization=flags.quantize)(net) if flags.return_softmax: net = quantize.quantize_layer(tf.keras.layers.Activation('softmax'), apply_quantization=flags.quantize)(net) return tf.keras.Model(input_audio, net)
def resnet_block(inputs, repeat, kernel_size, filters, dilation, stride, filter_separable, residual=False, padding='same', dropout=0.0, activation='relu', scale=True): """Residual block. It is based on paper Jasper: An End-to-End Convolutional Neural Acoustic Model https://arxiv.org/pdf/1904.03288.pdf Args: inputs: input tensor repeat: number of repeating DepthwiseConv1D and Conv1D block kernel_size: kernel size of DepthwiseConv1D in time dim filters: number of filters in DepthwiseConv1D and Conv1D dilation: dilation in time dim for DepthwiseConv1D stride: stride in time dim for DepthwiseConv1D filter_separable: use separable conv or standard conv residual: if True residual connection is added padding: can be 'same' or 'causal' dropout: dropout value activation: type of activation function (string) scale: apply scaling in batchnormalization layer Returns: output tensor Raises: ValueError: if padding has invalid value """ if residual and (padding not in ('same', 'causal')): raise ValueError('padding should be same or causal') net = inputs for _ in range(repeat - 1): if filter_separable: # apply separable conv if kernel_size > 0: # DepthwiseConv1D net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D( kernel_size=(kernel_size, 1), strides=(stride, 1), padding='valid', dilation_rate=(dilation, 1), use_bias=False), pad_time_dim=padding)(net) # Conv1D 1x1 - streamable by default net = tf.keras.layers.Conv2D(filters=filters, kernel_size=1, use_bias=False, padding='valid')(net) else: # apply 1D conv in time net = stream.Stream(cell=tf.keras.layers.Conv2D( filters=filters, kernel_size=(kernel_size, 1), dilation_rate=(dilation, 1), padding='valid', activation='linear', use_bias=False), pad_time_dim=padding)(net) net = tf.keras.layers.BatchNormalization(scale=scale)(net) net = tf.keras.layers.Activation(activation)(net) net = tf.keras.layers.Dropout(rate=dropout)(net) if filter_separable: # apply separable conv if kernel_size > 0: # DepthwiseConv1D net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D( kernel_size=(kernel_size, 1), strides=(stride, 1), padding='valid', dilation_rate=(dilation, 1), use_bias=False), pad_time_dim=padding)(net) # Conv1D 1x1 - streamable by default net = tf.keras.layers.Conv2D(filters=filters, kernel_size=1, use_bias=False, padding='valid')(net) else: # apply 1D conv in time net = stream.Stream(cell=tf.keras.layers.Conv2D( filters=filters, kernel_size=(kernel_size, 1), dilation_rate=(dilation, 1), padding='valid', activation='linear', use_bias=False), pad_time_dim=padding)(net) net = tf.keras.layers.BatchNormalization(scale=scale)(net) if residual: # Conv1D 1x1 - streamable by default net_res = tf.keras.layers.Conv2D(filters=filters, kernel_size=1, use_bias=False, padding='valid')(inputs) net_res = tf.keras.layers.BatchNormalization(scale=scale)(net_res) net = tf.keras.layers.Add()([net, net_res]) net = tf.keras.layers.Activation(activation)(net) net = tf.keras.layers.Dropout(rate=dropout)(net) return net
def model(flags): """MatchboxNet model. It is based on paper MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition https://arxiv.org/pdf/2004.08531.pdf Args: flags: data/model parameters Returns: Keras model for training Raises: ValueError: if any of input list has different length from any other; or if padding is not supported """ ds_filters = parse(flags.ds_filters) ds_repeat = parse(flags.ds_repeat) ds_kernel_size = parse(flags.ds_kernel_size) ds_stride = parse(flags.ds_stride) ds_dilation = parse(flags.ds_dilation) ds_residual = parse(flags.ds_residual) ds_pool = parse(flags.ds_pool) ds_padding = parse(flags.ds_padding) ds_filter_separable = parse(flags.ds_filter_separable) for l in (ds_repeat, ds_kernel_size, ds_stride, ds_dilation, ds_residual, ds_pool, ds_padding, ds_filter_separable): if len(ds_filters) != len(l): raise ValueError('all input lists have to be the same length') input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # make it [batch, time, 1, feature] net = tf.keras.backend.expand_dims(net, axis=2) # encoder for filters, repeat, ksize, stride, sep, dilation, res, pool, pad in zip( ds_filters, ds_repeat, ds_kernel_size, ds_stride, ds_filter_separable, ds_dilation, ds_residual, ds_pool, ds_padding): net = resnet_block(net, repeat, ksize, filters, dilation, stride, sep, res, pad, flags.dropout, flags.activation, flags.ds_scale) if pool > 1: if flags.ds_max_pool: net = tf.keras.layers.MaxPooling2D(pool_size=(pool, 1), strides=(pool, 1))(net) else: net = tf.keras.layers.AveragePooling2D(pool_size=(pool, 1), strides=(pool, 1))(net) # decoder net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D())(net) net = tf.keras.layers.Flatten()(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def conv_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding, dilation=1): """Toy convolutional model with sequence of convs with different paddings. It can be used for speech enhancement. Args: flags: model and data settings cnn_filters: list of filters in conv layer cnn_kernel_size: list of kernel_size in conv layer cnn_act: list of activation functions in conv layer cnn_use_bias: list of use_bias in conv layer cnn_padding: list of padding in conv layer dilation: dilation applied on all conv layers Returns: Keras model and sum delay Raises: ValueError: if any of input list has different length from any other or padding in not [same, causal] """ if not all( len(cnn_filters) == len(l) for l in [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]): raise ValueError('all input lists have to be the same length') # it is an example of deep conv model for speech enhancement # which can be trained in non streaming mode and converted to streaming mode input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = input_audio sum_delay = 0 sum_shift = 0 net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, use_bias, padding in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding): time_buffer_size = dilation * (kernel_size - 1) if padding == 'same': # need a delay with 'same' padding in streaming mode delay_val = time_buffer_size // 2 net = delay.Delay(delay=delay_val)(net) sum_delay += delay_val * 2 elif padding == 'causal': sum_shift += kernel_size else: raise ValueError('wrong padding mode ', padding) # it is a ring buffer in streaming mode and lambda x during training net = stream.Stream(cell=tf.keras.layers.Conv1D( filters=filters, kernel_size=kernel_size, activation=activation, use_bias=use_bias, padding='valid'), use_one_step=False, pad_time_dim=padding)(net) return tf.keras.Model(input_audio, net), sum_delay, sum_shift
def residual_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding): """Toy deep convolutional model with residual connections. It can be used for speech enhancement. Args: flags: model and data settings cnn_filters: list of filters in conv layer cnn_kernel_size: list of kernel_size in conv layer cnn_act: list of activation functions in conv layer cnn_use_bias: list of use_bias in conv layer cnn_padding: list of padding in conv layer Returns: Keras model Raises: ValueError: if any of input list has different length from any other """ if not all( len(cnn_filters) == len(l) for l in [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]): raise ValueError('all input lists have to be the same length') # it is an example of deep conv model for speech enhancement # which can be trained in non streaming mode and converted to streaming mode input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = input_audio net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, use_bias, padding in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding): ring_buffer_size_in_time_dim = (kernel_size - 1) # it is a ring buffer in streaming mode and lambda x during training net = stream.Stream( cell=tf.identity, ring_buffer_size_in_time_dim=ring_buffer_size_in_time_dim, use_one_step=False, pad_time_dim=None)(net) # residual connection in streaming mode needs: # * kernel size in time dim of conv layer # * padding mode which was used to padd data in time dim net_residual = residual.Residual( padding=padding, kernel_size_time=ring_buffer_size_in_time_dim + 1)(net) # it is easier to convert model to streaming mode when padding function # is decoupled from conv layer net = temporal_padding.TemporalPadding( padding=padding, padding_size=ring_buffer_size_in_time_dim)(net) net = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation=activation, use_bias=use_bias, padding='valid')( net) # padding has to be valid! net = tf.keras.layers.Add()([net, net_residual]) return tf.keras.Model(input_audio, net)
def residual_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding, delay_also_in_non_streaming, dilation=1): """Toy deep convolutional model with residual connections. It can be used for speech enhancement. Args: flags: model and data settings cnn_filters: list of filters in conv layer cnn_kernel_size: list of kernel_size in conv layer cnn_act: list of activation functions in conv layer cnn_use_bias: list of use_bias in conv layer cnn_padding: list of padding in conv layer delay_also_in_non_streaming: Whether to apply delay also in non-streaming. dilation: dilation applied on all conv layers Returns: Keras model and sum delay Raises: ValueError: if any of input list has different length from any other or padding in not [same, causal] """ if not all( len(cnn_filters) == len(l) for l in [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]): raise ValueError('all input lists have to be the same length') # it is an example of deep conv model for speech enhancement # which can be trained in non streaming mode and converted to streaming mode input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = input_audio sum_delay = 0 net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, use_bias, padding in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding): time_buffer_size = dilation * (kernel_size - 1) if padding == 'causal': # residual connection is simple with 'causal' padding net_residual = net elif padding == 'same': # residual connection in streaming mode needs delay with 'same' padding delay_val = time_buffer_size // 2 net_residual = delay.Delay( delay=delay_val, also_in_non_streaming=delay_also_in_non_streaming)(net) sum_delay += delay_val else: raise ValueError('wrong padding mode ', padding) # it is easier to convert model to streaming mode when padding function # is decoupled from conv layer net = temporal_padding.TemporalPadding( padding='causal' if delay_also_in_non_streaming else padding, padding_size=time_buffer_size)(net) # it is a ring buffer in streaming mode and lambda x during training net = stream.Stream(cell=tf.identity, ring_buffer_size_in_time_dim=time_buffer_size, use_one_step=False, pad_time_dim=None)(net) net = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation=activation, use_bias=use_bias, padding='valid')( net) # padding has to be valid! net = tf.keras.layers.Add()([net, net_residual]) return tf.keras.Model(input_audio, net), sum_delay
def transposed_conv_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_paddings, trans_paddings): """Toy deep convolutional model with transposed convolutions. It can be used for speech enhancement. Args: flags: model and data settings cnn_filters: list of filters for conv layer cnn_kernel_size: list of kernel_size for conv layer cnn_act: list of activation functions for conv layer cnn_use_bias: list of use_bias for conv layer cnn_paddings: list of padding for conv layer trans_paddings: list of padding for transposed conv layer Returns: Keras model and sum delay Raises: ValueError: if any of input list has different length from any other or padding in not [same, causal] """ if not all( len(cnn_filters) == len(l) for l in [ cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_paddings, trans_paddings ]): raise ValueError('all input lists have to be the same length') # it is an example of deep conv model for speech enhancement # which can be trained in non streaming mode and converted to streaming mode input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = input_audio net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, use_bias, padding, trans_padding in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_paddings, trans_paddings): time_buffer_size = kernel_size - 1 net = tf.keras.backend.expand_dims(net, axis=-2) net = stream.Stream( cell=tf.keras.layers.Conv2DTranspose( filters=filters, kernel_size=(3, 1), strides=(2, 1), padding='valid'), pad_time_dim=trans_padding)(net) net = tf.keras.backend.squeeze(net, axis=-2) if padding == 'same': # model looking into future, so introducing delay for streaming mode net = delay.Delay(delay=time_buffer_size // 2)(net) elif padding != 'causal': raise ValueError('wrong padding mode ', padding) # it is a ring buffer in streaming mode and lambda x during training net = stream.Stream( cell=tf.keras.layers.Conv1D( filters=filters, kernel_size=kernel_size, activation=activation, use_bias=use_bias, padding='valid'), use_one_step=False, pad_time_dim=padding)(net) return tf.keras.Model(input_audio, net)