def ds_tc_resnet_model_params(use_tf_fft=False): """Generate parameters for ds_tc_resnet model.""" # model parameters model_name = 'ds_tc_resnet' params = model_params.HOTWORD_MODEL_PARAMS[model_name] params.causal_data_frame_padding = 1 # causal padding on DataFrame params.clip_duration_ms = 160 params.use_tf_fft = use_tf_fft params.mel_non_zero_only = not use_tf_fft params.feature_type = 'mfcc_tf' params.window_size_ms = 5.0 params.window_stride_ms = 2.0 params.wanted_words = 'a,b,c' params.ds_padding = "'causal','causal','causal','causal'" params.ds_filters = '4,4,4,2' params.ds_repeat = '1,1,1,1' params.ds_residual = '0,1,1,1' # no residuals on strided layers params.ds_kernel_size = '3,3,3,1' params.ds_dilation = '1,1,1,1' params.ds_stride = '2,1,1,1' # streaming conv with stride params.ds_pool = '1,2,1,1' # streaming conv with pool params.ds_filter_separable = '1,1,1,1' # convert ms to samples and compute labels count params = model_flags.update_flags(params) # compute total stride pools = model_utils.parse(params.ds_pool) strides = model_utils.parse(params.ds_stride) time_stride = [1] for pool in pools: if pool > 1: time_stride.append(pool) for stride in strides: if stride > 1: time_stride.append(stride) total_stride = np.prod(time_stride) # override input data shape for streaming model with stride/pool params.data_stride = total_stride params.data_shape = (total_stride * params.window_stride_samples,) # set desired number of frames in model frames_number = 16 frames_per_call = total_stride frames_number = (frames_number // frames_per_call) * frames_per_call # number of input audio samples required to produce one output frame framing_stride = max( params.window_stride_samples, max(0, params.window_size_samples - params.window_stride_samples)) signal_size = framing_stride * frames_number # desired number of samples in the input data to train non streaming model params.desired_samples = signal_size params.batch_size = 1 return params
def model(flags): """Fully connected layer based model. It is based on paper (with added pooling): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) for units, activation in zip(utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D(pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')(net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Fully connected layer based model on raw wav data. It is based on paper (with added pooling and raw audio data): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Args: flags: data/model parameters Returns: Keras model for training """ if flags.preprocess != 'raw': ValueError('input audio has to be raw, but get ', flags.preprocess) input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = data_frame.DataFrame( frame_size=flags.window_size_samples, frame_step=flags.window_stride_samples)(input_audio) for units, activation in zip(utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D(pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')(net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """LSTM model. Similar model in papers: Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) for units, return_sequences, num_proj in zip( utils.parse(flags.lstm_units), utils.parse(flags.return_sequences), utils.parse(flags.num_proj)): net = lstm.LSTM( units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)( net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # for streaming mode it is better to use causal padding padding = 'causal' if flags.svdf_pad else 'valid' for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( utils.parse(flags.svdf_units1), utils.parse(flags.svdf_memory_size), utils.parse(flags.svdf_units2), utils.parse(flags.svdf_dropout), utils.parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=padding, name='svdf_%d' % i)( net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Inception resnet model. It is based on paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning https://arxiv.org/abs/1602.07261 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, 1, feature] for filters, kernel_size, stride in zip( utils.parse(flags.cnn1_filters), utils.parse(flags.cnn1_kernel_sizes), utils.parse(flags.cnn1_strides)): net = utils.conv2d_bn(net, filters, (kernel_size, 1), scale=flags.bn_scale, padding='valid') if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) # [batch, time, 1, filters] for stride, scale, filters_branch0, filters_branch1, filters_branch2, kernel_size in zip( utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_scales), utils.parse(flags.cnn2_filters_branch0), utils.parse(flags.cnn2_filters_branch1), utils.parse(flags.cnn2_filters_branch2), utils.parse(flags.cnn2_kernel_sizes)): net = inception_resnet_block(net, scale, filters_branch0, filters_branch1, kernel_size, bn_scale=flags.bn_scale) net = utils.conv2d_bn(net, filters_branch2, (1, 1), scale=flags.bn_scale, padding='valid') if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1), padding='valid')(net) # [batch, time, 1, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Mobilenet V2 model. It is based on paper: MobileNetV2: Inverted Residuals and Linear Bottlenecks https://arxiv.org/abs/1801.04381 It is applied on sequence in time, so only 1D filters applied Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, feature, 1] # it is conv_block net = tf.keras.layers.Conv2D(filters=flags.cnn1_filters, kernel_size=utils.parse( flags.cnn1_kernel_size), padding='valid', use_bias=False, strides=utils.parse(flags.cnn1_strides))(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # [batch, time, feature, filters] for kernel_size, stride, filters, expansion in zip( utils.parse(flags.ds_kernel_size), utils.parse(flags.cnn_strides), utils.parse(flags.cnn_filters), utils.parse(flags.cnn_expansions)): # it is Inverted ResNet block net_input = net in_channels = tf.keras.backend.int_shape(net_input)[-1] net = tf.keras.layers.Conv2D(expansion * in_channels, kernel_size=1, padding='same', use_bias=False, activation=None)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # [batch, time, feature, filters] # depthwise net = tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size, strides=stride, activation=None, use_bias=False, padding='same')(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # project net = tf.keras.layers.Conv2D(filters, kernel_size=1, padding='same', use_bias=False, activation=None)(net) net = tf.keras.layers.BatchNormalization()(net) if in_channels == filters and stride == (1, 1): net = tf.keras.layers.Add()([net_input, net]) # [batch, time, feature, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model with residual connections. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf In addition we added residual connection Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) blocks_pool = utils.parse(flags.blocks_pool) if len(blocks_pool) != 3: raise ValueError('number of pooling blocks has to be 3, but get: ', len(blocks_pool)) # for streaming mode it is better to use causal padding padding = 'causal' if flags.svdf_pad else 'valid' # first residual block number_of_blocks = len(utils.parse(flags.block1_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip( utils.parse(flags.block1_units1), utils.parse(flags.block1_memory_size), activations)): # [batch, time, feature] net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_1_%d' % i)( net) # number of channels in the last layer units1_last = utils.parse(flags.block1_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) # [batch, time, feature] net = tf.keras.layers.Activation(flags.activation)(net) net = tf.keras.layers.MaxPool1D( blocks_pool[0], strides=blocks_pool[0], padding='valid')( net) # second residual block number_of_blocks = len(utils.parse(flags.block2_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip( utils.parse(flags.block2_units1), utils.parse(flags.block2_memory_size), activations)): # [batch, time, feature] net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_2_%d' % i)( net) # number of channels in the last layer units1_last = utils.parse(flags.block2_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) net = tf.keras.layers.Activation(flags.activation)(net) # [batch, time, feature] net = tf.keras.layers.MaxPool1D( blocks_pool[1], strides=blocks_pool[1], padding='valid')( net) # third residual block number_of_blocks = len(utils.parse(flags.block3_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip( utils.parse(flags.block3_units1), utils.parse(flags.block3_memory_size), activations)): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_3_%d' % i)( net) # number of channels in the last layer units1_last = utils.parse(flags.block3_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) net = tf.keras.layers.Activation(flags.activation)(net) net = tf.keras.layers.MaxPool1D( blocks_pool[2], strides=blocks_pool[2], padding='valid')( net) # [batch, time, feature] # convert all feature to one vector if flags.flatten: net = stream.Stream(use_one_step=False, cell=tf.keras.layers.Flatten())(net) else: net = tf.keras.backend.expand_dims(net, axis=2) net = stream.Stream( use_one_step=False, cell=tf.keras.layers.AveragePooling2D( pool_size=(int(net.shape[1]), int(net.shape[2]))))( net) net = tf.keras.layers.Flatten()(net) # [batch, feature] net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units in utils.parse(flags.units2): net = tf.keras.layers.Dense(units=units, activation=flags.activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def main(_): # Update flags flags = model_flags.update_flags(FLAGS) if flags.train: # Create model folders where logs and model will be stored os.makedirs(flags.train_dir) os.mkdir(flags.summaries_dir) # Model training train.train(flags) else: if not os.path.isdir(flags.train_dir): raise ValueError( 'model is not trained set "--train 1" and retrain it') # write all flags settings into json with open(os.path.join(flags.train_dir, 'flags.json'), 'wt') as f: json.dump(flags.__dict__, f) # convert to SavedModel test.convert_model_saved(flags, 'non_stream', modes.Modes.NON_STREAM_INFERENCE) try: test.convert_model_saved(flags, 'stream_state_internal', modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) except (ValueError, IndexError) as e: logging.info('FAILED to run TF streaming: %s', e) logging.info('run TF non streaming model accuracy evaluation') # with TF folder_name = 'tf' test.tf_non_stream_model_accuracy(flags, folder_name) # with TF. # We can apply non stream model on stream data, by running inference # every 200ms (for example), so that total latency will be similar with # streaming model which is executed every 20ms. # To measure the impact of sampling on model accuracy, # we introduce time_shift_ms during accuracy evaluation. # Convert milliseconds to samples: time_shift_samples = int( (flags.time_shift_ms * flags.sample_rate) / model_flags.MS_PER_SECOND) test.tf_non_stream_model_accuracy( flags, folder_name, time_shift_samples, accuracy_name='tf_non_stream_model_sampling_stream_accuracy.txt') name2opt = { '': None, 'quantize_opt_for_size_': [tf.lite.Optimize.DEFAULT], } for opt_name, optimizations in name2opt.items(): if (opt_name and flags.feature_type == 'mfcc_tf' and flags.preprocess == 'raw'): logging.info( 'feature type mfcc_tf needs quantization aware training ' 'for quantization - it is not implemented') continue folder_name = opt_name + 'tflite_non_stream' file_name = 'non_stream.tflite' mode = modes.Modes.NON_STREAM_INFERENCE test.convert_model_tflite(flags, folder_name, mode, file_name, optimizations=optimizations) test.tflite_non_stream_model_accuracy(flags, folder_name, file_name) # these models are using bi-rnn, so they are non streamable by default # also models using striding or pooling are not supported for streaming now non_streamable_models = {'att_mh_rnn', 'att_rnn', 'tc_resnet'} model_is_streamable = True if flags.model_name in non_streamable_models: model_is_streamable = False # below models can use striding in time dimension, # but this is currently unsupported elif flags.model_name == 'cnn': for strides in model_utils.parse(flags.cnn_strides): if strides[0] > 1: model_is_streamable = False break elif flags.model_name == 'ds_cnn': if model_utils.parse(flags.cnn1_strides)[0] > 1: model_is_streamable = False for strides in model_utils.parse(flags.dw2_strides): if strides[0] > 1: model_is_streamable = False break # set input data shape for testing inference in streaming mode flags.data_shape = modes.get_input_data_shape( flags, modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE) # if model can be streamed, then run conversion/evaluation in streaming mode if model_is_streamable: # ---------------- TF streaming model accuracy evaluation ---------------- # Streaming model with external state evaluation using TF with state reset if not opt_name: logging.info( 'run TF evalution only without optimization/quantization') try: folder_name = 'tf' test.tf_stream_state_external_model_accuracy( flags, folder_name, accuracy_name= 'stream_state_external_model_accuracy_sub_set_reset1.txt', reset_state=True ) # with state reset between test sequences # Streaming (with external state) evaluation using TF no state reset test.tf_stream_state_external_model_accuracy( flags, folder_name, accuracy_name= 'stream_state_external_model_accuracy_sub_set_reset0.txt', reset_state=False) # without state reset # Streaming (with internal state) evaluation using TF no state reset test.tf_stream_state_internal_model_accuracy( flags, folder_name) except (ValueError, IndexError) as e: logging.info('FAILED to run TF streaming: %s', e) logging.info('run TFlite streaming model accuracy evaluation') try: # convert model to TFlite folder_name = opt_name + 'tflite_stream_state_external' file_name = 'stream_state_external.tflite' mode = modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE test.convert_model_tflite(flags, folder_name, mode, file_name, optimizations=optimizations) # Streaming model accuracy evaluation with TFLite with state reset test.tflite_stream_state_external_model_accuracy( flags, folder_name, file_name, accuracy_name= 'tflite_stream_state_external_model_accuracy_reset1.txt', reset_state=True) # Streaming model accuracy evaluation with TFLite without state reset test.tflite_stream_state_external_model_accuracy( flags, folder_name, file_name, accuracy_name= 'tflite_stream_state_external_model_accuracy_reset0.txt', reset_state=False) except (ValueError, IndexError) as e: logging.info('FAILED to run TFLite streaming: %s', e)
def model(flags): """BiRNN multihead attention model. It is based on paper: Attention Is All You Need https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf A neural attention model for speech command recognition https://arxiv.org/pdf/1808.08929.pdf Depending on parameter rnn_type, model can be biLSTM or biGRU Args: flags: data/model parameters Returns: Keras model for training """ rnn_types = {'lstm': tf.keras.layers.LSTM, 'gru': tf.keras.layers.GRU} if flags.rnn_type not in rnn_types: ValueError('not supported RNN type ', flags.rnn_type) rnn = rnn_types[flags.rnn_type] input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size), utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate), utils.parse(flags.cnn_strides)): net = tf.keras.layers.Conv2D( filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides, padding='same', kernel_regularizer=tf.keras.regularizers.l2(flags.l2_weight_decay), bias_regularizer=tf.keras.regularizers.l2( flags.l2_weight_decay))(net) net = tf.keras.layers.BatchNormalization()(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) # dims: [batch, time, feature] for _ in range(flags.rnn_layers): net = tf.keras.layers.Bidirectional( rnn(flags.rnn_units, return_sequences=True, unroll=True, kernel_regularizer=tf.keras.regularizers.l2( flags.l2_weight_decay), bias_regularizer=tf.keras.regularizers.l2( flags.l2_weight_decay)))(net) feature_dim = net.shape[-1] middle = net.shape[1] // 2 # index of middle point of sequence # feature vector at middle point [batch, feature] mid_feature = net[:, middle, :] # prepare multihead attention multiheads = [] for _ in range(flags.heads): # apply one projection layer with the same dim as input feature query = tf.keras.layers.Dense( feature_dim, kernel_regularizer=tf.keras.regularizers.l2(flags.l2_weight_decay), bias_regularizer=tf.keras.regularizers.l2( flags.l2_weight_decay))(mid_feature) # attention weights [batch, time] att_weights = tf.keras.layers.Dot(axes=[1, 2])([query, net]) att_weights = tf.keras.layers.Softmax()(att_weights) # apply attention weights [batch, feature] multiheads.append(tf.keras.layers.Dot(axes=[1, 1])([att_weights, net])) net = tf.keras.layers.concatenate(multiheads) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense( units=units, activation=activation, kernel_regularizer=tf.keras.regularizers.l2(flags.l2_weight_decay), bias_regularizer=tf.keras.regularizers.l2( flags.l2_weight_decay))(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def init_model(self, use_tf_fft=False): config = tf1.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf1.Session(config=config) tf1.keras.backend.set_session(self.sess) test_utils.set_seed(123) tf.keras.backend.set_learning_phase(0) # model parameters model_name = 'ds_tc_resnet' self.params = model_params.HOTWORD_MODEL_PARAMS[model_name] self.params.causal_data_frame_padding = 1 # causal padding on DataFrame self.params.clip_duration_ms = 160 self.params.use_tf_fft = use_tf_fft self.params.mel_non_zero_only = not use_tf_fft self.params.feature_type = 'mfcc_tf' self.params.window_size_ms = 5.0 self.params.window_stride_ms = 2.0 self.params.wanted_words = 'a,b,c' self.params.ds_padding = "'causal','causal','causal','causal'" self.params.ds_filters = '4,4,4,2' self.params.ds_repeat = '1,1,1,1' self.params.ds_residual = '0,1,1,1' # no residuals on strided layers self.params.ds_kernel_size = '3,3,3,1' self.params.ds_dilation = '1,1,1,1' self.params.ds_stride = '2,1,1,1' # streaming conv with stride self.params.ds_pool = '1,2,1,1' # streaming conv with pool self.params.ds_filter_separable = '1,1,1,1' # convert ms to samples and compute labels count self.params = model_flags.update_flags(self.params) # compute total stride pools = model_utils.parse(self.params.ds_pool) strides = model_utils.parse(self.params.ds_stride) time_stride = [1] for pool in pools: if pool > 1: time_stride.append(pool) for stride in strides: if stride > 1: time_stride.append(stride) total_stride = np.prod(time_stride) # overide input data shape for streaming model with stride/pool self.params.data_stride = total_stride self.params.data_shape = (total_stride * self.params.window_stride_samples, ) # set desired number of frames in model frames_number = 16 frames_per_call = total_stride frames_number = (frames_number // frames_per_call) * frames_per_call # number of input audio samples required to produce one output frame framing_stride = max( self.params.window_stride_samples, max( 0, self.params.window_size_samples - self.params.window_stride_samples)) signal_size = framing_stride * frames_number # desired number of samples in the input data to train non streaming model self.params.desired_samples = signal_size self.params.batch_size = 1 self.model = ds_tc_resnet.model(self.params) self.model.summary() self.input_data = np.random.rand(self.params.batch_size, self.params.desired_samples) # run non streaming inference self.non_stream_out = self.model.predict(self.input_data)
def model(flags): """Xception model. It is based on papers: Xception: Deep Learning with Depthwise Separable Convolutions https://arxiv.org/abs/1610.02357 MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition https://arxiv.org/pdf/2004.08531 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, 1, feature] # conv block for kernel_size, filters in zip(utils.parse(flags.cnn1_kernel_sizes), utils.parse(flags.cnn1_filters)): net = tf.keras.layers.Conv2D(filters, (kernel_size, 1), use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) # [batch, time, 1, feature] if flags.stride1 > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(flags.stride1, 1), padding='valid')(net) net = block(net, utils.parse(flags.cnn2_kernel_sizes), utils.parse(flags.cnn2_filters), flags.dropout, flags.bn_scale) if flags.stride2 > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(flags.stride2, 1), padding='valid')(net) net = block(net, utils.parse(flags.cnn3_kernel_sizes), utils.parse(flags.cnn3_filters), flags.dropout, flags.bn_scale) if flags.stride3 > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(flags.stride3, 1), padding='valid')(net) net = block(net, utils.parse(flags.cnn4_kernel_sizes), utils.parse(flags.cnn4_filters), flags.dropout, flags.bn_scale) if flags.stride4 > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(flags.stride4, 1), padding='valid')(net) net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) for units in utils.parse(flags.units2): net = tf.keras.layers.Dense(units=units, activation=None, use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) # [batch, label_count] return tf.keras.Model(input_audio, net)
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) if flags.quantize: net = quantize_layer.QuantizeLayer( AllValuesQuantizer(num_bits=8, per_axis=False, symmetric=False, narrow_range=False))(net) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size), utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate), utils.parse(flags.cnn_strides)): net = stream.Stream(cell=quantize.quantize_layer( tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, dilation_rate=dilation_rate, activation='linear', strides=strides), flags.quantize, quantize.NoOpActivationConfig(['kernel'], ['activation'], False)), pad_time_dim='causal', use_one_step=False)(net) net = quantize.quantize_layer( tf.keras.layers.BatchNormalization(), default_8bit_quantize_configs.NoOpQuantizeConfig())(net) net = quantize.quantize_layer( tf.keras.layers.Activation(activation))(net) net = stream.Stream(cell=quantize.quantize_layer( tf.keras.layers.Flatten(), apply_quantization=flags.quantize))(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units2), utils.parse(flags.act2)): net = quantize.quantize_layer(tf.keras.layers.Dense( units=units, activation=activation), apply_quantization=flags.quantize)(net) net = quantize.quantize_layer( tf.keras.layers.Dense(units=flags.label_count), apply_quantization=flags.quantize)(net) if flags.return_softmax: net = quantize.quantize_layer(tf.keras.layers.Activation('softmax'), apply_quantization=flags.quantize)(net) return tf.keras.Model(input_audio, net)
def model(flags): """MatchboxNet model. It is based on paper MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition https://arxiv.org/pdf/2004.08531.pdf Args: flags: data/model parameters Returns: Keras model for training Raises: ValueError: if any of input list has different length from any other; or if padding is not supported """ ds_filters = utils.parse(flags.ds_filters) ds_repeat = utils.parse(flags.ds_repeat) ds_kernel_size = utils.parse(flags.ds_kernel_size) ds_stride = utils.parse(flags.ds_stride) ds_dilation = utils.parse(flags.ds_dilation) ds_residual = utils.parse(flags.ds_residual) ds_pool = utils.parse(flags.ds_pool) ds_padding = utils.parse(flags.ds_padding) ds_filter_separable = utils.parse(flags.ds_filter_separable) for l in (ds_repeat, ds_kernel_size, ds_stride, ds_dilation, ds_residual, ds_pool, ds_padding, ds_filter_separable): if len(ds_filters) != len(l): raise ValueError('all input lists have to be the same length') input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # make it [batch, time, 1, feature] net = tf.keras.backend.expand_dims(net, axis=2) # encoder for filters, repeat, ksize, stride, sep, dilation, res, pool, pad in zip( ds_filters, ds_repeat, ds_kernel_size, ds_stride, ds_filter_separable, ds_dilation, ds_residual, ds_pool, ds_padding): net = resnet_block(net, repeat, ksize, filters, dilation, stride, sep, res, pad, flags.dropout, flags.activation, flags.ds_scale, flags.data_stride <= 1) if pool > 1: if flags.ds_max_pool: net = tf.keras.layers.MaxPooling2D(pool_size=(pool, 1), strides=(pool, 1))(net) else: net = tf.keras.layers.AveragePooling2D(pool_size=(pool, 1), strides=(pool, 1))(net) # decoder net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D())(net) net = tf.keras.layers.Flatten()(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Depthwise convolutional model. It is based on paper: MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861 Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) net = tf.keras.backend.expand_dims(net) net = stream.Stream(cell=tf.keras.layers.Conv2D( kernel_size=utils.parse(flags.cnn1_kernel_size), dilation_rate=utils.parse(flags.cnn1_dilation_rate), filters=flags.cnn1_filters, padding=flags.cnn1_padding, strides=utils.parse(flags.cnn1_strides)))(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) for kernel_size, dw2_act, dilation_rate, strides, filters, cnn2_act in zip( utils.parse(flags.dw2_kernel_size), utils.parse(flags.dw2_act), utils.parse(flags.dw2_dilation_rate), utils.parse(flags.dw2_strides), utils.parse(flags.cnn2_filters), utils.parse(flags.cnn2_act)): net = stream.Stream( cell=tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size, dilation_rate=dilation_rate, padding=flags.dw2_padding, strides=strides))(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation(dw2_act)(net) net = tf.keras.layers.Conv2D(kernel_size=(1, 1), filters=filters)(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation(cnn2_act)(net) net = stream.Stream(cell=tf.keras.layers.AveragePooling2D( pool_size=(int(net.shape[1]), int(net.shape[2]))))(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Inception model. It is based on paper: Rethinking the Inception Architecture for Computer Vision http://arxiv.org/abs/1512.00567 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, 1, feature] for stride, filters, kernel_size in zip( utils.parse(flags.cnn1_strides), utils.parse(flags.cnn1_filters), utils.parse(flags.cnn1_kernel_sizes)): net = utils.conv2d_bn( net, filters, (kernel_size, 1), padding='valid', scale=flags.bn_scale) if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) for stride, filters1, filters2, kernel_size in zip( utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_filters1), utils.parse(flags.cnn2_filters2), utils.parse(flags.cnn2_kernel_sizes)): branch1 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn( branch2, filters1, (kernel_size, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn( branch3, filters1, (kernel_size, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn( branch3, filters1, (kernel_size, 1), scale=flags.bn_scale) net = tf.keras.layers.concatenate([branch1, branch2, branch3]) # [batch, time, 1, filters*4] net = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale) # [batch, time, 1, filters2] if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters*4] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Mobilenet model. It is based on paper: MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861 It is applied on sequence in time, so only 1D filters applied Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, feature, 1] # it is convolutional block net = tf.keras.layers.Conv2D(filters=flags.cnn1_filters, kernel_size=utils.parse( flags.cnn1_kernel_size), padding='valid', use_bias=False, strides=utils.parse(flags.cnn1_strides))(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # [batch, time, feature, filters] for kernel_size, strides, filters in zip(utils.parse(flags.ds_kernel_size), utils.parse(flags.ds_strides), utils.parse(flags.cnn_filters)): # it is depthwise convolutional block net = tf.keras.layers.DepthwiseConv2D( kernel_size, padding='same' if strides == (1, 1) else 'valid', depth_multiplier=1, strides=strides, use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6., )(net) net = tf.keras.layers.Conv2D(filters=filters, kernel_size=(1, 1), padding='same', use_bias=False, strides=(1, 1))(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # [batch, time, feature, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) # [batch, label_count] return tf.keras.Model(input_audio, net)
def model(flags): """BC-ResNet model. It is based on paper Broadcasted Residual Learning for Efficient Keyword Spotting https://arxiv.org/pdf/2106.04140.pdf Args: flags: data/model parameters Returns: Keras model for training Raises: ValueError: if any of input list has different length from any other; or if padding is not supported """ dropouts = utils.parse(flags.dropouts) filters = utils.parse(flags.filters) blocks_n = utils.parse(flags.blocks_n) strides = utils.parse(flags.strides) dilations = utils.parse(flags.dilations) for l in (dropouts, filters, strides, dilations): if len(blocks_n) != len(l): raise ValueError('all input lists have to be the same length ' 'but get %s and %s ' % (blocks_n, l)) input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # make it [batch, time, feature, 1] net = tf.keras.backend.expand_dims(net, axis=3) if flags.paddings == 'same': net = tf.keras.layers.Conv2D(filters=flags.first_filters, kernel_size=5, strides=(1, 2), padding='same')(net) else: net = stream.Stream(cell=tf.keras.layers.Conv2D( filters=flags.first_filters, kernel_size=5, strides=(1, 2), padding='valid'), use_one_step=True, pad_time_dim=flags.paddings, pad_freq_dim='same')(net) for n, n_filters, dilation, stride, dropout in zip(blocks_n, filters, dilations, strides, dropouts): net = TransitionBlock(n_filters, dilation, stride, flags.paddings, dropout, sub_groups=flags.sub_groups)(net) for _ in range(n): net = NormalBlock(n_filters, dilation, 1, flags.paddings, dropout, sub_groups=flags.sub_groups)(net) if flags.paddings == 'same': net = tf.keras.layers.DepthwiseConv2D(kernel_size=5, padding='same')(net) else: net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D( kernel_size=5, padding='valid'), use_one_step=True, pad_time_dim=flags.paddings, pad_freq_dim='same')(net) # average out frequency dim net = tf.keras.backend.mean(net, axis=2, keepdims=True) net = tf.keras.layers.Conv2D(filters=flags.last_filters, kernel_size=1, use_bias=False)(net) # average out time dim if flags.paddings == 'same': net = tf.keras.layers.GlobalAveragePooling2D(keepdims=True)(net) else: net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D( keepdims=True))(net) net = tf.keras.layers.Conv2D(filters=flags.label_count, kernel_size=1, use_bias=False)(net) # 1 and 2 dims are equal to 1 net = tf.squeeze(net, [1, 2]) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Temporal Convolution ResNet model. It is based on paper: Temporal Convolution for Real-time Keyword Spotting on Mobile Devices https://arxiv.org/pdf/1904.03814.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) time_size, feature_size = net.shape[1:3] channels = utils.parse(flags.channels) net = tf.keras.backend.expand_dims(net) if flags.debug_2d: conv_kernel = first_conv_kernel = (3, 3) else: net = tf.reshape( net, [-1, time_size, 1, feature_size]) # [batch, time, 1, feature] first_conv_kernel = (3, 1) conv_kernel = utils.parse(flags.kernel_size) net = tf.keras.layers.Conv2D(filters=channels[0], kernel_size=first_conv_kernel, strides=1, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) if utils.parse(flags.pool_size): net = tf.keras.layers.AveragePooling2D(pool_size=utils.parse( flags.pool_size), strides=flags.pool_stride)(net) channels = channels[1:] # residual blocks for n in channels: if n != net.shape[-1]: stride = 2 layer_in = tf.keras.layers.Conv2D(filters=n, kernel_size=1, strides=stride, padding='same', activation='linear')(net) layer_in = tf.keras.layers.BatchNormalization( momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(layer_in) layer_in = tf.keras.layers.Activation('relu')(layer_in) else: layer_in = net stride = 1 net = tf.keras.layers.Conv2D(filters=n, kernel_size=conv_kernel, strides=stride, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.Conv2D(filters=n, kernel_size=conv_kernel, strides=1, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) # residual connection net = tf.keras.layers.Add()([net, layer_in]) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.AveragePooling2D(pool_size=net.shape[1:3], strides=1)(net) net = tf.keras.layers.Dropout(rate=flags.dropout)(net) # fully connected layer net = tf.keras.layers.Conv2D(filters=flags.label_count, kernel_size=1, strides=1, padding='same', activation='linear')(net) net = tf.reshape(net, shape=(-1, net.shape[3])) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Convolutional recurrent neural network (CRNN) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf Represented as sequence of Conv, RNN/GRU, FC layers. Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # expand dims for the next layer 2d conv net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size), utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate), utils.parse(flags.cnn_strides)): net = stream.Stream( cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) for units, return_sequences in zip(utils.parse(flags.gru_units), utils.parse(flags.return_sequences)): net = gru.GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)