def model(flags): """Convolutional recurrent neural network (CRNN) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf Represented as sequence of Conv, RNN/GRU, FC layers. Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # expand dims for the next layer 2d conv net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = Stream(cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Gated Recurrent Unit(GRU) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)( input_audio) for units, return_sequences in zip( parse(flags.gru_units), parse(flags.return_sequences)): net = GRU( units=units, return_sequences=return_sequences, stateful=flags.stateful)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Gated Recurrent Unit(GRU) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Gated Recurrent Unit(GRU) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( input_audio) for units, return_sequences in zip( parse(flags.gru_units), parse(flags.return_sequences)): net = GRU( units=units, return_sequences=return_sequences, stateful=flags.stateful)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Convolutional recurrent neural network (CRNN) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf Represented as sequence of Conv, RNN/GRU, FC layers. Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, feature_type=flags.feature_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)(input_audio) # expand dims for the next layer 2d conv net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = Stream(cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)