def model(flags): """Convolutional recurrent neural network (CRNN) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf Represented as sequence of Conv, RNN/GRU, FC layers. Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # expand dims for the next layer 2d conv net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = Stream(cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, feature_type=flags.feature_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)( input_audio) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = Stream( cell=tf.keras.layers.Conv2D( filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def E2E_1stage_v2(input_shape=(16000, ), data_settings=None, dropout=0.2): X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms=data_settings.window_size_ms, frame_step_ms=data_settings.window_stride_ms)(X_input) X = svdf.Svdf(units1=256, memory_size=8, units2=64, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=64, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=128, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=128, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=128, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_6')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v2') return model
def init(self, shape=(8, 2), flat_dim="time"): self.batch_size = 1 # input data placeholder input_tf = tf.keras.layers.Input(shape=shape, batch_size=self.batch_size, name="inp1") # input test data self.inputs = np.random.uniform(size=(self.batch_size, ) + shape) # create non streamable trainable model mode = Modes.TRAINING if flat_dim == "time": flat_tf = Stream(cell=tf.keras.layers.Flatten(), mode=mode)(input_tf) else: flat_tf = tf.reshape( input_tf, (-1, input_tf.shape[1], input_tf.shape[2] * input_tf.shape[3])) # flat_tf = flatten.Flatten(mode=mode, flat_dim=flat_dim)(input_tf) self.model_train = tf.keras.Model(input_tf, flat_tf) self.model_train.summary() # output data, generated by non streaming model self.outputs = self.model_train.predict(self.inputs) return self.outputs
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = Stream(cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def E2E_1stage_v9(input_shape=(16000,), data_settings = None, dropout = 0.5): assert data_settings.wanted_words == 'on,off,up,down,zero,one,two,three,four,five,six,seven,eight,nine' assert data_settings.window_size_ms == 40.0 assert data_settings.window_stride_ms == 20.0 assert data_settings.dct_num_features == 40 assert data_settings.mel_num_bins == 80 assert data_settings.mel_upper_edge_hertz == 7000 X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms = data_settings.window_size_ms, frame_step_ms = data_settings.window_stride_ms, mel_num_bins = data_settings.mel_num_bins, dct_num_features = data_settings.dct_num_features, mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input) X = svdf.Svdf( units1=192, memory_size = 4, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_6')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v9') return model
def model(flags): """Fully connected layer based model. It is based on paper (with added pooling): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)( input_audio) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D( pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')(net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, feature_type=flags.feature_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)( input_audio) for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( parse(flags.svdf_units1), parse(flags.svdf_memory_size), parse(flags.svdf_units2), parse(flags.svdf_dropout), parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=flags.svdf_pad, name='svdf_%d' % i)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def _get_conv2d_layer(self, mode, dilation_rate=(1, 1)): cell = tf.keras.layers.Conv2D(filters=self.filters, kernel_size=self.kernel_size, dilation_rate=dilation_rate, kernel_initializer='ones') return Stream( cell, mode=mode, inference_batch_size=self.batch_size, pad_time_dim='causal', )
def keyword_marvin_v3_vl_0_4(input_shape=(16000,), data_settings = None, dropout = 0.2): assert data_settings.window_size_ms == 30.0 assert data_settings.window_stride_ms == 10.0 assert data_settings.dct_num_features == 40 assert data_settings.mel_num_bins == 80 assert data_settings.background_volume == 0.4 assert data_settings.mel_upper_edge_hertz == 7000 assert data_settings.wanted_words == 'marvin' X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms = data_settings.window_size_ms, frame_step_ms = data_settings.window_stride_ms, mel_num_bins = data_settings.mel_num_bins, dct_num_features = data_settings.dct_num_features, mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input) X = svdf.Svdf( units1=84, memory_size = 12, units2=32, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf( units1=84, memory_size = 12, units2=32, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf( units1=84, memory_size = 12, units2=32, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='keyword_marvin_v3_vl_0_4') return model
def model(flags): """LSTM model. Similar model in papers: Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)( input_audio) for units, return_sequences, num_proj in zip( parse(flags.lstm_units), parse(flags.return_sequences), parse(flags.num_proj)): net = LSTM( units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(input_audio) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = Stream(cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def E2E_1stage_v7(input_shape=(16000,), data_settings = None, dropout = 0.5): data_settings.window_size_ms = 40.0 data_settings.window_stride_ms = 20.0 data_settings.dct_num_features = 40 data_settings.mel_num_bins = 80 data_settings.mel_upper_edge_hertz = 7000 X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms = data_settings.window_size_ms, frame_step_ms = data_settings.window_stride_ms, mel_num_bins = data_settings.mel_num_bins, dct_num_features = data_settings.dct_num_features, mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input) X = svdf.Svdf( units1=224, memory_size = 12, units2=56, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf( units1=224, memory_size = 12, units2=56, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf( units1=224, memory_size = 12, units2=56, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v7') return model
def model(flags): """Fully connected layer based model. It is based on paper (with added pooling): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D(pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')(net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # for streaming mode it is better to use causal padding padding = 'causal' if flags.svdf_pad else 'valid' for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip(parse(flags.svdf_units1), parse(flags.svdf_memory_size), parse(flags.svdf_units2), parse(flags.svdf_dropout), parse(flags.svdf_act))): net = svdf.Svdf(units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=padding, name='svdf_%d' % i)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """LSTM model. Similar model in papers: Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) for units, return_sequences, num_proj in zip( parse(flags.lstm_units), parse(flags.return_sequences), parse(flags.num_proj)): net = LSTM( units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Fully connected layer based model on raw wav data. It is based on paper (with added pooling and raw audio data): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Args: flags: data/model parameters Returns: Keras model for training """ if flags.preprocess != 'raw': ValueError('input audio has to be raw, but get ', flags.preprocess) input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = dataframe.DataFrame( frame_size=flags.window_size_samples, frame_step=flags.window_stride_samples)(input_audio) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D(pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')(net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """LSTM model. It is based on paper https://arxiv.org/pdf/1705.02411.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)(input_audio) for units, return_sequences, num_proj in zip(parse(flags.lstm_units), parse(flags.return_sequences), parse(flags.num_proj)): net = LSTM(units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( input_audio) for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( parse(flags.svdf_units1), parse(flags.svdf_memory_size), parse(flags.svdf_units2), parse(flags.svdf_dropout), parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=flags.svdf_pad, name='svdf_%d' % i)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """LSTM model. Similar model in papers: Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( input_audio) for units, return_sequences, num_proj in zip( parse(flags.lstm_units), parse(flags.return_sequences), parse(flags.num_proj)): net = LSTM( units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Gated Recurrent Unit(GRU) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Depthwise convolutional model. It is based on paper https://arxiv.org/abs/1704.04861 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)(input_audio) net = tf.keras.backend.expand_dims(net) net = Stream(cell=tf.keras.layers.Conv2D( kernel_size=parse(flags.cnn1_kernel_size), dilation_rate=parse(flags.cnn1_dilation_rate), filters=flags.cnn1_filters, padding=flags.cnn1_padding, strides=parse(flags.cnn1_strides)))(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) for kernel_size, dw2_act, dilation_rate, strides, filters, cnn2_act in zip( parse(flags.dw2_kernel_size), parse(flags.dw2_act), parse(flags.dw2_dilation_rate), parse(flags.dw2_strides), parse(flags.cnn2_filters), parse(flags.cnn2_act)): net = Stream( cell=tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size, dilation_rate=dilation_rate, padding=flags.dw2_padding, strides=strides))(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation(dw2_act)(net) net = tf.keras.layers.Conv2D(kernel_size=(1, 1), filters=filters)(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation(cnn2_act)(net) net = Stream(cell=tf.keras.layers.AveragePooling2D( pool_size=(int(net.shape[1]), int(net.shape[2]))))(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Depthwise convolutional model. It is based on paper: MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861 Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(input_audio) net = tf.keras.backend.expand_dims(net) net = Stream(cell=tf.keras.layers.Conv2D( kernel_size=parse(flags.cnn1_kernel_size), dilation_rate=parse(flags.cnn1_dilation_rate), filters=flags.cnn1_filters, padding=flags.cnn1_padding, strides=parse(flags.cnn1_strides)))(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) for kernel_size, dw2_act, dilation_rate, strides, filters, cnn2_act in zip( parse(flags.dw2_kernel_size), parse(flags.dw2_act), parse(flags.dw2_dilation_rate), parse(flags.dw2_strides), parse(flags.cnn2_filters), parse(flags.cnn2_act)): net = Stream( cell=tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size, dilation_rate=dilation_rate, padding=flags.dw2_padding, strides=strides))(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation(dw2_act)(net) net = tf.keras.layers.Conv2D(kernel_size=(1, 1), filters=filters)(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation(cnn2_act)(net) net = Stream(cell=tf.keras.layers.AveragePooling2D( pool_size=(int(net.shape[1]), int(net.shape[2]))))(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Convolutional recurrent neural network (CRNN) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf Represented as sequence of Conv, RNN/GRU, FC layers. Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, feature_type=flags.feature_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)(input_audio) # expand dims for the next layer 2d conv net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = Stream(cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)