def test_tf_non_streaming_vs_streaming_inference_internal_state(self): """Tests non stream inference vs stream inference with internal state.""" speech_params = speech_features.SpeechFeatures.get_params(self.params) mode = modes.Modes.NON_STREAM_INFERENCE # TF non streaming frame extraction based on tf.signal.frame mel_speech_tf = speech_features.SpeechFeatures( speech_params, mode, self.inference_batch_size) # it receives all data with size: data_size input1 = tf.keras.layers.Input(shape=(self.data_size, ), batch_size=self.inference_batch_size, dtype=tf.float32) output1 = mel_speech_tf(input1) model_tf = tf.keras.models.Model(input1, output1) # generate frames for the whole signal (no streaming here) output_tf = model_tf.predict(self.signal) # streaming frame extraction # it receives input data incrementally with step: frame_step mode = modes.Modes.STREAM_INTERNAL_STATE_INFERENCE mel_speech_stream = speech_features.SpeechFeatures( speech_params, mode, self.inference_batch_size) input2 = tf.keras.layers.Input(shape=(self.frame_step, ), batch_size=self.inference_batch_size, dtype=tf.float32) output2 = mel_speech_stream(input2) # initialize state of streaming model pre_state = self.signal[:, 0:mel_speech_stream.data_frame.frame_size - mel_speech_stream.data_frame.frame_step] state_init = np.concatenate( (np.zeros(shape=(1, mel_speech_stream.data_frame.frame_step), dtype=np.float32), pre_state), axis=1) mel_speech_stream.data_frame.set_weights([state_init]) model_stream = tf.keras.models.Model(input2, output2) # run streaming frames extraction start = self.frame_size - self.frame_step end = self.frame_size streamed_frames = [] while end <= self.data_size: # next data update stream_update = self.signal[:, start:end] # get new frame from stream of data output_frame = model_stream.predict(stream_update) streamed_frames.append(output_frame) # update indexes of streamed updates start = end end = start + self.frame_step self.assertNotEmpty(streamed_frames) # compare streaming vs non streaming frames extraction for i in range(len(streamed_frames)): self.assertAllClose(streamed_frames[i][0][0], output_tf[0][i], rtol=1e-4, atol=1e-4)
def E2E_1stage_v2(input_shape=(16000, ), data_settings=None, dropout=0.2): X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms=data_settings.window_size_ms, frame_step_ms=data_settings.window_stride_ms)(X_input) X = svdf.Svdf(units1=256, memory_size=8, units2=64, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=64, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=128, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=128, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=128, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_6')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v2') return model
def test_tf_non_streaming_vs_streaming_inference_external_state(self): """Tests non stream inference vs stream inference with external state.""" speech_params = speech_features.SpeechFeatures.get_params(self.params) mode = modes.Modes.NON_STREAM_INFERENCE # TF non streaming frame extraction based on tf.signal.frame mel_speech_tf = speech_features.SpeechFeatures( speech_params, mode, self.inference_batch_size) # it receives all data with size: data_size input1 = tf.keras.layers.Input(shape=(self.data_size, ), batch_size=self.inference_batch_size, dtype=tf.float32) output1 = mel_speech_tf(input1) model_tf = tf.keras.models.Model(input1, output1) # generate frames for the whole signal (no streaming here) output_tf = model_tf.predict(self.signal) # input data for streaming mode input_tensors = [ tf.keras.layers.Input(shape=(self.frame_step, ), batch_size=self.inference_batch_size, dtype=tf.float32) ] # convert non streaming trainable model to # streaming inference with external state mode = modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE model_stream = utils.convert_to_inference_model( model_tf, input_tensors, mode) # initialize state of streaming model pre_state = self.signal[:, 0:self.frame_size - self.frame_step] state2 = np.concatenate((np.zeros(shape=(1, self.frame_step), dtype=np.float32), pre_state), axis=1) # run streaming frames extraction start = self.frame_size - self.frame_step end = self.frame_size streamed_frames = [] while end <= self.data_size: # next data update stream_update = self.signal[:, start:end] # get new frame from stream of data output_frame, output_state = model_stream.predict( [stream_update, state2]) state2 = output_state streamed_frames.append(output_frame) # update indexes of streamed updates start = end end = start + self.frame_step # compare streaming vs non streaming frames extraction for i in range(len(streamed_frames)): self.assertAllClose(streamed_frames[i][0][0], output_tf[0][i], rtol=1e-4, atol=1e-4)
def model(flags): """Convolutional recurrent neural network (CRNN) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf Represented as sequence of Conv, RNN/GRU, FC layers. Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # expand dims for the next layer 2d conv net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = stream.Stream( cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = gru.GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def E2E_1stage_v9(input_shape=(16000,), data_settings = None, dropout = 0.5): assert data_settings.wanted_words == 'on,off,up,down,zero,one,two,three,four,five,six,seven,eight,nine' assert data_settings.window_size_ms == 40.0 assert data_settings.window_stride_ms == 20.0 assert data_settings.dct_num_features == 40 assert data_settings.mel_num_bins == 80 assert data_settings.mel_upper_edge_hertz == 7000 X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms = data_settings.window_size_ms, frame_step_ms = data_settings.window_stride_ms, mel_num_bins = data_settings.mel_num_bins, dct_num_features = data_settings.dct_num_features, mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input) X = svdf.Svdf( units1=192, memory_size = 4, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_6')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v9') return model
def model(flags): """Fully connected layer based model. It is based on paper (with added pooling): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)( input_audio) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D( pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')(net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, feature_type=flags.feature_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)( input_audio) for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( parse(flags.svdf_units1), parse(flags.svdf_memory_size), parse(flags.svdf_units2), parse(flags.svdf_dropout), parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=flags.svdf_pad, name='svdf_%d' % i)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, feature_type=flags.feature_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)( input_audio) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = Stream( cell=tf.keras.layers.Conv2D( filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # for streaming mode it is better to use causal padding padding = 'causal' if flags.svdf_pad else 'valid' for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( utils.parse(flags.svdf_units1), utils.parse(flags.svdf_memory_size), utils.parse(flags.svdf_units2), utils.parse(flags.svdf_dropout), utils.parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=padding, name='svdf_%d' % i)( net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def keyword_marvin_v3_vl_0_4(input_shape=(16000,), data_settings = None, dropout = 0.2): assert data_settings.window_size_ms == 30.0 assert data_settings.window_stride_ms == 10.0 assert data_settings.dct_num_features == 40 assert data_settings.mel_num_bins == 80 assert data_settings.background_volume == 0.4 assert data_settings.mel_upper_edge_hertz == 7000 assert data_settings.wanted_words == 'marvin' X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms = data_settings.window_size_ms, frame_step_ms = data_settings.window_stride_ms, mel_num_bins = data_settings.mel_num_bins, dct_num_features = data_settings.dct_num_features, mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input) X = svdf.Svdf( units1=84, memory_size = 12, units2=32, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf( units1=84, memory_size = 12, units2=32, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf( units1=84, memory_size = 12, units2=32, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='keyword_marvin_v3_vl_0_4') return model
def model(flags): """Temporal Convolution ResNet model. It can be configured to reproduce model config as described in the paper below Temporal Convolution for Real-time Keyword Spotting on Mobile Devices https://arxiv.org/pdf/1904.03814.pdf Args: flags: data/model parameters Returns: Keras model for training """ tc_filters = parse(flags.tc_filters) repeat_tc_convs = parse(flags.repeat_tc_convs) kernel_sizes = parse(flags.kernel_sizes) pool_sizes = parse(flags.pool_sizes) dilations = parse(flags.dilations) residuals = parse(flags.residuals) if len( set((len(repeat_tc_convs), len(kernel_sizes), len(pool_sizes), len(dilations), len(residuals), len(tc_filters)))) != 1: raise ValueError('all input lists have to be the same length') input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # make it [batch, time, 1, feature] net = tf.keras.backend.expand_dims(net, axis=2) for filters, repeat, kernel_size, pool_size, dilation, residual in zip( tc_filters, repeat_tc_convs, kernel_sizes, pool_sizes, dilations, residuals): net = resnet_block(net, repeat, kernel_size, filters, dilation, residual, flags.padding_in_time, flags.dropout, flags.activation) if pool_size > 1: net = tf.keras.layers.MaxPooling2D((pool_size, 1))(net) net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D())(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Inception resnet model. It is based on paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning https://arxiv.org/abs/1602.07261 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=-1) # [batch, time, feature, 1] for filters in utils.parse(flags.cnn_filters0): net = tf.keras.layers.SeparableConv2D(filters, (3, 3), padding='valid', use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2))(net) # [batch, time, feature, filters] for stride, scale, filters_branch0, filters_branch1 in zip( utils.parse(flags.strides), utils.parse(flags.scales), utils.parse(flags.filters_branch0), utils.parse(flags.filters_branch1)): net = inception_resnet_block(net, scale, filters_branch0, filters_branch1, bn_scale=flags.bn_scale) net = tf.keras.layers.MaxPooling2D(3, strides=stride, padding='valid')(net) # [batch, time, feature, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Fully connected layer based model. It is based on paper (with added pooling): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) for units, activation in zip( utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D( pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')(net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """LSTM model. Similar model in papers: Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)( input_audio) for units, return_sequences, num_proj in zip( parse(flags.lstm_units), parse(flags.return_sequences), parse(flags.num_proj)): net = LSTM( units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def E2E_1stage_v7(input_shape=(16000,), data_settings = None, dropout = 0.5): data_settings.window_size_ms = 40.0 data_settings.window_stride_ms = 20.0 data_settings.dct_num_features = 40 data_settings.mel_num_bins = 80 data_settings.mel_upper_edge_hertz = 7000 X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms = data_settings.window_size_ms, frame_step_ms = data_settings.window_stride_ms, mel_num_bins = data_settings.mel_num_bins, dct_num_features = data_settings.dct_num_features, mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input) X = svdf.Svdf( units1=224, memory_size = 12, units2=56, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf( units1=224, memory_size = 12, units2=56, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf( units1=224, memory_size = 12, units2=56, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v7') return model
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size), utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate), utils.parse(flags.cnn_strides)): net = stream.Stream( cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """LSTM model. Similar model in papers: Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) for units, return_sequences, num_proj in zip( utils.parse(flags.lstm_units), utils.parse(flags.return_sequences), utils.parse(flags.num_proj)): net = lstm.LSTM( units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)( net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """LSTM model. It is based on paper https://arxiv.org/pdf/1705.02411.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)(input_audio) for units, return_sequences, num_proj in zip(parse(flags.lstm_units), parse(flags.return_sequences), parse(flags.num_proj)): net = LSTM(units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def test_tf_non_streaming_train(self): """Tests non stream inference with train flag.""" params = Params() params.sp_time_shift_ms = 10.0 speech_params = speech_features.SpeechFeatures.get_params(params) mode = modes.Modes.TRAINING # TF non streaming frame extraction based on tf.signal.frame mel_speech_tf = speech_features.SpeechFeatures( speech_params, mode, self.inference_batch_size) # it receives all data with size: data_size input1 = tf.keras.layers.Input(shape=(self.data_size, ), batch_size=self.inference_batch_size, dtype=tf.float32) output1 = mel_speech_tf(input1) model_tf = tf.keras.models.Model(input1, output1) # generate frames for the whole signal (no streaming here) self.assertNotEmpty(model_tf.predict(self.signal))
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( input_audio) for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( parse(flags.svdf_units1), parse(flags.svdf_memory_size), parse(flags.svdf_units2), parse(flags.svdf_dropout), parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=flags.svdf_pad, name='svdf_%d' % i)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """LSTM model. Similar model in papers: Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( input_audio) for units, return_sequences, num_proj in zip( parse(flags.lstm_units), parse(flags.return_sequences), parse(flags.num_proj)): net = LSTM( units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(input_audio) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = Stream(cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Gated Recurrent Unit(GRU) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Mobilenet model. It is based on paper: MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861 It is applied on sequence in time, so only 1D filters applied Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, feature, 1] # it is convolutional block net = tf.keras.layers.Conv2D( filters=flags.cnn1_filters, kernel_size=utils.parse(flags.cnn1_kernel_size), padding='valid', use_bias=False, strides=utils.parse(flags.cnn1_strides))( net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # [batch, time, feature, filters] for kernel_size, strides, filters in zip( utils.parse(flags.ds_kernel_size), utils.parse(flags.ds_strides), utils.parse(flags.cnn_filters)): # it is depthwise convolutional block net = tf.keras.layers.DepthwiseConv2D( kernel_size, padding='same' if strides == (1, 1) else 'valid', depth_multiplier=1, strides=strides, use_bias=False)( net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.,)(net) net = tf.keras.layers.Conv2D( filters=filters, kernel_size=(1, 1), padding='same', use_bias=False, strides=(1, 1))(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # [batch, time, feature, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) # [batch, label_count] return tf.keras.Model(input_audio, net)
def model(flags): """Temporal Convolution ResNet model. It is based on paper: Temporal Convolution for Real-time Keyword Spotting on Mobile Devices https://arxiv.org/pdf/1904.03814.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) time_size, feature_size = net.shape[1:3] channels = utils.parse(flags.channels) net = tf.keras.backend.expand_dims(net) if flags.debug_2d: conv_kernel = first_conv_kernel = (3, 3) else: net = tf.reshape( net, [-1, time_size, 1, feature_size]) # [batch, time, 1, feature] first_conv_kernel = (3, 1) conv_kernel = utils.parse(flags.kernel_size) net = tf.keras.layers.Conv2D(filters=channels[0], kernel_size=first_conv_kernel, strides=1, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) if utils.parse(flags.pool_size): net = tf.keras.layers.AveragePooling2D(pool_size=utils.parse( flags.pool_size), strides=flags.pool_stride)(net) channels = channels[1:] # residual blocks for n in channels: if n != net.shape[-1]: stride = 2 layer_in = tf.keras.layers.Conv2D(filters=n, kernel_size=1, strides=stride, padding='same', activation='linear')(net) layer_in = tf.keras.layers.BatchNormalization( momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(layer_in) layer_in = tf.keras.layers.Activation('relu')(layer_in) else: layer_in = net stride = 1 net = tf.keras.layers.Conv2D(filters=n, kernel_size=conv_kernel, strides=stride, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.Conv2D(filters=n, kernel_size=conv_kernel, strides=1, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) # residual connection net = tf.keras.layers.Add()([net, layer_in]) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.AveragePooling2D(pool_size=net.shape[1:3], strides=1)(net) net = tf.keras.layers.Dropout(rate=flags.dropout)(net) # fully connected layer net = tf.keras.layers.Conv2D(filters=flags.label_count, kernel_size=1, strides=1, padding='same', activation='linear')(net) net = tf.reshape(net, shape=(-1, net.shape[3])) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Inception model. It is based on paper: Rethinking the Inception Architecture for Computer Vision http://arxiv.org/abs/1512.00567 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=-1) # [batch, time, feature, 1] for filters in utils.parse(flags.cnn_filters0): net = tf.keras.layers.SeparableConv2D(filters, (3, 3), padding='valid', use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2))(net) # [batch, time, feature, filters] filters = utils.parse(flags.cnn_filters0)[-1] net = utils.conv2d_bn(net, filters, (3, 1), padding='valid', scale=flags.bn_scale) net = utils.conv2d_bn(net, filters, (1, 3), padding='valid', scale=flags.bn_scale) for stride, filters1, filters2 in zip(utils.parse(flags.cnn_strides), utils.parse(flags.cnn_filters1), utils.parse(flags.cnn_filters2)): if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 3), strides=stride)(net) branch1 = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(branch2, filters1, (3, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(branch2, filters2, (1, 3), scale=flags.bn_scale) branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters1, (3, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters1, (1, 3), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters1, (3, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters2, (1, 3), scale=flags.bn_scale) branch4 = tf.keras.layers.AveragePooling2D((3, 3), strides=(1, 1), padding='same')(net) branch4 = utils.conv2d_bn(branch4, filters2, (1, 1), scale=flags.bn_scale) net = tf.keras.layers.concatenate([branch1, branch2, branch3, branch4]) # [batch, time, feature, filters*4] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters*4] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """BiRNN attention model. It is based on paper: A neural attention model for speech command recognition https://arxiv.org/pdf/1808.08929.pdf Depending on parameter rnn_type, model can be biLSTM or biGRU Args: flags: data/model parameters Returns: Keras model for training """ rnn_types = {'lstm': tf.keras.layers.LSTM, 'gru': tf.keras.layers.GRU} if flags.rnn_type not in rnn_types: ValueError('not supported RNN type ', flags.rnn_type) rnn = rnn_types[flags.rnn_type] input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(input_audio) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides, padding='same')(net) net = tf.keras.layers.BatchNormalization()(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) # dims: [batch, time, feature] for _ in range(flags.rnn_layers): net = tf.keras.layers.Bidirectional( rnn(flags.rnn_units, return_sequences=True, unroll=True))(net) feature_dim = net.shape[-1] middle = net.shape[1] // 2 # index of middle point of sequence # feature vector at middle point [batch, feature] mid_feature = net[:, middle, :] # apply one projection layer with the same dim as input feature query = tf.keras.layers.Dense(feature_dim)(mid_feature) # attention weights [batch, time] att_weights = tf.keras.layers.Dot(axes=[1, 2])([query, net]) att_weights = tf.keras.layers.Softmax(name='attSoftmax')(att_weights) # apply attention weights [batch, feature] net = tf.keras.layers.Dot(axes=[1, 1])([att_weights, net]) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Inception model. It is based on paper: Rethinking the Inception Architecture for Computer Vision http://arxiv.org/abs/1512.00567 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, 1, feature] for stride, filters, kernel_size in zip( utils.parse(flags.cnn1_strides), utils.parse(flags.cnn1_filters), utils.parse(flags.cnn1_kernel_sizes)): net = utils.conv2d_bn(net, filters, (kernel_size, 1), padding='valid', scale=flags.bn_scale) if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) for stride, filters1, filters2, kernel_size in zip( utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_filters1), utils.parse(flags.cnn2_filters2), utils.parse(flags.cnn2_kernel_sizes)): branch1 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(branch2, filters1, (kernel_size, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters1, (kernel_size, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters1, (kernel_size, 1), scale=flags.bn_scale) net = tf.keras.layers.concatenate([branch1, branch2, branch3]) # [batch, time, 1, filters*4] net = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale) # [batch, time, 1, filters2] if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters*4] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Xception model. It is based on paper: Xception: Deep Learning with Depthwise Separable Convolutions https://arxiv.org/abs/1610.02357 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=-1) # [batch, time, feature, 1] # conv block for kernel_size, stride, filters in zip(parse(flags.cnn1_kernel_size), parse(flags.cnn1_strides), parse(flags.cnn1_filters)): net = tf.keras.layers.Conv2D(filters, kernel_size, strides=stride, use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) # [batch, time, feature, filters] # first residual block for filters in parse(flags.cnn2_filters): residual = tf.keras.layers.Conv2D(filters, (1, 1), strides=(2, 2), padding='same', use_bias=False)(net) residual = tf.keras.layers.BatchNormalization( scale=flags.bn_scale)(residual) net = tf.keras.layers.SeparableConv2D(filters, (3, 3), padding='same', use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(net) net = tf.keras.layers.add([net, residual]) # [batch, time, feature, filters] # second residual block filters = parse(flags.cnn2_filters)[-1] for _ in range(flags.cnn3_blocks): residual = net net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.SeparableConv2D(filters, (3, 3), padding='same', use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.SeparableConv2D( filters, (3, 3), padding='same', use_bias=False, )(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.SeparableConv2D(filters, (3, 3), padding='same', use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.add([net, residual]) # [batch, time, feature, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) # [batch, label_count] return tf.keras.Model(input_audio, net)
def model(flags): """Inception resnet model. It is based on paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning https://arxiv.org/abs/1602.07261 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, 1, feature] for filters, kernel_size, stride in zip( utils.parse(flags.cnn1_filters), utils.parse(flags.cnn1_kernel_sizes), utils.parse(flags.cnn1_strides)): net = utils.conv2d_bn( net, filters, (kernel_size, 1), scale=flags.bn_scale, padding='valid') if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) # [batch, time, 1, filters] for stride, scale, filters_branch0, filters_branch1, filters_branch2, kernel_size in zip( utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_scales), utils.parse(flags.cnn2_filters_branch0), utils.parse(flags.cnn2_filters_branch1), utils.parse(flags.cnn2_filters_branch2), utils.parse(flags.cnn2_kernel_sizes)): net = inception_resnet_block( net, scale, filters_branch0, filters_branch1, kernel_size, bn_scale=flags.bn_scale) net = utils.conv2d_bn( net, filters_branch2, (1, 1), scale=flags.bn_scale, padding='valid') if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1), padding='valid')( net) # [batch, time, 1, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)