def keyword_marvin_v3_vl_0_4(input_shape=(16000,), data_settings = None, dropout = 0.2): assert data_settings.window_size_ms == 30.0 assert data_settings.window_stride_ms == 10.0 assert data_settings.dct_num_features == 40 assert data_settings.mel_num_bins == 80 assert data_settings.background_volume == 0.4 assert data_settings.mel_upper_edge_hertz == 7000 assert data_settings.wanted_words == 'marvin' X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms = data_settings.window_size_ms, frame_step_ms = data_settings.window_stride_ms, mel_num_bins = data_settings.mel_num_bins, dct_num_features = data_settings.dct_num_features, mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input) X = svdf.Svdf( units1=84, memory_size = 12, units2=32, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf( units1=84, memory_size = 12, units2=32, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf( units1=84, memory_size = 12, units2=32, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='keyword_marvin_v3_vl_0_4') return model
def E2E_1stage_v7(input_shape=(16000,), data_settings = None, dropout = 0.5): data_settings.window_size_ms = 40.0 data_settings.window_stride_ms = 20.0 data_settings.dct_num_features = 40 data_settings.mel_num_bins = 80 data_settings.mel_upper_edge_hertz = 7000 X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms = data_settings.window_size_ms, frame_step_ms = data_settings.window_stride_ms, mel_num_bins = data_settings.mel_num_bins, dct_num_features = data_settings.dct_num_features, mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input) X = svdf.Svdf( units1=224, memory_size = 12, units2=56, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf( units1=224, memory_size = 12, units2=56, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf( units1=224, memory_size = 12, units2=56, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf( units1=32, memory_size = 32, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v7') return model
def test_streaming_inference_internal_state(self): output_non_stream_np, _ = self._run_non_stream_model() mode = Modes.STREAM_INTERNAL_STATE_INFERENCE input_tf = tf.keras.layers.Input(shape=( 1, self.input_data.shape[2], )) svdf_layer = svdf.Svdf(units1=self.weights[0].shape[1], memory_size=self.memory_size, units2=self.weights[3].shape[1], activation="linear", inference_batch_size=self.batch_size, mode=mode) output_tf = svdf_layer(inputs=input_tf) input_states_np = np.zeros( [self.batch_size, self.memory_size, self.weights[1].shape[-1]]) svdf_layer.dense1.set_weights([self.weights[0]]) svdf_layer.depth_cnn1.set_weights( [self.weights[1], self.weights[2], input_states_np]) svdf_layer.dense2.set_weights([self.weights[3], self.weights[4]]) model = tf.keras.models.Model(input_tf, output_tf) for i in range( self.input_data.shape[1]): # loop over every element in time input_batch_np = self.input_data[:, i, :] input_batch_np = np.expand_dims(input_batch_np, 1) output_np = model.predict(input_batch_np) for b in range(self.input_data.shape[0]): # loop over batch self.assertAllClose(output_np[b][0], output_non_stream_np[b][i])
def _run_non_stream_model(self): # below model expects that input_data are already initialized in tu.TestBase # in setUp, by default input_data should have 3 dimensions. # size of each dimesnion is constant and is defiend by self.weights mode = Modes.TRAINING input_tf = tf.keras.layers.Input(shape=( None, self.input_data.shape[2], )) svdf_layer = svdf.Svdf(units1=self.weights[0].shape[1], memory_size=self.memory_size, units2=self.weights[3].shape[1], activation="linear", inference_batch_size=self.batch_size, mode=mode) output_tf = svdf_layer(inputs=input_tf) svdf_layer.dense1.set_weights([self.weights[0]]) svdf_layer.depth_cnn1.set_weights([self.weights[1], self.weights[2]]) svdf_layer.dense2.set_weights([self.weights[3], self.weights[4]]) model_tf = tf.keras.models.Model(input_tf, output_tf) # run inference in non streaming mode output_non_stream_np = model_tf.predict(self.input_data) return output_non_stream_np, model_tf
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( frame_size_ms=flags.window_size_ms, frame_step_ms=flags.window_stride_ms, sample_rate=flags.sample_rate, use_tf_fft=flags.use_tf_fft, preemph=flags.preemph, window_type=flags.window_type, feature_type=flags.feature_type, mel_num_bins=flags.mel_num_bins, mel_lower_edge_hertz=flags.mel_lower_edge_hertz, mel_upper_edge_hertz=flags.mel_upper_edge_hertz, mel_non_zero_only=flags.mel_non_zero_only, fft_magnitude_squared=flags.fft_magnitude_squared, dct_num_features=flags.dct_num_features)( input_audio) for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( parse(flags.svdf_units1), parse(flags.svdf_memory_size), parse(flags.svdf_units2), parse(flags.svdf_dropout), parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=flags.svdf_pad, name='svdf_%d' % i)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # for streaming mode it is better to use causal padding padding = 'causal' if flags.svdf_pad else 'valid' for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( utils.parse(flags.svdf_units1), utils.parse(flags.svdf_memory_size), utils.parse(flags.svdf_units2), utils.parse(flags.svdf_dropout), utils.parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=padding, name='svdf_%d' % i)( net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def E2E_1stage_v2(input_shape=(16000, ), data_settings=None, dropout=0.2): X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms=data_settings.window_size_ms, frame_step_ms=data_settings.window_stride_ms)(X_input) X = svdf.Svdf(units1=256, memory_size=8, units2=64, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=64, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=128, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=128, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=128, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = svdf.Svdf(units1=256, memory_size=10, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_6')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v2') return model
def E2E_1stage_v9(input_shape=(16000,), data_settings = None, dropout = 0.5): assert data_settings.wanted_words == 'on,off,up,down,zero,one,two,three,four,five,six,seven,eight,nine' assert data_settings.window_size_ms == 40.0 assert data_settings.window_stride_ms == 20.0 assert data_settings.dct_num_features == 40 assert data_settings.mel_num_bins == 80 assert data_settings.mel_upper_edge_hertz == 7000 X_input = tf.keras.Input(input_shape) X = speech_features.SpeechFeatures( frame_size_ms = data_settings.window_size_ms, frame_step_ms = data_settings.window_stride_ms, mel_num_bins = data_settings.mel_num_bins, dct_num_features = data_settings.dct_num_features, mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input) X = svdf.Svdf( units1=192, memory_size = 4, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_1')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_2')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_3')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_4')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=96, dropout=dropout, activation='relu', pad=0, name='svdf_5')(X) X = svdf.Svdf( units1=192, memory_size = 10, units2=-1, dropout=dropout, activation='relu', pad=0, name='svdf_6')(X) X = Stream(cell=tf.keras.layers.Flatten())(X) X = tf.keras.layers.Dropout(dropout)(X) X = tf.keras.layers.Dense(units=data_settings.label_count)(X) # Create model model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v9') return model
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( input_audio) for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( parse(flags.svdf_units1), parse(flags.svdf_memory_size), parse(flags.svdf_units2), parse(flags.svdf_dropout), parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=flags.svdf_pad, name='svdf_%d' % i)( net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model with residual connections. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf In addition we added residual connection Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) blocks_pool = parse(flags.blocks_pool) if len(blocks_pool) != 3: raise ValueError('number of pooling blocks has to be 3, but get: ', len(blocks_pool)) # for streaming mode it is better to use causal padding padding = 'causal' if flags.svdf_pad else 'valid' # first residual block number_of_blocks = len(parse(flags.block1_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip(parse(flags.block1_units1), parse(flags.block1_memory_size), activations)): # [batch, time, feature] net = svdf.Svdf(units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_1_%d' % i)(net) # number of channels in the last layer units1_last = parse(flags.block1_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization( scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) # [batch, time, feature] net = tf.keras.layers.Activation(flags.activation)(net) net = tf.keras.layers.MaxPool1D(3, strides=blocks_pool[0], padding='valid')(net) # second residual block number_of_blocks = len(parse(flags.block2_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip(parse(flags.block2_units1), parse(flags.block2_memory_size), activations)): # [batch, time, feature] net = svdf.Svdf(units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_2_%d' % i)(net) # number of channels in the last layer units1_last = parse(flags.block2_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization( scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) net = tf.keras.layers.Activation(flags.activation)(net) # [batch, time, feature] net = tf.keras.layers.MaxPool1D(3, strides=blocks_pool[1], padding='valid')(net) # third residual block number_of_blocks = len(parse(flags.block3_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip(parse(flags.block3_units1), parse(flags.block3_memory_size), activations)): net = svdf.Svdf(units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_3_%d' % i)(net) # number of channels in the last layer units1_last = parse(flags.block3_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization( scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) net = tf.keras.layers.Activation(flags.activation)(net) net = tf.keras.layers.MaxPool1D(3, strides=blocks_pool[2], padding='valid')(net) # [batch, time, feature] # convert all feature to one vector if flags.flatten: net = tf.keras.layers.Flatten()(net) else: net = tf.keras.layers.GlobalAveragePooling1D()(net) # [batch, feature] net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units in parse(flags.units2): net = tf.keras.layers.Dense(units=units, activation=flags.activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)