def testStreaming(self, input_samples): # prepare non streaming model stft_layer = stft.STFT(self.frame_size, self.frame_step, mode=modes.Modes.TRAINING, inference_batch_size=1, padding='causal') input_tf = tf.keras.layers.Input(shape=(self.input_signal.shape[1], ), batch_size=1) net = stft_layer(input_tf) model_non_stream = tf.keras.models.Model(input_tf, net) params = test_utils.Params([1]) # shape of input data in the inference streaming mode (excluding batch size) params.data_shape = (input_samples * stft_layer.frame_step, ) params.step = input_samples # convert it to streaming model model_stream = utils.to_streaming_inference( model_non_stream, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run streaming inference and compare it with default stft stream_out = inference.run_stream_inference(params, model_stream, self.input_signal) stream_output_length = stream_out.shape[1] self.assertAllClose(stream_out, self.stft_out[:, 0:stream_output_length])
def test_delay_internal_state(self, delay_also_in_non_streaming): """Test delay layer with internal state.""" # model and data parameters params = test_utils.Params([1], clip_duration_ms=1) # prepare non stream model time_delay = 3 model = delay_model(params, time_delay, delay_also_in_non_streaming) model.summary() # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model.summary() # fill the buffer for i in range(time_delay): output = model_stream.predict([i + 1]) self.assertAllEqual(output[0, 0, 0], 0) # now get the data with delay for i in range(time_delay): output = model_stream.predict([0]) self.assertAllEqual(output[0, 0, 0], i + 1)
def test_dynamic_shape(self): # model and data parameters params = test_utils.Params([1], clip_duration_ms=0.25) # prepare input data x = np.arange(10) inp_audio = x inp_audio = np.expand_dims(inp_audio, 0) # add batch dim # prepare non stream model params.desired_samples = None model = conv1d_transpose_model(params, filters=1, kernel_size=3, stride=1) model.summary() # run inference on input with dynamic shape model.predict(inp_audio) with self.assertRaisesRegex( ValueError, 'in streaming mode time dimension of input packet ' 'should not be dynamic: TFLite limitation'): # streaming model expected to fail on input data with dynamic shape params.data_shape = (None, ) utils.to_streaming_inference(model, params, Modes.STREAM_INTERNAL_STATE_INFERENCE)
def test_streaming_strides(self, stride): """Test Conv1DTranspose layer in streaming mode with different strides. Args: stride: controls the upscaling factor """ # model and data parameters step = 1 # amount of data fed into streaming model on every iteration params = test_utils.Params([step], clip_duration_ms=0.25) # prepare input data x = np.arange(params.desired_samples) inp_audio = x inp_audio = np.expand_dims(inp_audio, 0) # add batch dim # prepare non stream model model = conv1d_transpose_model(params, filters=1, kernel_size=3, stride=stride) model.summary() # prepare streaming model model_stream = utils.to_streaming_inference( model, params, Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run inference non_stream_out = model.predict(inp_audio) stream_out = test.run_stream_inference(params, model_stream, inp_audio) self.assertAllClose(stream_out, non_stream_out)
def test_dynamic_shape(self): # model and data parameters params = test_utils.Params([1], clip_duration_ms=0.25) # prepare input data x = np.random.rand(1, params.desired_samples, 1, self.input_channels) inp_audio = x # prepare non stream model params.desired_samples = None model = conv2d_transpose_model( params, filters=1, kernel_size=(3, 1), strides=(1, 1), channels=self.input_channels) model.summary() # run inference on input with dynamic shape model.predict(inp_audio) with self.assertRaisesRegex( ValueError, 'in streaming mode time dimension of input packet ' 'should not be dynamic: TFLite limitation'): # streaming model expected to fail on input data with dynamic shape params.data_shape = (None, 1, self.input_channels) utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE)
def testStreaming(self, input_frames): params = test_utils.Params([1]) # shape of input data in the inference streaming mode (excluding batch size) params.data_shape = (1, self.feature_size) params.step = input_frames # prepare non streaming model inverse_stft_layer = inverse_stft.InverseSTFT( self.frame_size, self.frame_step, use_one_step=(input_frames == 1)) input_tf = tf.keras.layers.Input(shape=self.signal_stft.shape[1:3], batch_size=1, dtype=tf.complex64) net = inverse_stft_layer(input_tf) model_non_stream = tf.keras.models.Model(input_tf, net) self.non_stream_out = model_non_stream.predict(self.signal_stft) # convert it to streaming model model_stream = utils.to_streaming_inference( model_non_stream, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run streaming inference stream_out = inference.run_stream_inference(params, model_stream, self.signal_stft) # several samples in the end will be missing stream_output_length = stream_out.shape[1] self.assertAllClose(stream_out, self.non_stream_out[:, 0:stream_output_length])
def test_transposed_conv(self): """Test transposed and standard conv model with 'same' padding.""" test_utils.set_seed(123) # model and data parameters cnn_filters = [1, 1] cnn_kernel_size = [5, 3] cnn_act = ['linear', 'linear'] cnn_use_bias = [False, False] cnn_paddings = ['same', 'same'] trans_paddings = ['same', 'causal'] params = test_utils.Params([1], clip_duration_ms=2) # prepare input data x = np.arange(params.desired_samples) inp_audio = x inp_audio = np.expand_dims(inp_audio, 0) # prepare non stream model model = transposed_conv_model(params, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_paddings, trans_paddings) # set random weights all_weights = [] for w in model.get_weights(): if isinstance(w, np.ndarray): shape = w.shape new_w = np.random.rand(*shape) all_weights.append(new_w) else: all_weights.append(True) model.set_weights(all_weights) model.summary() non_stream_out = model.predict(inp_audio) # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() stream_out = inference.run_stream_inference(params, model_stream, inp_audio) # shift defines the index after which data in streaming mode become valid: # in streaming mode we use ring buffers initialized with zeros and it needs # several cycles until they are filled with real data. shift = 2 # the total conv delay is (5//2) * 2 + 3//2 = 5 # (there is no delay from the k=3 s=2 transposed convs, 'same' or 'causal'), # and the explicit Delay layers add an additional same amount. total_delay = 10 # normalize output data and compare them non_stream_out = non_stream_out[0, shift:-(total_delay), ] stream_out = stream_out[0, total_delay + shift:, ] self.assertAllClose(stream_out, non_stream_out)
def test_streaming_on_2d_data_strides(self, stride): """Tests Conv2DTranspose on 2d in streaming mode with different strides. Args: stride: controls the upscaling factor """ tf1.reset_default_graph() config = tf1.ConfigProto() config.gpu_options.allow_growth = True sess = tf1.Session(config=config) tf1.keras.backend.set_session(sess) # model and data parameters step = 1 # amount of data fed into streaming model on every iteration params = test_utils.Params([step], clip_duration_ms=0.25) input_features = 3 # prepare input data: [batch, time, features, channels] x = np.random.rand(1, params.desired_samples, input_features, self.input_channels) inp_audio = x # prepare non-streaming model model = conv2d_transpose_model( params, filters=1, kernel_size=(3, 3), strides=(stride, stride), features=input_features, channels=self.input_channels) model.summary() # set weights with bias for layer in model.layers: if isinstance(layer, tf.keras.layers.Conv2DTranspose): layer.set_weights([ np.ones(layer.weights[0].shape), np.zeros(layer.weights[1].shape) + 0.5 ]) params.data_shape = (1, input_features, self.input_channels) # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run inference non_stream_out = model.predict(inp_audio) stream_out = inference.run_stream_inference(params, model_stream, inp_audio) self.assertAllClose(stream_out, non_stream_out)
def test_residual(self, step, padding, delay_also_in_non_streaming): """Test residual connection in streaming mode with conv layer.""" # model and data parameters cnn_filters = [1, 1] cnn_kernel_size = [5, 3] cnn_act = ['elu', 'elu'] cnn_use_bias = [False, False] cnn_padding = [padding, padding] params = test_utils.Params([step], clip_duration_ms=2) # prepare input data x = np.arange(params.desired_samples) inp_audio = x inp_audio = np.expand_dims(inp_audio, 0) # prepare non stream model model, sum_delay = residual_model(params, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding, delay_also_in_non_streaming) model.summary() # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run inference non_stream_out = model.predict(inp_audio) stream_out = inference.run_stream_inference(params, model_stream, inp_audio) # normalize output data and compare them channel = 0 non_stream_out = non_stream_out[0, :, channel] stream_out = stream_out[0, :, channel] min_len = min(stream_out.shape[0], non_stream_out.shape[0]) stream_out = stream_out[0:min_len] non_stream_out = non_stream_out[0:min_len] shift = 1 if delay_also_in_non_streaming: # Delay was also applied in non-streaming, as well as streaming mode. non_stream_out = non_stream_out[shift + sum_delay:min_len] else: non_stream_out = non_stream_out[shift:min_len - sum_delay] stream_out = stream_out[sum_delay + shift:] self.assertAllEqual(non_stream_out.shape, (31 - sum_delay, )) self.assertAllClose(stream_out, non_stream_out)
def test_average_pooling_stream(self): # prepare input data params = test_utils.Params([1]) params.desired_samples = 5 batch_size = 1 time1 = params.desired_samples # it is time dim (will not be averaged out) time2 = 3 # this dim will be averaged out and become 1 feature = 16 # it is a feature dim # override data shape for streaming mode testing params.preprocess = 'custom' params.data_shape = (1, time2, feature) inp_audio = np.random.rand(batch_size, time1, time2, feature) inputs = tf.keras.layers.Input( shape=(time1, time2, feature), batch_size=batch_size) net = stream.Stream( cell=average_pooling2d.AveragePooling2D( kernel_size=(time1, time2), padding='valid'), use_one_step=False, pad_time_dim='causal')(inputs) model = tf.keras.Model(inputs, net) model.summary() # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run inference and compare streaming vs non streaming non_stream_out = model.predict(inp_audio) stream_out = test.run_stream_inference(params, model_stream, inp_audio) self.assertAllClose(stream_out, non_stream_out) net = tf.keras.layers.GlobalAveragePooling2D()(inputs) model_global = tf.keras.Model(inputs, net) model_global.summary() global_out = model_global.predict(inp_audio) # last result in streaming output has to be the same with global average self.assertAllClose(stream_out[0, -1, 0, :], global_out[0, :])
def test_stream_strided_convolution(self, get_model, conv_cell): # Test streaming convolutional layers with striding, dilation. cnn_filters = [1, 1, 1, 1] cnn_kernel_size = [3, 3, 3, 3] cnn_act = ['linear', 'linear', 'elu', 'elu'] cnn_dilation_rate = [1, 1, 1, 2] cnn_strides = [2, 1, 3, 1] cnn_use_bias = [False, False, False, False] # prepare input data params = test_utils.Params(cnn_strides) x = np.arange(params.desired_samples) frequency = 2.0 inp_audio = np.cos((2.0 * np.pi / params.desired_samples) * frequency * x) + np.random.rand(1, params.desired_samples) * 0.5 # prepare non stream model model = get_model(params, conv_cell, cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias) model.summary() # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run inference non_stream_out = model.predict(inp_audio) stream_out = test.run_stream_inference(params, model_stream, inp_audio) # normalize output data and compare them channel = 0 non_stream_out = non_stream_out[0, :, channel] stream_out = stream_out[0, :, channel] min_len = min(stream_out.shape[0], non_stream_out.shape[0]) stream_out = stream_out[0:min_len] non_stream_out = non_stream_out[0:min_len] self.assertAllEqual(non_stream_out.shape, (42, )) self.assertAllClose(stream_out, non_stream_out)
def test_residual(self, step): # model and data parameters cnn_filters = [1, 1, 1, 1] cnn_kernel_size = [3, 3, 3, 3] cnn_act = ['linear', 'linear', 'elu', 'elu'] cnn_use_bias = [False, False, False, False] cnn_padding = ['causal', 'causal', 'causal', 'causal'] params = test_utils.Params([step], clip_duration_ms=2) # prepare input data x = np.arange(params.desired_samples) frequency = 2.0 inp_audio = np.cos((2.0 * np.pi / params.desired_samples) * frequency * x) + np.random.rand(1, params.desired_samples) * 0.5 # prepare non stream model model = residual_model(params, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding) model.summary() # prepare streaming model model_stream = utils.to_streaming_inference( model, params, Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run inference non_stream_out = model.predict(inp_audio) stream_out = test.run_stream_inference(params, model_stream, inp_audio) # normalize output data and compare them channel = 0 non_stream_out = non_stream_out[0, :, channel] stream_out = stream_out[0, :, channel] min_len = min(stream_out.shape[0], non_stream_out.shape[0]) stream_out = stream_out[0:min_len] non_stream_out = non_stream_out[0:min_len] self.assertAllEqual(non_stream_out.shape, (32, )) self.assertAllClose(stream_out, non_stream_out)
def test_conv(self): """Test conv model with 'same' padding.""" # model and data parameters cnn_filters = [1, 1, 1] cnn_kernel_size = [5, 3, 5] cnn_act = ['elu', 'elu', 'elu'] cnn_use_bias = [False, False, False] cnn_padding = ['same', 'causal', 'same'] params = test_utils.Params([1], clip_duration_ms=2) # prepare input data x = np.arange(params.desired_samples) inp_audio = x inp_audio = np.expand_dims(inp_audio, 0) # prepare non stream model model, sum_delay, sum_shift = conv_model(params, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding) model.summary() non_stream_out = model.predict(inp_audio) # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() stream_out = inference.run_stream_inference(params, model_stream, inp_audio) shift = sum_shift + 1 # normalize output data and compare them non_stream_out = non_stream_out[0, shift:-(sum_delay), ] stream_out = stream_out[0, sum_delay + shift:, ] self.assertAllClose(stream_out, non_stream_out)
def test_stream_strided_convolution(self, get_model, conv_cell): # Test streaming convolutional layers with striding, dilation. cnn_filters = [1, 1, 1, 1] cnn_kernel_size = [3, 3, 3, 3] cnn_act = ['linear', 'linear', 'elu', 'elu'] cnn_dilation_rate = [1, 1, 1, 2] cnn_strides = [2, 1, 3, 1] cnn_use_bias = [False, False, False, False] # prepare input data params = test_utils.Params(cnn_strides) x = np.arange(params.desired_samples) frequency = 2.0 inp_audio = np.cos((2.0 * np.pi / params.desired_samples) * frequency * x) + np.random.rand(1, params.desired_samples) * 0.5 if conv_cell == tf.keras.layers.SeparableConv1D: kwargs = dict( depthwise_initializer=tf.keras.initializers.GlorotUniform( seed=123), pointwise_initializer=tf.keras.initializers.GlorotUniform( seed=456)) else: kwargs = dict( kernel_initializer=tf.keras.initializers.GlorotUniform( seed=123)) # Prepare Keras native model. model_native = conv_model_keras_native(params, conv_cell, cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias, **kwargs) model_native.summary() # prepare non stream model model = get_model(params, conv_cell, cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias, **kwargs) model.summary() # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run inference non_stream_out = model.predict(inp_audio) native_out = model_native.predict(inp_audio) stream_out = test.run_stream_inference(params, model_stream, inp_audio) # normalize output data and compare them channel = 0 non_stream_out = non_stream_out[0, :, channel] native_out = native_out[0, :, channel] stream_out = stream_out[0, :, channel] min_len = min(stream_out.shape[0], non_stream_out.shape[0]) stream_out = stream_out[0:min_len] native_out = native_out[0:min_len] non_stream_out = non_stream_out[0:min_len] self.assertAllEqual(non_stream_out.shape, (params.desired_samples / np.prod(cnn_strides), )) with self.subTest(name='stream_vs_non_stream'): self.assertAllClose(stream_out, non_stream_out) with self.subTest(name='non_stream_vs_native'): self.assertAllClose(non_stream_out, native_out)
def test_cnn_model_end_to_end(self): config = tf1.ConfigProto() config.gpu_options.allow_growth = True sess = tf1.Session(config=config) tf1.keras.backend.set_session(sess) test_utils.set_seed(123) # data parameters num_time_bins = 12 feature_size = 12 # model params. total_stride = 2 params = test_utils.Params([total_stride], 0) params.model_name = 'cnn' params.cnn_filters = '2' params.cnn_kernel_size = '(3,3)' params.cnn_act = "'relu'" params.cnn_dilation_rate = '(1,1)' params.cnn_strides = '(2,2)' params.dropout1 = 0.5 params.units2 = '' params.act2 = '' params.label_count = 2 params.return_softmax = True params.quantize = 1 # apply quantization aware training params.data_shape = (num_time_bins, feature_size) params.preprocess = 'custom' model = cnn.model(params) model.summary() # prepare training and testing data train_images, train_labels = test_utils.generate_data( img_size_y=num_time_bins, img_size_x=feature_size, n_samples=32) test_images = train_images test_labels = train_labels # create and train quantization aware model in non streaming mode model.compile( optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy']) model.fit( train_images, train_labels, epochs=1, validation_data=(test_images, test_labels)) model.summary() # one test image train_image = train_images[:1,] # run tf non streaming inference non_stream_output_tf = model.predict(train_image) # specify input data shape for streaming mode params.data_shape = (total_stride, feature_size) # TODO(rybakov) add params structure for model with no feature extractor # prepare tf streaming model and use it to generate representative_dataset with quantize.quantize_scope(): stream_quantized_model = utils.to_streaming_inference( model, params, Modes.STREAM_EXTERNAL_STATE_INFERENCE) calibration_data = prepare_calibration_data(stream_quantized_model, total_stride, train_image) def representative_dataset(dtype): def _representative_dataset_gen(): for i in range(len(calibration_data)): yield [ calibration_data[i][0].astype(dtype), # input audio packet calibration_data[i][1].astype(dtype), # conv state calibration_data[i][2].astype(dtype) # flatten state ] return _representative_dataset_gen # convert streaming quantization aware model to tflite # and apply post training quantization with quantize.quantize_scope(): tflite_streaming_model = utils.model_to_tflite( sess, model, params, Modes.STREAM_EXTERNAL_STATE_INFERENCE, optimizations=[tf.lite.Optimize.DEFAULT], inference_type=tf.int8, experimental_new_quantizer=True, representative_dataset=representative_dataset(np.float32)) # run tflite in streaming mode and compare output logits with tf interpreter = tf.lite.Interpreter(model_content=tflite_streaming_model) interpreter.allocate_tensors() input_states = [] for detail in interpreter.get_input_details(): input_states.append(np.zeros(detail['shape'], dtype=np.float32)) stream_out_tflite = inference.run_stream_inference_classification_tflite( params, interpreter, train_image, input_states) self.assertAllClose(stream_out_tflite, non_stream_output_tf, atol=0.001)
def test_streaming_on_1d_data_strides(self, stride): """Tests Conv2DTranspose on 1d in streaming mode with different strides. Args: stride: controls the upscaling factor """ tf1.reset_default_graph() config = tf1.ConfigProto() config.gpu_options.allow_growth = True sess = tf1.Session(config=config) tf1.keras.backend.set_session(sess) # model and data parameters step = 1 # amount of data fed into streaming model on every iteration params = test_utils.Params([step], clip_duration_ms=0.25) # prepare input data: [batch, time, 1, channels] x = np.random.rand(1, params.desired_samples, 1, self.input_channels) inp_audio = x # prepare non-streaming model model = conv2d_transpose_model( params, filters=1, kernel_size=(3, 1), strides=(stride, 1), channels=self.input_channels) model.summary() # set weights with bias for layer in model.layers: if isinstance(layer, tf.keras.layers.Conv2DTranspose): layer.set_weights([ np.ones(layer.weights[0].shape), np.zeros(layer.weights[1].shape) + 0.5 ]) params.data_shape = (1, 1, self.input_channels) # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() # run inference non_stream_out = model.predict(inp_audio) stream_out = inference.run_stream_inference(params, model_stream, inp_audio) self.assertAllClose(stream_out, non_stream_out) # Convert TF non-streaming model to TFLite external-state streaming model. tflite_streaming_model = utils.model_to_tflite( sess, model, params, modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE) self.assertTrue(tflite_streaming_model) # Run TFLite external-state streaming inference. interpreter = tf.lite.Interpreter(model_content=tflite_streaming_model) interpreter.allocate_tensors() input_details = interpreter.get_input_details() input_states = [] # before processing test sequence we create model state for s in range(len(input_details)): input_states.append(np.zeros(input_details[s]['shape'], dtype=np.float32)) stream_out_tflite_external_st = inference.run_stream_inference_tflite( params, interpreter, inp_audio, input_states, concat=True) # compare streaming TFLite with external-state vs TF non-streaming self.assertAllClose(stream_out_tflite_external_st, non_stream_out)