Esempio n. 1
0
    def testModelEndToEnd(self, model_fn):
        # 1. Check whether quantized model graph can be constructed.
        model = model_fn(self)
        model = quantize.quantize_model(model)

        # 2. Sanity check to ensure basic training on random data works.
        x_train, y_train = self._create_test_data(model)
        model.compile(loss='mse', optimizer='sgd', metrics=['accuracy'])
        model.fit(x_train, y_train, epochs=100)

        x_test, y_test = self._create_test_data(model)

        y_tf = model.predict(x_test)

        # 3. Ensure conversion to TFLite works.
        _, tflite_file = tempfile.mkstemp('.tflite')
        print('TFLite File: ', tflite_file)
        with quantize.quantize_scope():
            utils.convert_keras_to_tflite(model, tflite_file)

        # 4. Verify input runs on converted model.
        y_tfl = self._execute_tflite(tflite_file, x_test, y_test)

        # 5. Verify results are the same in TF and TFL.
        # TODO(pulkitb): Temporarily raise tolerances since some rounding
        # changes in x86 kernels are causing values to differ by 'scale'.
        self.assertAllClose(y_tf, y_tfl, atol=1e-1, rtol=1e-1)
Esempio n. 2
0
def measure_sparsity(model):
    assert quantize_base.SET_CUSTOM_TNH_FLAG, log.info(
        "TFMOD needs to be modified with quantizer disabled for proper "
        "running")

    # Helper function uses `quantize_annotate_layer` to annotate that only the
    # Dense layers should be quantized.
    def add_sparsity_annotation(layer):
        quantize_config = SparsityMeter()
        log.info(
            "**Sparsity Measure annotation added to layer {} with {}".format(
                layer.name, quantize_config))
        quantized_layer = quantize_annotate_layer(
            to_annotate=layer, quantize_config=quantize_config)
        return quantized_layer

    log.info("Annotating model {}".format(model.name))
    tf.keras.backend.clear_session()
    annotated_model = tf.keras.models.clone_model(
        model, clone_function=add_sparsity_annotation)

    with quantize_scope({
            'SparsityMeter': SparsityMeter,
            "ActivSparsityMeasure ": ActivSparsityMeasure,
            "WeightsSparsityMeasure ": WeightsSparsityMeasure
    }):
        # Use `quantize_apply` to actually make the model Sparsity Measure aware.
        quant_aware_model = quantize_apply(annotated_model)

        return quant_aware_model
Esempio n. 3
0
    def testQuantizeApply_KeepTrainableWeightOrder(self):
        layer = self.CustomConvLayer(input_shape=(28, 28, 3))
        model = keras.Sequential([layer])

        def apply_quantization_to_dense(layer):
            if isinstance(layer, self.CustomConvLayer):
                return quantize_annotate_layer(
                    layer, quantize_config=self.CustomConvQuantizeConfig())
            return layer

        annotated_model = tf.keras.models.clone_model(
            model,
            clone_function=apply_quantization_to_dense,
        )

        with quantize.quantize_scope({
                'CustomConvQuantizeConfig': self.CustomConvQuantizeConfig,
                'CustomConvLayer': self.CustomConvLayer
        }):
            quant_aware_model = quantize_apply(annotated_model)

        self._assert_weights_different_objects(
            model.trainable_weights, quant_aware_model.trainable_weights)
        self._assert_weights_equal_value(model.trainable_weights,
                                         quant_aware_model.trainable_weights)
Esempio n. 4
0
def apply_quantization(model):
    # Helper function uses `quantize_annotate_layer` to annotate that only the
    # Dense layers should be quantized.
    def add_quantize_annotation(layer):
        kernelization_map = [
            # tf.keras.layers.Dense,
            tf.keras.layers.Conv2D
        ]

        for layer_type in kernelization_map:
            if isinstance(layer, layer_type):
                quantize_config = SLCQuantizeConfig()

                log.info(
                    "**Kernelization annotation added to layer {} of type {} with {}".format(layer.name, layer_type,
                                                                                             quantize_config))

                quantized_layer = quantize_annotate_layer(to_annotate=layer, quantize_config=quantize_config)
                return quantized_layer
        log.info("**Kernelization annotation not added to layer {} of type {}".format(layer.name, type(layer)))

        return layer

    # Use `tf.keras.models.clone_model` to apply `add_quantize_annotation`
    # to the layers of the model.
    log.info("Annotating model {}".format(model.name))
    annotated_model = tf.keras.models.clone_model(
        model,
        clone_function=add_quantize_annotation,
    )

    with quantize_scope({
        'SLCQuantizeConfig': SLCQuantizeConfig,
        "SLCWeightGenerator": SLCWeightGenerator,
        "SLCRegularizer": SLCRegularizer
    }):
        # Use `quantize_apply` to actually make the model kernelization aware.
        quant_aware_model = quantize_apply(annotated_model)

        original_size = 0
        compressed_size = 0
        for layer in quant_aware_model.layers:
            try:
                original_size = original_size + layer.original_size
                if layer.compressed is True:
                    compressed_size = compressed_size + layer.compressed_size
                else:
                    compressed_size = compressed_size + layer.original_size

            except AttributeError:
                pass
        try:
            ratio = compressed_size * 100.0 / original_size
            log.info(
                "Model original size: {}, compressed size: {}, ratio: {:.2f}%".format(original_size, compressed_size, ratio))
        except ZeroDivisionError:
            log.info(
                "Zero division error? Model original size: {}, compressed size: {}".format(original_size, compressed_size))

        return quant_aware_model
Esempio n. 5
0
def to_streaming_inference(model_non_stream, flags, mode):
    """Convert non streaming trained model to inference modes.

  Args:
    model_non_stream: trained Keras model non streamable
    flags: settings with global data and model properties
    mode: it supports Non streaming inference, Streaming inference with internal
      states, Streaming inference with external states

  Returns:
    Keras inference model of inference_type
  """
    tf.keras.backend.set_learning_phase(0)
    input_data_shape = modes.get_input_data_shape(flags, mode)

    # get input data type and use it for input streaming type
    dtype = (model_non_stream.input[0].dtype if isinstance(
        model_non_stream.input, tuple) else model_non_stream.input.dtype)
    input_tensors = [
        tf.keras.layers.Input(shape=input_data_shape,
                              batch_size=1,
                              dtype=dtype,
                              name='input_audio')
    ]
    quantize_stream_scope = quantize.quantize_scope()
    with quantize_stream_scope:
        model_inference = convert_to_inference_model(model_non_stream,
                                                     input_tensors, mode)
    return model_inference
Esempio n. 6
0
  def testSerialization_TF1SavedModel(self):
    if not compat.is_v1_apis():
      return

    model = test_utils.build_simple_dense_model()
    quantized_model = quantize.quantize_model(model)
    self._train_model(quantized_model)

    saved_model_dir = tempfile.mkdtemp()
    with quantize.quantize_scope():
      tf.keras.experimental.export_saved_model(quantized_model, saved_model_dir)

    with quantize.quantize_scope():
      loaded_model = tf.keras.experimental.load_from_saved_model(
          saved_model_dir)

    self._assert_outputs_equal(quantized_model, loaded_model)
    def _test_equivalent_to_tflite(self, model, is_tflite_quantized=False):
        _, keras_file = tempfile.mkstemp('.h5')
        _, tflite_file = tempfile.mkstemp('.tflite')

        model.compile(loss='categorical_crossentropy',
                      optimizer='sgd',
                      metrics=['accuracy'])

        model.fit(np.random.uniform(0, 1, size=[self.batch_size, 10, 10, 3]),
                  np.random.uniform(0, 10, size=[self.batch_size, 8, 8, 2]),
                  epochs=1,
                  callbacks=[])

        # Prepare for inference.
        inp = np.random.uniform(0, 1, size=[self.batch_size, 10, 10, 3])
        inp = inp.astype(np.float32)

        # TensorFlow inference.
        tf_out = model.predict(inp)

        if is_tflite_quantized:
            scale, zero_point = self._compute_quantization_params(model)

            # TFLite input needs to be quantized.
            inp = inp * 255
            inp = inp.astype(np.uint8)

        # TensorFlow Lite inference.
        tf.keras.models.save_model(model, keras_file)
        with quantize.quantize_scope():
            utils.convert_keras_to_tflite(
                keras_file,
                tflite_file,
                custom_objects={'_ConvBatchNorm2D': _ConvBatchNorm2D},
                is_quantized=is_tflite_quantized)

        interpreter = tf.lite.Interpreter(model_path=tflite_file)
        interpreter.allocate_tensors()
        input_index = interpreter.get_input_details()[0]['index']
        output_index = interpreter.get_output_details()[0]['index']

        interpreter.set_tensor(input_index, inp)
        interpreter.invoke()
        tflite_out = interpreter.get_tensor(output_index)

        if is_tflite_quantized:
            # dequantize outputs
            tflite_out = [scale * (x - zero_point) for x in tflite_out]
            # Off by 1 in quantized output. Notably we cannot reduce this. There is
            # an existing mismatch between TensorFlow and TFLite (from
            # contrib.quantize days).
            self.assertAllClose(tf_out, tflite_out, atol=scale)
        else:
            # Taken from testFoldFusedBatchNorms from
            # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference_test.py#L230
            self.assertAllClose(tf_out, tflite_out, rtol=1e-04, atol=1e-06)
  def testSerialization_KerasModel(self):
    model = test_utils.build_simple_dense_model()
    quantized_model = quantize.quantize_model(model)
    self._train_model(quantized_model)

    _, model_file = tempfile.mkstemp('.h5')
    tf.keras.models.save_model(quantized_model, model_file)
    with quantize.quantize_scope():
      loaded_model = tf.keras.models.load_model(model_file)

    self._assert_models_equal(quantized_model, loaded_model)
Esempio n. 9
0
    def testProductionModelConversionToTFLite(self):
        # small input shape to keep test running quickly.
        model = tf.keras.applications.mobilenet.MobileNet(weights=None,
                                                          input_shape=(32, 32,
                                                                       3))

        annotated = quantize_annotate(model)
        quantized_model = quantize_apply(annotated)

        _, tflite_file = tempfile.mkstemp('.h5')

        with quantize.quantize_scope():
            utils.convert_keras_to_tflite(quantized_model, tflite_file)
  def testTransformsConvBNPattern(self):
    model = Conv2DModel.get_nonfolded_batchnorm_model(
        model_type='functional')
    folded_model = Conv2DModel.get_folded_batchnorm_model(
        is_quantized=True)

    with quantize.quantize_scope():
      transformed_model, _ = ModelTransformer(
          model, [default_8bit_transforms.Conv2DBatchNormFold()]).transform()

    inputs = np.random.standard_normal(Conv2DModel.get_batched_input_shape())
    self.assertAllClose(
        transformed_model.predict(inputs), folded_model.predict(inputs))
Esempio n. 11
0
def apply_quantization(model):
    assert quantize_base.SET_CUSTOM_TNH_FLAG, log.info(
        "TFMOD needs to be modified with quantizer disabled for proper "
        "running")

    # Helper function uses `quantize_annotate_layer` to annotate that only the
    # Dense layers should be quantized.
    def add_quantize_annotation(layer):
        # create new layer to break link with old model
        layer = layer.__class__.from_config(layer.get_config())

        quantization_map = [
            # tf.keras.layers.Dense,
            tf.keras.layers.Conv2D
            # tf.keras.layers.Input: BFPInputQuantizerConfig()
        ]

        for layer_type in quantization_map:

            if isinstance(layer, layer_type):
                quantize_config = SLGQuantizeConfig()

                log.info(
                    "**SLG annotation added to layer {} of type {} with {}".
                    format(layer.name, layer_type, quantize_config))

                quantized_layer = quantize_annotate_layer(
                    to_annotate=layer, quantize_config=quantize_config)
                return quantized_layer
        log.info("**SLG annotation not added to layer {} of type {}".format(
            layer.name, type(layer)))

        return layer

    # Use `tf.keras.models.clone_model` to apply `add_quantize_annotation`
    # to the layers of the model.
    log.info("Annotating model {}".format(model.name))

    tf.keras.backend.clear_session()
    annotated_model = tf.keras.models.clone_model(
        model,
        clone_function=add_quantize_annotation,
    )

    with quantize_scope({
            "SLGWeightGenerator": SLGWeightGenerator,
            "SLGQuantizeConfig": SLGQuantizeConfig,
    }):
        # Use `quantize_apply` to actually make the model quantization aware.
        quant_aware_model = quantize_apply(annotated_model)
        return quant_aware_model
Esempio n. 12
0
    def testTransformsDepthwiseConvBNReLUPattern(self):
        model = DepthwiseConv2DModel.get_nonfolded_batchnorm_model(
            post_bn_activation=keras.layers.ReLU(6.0), model_type='functional')
        folded_model = DepthwiseConv2DModel.get_folded_batchnorm_model(
            post_bn_activation=keras.layers.ReLU(6.0), is_quantized=True)

        with quantize.quantize_scope():
            transformed_model = ModelTransformer(
                model, [tflite_transforms.DepthwiseConv2DBatchNormReLU6Fold()
                        ]).transform()

        inputs = np.random.standard_normal(
            DepthwiseConv2DModel.get_batched_input_shape())
        self.assertAllClose(transformed_model.predict(inputs),
                            folded_model.predict(inputs))
  def testCustomWeightQuantizers_Run(self, quantizer_type):
    init_params = self._get_quant_params(quantizer_type)

    # Additional test that same quantizer object can be shared
    # between Configs, though we don't expicitly promote this
    # anywhere in the documentation.
    quantizer = quantizer_type(**init_params)

    class DenseQuantizeConfig(QuantizeConfig):
      """Custom QuantizeConfig for Dense layer."""

      def get_weights_and_quantizers(self, layer):
        return [(layer.kernel, quantizer)]

      def get_activations_and_quantizers(self, layer):
        # Defaults.
        return [(layer.activation,
                 MovingAverageQuantizer(
                     num_bits=8,
                     per_axis=False,
                     symmetric=False,
                     narrow_range=False))]

      def set_quantize_weights(self, layer, quantize_weights):
        layer.kernel = quantize_weights[0]

      def set_quantize_activations(self, layer, quantize_activations):
        return

      def get_output_quantizers(self, layer):
        return []

      def get_config(self):
        return {}

    annotated_model = tf.keras.Sequential([
        quantize.quantize_annotate_layer(
            l.Dense(8, input_shape=(10,)), DenseQuantizeConfig()),
        quantize.quantize_annotate_layer(
            l.Dense(5), DenseQuantizeConfig())
    ])

    with quantize.quantize_scope(
        {'DenseQuantizeConfig': DenseQuantizeConfig}):
      quant_model = quantize.quantize_apply(annotated_model)

    # Check no error happens.
    self._train_model(quant_model)
Esempio n. 14
0
    def testTransformsConvBNPatternPreservesWeights(self):
        # random_init to prevent non-random initialization in resulting
        # in same weights between transformed and non-transformed models.
        model = Conv2DModel.get_nonfolded_batchnorm_model(
            model_type='functional', random_init=True)

        with quantize.quantize_scope():
            transformed_model = ModelTransformer(
                model, [tflite_transforms.Conv2DBatchNormFold()]).transform()

        transformed_weights = transformed_model.get_weights()
        # Remove quantization related weights.
        del transformed_weights[3:8]

        self.assertEqual(len(transformed_weights), len(model.get_weights()))
        for i in range(len(transformed_weights)):
            self.assertAllEqual(transformed_weights[i], model.get_weights()[i])
    def testQuantizesMnist(self):
        if not compat.is_v1_apis():
            return

        model = test_utils_mnist.sequential_model()
        x_train, y_train, x_test, y_test = test_utils_mnist.preprocessed_data()

        model.compile(loss='categorical_crossentropy',
                      optimizer='sgd',
                      metrics=['accuracy'])
        model.fit(x_train, y_train, batch_size=500)
        _, model_accuracy = model.evaluate(x_test, y_test, verbose=0)

        quantized_model = quantize.quantize_model(model)
        quantized_model.compile(loss='categorical_crossentropy',
                                optimizer='sgd',
                                metrics=['accuracy'])

        quantized_model.fit(x_train, y_train, batch_size=500)
        _, quantized_model_accuracy = quantized_model.evaluate(x_test,
                                                               y_test,
                                                               verbose=0)

        self.assertGreater(quantized_model_accuracy, 0.6)

        _, quantized_tflite_file = tempfile.mkstemp('.tflite')

        with quantize.quantize_scope():
            test_utils.convert_keras_to_tflite(
                model=quantized_model,
                output_path=quantized_tflite_file,
                is_quantized=True)
        quantized_model_tflite_accuracy = test_utils_mnist.eval_tflite(
            quantized_tflite_file)

        # Ensure accuracy for quantized TF and TFLite models are similar to original
        # model. There is no clear way to measure quantization, but for MNIST
        # results which differ a lot likely suggest an error in quantization.
        self.assertAllClose(model_accuracy,
                            quantized_model_accuracy,
                            rtol=0.2,
                            atol=0.2)
        self.assertAllClose(quantized_model_accuracy,
                            quantized_model_tflite_accuracy,
                            rtol=0.2,
                            atol=0.2)
Esempio n. 16
0
def apply_quantization(model):
    # Helper function uses `quantize_annotate_layer` to annotate that only the
    # Dense layers should be quantized.
    def add_quantize_annotation(layer):
        # create new layer to break link with old model
        layer = layer.__class__.from_config(layer.get_config())

        quantization_map = {
            tf.keras.layers.Dense: BFPQuantizeConfig(),
            tf.keras.layers.Conv2D: BFPQuantizeConfig()
        }

        for layer_type, quantize_config in quantization_map.items():
            if isinstance(layer, layer_type):
                print(
                    "**Quantization annotation added to layer {} of type {} with {}"
                    .format(layer.name, layer_type, quantize_config))

                quantized_layer = quantize_annotate_layer(
                    to_annotate=layer, quantize_config=quantize_config)
                return quantized_layer
        print("**Quantization annotation not added to layer {} of type {}".
              format(layer.name, type(layer)))

        return layer

    # Use `tf.keras.models.clone_model` to apply `add_quantize_annotation`
    # to the layers of the model.
    print("Annotating model {}".format(model.name))

    annotated_model = tf.keras.models.clone_model(
        model,
        clone_function=add_quantize_annotation,
    )

    with quantize_scope({
            'BFPQuantizeConfig': BFPQuantizeConfig,
            "BFPActivQuantizer": BFPActivQuantizer,
            "BFPWeightQuantizer": BFPWeightQuantizer,
            "BFPBiasQuantizer": BFPBiasQuantizer,
            "PolynomialDecay": PolynomialDecay
    }):
        # Use `quantize_apply` to actually make the model quantization aware.
        quant_aware_model = quantize_apply(annotated_model)
        return quant_aware_model
Esempio n. 17
0
    def testSerialization(self):
        model = test_utils.build_simple_dense_model()

        quantized_model = quantize_apply(quantize_annotate(model))
        quantized_model.compile(loss='categorical_crossentropy',
                                optimizer='sgd',
                                metrics=['accuracy'])
        quantized_model.fit(np.random.rand(20, 10),
                            tf.keras.utils.to_categorical(
                                np.random.randint(5, size=(20, 1)), 5),
                            batch_size=20)

        _, model_file = tempfile.mkstemp('.h5')
        keras.models.save_model(quantized_model, model_file)
        with quantize.quantize_scope():
            loaded_model = keras.models.load_model(model_file)

        self._assert_models_equal(quantized_model, loaded_model)
    def testQuantizeSingleLayer_ProducesFullIntegerModel_TF1(
            self, layer_type, kwargs):
        if not compat.is_v1_apis():
            return

        if 'input_shape' not in kwargs:
            kwargs['input_shape'] = (5, )

        layer = layer_type(**kwargs)
        model = tf.keras.Sequential([layer])
        quantized_model = quantize.quantize_model(model)

        with quantize.quantize_scope():
            test_utils.convert_keras_to_tflite(model=quantized_model,
                                               output_path=None,
                                               is_quantized=True,
                                               inference_type=tf.uint8,
                                               inference_input_type=tf.uint8,
                                               input_quant_params=(0., 1.))
Esempio n. 19
0
  def testModelEndToEnd(self, model_type):
    # 1. Check whether quantized model graph can be constructed.
    model = self._get_model(model_type)
    model = quantize.quantize_model(model)

    # 2. Sanity check to ensure basic training on random data works.
    x_train, y_train = self._create_test_data(model)
    model.compile(
        loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
    model.fit(x_train, y_train)

    # 3. Ensure conversion to TFLite works.
    _, tflite_file = tempfile.mkstemp('.tflite')
    print('TFLite File: ', tflite_file)
    with quantize.quantize_scope():
      utils.convert_keras_to_tflite(model, tflite_file)

    # 4. Verify input runs on converted model.
    self._verify_tflite(tflite_file, x_train, y_train)
    def testQuantizeSingleLayer_ProducesFullIntegerModel_TF2(
            self, layer_type, kwargs):
        # "FullInteger" in the sense that ignores inputs and outputs.
        if compat.is_v1_apis():
            return

        if 'input_shape' not in kwargs:
            kwargs['input_shape'] = (5, )

        layer = layer_type(**kwargs)
        model = tf.keras.Sequential([layer])
        quantized_model = quantize.quantize_model(model)

        _, quantized_tflite_file = tempfile.mkstemp('.tflite')

        with quantize.quantize_scope():
            test_utils.convert_keras_to_tflite(
                model=quantized_model,
                output_path=quantized_tflite_file,
                is_quantized=True,
                input_quant_params=(0., 1.),
                experimental_new_converter=True)

        interpreter = tf.lite.Interpreter(model_path=quantized_tflite_file)
        interpreter.allocate_tensors()

        input_tensor_details = interpreter.get_input_details()
        self.assertEqual(input_tensor_details[0]['dtype'], np.float32)

        output_tensor_details = interpreter.get_output_details()
        self.assertEqual(output_tensor_details[0]['dtype'], np.float32)

        tensor_details = interpreter.get_tensor_details()
        float_tensor_details = [
            t for t in tensor_details if t['dtype'] == np.float32
        ]
        # Only the input and outputs are float. The rest are integer.
        #
        # TODO(tfmot): update this test to use the full-integer path when available,
        # so that float_tensor_details should be length 0.
        self.assertLen(float_tensor_details, 2)
Esempio n. 21
0
    def testConv2DBatchNormReLUQuantize(self, layer_type):
        model = self._get_model(layer_type, True)
        input_shape = self._get_input_shape(layer_type)

        with quantize.quantize_scope():
            transformed_model, updated_metadata = ModelTransformer(
                model,
                [tflite_transforms.Conv2DBatchNormReLUQuantize()],
            ).transform()

        conv_layer = transformed_model.layers[1]
        bn_layer = transformed_model.layers[2]

        self.assertIsInstance(conv_layer.activation,
                              quantize_aware_activation.NoOpActivation)
        self.assertIsInstance(
            updated_metadata.get(bn_layer.name).get('quantize_provider'),
            tflite_quantize_providers.NoOpQuantizeProvider)

        inputs = np.random.standard_normal(input_shape)
        self.assertAllClose(transformed_model.predict(inputs),
                            model.predict(inputs))
    def testQuantizeSingleLayer_ProducesFullIntegerModel_TF1(
            self, layer_type, kwargs):
        if not compat.is_v1_apis():
            return

        if 'input_shape' not in kwargs:
            kwargs['input_shape'] = (5, )

        layer = layer_type(**kwargs)
        model = tf.keras.Sequential([layer])
        quantized_model = quantize.quantize_model(model)

        with quantize.quantize_scope():
            test_utils.convert_keras_to_tflite(
                model=quantized_model,
                output_path=None,
                is_quantized=True,
                inference_type=tf.uint8,
                inference_input_type=tf.uint8,
                input_quant_params=(0., 1.),
                # Set to False to throw errors when FakeQuants are
                # not placed everywhere to create full-integer model. Errors
                # are not thrown when set to True.
                experimental_new_converter=False)
Esempio n. 23
0
  def test_cnn_model_end_to_end(self):

    config = tf1.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf1.Session(config=config)
    tf1.keras.backend.set_session(sess)
    test_utils.set_seed(123)

    # data parameters
    num_time_bins = 12
    feature_size = 12

    # model params.
    total_stride = 2
    params = test_utils.Params([total_stride], 0)
    params.model_name = 'cnn'
    params.cnn_filters = '2'
    params.cnn_kernel_size = '(3,3)'
    params.cnn_act = "'relu'"
    params.cnn_dilation_rate = '(1,1)'
    params.cnn_strides = '(2,2)'
    params.dropout1 = 0.5
    params.units2 = ''
    params.act2 = ''

    params.label_count = 2
    params.return_softmax = True
    params.quantize = 1  # apply quantization aware training

    params.data_shape = (num_time_bins, feature_size)
    params.preprocess = 'custom'

    model = cnn.model(params)
    model.summary()

    # prepare training and testing data
    train_images, train_labels = test_utils.generate_data(
        img_size_y=num_time_bins, img_size_x=feature_size, n_samples=32)
    test_images = train_images
    test_labels = train_labels

    # create and train quantization aware model in non streaming mode
    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=['accuracy'])
    model.fit(
        train_images,
        train_labels,
        epochs=1,
        validation_data=(test_images, test_labels))
    model.summary()

    # one test image
    train_image = train_images[:1,]

    # run tf non streaming inference
    non_stream_output_tf = model.predict(train_image)

    # specify input data shape for streaming mode
    params.data_shape = (total_stride, feature_size)
    # TODO(rybakov) add params structure for model with no feature extractor

    # prepare tf streaming model and use it to generate representative_dataset
    with quantize.quantize_scope():
      stream_quantized_model = utils.to_streaming_inference(
          model, params, Modes.STREAM_EXTERNAL_STATE_INFERENCE)

    calibration_data = prepare_calibration_data(stream_quantized_model,
                                                total_stride, train_image)

    def representative_dataset(dtype):
      def _representative_dataset_gen():
        for i in range(len(calibration_data)):
          yield [
              calibration_data[i][0].astype(dtype),  # input audio packet
              calibration_data[i][1].astype(dtype),  # conv state
              calibration_data[i][2].astype(dtype)  # flatten state
          ]

      return _representative_dataset_gen

    # convert streaming quantization aware model to tflite
    # and apply post training quantization
    with quantize.quantize_scope():
      tflite_streaming_model = utils.model_to_tflite(
          sess, model, params,
          Modes.STREAM_EXTERNAL_STATE_INFERENCE,
          optimizations=[tf.lite.Optimize.DEFAULT],
          inference_type=tf.int8,
          experimental_new_quantizer=True,
          representative_dataset=representative_dataset(np.float32))

    # run tflite in streaming mode and compare output logits with tf
    interpreter = tf.lite.Interpreter(model_content=tflite_streaming_model)
    interpreter.allocate_tensors()
    input_states = []
    for detail in interpreter.get_input_details():
      input_states.append(np.zeros(detail['shape'], dtype=np.float32))
    stream_out_tflite = inference.run_stream_inference_classification_tflite(
        params, interpreter, train_image, input_states)
    self.assertAllClose(stream_out_tflite, non_stream_output_tf, atol=0.001)
Esempio n. 24
0
    def _test_equal_tf_and_tflite_outputs(self,
                                          tf_model,
                                          is_tflite_quantized=False):
        _, tflite_file = tempfile.mkstemp('.tflite')

        batched_input_shape = self._get_batched_input_shape()
        output_shape = self._get_output_shape()

        tf_model.compile(loss='categorical_crossentropy',
                         optimizer='sgd',
                         metrics=['accuracy'])

        tf_model.fit(np.random.uniform(0, 1, size=batched_input_shape),
                     np.random.uniform(0, 10, size=output_shape),
                     epochs=1,
                     callbacks=[])
        # Prepare for inference.
        inp = np.random.uniform(0, 1, size=batched_input_shape)
        inp = inp.astype(np.float32)

        if is_tflite_quantized:
            real_min = keras.backend.eval(
                tf_model.layers[-1]._activation_min_var)
            real_max = keras.backend.eval(
                tf_model.layers[-1]._activation_max_var)
            scale, zero_point = self._get_asymmetric_quant_params(
                real_min, real_max, -128.0, 127.0)

            # TFLite input needs to be quantized.
            real_input_min = 0.0
            real_input_max = 1.0
            inp_scale, inp_zp = self._get_asymmetric_quant_params(
                real_input_min, real_input_max, -128.0, 127.0)

            inp8 = np.round(inp / inp_scale + inp_zp)
            inp8 = inp8.astype(np.int8)

            # Dequant
            inp = (inp8.astype(np.float32) - inp_zp) * inp_scale

        # TensorFlow inference.
        tf_out = tf_model.predict(inp)

        # TensorFlow Lite inference.
        with quantize.quantize_scope():
            utils.convert_keras_to_tflite(
                tf_model,
                tflite_file,
                custom_objects={
                    '_ConvBatchNorm2D': _ConvBatchNorm2D,
                    '_DepthwiseConvBatchNorm2D': _DepthwiseConvBatchNorm2D,
                },
                is_quantized=is_tflite_quantized,
                inference_input_type=tf.lite.constants.INT8)

        interpreter = tf.lite.Interpreter(model_path=tflite_file)
        interpreter.allocate_tensors()
        input_index = interpreter.get_input_details()[0]['index']
        output_index = interpreter.get_output_details()[0]['index']

        if is_tflite_quantized:
            interpreter.set_tensor(input_index, inp8)
        else:
            interpreter.set_tensor(input_index, inp)

        interpreter.invoke()
        tflite_out = interpreter.get_tensor(output_index)

        if is_tflite_quantized:
            # dequantize outputs
            tflite_out = [scale * (x - zero_point) for x in tflite_out]

            # TODO(pulkitb): DConv quantized test somehow has a single value (0.065%)
            # of total values, which falls off by 1 scale. Investigate further and
            # introduce stricter testing by removing atol=scale.
            self.assertAllClose(tf_out, tflite_out, atol=scale)
        else:
            # Taken from testFoldFusedBatchNorms from
            # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference_test.py#L230
            self.assertAllClose(tf_out, tflite_out, rtol=1e-04, atol=1e-06)
Esempio n. 25
0
def apply_quantization(model,
                       pruning_policy=None,
                       weight_precision=None,
                       activation_precision=None,
                       activation_margin=None):
    # assert quantize_base.SET_CUSTOM_TNH_FLAG, log.info("TFMOD needs to be modified with quantizer disabled for proper "
    #                                                    "running")

    if weight_precision is not None:
        global _WEIGHTS_NUM_BITS  # need to declare when you want to change the value
        _WEIGHTS_NUM_BITS = weight_precision

    if activation_precision is not None:
        global _ACTIV_NUM_BITS
        _ACTIV_NUM_BITS = activation_precision

    if activation_margin is not None:
        global _ACTIV_MARGIN
        _ACTIV_MARGIN = activation_margin

    log.info(
        "Weights num bits: {} - Activ num bits: {} - Activ margin: {}".format(
            _WEIGHTS_NUM_BITS, _ACTIV_NUM_BITS, _ACTIV_MARGIN))

    # Helper function uses `quantize_annotate_layer` to annotate that only the
    # Dense layers should be quantized.
    def add_quantize_annotation(layer):
        # create new layer to break link with old model
        try:
            layer = layer.__class__.from_config(layer.get_config())
        except:
            pass

        for layer_type in quantization_map:

            if isinstance(layer, layer_type):

                if isinstance(pruning_policy, float) or pruning_policy is None:
                    layer_pruning = pruning_policy
                elif isinstance(pruning_policy, dict):
                    layer_pruning = pruning_policy[layer.name]
                else:
                    raise ValueError("Illegal layer pruning policy {}".format(
                        pruning_policy))

                quantize_config = BFPQuantizeConfig(
                    pruning_policy=layer_pruning)

                log.info(
                    "**Quantization annotation added to layer {} of type {} with {}"
                    .format(layer.name, layer_type, quantize_config))

                quantized_layer = quantize_annotate_layer(
                    to_annotate=layer, quantize_config=quantize_config)
                return quantized_layer
        log.info("**Quantization annotation not added to layer {} of type {}".
                 format(layer.name, type(layer)))

        return layer

    # Use `tf.keras.models.clone_model` to apply `add_quantize_annotation`
    # to the layers of the model.
    log.info("Annotating model {}".format(model.name))

    tf.keras.backend.clear_session()
    annotated_model = tf.keras.models.clone_model(
        model,
        clone_function=add_quantize_annotation,
    )

    with quantize_scope({
            'BFPQuantizeConfig': BFPQuantizeConfig,
            "BFPActivQuantizer": BFPActivQuantizer,
            "BFPWeightQuantizer": BFPWeightQuantizer,
            "BFPBiasQuantizer": BFPBiasQuantizer,
            "PolynomialDecay": PolynomialDecay
    }):
        # Use `quantize_apply` to actually make the model quantization aware.
        quant_aware_model = quantize_apply(annotated_model)

    for q_layer in quant_aware_model.layers:
        if isinstance(q_layer, QuantizeWrapper):
            for quant_type in quantization_map:
                if isinstance(q_layer.layer, quant_type):
                    original_name = q_layer.name.replace("quant_", "")
                    old_layer = model.get_layer(original_name)

                    q_weights = q_layer.get_weights()
                    orig_weights = old_layer.get_weights()

                    q_weights[0] = orig_weights[0]
                    try:
                        q_weights[1] = orig_weights[1]
                    except IndexError:
                        pass
                    q_layer.set_weights(q_weights)

    return quant_aware_model
Esempio n. 26
0
    f.write(str(graph_def))

model.compile(loss=tf.keras.losses.categorical_crossentropy,
              optimizer=tf.keras.optimizers.Adadelta(),
              metrics=['accuracy'])

model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

# Export to Keras.
keras_file = '/tmp/quantized_mnist.h5'
tf.keras.models.save_model(model, keras_file)

# Convert to TFLite model.
with quantize.quantize_scope():
    converter = tf.lite.TFLiteConverter.from_keras_model_file(keras_file)
converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
input_arrays = converter.get_input_arrays()
converter.quantized_input_stats = {
    input_arrays[0]: (0., 255.)
}  # mean, std_dev
tflite_model = converter.convert()
open('/tmp/quantized_mnist.tflite', 'wb').write(tflite_model)