def test_dequantize_op_weights(self):
        program = fluid.Program()
        with fluid.program_guard(program):
            self.prepare_program_mul(program)
            graph = IrGraph(core.Graph(program.desc), for_test=True)

            for op in graph.all_op_nodes():
                if op.op().type() == "mul":
                    op_node = op
                    break

            qpass = Quant2Int8MkldnnPass(self.quantized_ops,
                                         _scope=self.scope,
                                         _place=self.place,
                                         _core=core,
                                         _debug=False)
            qpass._weight_scales["mul_output"] = self.mul_output_scale
            param = self.scope.var("mul_weights").get_tensor()
            param.set(self.variables_mul["mul_weights"], self.place)
            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")

            assert np.allclose(
                self.scope.find_var("mul_weights").get_tensor(),
                [[127, 63.5, 42.3333, 31.75, 25.4],
                 [127, 63.5, 42.3333, 31.75, 25.4],
                 [127, 63.5, 42.3333, 31.75, 25.4]])

            param = self.scope.var("mul_weights").get_tensor()
            param.set(self.variables_mul["mul_weights_bad"], self.place)
            with self.assertRaises(ValueError):
                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
Exemple #2
0
def transform_and_save_int8_model(original_path,
                                  save_path,
                                  ops_to_quantize='',
                                  op_ids_to_skip='',
                                  debug=False,
                                  quant_model_filename='',
                                  quant_params_filename='',
                                  save_model_filename="__model__",
                                  save_params_filename=None):
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    inference_scope = fluid.executor.global_scope()
    with fluid.scope_guard(inference_scope):
        if not quant_model_filename:
            if os.path.exists(os.path.join(original_path, '__model__')):
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(original_path, exe)
            else:
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(original_path, exe, 'model',
                                                   'params')
        else:
            [inference_program, feed_target_names, fetch_targets
             ] = fluid.io.load_inference_model(original_path, exe,
                                               quant_model_filename,
                                               quant_params_filename)

        ops_to_quantize_set = set()
        print(ops_to_quantize)
        if len(ops_to_quantize) > 0:
            ops_to_quantize_set = set(ops_to_quantize.split(','))

        op_ids_to_skip_set = set([-1])
        print(op_ids_to_skip)
        if len(op_ids_to_skip) > 0:
            op_ids_to_skip_set = set(map(int, op_ids_to_skip.split(',')))

        graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
        if (debug):
            graph.draw('.', 'quant_orig', graph.all_op_nodes())
        transform_to_mkldnn_int8_pass = Quant2Int8MkldnnPass(
            ops_to_quantize_set,
            _op_ids_to_skip=op_ids_to_skip_set,
            _scope=inference_scope,
            _place=place,
            _core=core,
            _debug=debug)
        graph = transform_to_mkldnn_int8_pass.apply(graph)
        inference_program = graph.to_program()
        with fluid.scope_guard(inference_scope):
            fluid.io.save_inference_model(save_path,
                                          feed_target_names,
                                          fetch_targets,
                                          exe,
                                          inference_program,
                                          model_filename=save_model_filename,
                                          params_filename=save_params_filename)
        print(
            "Success! INT8 model obtained from the Quant model can be found at {}\n"
            .format(save_path))
    def test_dequantize_op_weights(self):
        program = fluid.Program()
        with fluid.program_guard(program):
            self.prepare_program_mul(program)
            graph = IrGraph(core.Graph(program.desc), for_test=True)

            op_node = ""
            for op in graph.all_op_nodes():
                if op.op().type() == self.op_name():
                    op_node = op
                    break
            assert op_node != "", "op of type %s not found" % self.op_name()

            qpass = Quant2Int8MkldnnPass(self.quantized_ops,
                                         _scope=self.scope,
                                         _place=self.place,
                                         _core=core,
                                         _debug=False)
            qpass._weight_thresholds["mul_output"] = self.mul_output_scale
            param = self.scope.var("mul_weights").get_tensor()
            param.set(self.variables_mul["mul_weights"], self.place)
            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")

            assert np.allclose(
                self.scope.find_var("mul_weights").get_tensor(),
                [[1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.]])

            param = self.scope.var("mul_weights").get_tensor()
            param.set(self.variables_mul["mul_weights_bad"], self.place)
            with self.assertRaises(ValueError):
                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
        def test_quant_update_activation(self):
            program = fluid.Program()
            with fluid.program_guard(program):
                self.prepare_program(program)
                graph = IrGraph(core.Graph(program.desc), for_test=True)
                quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass(
                    self.quantized_ops,
                    _scope=self.scope,
                    _place=self.place,
                    _core=core,
                    _debug=False)

                input_scale_tensor = quant2_int8_mkldnn_pass._convert_scale2tensor(
                    np.array(self.scale).astype(np.float64))
                output_scale_tensor = quant2_int8_mkldnn_pass._convert_scale2tensor(
                    np.array(1. / self.scale * self.scale).astype(np.float64))
                var_scale = {
                    "input": (False, input_scale_tensor),
                    "filter": (False, input_scale_tensor),
                    "conv_output": (False, output_scale_tensor),
                }
                if core.avx_supported():
                    quant2_int8_mkldnn_pass._var_quant_scales = var_scale
                    graph = quant2_int8_mkldnn_pass._propagate_scales(graph)
                    graph = quant2_int8_mkldnn_pass._quantize_fp32_graph(graph)
                    self.check_graph_after_pass(graph)
 def test_quant_update_activation(self):
     program = fluid.Program()
     with fluid.program_guard(program):
         self.prepare_program(program)
         graph = IrGraph(core.Graph(program.desc), for_test=True)
         graph = self.remove_fuse_activation_attribute(graph)
         self.check_graph_before_pass(graph)
         quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass(self.quantized_ops,
                                                        _scope=self.scope,
                                                        _place=self.place,
                                                        _core=core,
                                                        _debug=False)
         graph = quant2_int8_mkldnn_pass._update_activations(graph)
         self.check_graph_after_pass(graph)
Exemple #6
0
    def _predict(self,
                 test_reader=None,
                 model_path=None,
                 batch_size=1,
                 batch_num=1,
                 skip_batch_num=0,
                 target='quant'):
        assert target in ['quant', 'int8', 'fp32']
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        inference_scope = fluid.executor.global_scope()
        with fluid.scope_guard(inference_scope):
            if os.path.exists(os.path.join(model_path, '__model__')):
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(model_path, exe)
            else:
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(model_path, exe, 'model',
                                                   'params')

            graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
            if (self._debug):
                graph.draw('.', 'quant_orig', graph.all_op_nodes())
            quant_transform_pass = Quant2Int8MkldnnPass(
                self._quantized_ops,
                _op_ids_to_skip=self._op_ids_to_skip,
                _scope=inference_scope,
                _place=place,
                _core=core,
                _debug=self._debug)
            if (target == 'quant'):
                graph = self._prepare_for_fp32_mkldnn(graph)
            elif (target == 'int8'):
                graph = quant_transform_pass.apply(graph)
            else:  # target == fp32
                graph = quant_transform_pass.prepare_and_optimize_fp32(graph)

            inference_program = graph.to_program()

            dshape = [3, 224, 224]
            outputs = []
            infer_accs1 = []
            infer_accs5 = []
            batch_acc1 = 0.0
            batch_acc5 = 0.0
            fpses = []
            batch_times = []
            batch_time = 0.0
            total_samples = 0
            iters = 0
            infer_start_time = time.time()
            for data in test_reader():
                if batch_num > 0 and iters >= batch_num:
                    break
                if iters == skip_batch_num:
                    total_samples = 0
                    infer_start_time = time.time()
                if six.PY2:
                    images = map(lambda x: x[0].reshape(dshape), data)
                if six.PY3:
                    images = list(map(lambda x: x[0].reshape(dshape), data))
                images = np.array(images).astype('float32')
                labels = np.array([x[1] for x in data]).astype('int64')

                if (target == 'fp32'):
                    # FP32 models have accuracy measuring layers
                    labels = labels.reshape([-1, 1])
                    start = time.time()
                    out = exe.run(inference_program,
                                  feed={
                                      feed_target_names[0]: images,
                                      feed_target_names[1]: labels
                                  },
                                  fetch_list=fetch_targets)
                    batch_time = (time.time() - start) * 1000  # in miliseconds
                    batch_acc1, batch_acc5 = out[1][0], out[2][0]
                    outputs.append(batch_acc1)
                else:
                    # Quant INT8 models do not have accuracy measuring layers
                    start = time.time()
                    out = exe.run(inference_program,
                                  feed={feed_target_names[0]: images},
                                  fetch_list=fetch_targets)
                    batch_time = (time.time() - start) * 1000  # in miliseconds
                    outputs.append(out[0])
                    # Calculate accuracy result
                    batch_acc1, batch_acc5 = self._get_batch_accuracy(
                        out[0], labels)

                infer_accs1.append(batch_acc1)
                infer_accs5.append(batch_acc5)
                samples = len(data)
                total_samples += samples
                batch_times.append(batch_time)
                fps = samples / batch_time * 1000
                fpses.append(fps)
                iters += 1
                appx = ' (warm-up)' if iters <= skip_batch_num else ''
                _logger.info('batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, '
                             'latency: {3:.4f} ms, fps: {4:.2f}'.format(
                                 iters, batch_acc1, batch_acc5,
                                 batch_time / batch_size, fps, appx))

            # Postprocess benchmark data
            batch_latencies = batch_times[skip_batch_num:]
            batch_latency_avg = np.average(batch_latencies)
            latency_avg = batch_latency_avg / batch_size
            fpses = fpses[skip_batch_num:]
            fps_avg = np.average(fpses)
            infer_total_time = time.time() - infer_start_time
            acc1_avg = np.mean(infer_accs1)
            acc5_avg = np.mean(infer_accs5)
            _logger.info(
                'Total inference run time: {:.2f} s'.format(infer_total_time))

            return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
    def _predict(self,
                 test_reader=None,
                 model_path=None,
                 batch_size=1,
                 batch_num=1,
                 skip_batch_num=0,
                 target='quant'):
        assert target in ['quant', 'int8', 'fp32']
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        inference_scope = fluid.executor.global_scope()
        with fluid.scope_guard(inference_scope):
            if os.path.exists(os.path.join(model_path, '__model__')):
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(model_path, exe)
            else:
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(model_path, exe, 'model',
                                                   'params')

            graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
            if (self._debug):
                graph.draw('.', 'quant_orig', graph.all_op_nodes())
            if (target != 'quant'):
                quant_transform_pass = Quant2Int8MkldnnPass(
                    self._quantized_ops,
                    _op_ids_to_skip=self._op_ids_to_skip,
                    _scope=inference_scope,
                    _place=place,
                    _core=core,
                    _debug=self._debug)
                if (target == 'int8'):
                    graph = quant_transform_pass.apply(graph)
                else:  # target == fp32
                    graph = quant_transform_pass.prepare_and_optimize_fp32(
                        graph)

            inference_program = graph.to_program()

            total_correct = 0
            total_samples = 0
            batch_times = []
            ppses = []  # predictions per second
            iters = 0
            infer_start_time = time.time()
            for data in test_reader():
                if batch_num > 0 and iters >= batch_num:
                    break
                if iters == skip_batch_num:
                    total_samples = 0
                    infer_start_time = time.time()
                input0 = np.array([x[0] for x in data]).astype('int64')
                input1 = np.array([x[1] for x in data]).astype('int64')
                labels = np.array([x[2] for x in data]).astype('int64')

                start = time.time()
                out = exe.run(inference_program,
                              feed={
                                  feed_target_names[0]: input0,
                                  feed_target_names[1]: input1
                              },
                              fetch_list=fetch_targets)
                batch_time = (time.time() - start) * 1000  # in miliseconds
                batch_times.append(batch_time)
                batch_correct = self._get_batch_correct(out, labels)
                batch_len = len(data)
                total_samples += batch_len
                total_correct += batch_correct
                batch_acc = float(batch_correct) / float(batch_len)
                pps = batch_len / batch_time * 1000
                ppses.append(pps)
                latency = batch_time / batch_len
                iters += 1
                appx = ' (warm-up)' if iters <= skip_batch_num else ''
                _logger.info(
                    'batch {0}{4}, acc: {1:.4f}, latency: {2:.4f} ms, predictions per sec: {3:.2f}'
                    .format(iters, batch_acc, latency, pps, appx))

            # Postprocess benchmark data
            infer_total_time = time.time() - infer_start_time
            batch_latencies = batch_times[skip_batch_num:]
            batch_latency_avg = np.average(batch_latencies)
            latency_avg = batch_latency_avg / batch_size
            ppses = ppses[skip_batch_num:]
            pps_avg = np.average(ppses)
            acc_avg = float(np.sum(total_correct)) / float(total_samples)
            _logger.info(
                'Total inference run time: {:.2f} s'.format(infer_total_time))

            return acc_avg, pps_avg, latency_avg