def linear_fc_quant(self,
                     activation_quant_type,
                     weight_quantize_type,
                     for_ci=True):
     main = fluid.Program()
     startup = fluid.Program()
     with fluid.program_guard(main, startup):
         loss = linear_fc(3)
         opt = fluid.optimizer.Adam(learning_rate=0.001)
         opt.minimize(loss)
     place = fluid.CPUPlace()
     graph = IrGraph(core.Graph(main.desc), for_test=False)
     transform_pass = QuantizationTransformPass(
         scope=fluid.global_scope(),
         place=place,
         activation_quantize_type=activation_quant_type,
         weight_quantize_type=weight_quantize_type)
     transform_pass.apply(graph)
     if not for_ci:
         marked_nodes = set()
         for op in graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
         graph.draw('.', 'quantize_fc_' + activation_quant_type,
                    marked_nodes)
     program = graph.to_program()
     self.check_program(program)
     val_graph = IrGraph(core.Graph(program.desc), for_test=False)
     if not for_ci:
         val_marked_nodes = set()
         for op in val_graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 val_marked_nodes.add(op)
         val_graph.draw('.', 'val_fc_' + activation_quant_type,
                        val_marked_nodes)
 def residual_block_quant(self,
                          quantizable_op_type,
                          skip_pattern=None,
                          for_ci=True):
     main = fluid.Program()
     startup = fluid.Program()
     with fluid.program_guard(main, startup):
         loss = quant_dequant_residual_block(2, skip_pattern)
         opt = fluid.optimizer.Adam(learning_rate=0.001)
         opt.minimize(loss)
     place = fluid.CPUPlace()
     graph = IrGraph(core.Graph(main.desc), for_test=False)
     add_quant_dequant_pass = AddQuantDequantPass(
         scope=fluid.global_scope(),
         place=place,
         skip_pattern=skip_pattern,
         quantizable_op_type=quantizable_op_type)
     add_quant_dequant_pass.apply(graph)
     if not for_ci:
         marked_nodes = set()
         for op in graph.all_op_nodes():
             if op.name().find('quant') > -1:
                 marked_nodes.add(op)
         graph.draw('.', 'add_quant_dequant_graph', marked_nodes)
     self.check_graph(graph, skip_pattern)
     program = graph.to_program()
     val_graph = IrGraph(core.Graph(program.desc), for_test=False)
     if not for_ci:
         val_marked_nodes = set()
         for op in val_graph.all_op_nodes():
             if op.name().find('quant') > -1:
                 val_marked_nodes.add(op)
         val_graph.draw('.', 'val_add_quant_dequant_graph', val_marked_nodes)
 def residual_block_quant(self, quant_type):
     main = fluid.Program()
     startup = fluid.Program()
     with fluid.program_guard(main, startup):
         loss = residual_block(2)
         opt = fluid.optimizer.Adam(learning_rate=0.001)
         opt.minimize(loss)
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     graph = IrGraph(core.Graph(main.desc), for_test=False)
     transform_pass = QuantizationTransformPass(
         scope=fluid.global_scope(),
         place=place,
         activation_quantize_type=quant_type)
     transform_pass.apply(graph)
     marked_nodes = set()
     for op in graph.all_op_nodes():
         if op.name().find('quantize') > -1:
             marked_nodes.add(op)
     graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
     program = graph.to_program()
     self.check_program(transform_pass, program)
     val_graph = IrGraph(core.Graph(program.desc), for_test=False)
     val_marked_nodes = set()
     for op in val_graph.all_op_nodes():
         if op.name().find('quantize') > -1:
             val_marked_nodes.add(op)
     val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
    def test_dequantize_op_weights(self):
        program = fluid.Program()
        with fluid.program_guard(program):
            self.prepare_program_mul(program)
            graph = IrGraph(core.Graph(program.desc), for_test=True)

            for op in graph.all_op_nodes():
                if op.op().type() == "mul":
                    op_node = op
                    break

            qpass = Quant2Int8MkldnnPass(self.quantized_ops,
                                         _scope=self.scope,
                                         _place=self.place,
                                         _core=core,
                                         _debug=False)
            qpass._weight_scales["mul_output"] = self.mul_output_scale
            param = self.scope.var("mul_weights").get_tensor()
            param.set(self.variables_mul["mul_weights"], self.place)
            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")

            assert np.allclose(
                self.scope.find_var("mul_weights").get_tensor(),
                [[127, 63.5, 42.3333, 31.75, 25.4],
                 [127, 63.5, 42.3333, 31.75, 25.4],
                 [127, 63.5, 42.3333, 31.75, 25.4]])

            param = self.scope.var("mul_weights").get_tensor()
            param.set(self.variables_mul["mul_weights_bad"], self.place)
            with self.assertRaises(ValueError):
                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
Exemple #5
0
def transform_and_save_int8_model(original_path,
                                  save_path,
                                  ops_to_quantize='',
                                  op_ids_to_skip='',
                                  debug=False,
                                  quant_model_filename='',
                                  quant_params_filename='',
                                  save_model_filename="__model__",
                                  save_params_filename=None):
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    inference_scope = fluid.executor.global_scope()
    with fluid.scope_guard(inference_scope):
        if not quant_model_filename:
            if os.path.exists(os.path.join(original_path, '__model__')):
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(original_path, exe)
            else:
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(original_path, exe, 'model',
                                                   'params')
        else:
            [inference_program, feed_target_names, fetch_targets
             ] = fluid.io.load_inference_model(original_path, exe,
                                               quant_model_filename,
                                               quant_params_filename)

        ops_to_quantize_set = set()
        print(ops_to_quantize)
        if len(ops_to_quantize) > 0:
            ops_to_quantize_set = set(ops_to_quantize.split(','))

        op_ids_to_skip_set = set([-1])
        print(op_ids_to_skip)
        if len(op_ids_to_skip) > 0:
            op_ids_to_skip_set = set(map(int, op_ids_to_skip.split(',')))

        graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
        if (debug):
            graph.draw('.', 'quant_orig', graph.all_op_nodes())
        transform_to_mkldnn_int8_pass = Quant2Int8MkldnnPass(
            ops_to_quantize_set,
            _op_ids_to_skip=op_ids_to_skip_set,
            _scope=inference_scope,
            _place=place,
            _core=core,
            _debug=debug)
        graph = transform_to_mkldnn_int8_pass.apply(graph)
        inference_program = graph.to_program()
        with fluid.scope_guard(inference_scope):
            fluid.io.save_inference_model(save_path,
                                          feed_target_names,
                                          fetch_targets,
                                          exe,
                                          inference_program,
                                          model_filename=save_model_filename,
                                          params_filename=save_params_filename)
        print(
            "Success! INT8 model obtained from the Quant model can be found at {}\n"
            .format(save_path))
    def test_dequantize_op_weights(self):
        program = fluid.Program()
        with fluid.program_guard(program):
            self.prepare_program_mul(program)
            graph = IrGraph(core.Graph(program.desc), for_test=True)

            op_node = ""
            for op in graph.all_op_nodes():
                if op.op().type() == self.op_name():
                    op_node = op
                    break
            assert op_node != "", "op of type %s not found" % self.op_name()

            qpass = Quant2Int8MkldnnPass(self.quantized_ops,
                                         _scope=self.scope,
                                         _place=self.place,
                                         _core=core,
                                         _debug=False)
            qpass._weight_thresholds["mul_output"] = self.mul_output_scale
            param = self.scope.var("mul_weights").get_tensor()
            param.set(self.variables_mul["mul_weights"], self.place)
            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")

            assert np.allclose(
                self.scope.find_var("mul_weights").get_tensor(),
                [[1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.]])

            param = self.scope.var("mul_weights").get_tensor()
            param.set(self.variables_mul["mul_weights_bad"], self.place)
            with self.assertRaises(ValueError):
                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
Exemple #7
0
    def get_op_number(self, prog):

        graph = IrGraph(core.Graph(prog.desc), for_test=False)
        quant_op_nums = 0
        op_nums = 0
        for op in graph.all_op_nodes():
            if op.name() in ['conv2d', 'depthwise_conv2d', 'mul']:
                op_nums += 1
            elif 'fake_' in op.name():
                quant_op_nums += 1
        return op_nums, quant_op_nums
Exemple #8
0
 def test_graph_functions(self):
     main = fluid.Program()
     startup = fluid.Program()
     with fluid.program_guard(main, startup):
         loss = residual_block(2)
         opt = fluid.optimizer.Adam(learning_rate=0.001)
         opt.minimize(loss)
     graph = IrGraph(core.Graph(main.desc), for_test=False)
     marked_nodes = set()
     for op in graph.all_op_nodes():
         if op.name().find('conv2d') > -1:
             marked_nodes.add(op)
     graph.draw('.', 'residual', marked_nodes)
     self.assertFalse(graph.has_circle())
     self.assertEqual(graph.graph_num(), 1)
     nodes = graph.topology_sort()
     self.assertEqual(len(nodes), len(graph.all_op_nodes()))
     nodes_map = graph.build_adjacency_list()
     self.assertEqual(len(nodes_map), len(graph.all_op_nodes()))
     nodes_num = len(graph.all_nodes())
     graph.safe_remove_nodes(marked_nodes)
     self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
Exemple #9
0
def quant_embedding(program, place, config=None, scope=None):
    """quantize lookup_table op parameters

    Args:
        program(paddle.static.Program): infer program
        scope(paddle.static.Scope, optional): Scope records the mapping between variable names and variables, similar to brackets in programming languages. Usually users can use `paddle.static.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ . When ``None`` will use `paddle.static.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_. Default : ``None``.
        place(paddle.CPUPlace or paddle.CUDAPlace): This parameter represents the executor run on which device.
        config(dict, optional): config to quantize. The keys are 'quantize_op_types'. For op in quantize_op_types, you can define 'quantize_type', \
                'quantize_bits', 'dtype', 'threshold'. \
                ``quantize_type`` is  quantize type, supported types are ['abs_max'], default is "abs_max".
                ``quantize_bits`` supported bits are [8] and default is 8.
                ``dtype`` is quantize dtype, supported dtype are ['int8'], default is 'int8'.
                ``threshold`` is threshold to clip tensor before quant. When threshold is not set, \
                        tensor will not be clipped.

    Returns:
        None
    """
    config = config or {}
    config = _merge_config(copy.deepcopy(_default_config), config)
    scope = paddle.static.global_scope() if scope is None else scope

    graph = IrGraph(core.Graph(program.desc), for_test=True)
    quantize_params_map = {}
    all_op = graph.all_op_nodes()
    for op in all_op:
        if op.inputs == [] and op.outputs == []:
            continue
        op_type = op.name()
        if op_type in config['quantize_op_types']:
            weight_name = op.input('W')[0]
            if weight_name in quantize_params_map.values():
                continue
            embedding_node = graph._find_node_by_name(op.inputs,
                                                      op.input('W')[0])
            for op_node in embedding_node.outputs:
                if op_node.name() == 'fused_embedding_seq_pool':
                    _split_embedding_seq_pool(graph, op_node)
            if config[op_type]['quantize_type'] == 'abs_max':
                _quant_embedding_abs_max(graph, scope, place, config[op_type],
                                         weight_name, embedding_node)
            elif config[op_type]['quantize_type'] == 'log':
                _quant_embedding_log(graph, scope, place, config[op_type],
                                     weight_name, embedding_node)
            quantize_params_map[weight_name] = _get_quant_var_name(weight_name)
    for op in all_op:
        if op.name() == 'fused_embedding_seq_pool':
            graph.safe_remove_nodes(op)

    return graph.to_program()
Exemple #10
0
def generate_dot_for_model(model_path, save_graph_dir, save_graph_name):
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    inference_scope = fluid.executor.global_scope()
    with fluid.scope_guard(inference_scope):
        if os.path.exists(os.path.join(model_path, '__model__')):
            [inference_program, feed_target_names,
             fetch_targets] = fluid.io.load_inference_model(model_path, exe)
        else:
            [inference_program, feed_target_names,
             fetch_targets] = fluid.io.load_inference_model(model_path, exe,
                                                            'model', 'params')
        graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
        if not os.path.exists(save_graph_dir):
            os.makedirs(save_graph_dir)
        model_name = os.path.basename(os.path.normpath(save_graph_dir))
        if save_graph_name is '':
            save_graph_name = model_name
        graph.draw(save_graph_dir, save_graph_name, graph.all_op_nodes())
        print(
            "Success! Generated dot and pdf files for {0} model, that can be found at {1} named {2}.\n".
            format(model_name, save_graph_dir, save_graph_name))
Exemple #11
0
    def _predict(self,
                 test_reader=None,
                 model_path=None,
                 batch_size=1,
                 batch_num=1,
                 skip_batch_num=0,
                 transform_to_int8=False):
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        inference_scope = fluid.executor.global_scope()
        with fluid.scope_guard(inference_scope):
            if os.path.exists(os.path.join(model_path, '__model__')):
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(model_path, exe)
            else:
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(model_path, exe, 'model',
                                                   'params')

            graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
            if (self._debug):
                graph.draw('.', 'qat_orig', graph.all_op_nodes())
            if (transform_to_int8):
                transform_to_mkldnn_int8_pass = Qat2Int8MkldnnPass(
                    self._quantized_ops,
                    _scope=inference_scope,
                    _place=place,
                    _core=core,
                    _debug=self._debug)
                graph = transform_to_mkldnn_int8_pass.apply(graph)

            inference_program = graph.to_program()

            total_correct = 0
            total_samples = 0
            batch_times = []
            ppses = []  # predictions per second
            iters = 0
            infer_start_time = time.time()
            for data in test_reader():
                if batch_num > 0 and iters >= batch_num:
                    break
                if iters == skip_batch_num:
                    total_samples = 0
                    infer_start_time = time.time()
                input0 = np.array([x[0] for x in data]).astype('int64')
                input1 = np.array([x[1] for x in data]).astype('int64')
                labels = np.array([x[2] for x in data]).astype('int64')

                start = time.time()
                out = exe.run(inference_program,
                              feed={
                                  feed_target_names[0]: input0,
                                  feed_target_names[1]: input1
                              },
                              fetch_list=fetch_targets)
                batch_time = (time.time() - start) * 1000  # in miliseconds
                batch_times.append(batch_time)
                batch_correct = self._get_batch_correct(out, labels)
                batch_len = len(data)
                total_samples += batch_len
                total_correct += batch_correct
                batch_acc = float(batch_correct) / float(batch_len)
                pps = batch_len / batch_time * 1000
                ppses.append(pps)
                latency = batch_time / batch_len
                iters += 1
                appx = ' (warm-up)' if iters <= skip_batch_num else ''
                _logger.info(
                    'batch {0}{4}, acc: {1:.4f}, latency: {2:.4f} ms, predictions per sec: {3:.2f}'
                    .format(iters, batch_acc, latency, pps, appx))

            # Postprocess benchmark data
            infer_total_time = time.time() - infer_start_time
            batch_latencies = batch_times[skip_batch_num:]
            batch_latency_avg = np.average(batch_latencies)
            latency_avg = batch_latency_avg / batch_size
            ppses = ppses[skip_batch_num:]
            pps_avg = np.average(ppses)
            acc_avg = float(np.sum(total_correct)) / float(total_samples)
            _logger.info(
                'Total inference run time: {:.2f} s'.format(infer_total_time))

            return acc_avg, pps_avg, latency_avg
    def freeze_graph(self,
                     use_cuda,
                     seed,
                     activation_quant_type,
                     weight_quant_type='abs_max',
                     for_ci=True,
                     quant_skip_pattern='skip_quant'):
        def build_program(main, startup, is_test):
            main.random_seed = seed
            startup.random_seed = seed
            with fluid.unique_name.guard():
                with fluid.program_guard(main, startup):
                    img = fluid.layers.data(
                        name='image', shape=[1, 28, 28], dtype='float32')
                    label = fluid.layers.data(
                        name='label', shape=[1], dtype='int64')
                    loss = conv_net(img, label, quant_skip_pattern)
                    if not is_test:
                        opt = fluid.optimizer.Adam(learning_rate=0.001)
                        opt.minimize(loss)
            return [img, label], loss

        random.seed(0)
        np.random.seed(0)

        main = fluid.Program()
        startup = fluid.Program()
        test_program = fluid.Program()
        feeds, loss = build_program(main, startup, False)
        build_program(test_program, startup, True)
        test_program = test_program.clone(for_test=True)
        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)
        transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type,
            skip_pattern=quant_skip_pattern)
        transform_pass.apply(main_graph)
        transform_pass.apply(test_graph)
        dev_name = '_gpu_' if use_cuda else '_cpu_'
        if not for_ci:
            marked_nodes = set()
            for op in main_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            main_graph.draw('.', 'main' + dev_name + activation_quant_type + '_'
                            + weight_quant_type, marked_nodes)
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test' + dev_name + activation_quant_type + '_'
                            + weight_quant_type, marked_nodes)

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        quantized_test_program = test_graph.to_program()
        iters = 5
        batch_size = 8

        train_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.mnist.train(), buf_size=500),
            batch_size=batch_size)
        test_reader = paddle.batch(
            paddle.dataset.mnist.test(), batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])
                if not for_ci:
                    print('{}: {}'.format('loss' + dev_name +
                                          activation_quant_type + '_' +
                                          weight_quant_type, loss_v))

        test_data = next(test_reader())
        with fluid.program_guard(quantized_test_program):
            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
                                             quantized_test_program)
        # Testing
        with fluid.scope_guard(scope):
            test_loss1, w_quant = exe.run(program=quantized_test_program,
                                          feed=feeder.feed(test_data),
                                          fetch_list=[loss, w_var])

        # Freeze graph for inference, but the weight of fc/conv is still float type.
        freeze_pass = QuantizationFreezePass(
            scope=scope, place=place, weight_quantize_type=weight_quant_type)
        freeze_pass.apply(test_graph)
        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test_freeze' + dev_name +
                            activation_quant_type + '_' + weight_quant_type,
                            marked_nodes)

        server_program = test_graph.to_program()
        with fluid.scope_guard(scope):
            test_loss2, = exe.run(program=server_program,
                                  feed=feeder.feed(test_data),
                                  fetch_list=[loss])
        self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
        if not for_ci:
            print(
                '{}: {}'.format('test_loss1' + dev_name + activation_quant_type
                                + '_' + weight_quant_type, test_loss1))
            print(
                '{}: {}'.format('test_loss2' + dev_name + activation_quant_type
                                + '_' + weight_quant_type, test_loss2))
        w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
        # Maybe failed, this is due to the calculation precision
        # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
        if not for_ci:
            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
                                  + '_' + weight_quant_type, np.sum(w_freeze)))
            print('{}: {}'.format('w_quant' + dev_name + activation_quant_type +
                                  '_' + weight_quant_type, np.sum(w_quant)))

        # Convert parameter to 8-bit.
        convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
        convert_int8_pass.apply(test_graph)
        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test_int8' + dev_name + activation_quant_type
                            + '_' + weight_quant_type, marked_nodes)
        server_program_int8 = test_graph.to_program()
        # Save the 8-bit parameter and model file.
        with fluid.scope_guard(scope):
            fluid.io.save_inference_model(
                'server_int8' + dev_name + activation_quant_type + '_' +
                weight_quant_type, ['image', 'label'], [loss], exe,
                server_program_int8)
            # Test whether the 8-bit parameter and model file can be loaded successfully.
            [infer, feed, fetch] = fluid.io.load_inference_model(
                'server_int8' + dev_name + activation_quant_type + '_' +
                weight_quant_type, exe)
        # Check the loaded 8-bit weight.
        w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
        self.assertEqual(w_8bit.dtype, np.int8)
        self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
        if not for_ci:
            print('{}: {}'.format('w_8bit' + dev_name + activation_quant_type +
                                  '_' + weight_quant_type, np.sum(w_8bit)))
            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
                                  + '_' + weight_quant_type, np.sum(w_freeze)))

        mobile_pass = TransformForMobilePass()
        mobile_pass.apply(test_graph)
        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test_mobile' + dev_name +
                            activation_quant_type + '_' + weight_quant_type,
                            marked_nodes)

        mobile_program = test_graph.to_program()
        with fluid.scope_guard(scope):
            fluid.io.save_inference_model(
                'mobile_int8' + dev_name + activation_quant_type + '_' +
                weight_quant_type, ['image', 'label'], [loss], exe,
                mobile_program)
    def mkldnn_based_freeze_graph(self,
                                  use_cuda,
                                  seed,
                                  activation_quant_type,
                                  weight_quant_type='abs_max',
                                  quant_perf=False,
                                  for_ci=False):
        random.seed(0)
        np.random.seed(0)

        main = fluid.Program()
        startup = fluid.Program()
        test_program = fluid.Program()
        feeds, loss = self.build_program(main, startup, False, seed)
        self.build_program(test_program, startup, True, seed)
        test_program = test_program.clone(for_test=True)
        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)
        # Apply the QuantizationTransformPass
        transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type)
        transform_pass.apply(main_graph)
        transform_pass.apply(test_graph)

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        quantized_test_program = test_graph.to_program()
        iters = 5
        batch_size = 8

        train_reader = paddle.batch(paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=500),
                                    batch_size=batch_size)
        test_reader = paddle.batch(paddle.dataset.mnist.test(),
                                   batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=feeds, place=place)

        # Training the model to get the weights value
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])

        # Freeze graph for inference, but the weight of fc/conv is still float type.
        freeze_pass = QuantizationFreezePass(
            scope=scope, place=place, weight_quantize_type=weight_quant_type)
        freeze_pass.apply(test_graph)

        # Transform quantized graph for MKL-DNN INT8 inference
        mkldnn_int8_pass = QuantInt8MkldnnPass(_scope=scope, _place=place)
        mkldnn_int8_pass.apply(test_graph)
        dev_name = '_cpu_'
        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw(
                '.', 'test_mkldnn' + dev_name + activation_quant_type + '_' +
                weight_quant_type, marked_nodes)
        mkldnn_program = test_graph.to_program()

        # Check the transformation weights of conv2d and mul
        conv_w_mkldnn = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
        mul_w_mkldnn = np.array(scope.find_var('fc_0.w_0').get_tensor())
        # Check if weights are still integer
        self.assertFalse(self.isinteger(np.sum(conv_w_mkldnn)))
        self.assertFalse(self.isinteger(np.sum(mul_w_mkldnn)))

        # Check if the conv2d output and mul output are correctly linked to fake_dequantize's
        # output
        self.check_program(mkldnn_program)
        if not for_ci:
            print('{}: {}'.format(
                'w_mkldnn' + dev_name + activation_quant_type + '_' +
                weight_quant_type, np.sum(w_mkldnn)))
Exemple #14
0
def create_quant_model(model,
                       params,
                       activation_quantize_type='moving_average_abs_max',
                       weight_quantize_type='channel_wise_abs_max',
                       save=False):
    place = paddle.CUDAPlace(0)
    scope = global_scope()
    exe = paddle.static.Executor(place)
    [inference_program, feed_target_names, fetch_targets
     ] = paddle.static.load_inference_model(path_prefix=None,
                                            executor=exe,
                                            model_filename=model,
                                            params_filename=params)
    graph = IrGraph(core.Graph(inference_program.desc), for_test=True)

    out_scale_op_list = [
        "conv2d",
        "depthwise_conv2d",
        "mul",
        "matmul",
        "relu",
        "leaky_relu",
        "relu6",
        "sigmoid",
        "tanh",
        "prelu",
        "swish",
        "softmax",
        "batch_norm",
        "layer_norm",
        "elementwise_add",
        "pool2d",
        "reshape2",
        "transpose2",
        "concat",
        "elementwise_mul",
        "scale",
        "slice",
        "hard_swish",
        "hard_sigmoid",
        "conv2d_transpose",
        "gru",
        "bilinear_interp",
        "nearest_interp",
        "trilinear_interp",
        "flatten",
        "flatten2",
        "transpose",
        "pad2d",
        "reshape",
        "layer_norm",
    ]
    op_real_in_out_name = {
        "conv2d": [["Input", "Filter"], ["Output"]],
        "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
        "conv2d_transpose": [["Input", "Filter"], ["Output"]],
        "mul": [["X", "Y"], ["Out"]],
        "matmul": [["X", "Y"], ["Out"]],
        "pool2d": [["X"], ["Out"]],
        "elementwise_add": [["X", "Y"], ["Out"]],
        "concat": [["X"], ["Out"]],
        "softmax": [["X"], ["Out"]],
        "argmax": [["X"], ["Out"]],
        "transpose": [["X"], ["Out"]],
        "equal": [["X", "Y"], ["Out"]],
        "gather": [["X"], ["Out"]],
        "greater_equal": [["X", "Y"], ["Out"]],
        "greater_than": [["X", "Y"], ["Out"]],
        "less_equal": [["X", "Y"], ["Out"]],
        "less_than": [["X", "Y"], ["Out"]],
        "mean": [["X"], ["Out"]],
        "not_equal": [["X", "Y"], ["Out"]],
        "reshape": [["X"], ["Out"]],
        "reshape2": [["X"], ["Out"]],
        "transpose2": [["X"], ["Out"]],
        "bilinear_interp": [["X"], ["Out"]],
        "nearest_interp": [["X"], ["Out"]],
        "trilinear_interp": [["X"], ["Out"]],
        "slice": [["Input"], ["Out"]],
        "squeeze": [["X"], ["Out"]],
        "elementwise_sub": [["X", "Y"], ["Out"]],
        "relu": [["X"], ["Out"]],
        "relu6": [["X"], ["Out"]],
        "leaky_relu": [["X"], ["Out"]],
        "prelu": [["X"], ["Out"]],
        "tanh": [["X"], ["Out"]],
        "swish": [["X"], ["Out"]],
        "dropout": [["X"], ["Out"]],
        "batch_norm": [["X"], ["Y"]],
        "layer_norm": [["X"], ["Y"]],
        "sigmoid": [["X"], ["Out"]],
        "elementwise_mul": [["X", "Y"], ["Out"]],
        "scale": [["X"], ["Out"]],
        "hard_swish": [["X"], ["Out"]],
        "hard_sigmoid": [["X"], ["Out"]],
        "gru": [["Input", "Weight"], ["Hidden"]],
        "lstm": [["Input", "Weight"], ["Hidden"]],
        "pad2d": [["X"], ["Out"]],
        "flatten": [["X"], ["Out"]],
        "flatten2": [["X"], ["Out"]],
    }

    def _get_op_output_var_names(op):
        """ """
        assert isinstance(op, (IrNode, Operator)), \
            "The input op should be IrNode or Operator."
        var_names = []
        op_name = op.name() if isinstance(op, IrNode) \
            else op.type
        if op_name not in op_real_in_out_name:
            return []

        name_list = op_real_in_out_name[op_name][1]
        for name in name_list:
            var_name = op.output(name)
            if isinstance(var_name, list):
                var_names.extend(var_name)
            else:
                var_names.append(var_name)
        return var_names

    transform_pass = QuantizationTransformPass(
        scope=scope,
        place=place,
        activation_quantize_type=activation_quantize_type,
        weight_quantize_type=weight_quantize_type)
    transform_pass.apply(graph)

    op_nodes = graph.all_op_nodes()
    for op_node in op_nodes:
        if op_node.name() in out_scale_op_list:
            var_names = _get_op_output_var_names(op_node)
            for var_name in var_names:
                in_node = graph._find_node_by_name(op_node.outputs, var_name)
                if in_node.dtype() not in \
                    [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
                    continue

                op_node.op()._set_attr("out_threshold", 3.0)

    # Freeze graph for inference, but the weight of fc/conv is still float type.
    freeze_pass = QuantizationFreezePass(
        scope=scope, place=place, weight_quantize_type=weight_quantize_type)
    freeze_pass.apply(graph)

    main_program = graph.to_program()

    # modify fake_quantize_moving_average_abs_max(InScale) and fake_channel_wise_dequantize_max_abs(Scales)
    op_nodes = graph.all_op_nodes()
    for op_node in op_nodes:
        if op_node.name() == 'fake_quantize_moving_average_abs_max':
            var_name = op_node.input("InScale")[0]
            tensor = scope.var(var_name).get_tensor()
            tensor.set(np.array([1], dtype=np.float32), place)
        elif op_node.name() == 'fake_channel_wise_dequantize_max_abs':
            var_name = op_node.input("Scales")[0]
            tensor = scope.var(var_name).get_tensor()
            tensor.set(np.ones(tensor.shape(), dtype=np.float32), place)

    if save:
        fluid.io.save_inference_model('test_inference_model',
                                      feed_target_names,
                                      fetch_targets,
                                      exe,
                                      main_program=main_program)

    feed_vars = [
        main_program.global_block().var(name) for name in feed_target_names
    ]
    serialized_program = paddle.static.serialize_program(feed_vars,
                                                         fetch_targets,
                                                         program=main_program)
    serialized_params = paddle.static.serialize_persistables(
        feed_vars, fetch_targets, executor=exe, program=main_program)
    return serialized_program, serialized_params
Exemple #15
0
    def _predict(self,
                 test_reader=None,
                 model_path=None,
                 batch_size=1,
                 batch_num=1,
                 skip_batch_num=0,
                 transform_to_int8=False):
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        inference_scope = fluid.executor.global_scope()
        with fluid.scope_guard(inference_scope):
            if os.path.exists(os.path.join(model_path, '__model__')):
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(model_path, exe)
            else:
                [inference_program, feed_target_names, fetch_targets
                 ] = fluid.io.load_inference_model(model_path, exe, 'model',
                                                   'params')

            graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
            if (self._debug):
                graph.draw('.', 'quant_orig', graph.all_op_nodes())
            if (transform_to_int8):
                mkldnn_int8_pass = QuantInt8MkldnnPass(_scope=inference_scope,
                                                       _place=place)
                graph = mkldnn_int8_pass.apply(graph)
            else:
                graph = self._prepare_for_fp32_mkldnn(graph)

            inference_program = graph.to_program()

            dshape = [3, 224, 224]
            outputs = []
            infer_accs1 = []
            infer_accs5 = []
            fpses = []
            batch_times = []
            total_samples = 0
            iters = 0
            infer_start_time = time.time()
            for data in test_reader():
                if batch_num > 0 and iters >= batch_num:
                    break
                if iters == skip_batch_num:
                    total_samples = 0
                    infer_start_time = time.time()
                if six.PY2:
                    images = map(lambda x: x[0].reshape(dshape), data)
                if six.PY3:
                    images = list(map(lambda x: x[0].reshape(dshape), data))
                images = np.array(images).astype('float32')
                labels = np.array([x[1] for x in data]).astype('int64')

                start = time.time()
                out = exe.run(inference_program,
                              feed={feed_target_names[0]: images},
                              fetch_list=fetch_targets)
                batch_time = (time.time() - start) * 1000  # in miliseconds
                outputs.append(out[0])
                batch_acc1, batch_acc5 = self._get_batch_accuracy(
                    out[0], labels)
                infer_accs1.append(batch_acc1)
                infer_accs5.append(batch_acc5)
                samples = len(data)
                total_samples += samples
                batch_times.append(batch_time)
                fps = samples / batch_time * 1000
                fpses.append(fps)
                iters += 1
                appx = ' (warm-up)' if iters <= skip_batch_num else ''
                _logger.info('batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, '
                             'latency: {3:.4f} ms, fps: {4:.2f}'.format(
                                 iters, batch_acc1, batch_acc5,
                                 batch_time / batch_size, fps, appx))

            # Postprocess benchmark data
            batch_latencies = batch_times[skip_batch_num:]
            batch_latency_avg = np.average(batch_latencies)
            latency_avg = batch_latency_avg / batch_size
            fpses = fpses[skip_batch_num:]
            fps_avg = np.average(fpses)
            infer_total_time = time.time() - infer_start_time
            acc1_avg = np.mean(infer_accs1)
            acc5_avg = np.mean(infer_accs5)
            _logger.info(
                'Total inference run time: {:.2f} s'.format(infer_total_time))

            return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
Exemple #16
0
    def graph_apis(self, use_cuda=False, for_ci=True):
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.unique_name.guard():
            with fluid.program_guard(main, startup):
                feeds, loss = conv_block()
                opt = fluid.optimizer.Adam(learning_rate=0.001)
                opt.minimize(loss)
        graph = IrGraph(core.Graph(main.desc), for_test=False)
        backup_graph = graph.clone()
        self.assertEqual(len(graph.all_nodes()), len(backup_graph.all_nodes()))
        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        origin_binary = fluid.CompiledProgram(graph.graph).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        backup_binary = fluid.CompiledProgram(
            backup_graph.graph).with_data_parallel(
                loss_name=loss.name, build_strategy=build_strategy)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup)
        iters = 5
        batch_size = 8
        train_reader = paddle.batch(
            paddle.dataset.mnist.train(), batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=feeds, place=place)

        def _train(binary):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss.name])
                if not for_ci:
                    print('{}: {}'.format('loss', loss_v))

        _train(origin_binary)
        _train(backup_binary)

        checkponit_dir = "checkpoint_gpu" if use_cuda else "checkpoint_cpu"

        def _set_zero(var_name, scope, place):
            var = scope.find_var(var_name).get_tensor()
            var_array = np.zeros(var._get_dims()).astype("float32")
            var.set(var_array, place)

        sum_before = np.sum(
            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
            )))
        fluid.io._save_persistable_nodes(exe, checkponit_dir, graph)
        _set_zero('conv2d_1.w_0', fluid.global_scope(), place)
        set_after = np.sum(
            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
            )))
        self.assertEqual(set_after, 0)
        fluid.io._load_persistable_nodes(exe, checkponit_dir, graph)
        sum_after = np.sum(
            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
            )))
        self.assertEqual(sum_before, sum_after)

        marked_nodes = set()
        for op in graph.all_op_nodes():
            if op.name().find('conv2d') > -1:
                marked_nodes.add(op)
        if not for_ci:
            graph.draw('.', 'residual', marked_nodes)
            backup_marked_nodes = set()
            for op in backup_graph.all_op_nodes():
                if op.name().find('conv2d') > -1:
                    backup_marked_nodes.add(op)
            backup_graph.draw('./origin', 'backup', backup_marked_nodes)
        self.assertFalse(graph.has_circle())
        self.assertEqual(graph.graph_num(), 1)
        nodes = graph.topology_sort()
        self.assertEqual(len(nodes), len(graph.all_op_nodes()))
        nodes_map = graph.build_adjacency_list()
        self.assertEqual(len(nodes_map), len(graph.all_op_nodes()))
        nodes_num = len(graph.all_nodes())
        graph.safe_remove_nodes(marked_nodes)
        self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
    def quantization_scale(self,
                           use_cuda,
                           seed,
                           activation_quant_type,
                           weight_quant_type='abs_max',
                           for_ci=False):
        def build_program(main, startup, is_test):
            main.random_seed = seed
            startup.random_seed = seed
            with fluid.unique_name.guard():
                with fluid.program_guard(main, startup):
                    img = fluid.layers.data(name='image',
                                            shape=[1, 28, 28],
                                            dtype='float32')
                    label = fluid.layers.data(name='label',
                                              shape=[1],
                                              dtype='int64')
                    loss = residual_block(img, label, 1)
                    if not is_test:
                        opt = fluid.optimizer.Adam(learning_rate=0.0001)
                        opt.minimize(loss)
            return [img, label], loss

        random.seed(0)
        np.random.seed(0)

        main = fluid.Program()
        startup = fluid.Program()
        test_program = fluid.Program()
        feeds, loss = build_program(main, startup, False)
        build_program(test_program, startup, True)
        test_program = test_program.clone(for_test=True)
        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)

        transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type)
        transform_pass.apply(main_graph)
        transform_pass.apply(test_graph)

        add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place)
        add_quant_dequant_pass.apply(main_graph)
        add_quant_dequant_pass.apply(test_graph)

        scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place)
        scale_training_pass.apply(main_graph)

        dev_name = '_gpu' if use_cuda else '_cpu'
        if not for_ci:
            marked_nodes = set()
            for op in main_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            main_graph.draw('.', 'main_scale' + dev_name, marked_nodes)
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test_scale' + dev_name, marked_nodes)

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        build_strategy.fuse_all_reduce_ops = False
        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        iters = 5
        batch_size = 8

        train_reader = paddle.batch(paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=500),
                                    batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])
                if not for_ci:
                    print('{}: {}'.format('loss' + dev_name, loss_v))

        scale_inference_pass = OutScaleForInferencePass(scope=scope)
        scale_inference_pass.apply(test_graph)

        # Freeze graph for inference, but the weight of fc/conv is still float type.
        freeze_pass = QuantizationFreezePass(
            scope=scope, place=place, weight_quantize_type=weight_quant_type)
        freeze_pass.apply(test_graph)
        server_program = test_graph.to_program()

        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'quant_scale' + dev_name, marked_nodes)

        with open('quant_scale_model' + dev_name + '.txt', 'w') as f:
            f.write(str(server_program))

        with fluid.scope_guard(scope):
            fluid.io.save_inference_model('quant_scale_model' + dev_name,
                                          ['image', 'label'], [loss], exe,
                                          server_program)
Exemple #18
0
    def check_output_with_option(self,
                                 use_gpu,
                                 atol=1e-5,
                                 flatten=False,
                                 quant=False):
        '''
        Check whether calculating on CPU and GPU, enable TensorRT 
        or disable TensorRT, enable MKLDNN or disable MKLDNN 
        are all the same. 
        '''
        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
        executor = fluid.Executor(place)
        scope = fluid.Scope()
        device = "GPU" if use_gpu else "CPU"
        with fluid.scope_guard(scope):
            executor.run(self.startup_program)

        if quant:
            main_graph = IrGraph(core.Graph(self.main_program.desc),
                                 for_test=True)

            transform_pass = QuantizationTransformPass(
                scope=scope,
                place=place,
                activation_quantize_type=self.activation_quant_type,
                weight_quantize_type=self.weight_quant_type,
                quantizable_op_type=[
                    'conv2d', 'mul', 'depthwise_conv2d', 'conv2d_transpose'
                ])
            transform_pass.apply(main_graph)
            weight_scale_map = {
                "conv2d": "conv2d_0.w_0.scale",
                "mul": "fc_0.w_0.scale"
            }

            weight_scale_tensor = scope.var(
                weight_scale_map[self.quantized_op_type]).get_tensor()
            weight_scale = np.ones(self.channels).astype("float32")
            weight_scale_tensor.set(weight_scale, place)

            op_nodes = main_graph.all_op_nodes()
            for op_node in op_nodes:
                if op_node.name() in [self.quantized_op_type, "relu"]:
                    op_node.op()._set_attr("out_threshold", 0.5)

            with fluid.scope_guard(scope):
                executor.run(program=self.main_program,
                             feed=self.feeds,
                             fetch_list=self.fetch_list)

            freeze_pass = QuantizationFreezePass(
                scope=scope,
                place=place,
                weight_quantize_type=self.weight_quant_type)
            freeze_pass.apply(main_graph)
            self.main_program = main_graph.to_program()

        outs = self._save_models(executor, self.main_program, scope)

        analysis_outputs = self._get_analysis_outputs(
            self._get_analysis_config(use_gpu=use_gpu))

        # Check whether the results calculated on CPU and on GPU are the same.
        self.assertTrue(
            len(outs) == len(analysis_outputs),
            "The number of outputs is different between inference and training forward at {}"
            .format(device))

        for out, analysis_output in zip(outs, analysis_outputs):
            out = np.array(out)
            if flatten:
                out = out.flatten()
                analysis_output = analysis_output.flatten()

            self.assertTrue(
                np.allclose(out, analysis_output, atol=atol),
                "Output has diff between inference and training forward at {} "
                .format(device))

        # Check whether the trt results and the GPU results are the same.
        if use_gpu and self.enable_trt:
            tensorrt_outputs = self._get_analysis_outputs(
                self._get_analysis_config(use_gpu=use_gpu,
                                          use_trt=self.enable_trt))

            if self.trt_parameters.use_static:
                #deserialize
                tensorrt_outputs = self._get_analysis_outputs(
                    self._get_analysis_config(use_gpu=use_gpu,
                                              use_trt=self.enable_trt))

            self.assertTrue(
                len(tensorrt_outputs) == len(outs),
                "The number of outputs is different between GPU and TensorRT. "
            )

            for out, tensorrt_output in zip(outs, tensorrt_outputs):
                out = np.array(out)
                if flatten:
                    out = out.flatten()
                    tensorrt_output = tensorrt_output.flatten()

                self.assertTrue(np.allclose(out, tensorrt_output, atol=atol),
                                "Output has diff between GPU and TensorRT. ")

        # Check whether the mkldnn results and the CPU results are the same.
        if (not use_gpu) and self.enable_mkldnn:
            mkldnn_outputs = self._get_analysis_outputs(
                self._get_analysis_config(use_gpu=use_gpu,
                                          use_mkldnn=self.enable_mkldnn))

            self.assertTrue(
                len(outs) == len(mkldnn_outputs),
                "The number of outputs is different between CPU and MKLDNN. ")

            if self.enable_mkldnn_bfloat16:
                atol = 0.01
            for out, mkldnn_output in zip(outs, mkldnn_outputs):
                self.assertTrue(
                    np.allclose(np.array(out), mkldnn_output, atol=atol),
                    "Output has diff between CPU and MKLDNN. ")