Exemple #1
0
    def testFoldFusedBatchNormsWithSharedWeights(self):
        for data_format, conv2d_func in [
            ("NHWC", nn_ops.conv2d), ("NCHW", nn_ops.conv2d),
            ("NHWC", nn_ops.depthwise_conv2d_native),
            ("NCHW", nn_ops.depthwise_conv2d_native)
        ]:
            with tf.compat.v1.Session() as sess:
                _generate_fused_batchnorm(data_format, conv2d_func, 2)
                original_graph_def = sess.graph_def
                original_result = sess.run(["output:0"])
            optimized_graph_def = fold_batch_norms.fold_batch_norms(
                original_graph_def)
        with tf.compat.v1.Session() as sess:
            _ = importer.import_graph_def(optimized_graph_def,
                                          input_map={},
                                          name="optimized")
            optimized_result = sess.run(["optimized/output:0"])

            self.assertAllClose(original_result,
                                optimized_result,
                                rtol=1e-04,
                                atol=1e-06)

            for node in optimized_graph_def.node:
                self.assertNotEqual("FusedBatchNormV3", node.op)
Exemple #2
0
    def testFoldFusedBatchNormWithBias(self):
        for data_format, conv2d_func in [
            ("NHWC", nn_ops.conv2d),
            ("NHWC", nn_ops.depthwise_conv2d_native),
        ]:
            graph = tf1.Graph()
            with tf1.Session(graph=graph) as sess:
                count = 1
                add_bias = True
                _generate_fused_batchnorm(data_format, conv2d_func, count,
                                          add_bias)
                original_graph_def = sess.graph_def
                original_result = sess.run(["output:0"])
            optimized_graph_def = fold_batch_norms.fold_batch_norms(
                original_graph_def)
        with tf1.Session() as sess:
            _ = importer.import_graph_def(optimized_graph_def,
                                          input_map={},
                                          name="optimized")
            optimized_result = sess.run(["optimized/output:0"])

            self.assertAllClose(original_result,
                                optimized_result,
                                rtol=1e-04,
                                atol=1e-06)

            bias_nodes = [
                node for node in optimized_graph_def.node
                if node.op == 'BiasAdd'
            ]
            self.assertEqual(len(bias_nodes), 1)
            for node in optimized_graph_def.node:
                self.assertNotEqual("FusedBatchNormV3", node.op)
Exemple #3
0
    def testFoldBatchNorms(self):
        with tf.compat.v1.Session() as sess:
            inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
            input_op = constant_op.constant(np.array(inputs),
                                            shape=[1, 1, 6, 2],
                                            dtype=dtypes.float32)
            weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
            weights_op = constant_op.constant(np.array(weights),
                                              shape=[1, 2, 2, 2],
                                              dtype=dtypes.float32)
            conv_op = nn_ops.conv2d(input_op,
                                    weights_op, [1, 1, 1, 1],
                                    padding="SAME",
                                    name="conv_op")
            mean_op = constant_op.constant(np.array([10, 20]),
                                           shape=[2],
                                           dtype=dtypes.float32)
            variance_op = constant_op.constant(np.array([0.25, 0.5]),
                                               shape=[2],
                                               dtype=dtypes.float32)
            beta_op = constant_op.constant(np.array([0.1, 0.6]),
                                           shape=[2],
                                           dtype=dtypes.float32)
            gamma_op = constant_op.constant(np.array([1.0, 2.0]),
                                            shape=[2],
                                            dtype=dtypes.float32)
            test_util.set_producer_version(ops.get_default_graph(), 8)
            gen_nn_ops._batch_norm_with_global_normalization(conv_op,
                                                             mean_op,
                                                             variance_op,
                                                             beta_op,
                                                             gamma_op,
                                                             0.00001,
                                                             False,
                                                             name="output")
            original_graph_def = sess.graph_def
            original_result = sess.run(["output:0"])
        optimized_graph_def = fold_batch_norms.fold_batch_norms(
            original_graph_def)
        with tf.compat.v1.Session() as sess:
            _ = importer.import_graph_def(optimized_graph_def,
                                          input_map={},
                                          name="optimized")
            optimized_result = sess.run(["optimized/output:0"])

        self.assertAllClose(original_result, optimized_result)

        for node in optimized_graph_def.node:
            self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
def optimize_graph(graph,
                   signature_def,
                   output_graph,
                   tf_version,
                   quantization_dtype=None,
                   skip_op_check=False,
                   strip_debug_ops=False,
                   weight_shard_size_bytes=1024 * 1024 * 4):
    """Takes a Python Graph object and optimizes the graph.

  Args:
    graph: The frozen graph to optimize.
    signature_def: the SignatureDef of the inference graph.
    output_graph: The location of the output graph.
    tf_version: Tensorflow version of the input graph.
    quantization_dtype: An optional numpy dtype to quantize weights to for
      compression. Only np.uint8 and np.uint16 are supported.
    skip_op_check: Bool whether to skip the op check.
    strip_debug_ops: Bool whether to strip debug ops.
    weight_shard_size_bytes: Shard size (in bytes) of the weight files.
      The size of each weight file will be <= this value.
  """

    # Add a collection 'train_op' so that Grappler knows the outputs.
    for _, output in signature_def.outputs.items():
        name = output.name.split(':')[0]
        graph.add_to_collection('train_op', graph.get_operation_by_name(name))

    graph_def = graph.as_graph_def()

    unsupported = validate(graph_def.node, skip_op_check, strip_debug_ops)
    if unsupported:
        raise ValueError('Unsupported Ops in the model before optimization\n' +
                         ', '.join(unsupported))

    # first pass of grappler optimization, this is needed for batch norm folding.
    config = config_pb2.ConfigProto()
    rewriter_config = config.graph_options.rewrite_options
    rewriter_config.optimizers[:] = [
        'pruning', 'constfold', 'arithmetic', 'dependency', 'pruning',
        'constfold', 'arithmetic', 'dependency'
    ]
    if strip_debug_ops:
        rewriter_config.optimizers.insert(0, 'debug_stripper')

    optimized_graph = _run_grappler(config, graph_def, graph, signature_def)

    # batch norm folding
    optimized_graph = fold_batch_norms.fold_batch_norms(optimized_graph)

    # set the device to CPU for all Conv2d and MatMul nodes, since grappler
    # remap optimizer only support FusedConv2D and FusedMatMul for CPU.
    for node in optimized_graph.node:
        if node.op == 'Conv2D' or node.op == 'MatMul':
            node.device = '/device:CPU:0'

    # rerun grappler to fuse conv2d/matmul
    config.graph_options.rewrite_options.optimizers[:] = [
        'remap', 'constfold', 'arithmetic', 'dependency'
    ]

    optimized_graph = _run_grappler(config, optimized_graph, graph,
                                    signature_def)
    optimized_graph = _remove_unused_control_flow_inputs(optimized_graph)

    # Because TF break the Prelu op into 6 ops, for performance we are
    # fusing those ops into a single prelu
    optimized_graph = fuse_prelu.fuse_ops_for_prelu(optimized_graph)

    # Because grappler does not support DepthwiseConv2d fusing, we have
    # implemented it here.
    optimized_graph = fuse_depthwise_conv2d.fuse_depthwise_conv2d(
        optimized_graph)

    # Since the grappler remap optimizer doe snot support prelu as the activation
    # function for _FusedConv2D op, we are doing it manually here.
    optimized_graph = fuse_prelu.fuse_prelu_with_fused_conv2d_or_matmul(
        optimized_graph)

    unsupported = validate(optimized_graph.node, skip_op_check,
                           strip_debug_ops)
    if unsupported:
        raise ValueError('Unsupported Ops in the model after optimization\n' +
                         ', '.join(unsupported))

    extract_weights(optimized_graph, output_graph, tf_version, signature_def,
                    quantization_dtype, weight_shard_size_bytes)
    return optimize_graph
def optimize_graph(graph,
                   output_node_names,
                   output_graph,
                   tf_version,
                   quantization_dtype=None,
                   skip_op_check=False,
                   strip_debug_ops=False):
    """Takes a Python Graph object and optimizes the graph.

  Args:
    graph: The frozen graph to optimize.
    output_node_names: List of output node names.
    output_graph: The location of the output graph.
    tf_version: Tensorflow version of the input graph.
    quantization_dtype: An optional numpy dtype to quantize weights to for
      compression. Only np.uint8 and np.uint16 are supported.
    skip_op_check: Bool whether to skip the op check.
    strip_debug_ops: Bool whether to strip debug ops.
  """
    fuse_prelu.register_prelu_func(graph)

    # Add a collection 'train_op' so that Grappler knows the outputs.
    for output in output_node_names:
        graph.add_to_collection('train_op',
                                graph.get_operation_by_name(output))

    graph_def = graph.as_graph_def()

    unsupported = validate(graph_def.node, skip_op_check, strip_debug_ops)
    if unsupported:
        raise ValueError('Unsupported Ops in the model before optimization\n' +
                         ', '.join(unsupported))

    # Because TF break the Prelu op into 6 ops, for performance we are
    # fusing those ops into a single prelu
    optimized_graph = fuse_prelu.fuse_ops_for_prelu(graph_def)

    # first pass of grappler optimization, this is needed for batch norm folding.
    config = config_pb2.ConfigProto()
    rewriter_config = config.graph_options.rewrite_options
    rewriter_config.optimizers[:] = [
        'pruning', 'constfold', 'arithmetic', 'dependency', 'pruning',
        'constfold', 'arithmetic', 'dependency'
    ]
    if strip_debug_ops:
        rewriter_config.optimizers.insert(0, 'debug_stripper')

    optimized_graph = _run_grappler(config, optimized_graph, graph)

    # batch norm folding
    optimized_graph = fold_batch_norms.fold_batch_norms(optimized_graph)

    # set the device to CPU for all Conv2d nodes, since grappler remap optimizer
    # only support FusedConv2D for CPU.
    for node in optimized_graph.node:
        if node.op == 'Conv2D':
            node.device = '/device:CPU:0'

    # rerun grappler to fuse conv2d
    config.graph_options.rewrite_options.optimizers[:] = [
        'remap', 'constfold', 'arithmetic', 'dependency'
    ]

    optimized_graph = _run_grappler(config, optimized_graph, graph)

    # Since the grappler remap optimizer doe snot support prelu as the activation
    # function for _FusedConv2D op, we are doing it manually here.
    optimized_graph = fuse_prelu.fuse_prelu_with_fused_conv2d(optimized_graph)

    unsupported = validate(optimized_graph.node, skip_op_check,
                           strip_debug_ops)

    if unsupported:
        raise ValueError('Unsupported Ops in the model after optimization\n' +
                         ', '.join(unsupported))

    extract_weights(optimized_graph, output_graph, tf_version,
                    quantization_dtype)
    return optimize_graph
Exemple #6
0
    def testFoldFusedBatchNorms(self):
        for data_format, conv2d_func in [
            ("NHWC", nn_ops.conv2d), ("NCHW", nn_ops.conv2d),
            ("NHWC", nn_ops.depthwise_conv2d_native),
            ("NCHW", nn_ops.depthwise_conv2d_native)
        ]:
            with tf.compat.v1.Session() as sess:
                inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
                input_op = constant_op.constant(
                    np.array(inputs),
                    shape=[1, 1, 6, 2]
                    if data_format == "NHWC" else [1, 2, 1, 6],
                    dtype=dtypes.float32)
                if conv2d_func == nn_ops.conv2d:
                    weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
                    weights_op = constant_op.constant(np.array(weights),
                                                      shape=[1, 2, 2, 2],
                                                      dtype=dtypes.float32)
                else:
                    weights = [1, 2, 0.3, 0.4]
                    weights_op = constant_op.constant(np.array(weights),
                                                      shape=[1, 2, 2, 1],
                                                      dtype=dtypes.float32)
                conv_op = conv2d_func(input_op,
                                      weights_op, [1, 1, 1, 1],
                                      padding="SAME",
                                      data_format=data_format,
                                      name="conv_op")
                mean_op = constant_op.constant(np.array([10, 20]),
                                               shape=[2],
                                               dtype=dtypes.float32)
                variance_op = constant_op.constant(np.array([0.25, 0.5]),
                                                   shape=[2],
                                                   dtype=dtypes.float32)
                beta_op = constant_op.constant(np.array([0.1, 0.6]),
                                               shape=[2],
                                               dtype=dtypes.float32)
                gamma_op = constant_op.constant(np.array([1.0, 2.0]),
                                                shape=[2],
                                                dtype=dtypes.float32)
                ops.get_default_graph().graph_def_versions.producer = 9
                gen_nn_ops._fused_batch_norm(conv_op,
                                             gamma_op,
                                             beta_op,
                                             mean_op,
                                             variance_op,
                                             0.00001,
                                             is_training=False,
                                             data_format=data_format,
                                             name="output")
                original_graph_def = sess.graph_def
                original_result = sess.run(["output:0"])
            optimized_graph_def = fold_batch_norms.fold_batch_norms(
                original_graph_def)
        with tf.compat.v1.Session() as sess:
            _ = importer.import_graph_def(optimized_graph_def,
                                          input_map={},
                                          name="optimized")
            optimized_result = sess.run(["optimized/output:0"])

            self.assertAllClose(original_result,
                                optimized_result,
                                rtol=1e-04,
                                atol=1e-06)

            for node in optimized_graph_def.node:
                self.assertNotEqual("FusedBatchNorm", node.op)