Example #1
0
  def testSymGradAttr(self):

    @function.Defun(noinline=True)
    def Foo(x):
      return x * 2

    self.assertTrue(
        Foo.instantiate([dtypes.float32]).definition.attr["_noinline"].b)

    g = ops.Graph()
    with g.as_default():
      x = constant_op.constant(3.0)
      y = Foo(x)
      dx, = gradients_impl.gradients(y, [x])

    cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
        optimizer_options=config_pb2.OptimizerOptions(
            opt_level=config_pb2.OptimizerOptions.L0,
            do_common_subexpression_elimination=True,
            do_function_inlining=True,
            do_constant_folding=True)))

    with self.test_session(graph=g, config=cfg):
      self.assertAllClose(y.eval(), 6.)
      self.assertAllClose(dx.eval(), 2.)
Example #2
0
def _OptimizerOptions():
    for cse in [False, True]:
        for inline in [False, True]:
            for cfold in [False, True]:
                cfg = config_pb2.ConfigProto(
                    graph_options=config_pb2.GraphOptions(
                        optimizer_options=config_pb2.OptimizerOptions(
                            opt_level=config_pb2.OptimizerOptions.L0,
                            do_common_subexpression_elimination=cse,
                            do_function_inlining=inline,
                            do_constant_folding=cfold)))
                if cse:
                    cfg.graph_options.rewrite_options.arithmetic_optimization = (
                        rewriter_config_pb2.RewriterConfig.ON)
                else:
                    cfg.graph_options.rewrite_options.arithmetic_optimization = (
                        rewriter_config_pb2.RewriterConfig.OFF)
                if inline:
                    cfg.graph_options.rewrite_options.function_optimization = (
                        rewriter_config_pb2.RewriterConfig.ON)
                else:
                    cfg.graph_options.rewrite_options.function_optimization = (
                        rewriter_config_pb2.RewriterConfig.OFF)
                if cfold:
                    cfg.graph_options.rewrite_options.constant_folding = (
                        rewriter_config_pb2.RewriterConfig.ON)
                else:
                    cfg.graph_options.rewrite_options.constant_folding = (
                        rewriter_config_pb2.RewriterConfig.OFF)
                yield cfg
    def testConstantWithScopedAllocator(self):
        group_size = 2
        group_key = 1
        instance_key1 = 1
        instance_key2 = 2

        graph_options = config_pb2.GraphOptions(
            optimizer_options=config_pb2.OptimizerOptions(
                do_constant_folding=True))
        cfg = config_pb2.ConfigProto(device_count={'CPU': group_size},
                                     graph_options=graph_options)
        rewrite_options = cfg.graph_options.rewrite_options
        rewrite_options.scoped_allocator_optimization = (
            rewriter_config_pb2.RewriterConfig.ON)
        del rewrite_options.scoped_allocator_opts.enable_op[:]
        rewrite_options.scoped_allocator_opts.enable_op.append(
            'CollectiveReduce')

        with self.session(config=cfg) as sess:
            run_ops = []
            for i in range(group_size):
                with ops.device('CPU:%d' % i):
                    constant = constant_op.constant(i + 1.)
                    input_tensor1 = array_ops.identity(constant)
                    input_tensor2 = array_ops.identity(constant)
                    reduced_tensor1 = collective_ops.all_reduce(
                        input_tensor1, group_size, group_key, instance_key1,
                        'Add', 'Id')
                    reduced_tensor2 = collective_ops.all_reduce(
                        input_tensor2, group_size, group_key, instance_key2,
                        'Add', 'Id')
                    run_ops.append(array_ops.identity(reduced_tensor1))
                    run_ops.append(array_ops.identity(reduced_tensor2))
            results = sess.run(run_ops)
            self.assertEqual(results, [3., 3., 3., 3.])
Example #4
0
def npu_optimizer_options(optimizer_options=None):
    """Set NPU optimizer options"""
    if (not isinstance(optimizer_options, config_pb2.OptimizerOptions)) or (
            not issubclass(type(optimizer_options), config_pb2.OptimizerOptions)):
        optimizer_options = config_pb2.OptimizerOptions()
    optimizer_options.global_jit_level = config_pb2.OptimizerOptions.OFF
    return optimizer_options
Example #5
0
    def _Run(compiled):

      @function.Defun(compiled=compiled)
      def Forward(x):
        return math_ops.log(x)

      g = ops.Graph()
      with g.as_default():
        x = array_ops.placeholder(dtypes.float32)
        y = Forward(x)
        dx, = gradients_impl.gradients(y, [x], 1.0)

      cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
          optimizer_options=config_pb2.OptimizerOptions(
              opt_level=config_pb2.OptimizerOptions.L1,
              do_function_inlining=True)))
      with session_lib.Session(graph=g, config=cfg) as sess:
        run_metadata = config_pb2.RunMetadata()
        dx_val = sess.run(dx,
                          feed_dict={x: 100.},
                          run_metadata=run_metadata,
                          options=config_pb2.RunOptions(
                              trace_level=config_pb2.RunOptions.FULL_TRACE))
      self.assertAllClose(dx_val, 0.01)
      return RunMetadataLabels(run_metadata)
Example #6
0
def _OptimizerOptions():
  for cse in [False, True]:
    for inline in [False, True]:
      for cfold in [False, True]:
        yield config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
            optimizer_options=config_pb2.OptimizerOptions(
                opt_level=config_pb2.OptimizerOptions.L0,
                do_common_subexpression_elimination=cse,
                do_function_inlining=inline,
                do_constant_folding=cfold)))
Example #7
0
    def testFoo(self):
        dtype = dtypes.float32
        cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
            optimizer_options=config_pb2.OptimizerOptions(
                opt_level=config_pb2.OptimizerOptions.L0,
                do_common_subexpression_elimination=True,
                do_function_inlining=True,
                do_constant_folding=True)))
        cell_func_call_pattern = re.compile(r"Cell[^/]*\(")
        for noinline in [False, True]:

            @function.Defun(dtype, noinline=noinline)
            def Cell(v):
                # If v is a vector [n, 1], x is a big square matrix.
                x = math_ops.tanh(v + array_ops.transpose(v, [1, 0]))
                return math_ops.reduce_sum(x, 1, keep_dims=True)

            @function.Defun(dtype)
            def Forward(x):
                for _ in range(10):
                    # pylint: disable=cell-var-from-loop
                    x = Cell(x)
                return math_ops.reduce_sum(x, [0, 1])

            self.assertEqual(noinline, Cell.definition.attr["_noinline"].b)

            g = ops.Graph()
            with g.as_default():
                x = array_ops.placeholder(dtype)
                y = Forward(x)
                dx, = gradients_impl.gradients([y], [x])

            np.random.seed(321)
            inp = np.random.uniform(-1, 1, [16, 1]).astype(np.float32)
            run_metadata = config_pb2.RunMetadata()
            with session.Session(graph=g, config=cfg) as sess:
                ans = sess.run(
                    [y, dx], {x: inp},
                    run_metadata=run_metadata,
                    options=config_pb2.RunOptions(
                        trace_level=config_pb2.RunOptions.FULL_TRACE))
                print(ans[0], np.sum(ans[1]))
                self.assertAllClose(ans[0], 255.971, rtol=1e-3)
                self.assertAllClose(np.sum(ans[1]), 13.0408, rtol=1e-3)

            def MetadataHasCell(run_metadata):
                for dev_stats in run_metadata.step_stats.dev_stats:
                    for node_stats in dev_stats.node_stats:
                        if cell_func_call_pattern.search(
                                node_stats.timeline_label):
                            return True
                return False

            self.assertEqual(MetadataHasCell(run_metadata), noinline)
    def _run_graph(self, device, output_shape, variable, num_outputs, axis):
        """Run the graph and print its execution time.

    Args:
      device: string, the device to run on.
      output_shape: shape of each output tensors.
      variable: whether or not the output shape should be fixed
      num_outputs: the number of outputs to split the input into
      axis: axis to be split

    Returns:
      The duration of the run in seconds.
    """
        graph = ops.Graph()
        with graph.as_default():
            if not variable:
                if axis == 0:
                    input_shape = [
                        output_shape[0] * num_outputs, output_shape[1]
                    ]
                    sizes = [output_shape[0] for _ in range(num_outputs)]
                else:
                    input_shape = [
                        output_shape[0], output_shape[1] * num_outputs
                    ]
                    sizes = [output_shape[1] for _ in range(num_outputs)]
            else:
                sizes = np.random.randint(low=max(1, output_shape[axis] - 2),
                                          high=output_shape[axis] + 2,
                                          size=num_outputs)
                total_size = np.sum(sizes)
                if axis == 0:
                    input_shape = [total_size, output_shape[1]]
                else:
                    input_shape = [output_shape[0], total_size]

            outputs = build_graph(device, input_shape, sizes, axis)
        config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
            optimizer_options=config_pb2.OptimizerOptions(
                opt_level=config_pb2.OptimizerOptions.L0)))
        with session_lib.Session(graph=graph, config=config) as session:
            logging.set_verbosity("info")
            variables.global_variables_initializer().run()
            bench = benchmark.TensorFlowBenchmark()
            bench.run_op_benchmark(session,
                                   outputs,
                                   mbs=input_shape[0] * input_shape[1] * 4 *
                                   2 * 100 / 1e6,
                                   extras={
                                       "input_shape": input_shape,
                                       "variable": variable,
                                       "axis": axis
                                   })
    def _run_graph(self, device, input_shape, variable, num_inputs, axis, grad,
                   num_iters):
        """Run the graph and print its execution time.

    Args:
      device: string, the device to run on.
      input_shape: shape of the input tensors.
      variable: whether or not the input shape should be fixed
      num_inputs: the number of inputs to concat
      axis: axis to be concat'ed
      grad: if True compute the gradient
      num_iters: number of steps to run.

    Returns:
      The duration of the run in seconds.
    """
        graph = ops.Graph()
        with graph.as_default():
            outputs = build_graph(device, input_shape, variable, num_inputs,
                                  axis, grad)
        config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
            optimizer_options=config_pb2.OptimizerOptions(
                opt_level=config_pb2.OptimizerOptions.L0)))
        with session_lib.Session(graph=graph, config=config) as session:
            variables.global_variables_initializer().run()
            _ = session.run(outputs)  # warm up.
            start_time = time.time()
            for _ in range(num_iters):
                _ = session.run(outputs)
            duration = time.time() - start_time
            print(
                "%s shape:%d/%d var: %r #inputs:%d axis:%d grad:%r - %f secs - %f "
                "GB/sec" %
                (device, input_shape[0], input_shape[1], variable, num_inputs,
                 axis, grad, duration / num_iters,
                 num_inputs * input_shape[0] * input_shape[1] * 4 * 2 * 100 /
                 (duration / num_iters) / 1e9))

        name_template = (
            "concat_bench_{device}_input_shape_{shape}_variable_{variable}"
            "_num_inputs_{num_inputs}_axis_{axis}_grad_{grad}")

        self.report_benchmark(
            name=name_template.format(device=device,
                                      num_inputs=num_inputs,
                                      variable=variable,
                                      grad=grad,
                                      shape=str(input_shape).replace(" ", ""),
                                      axis=str(axis),
                                      iters=num_iters))

        return duration
def randn_sampler_switchover(shape, num_iters, use_gpu=False):
  # Benchmark by constructing samplers on the threshold of using the randn
  # rejection sampling and check that this threshold is set correctly by
  # benchmarking with bounds just above and below this threshold.
  # The uniform and randn samplers should have about the same performance
  # at this point.

  stddev_inside_bounds_before_using_randn = (
      _get_stddev_inside_bounds_before_using_randn(use_gpu))

  epsilon = 0.001

  np.random.seed(1618)  # Make it reproducible.

  # No CSE/CF.
  optimizer_options = config_pb2.OptimizerOptions(
      opt_level=config_pb2.OptimizerOptions.L0)
  config = config_pb2.ConfigProto(
      graph_options=config_pb2.GraphOptions(
          optimizer_options=optimizer_options))

  with session.Session(config=config) as sess:
    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
      uniform_sampler_op = control_flow_ops.group(
          random_ops.parameterized_truncated_normal(
              shape,
              means=0.,
              stddevs=1.0,
              minvals=-stddev_inside_bounds_before_using_randn + epsilon,
              maxvals=0.01))
      randn_sampler_op = control_flow_ops.group(
          random_ops.parameterized_truncated_normal(
              shape,
              means=0.,
              stddevs=1.0,
              minvals=-stddev_inside_bounds_before_using_randn - epsilon,
              maxvals=0.01))

    # Burn-in to avoid session setup costs in the timing.
    sess.run(uniform_sampler_op)
    sess.run(uniform_sampler_op)
    uniform_dt = timeit.timeit(
        lambda: sess.run(uniform_sampler_op), number=num_iters)

    sess.run(randn_sampler_op)
    sess.run(randn_sampler_op)
    randn_dt = timeit.timeit(
        lambda: sess.run(randn_sampler_op), number=num_iters)

    return randn_dt, uniform_dt
Example #11
0
  def benchmark_reduce_sum_grad_graph(self):
    config = config_pb2.ConfigProto(
        graph_options=config_pb2.GraphOptions(
            optimizer_options=config_pb2.OptimizerOptions(
                opt_level=config_pb2.OptimizerOptions.L0)))
    with ops.Graph().as_default(), session.Session(config=config) as sess:

      tensor = constant_op.constant(np.zeros([100, 1000], dtype=np.float32))
      reduction = math_ops.reduce_sum(tensor)
      grad, = gradients_impl.gradients(reduction, tensor)

      def fn():
        self.evaluate(grad.op)

      self._run(fn, 10000)
    def testScopedAllocatorWithXla(self):
        group_size = 2
        group_key = 1
        instance_key1 = 1
        instance_key2 = 2
        tensor_size = 10

        graph_options = config_pb2.GraphOptions(
            optimizer_options=config_pb2.OptimizerOptions(
                do_constant_folding=False))
        cfg = config_pb2.ConfigProto(device_count={'CPU': group_size},
                                     graph_options=graph_options)
        rewrite_options = cfg.graph_options.rewrite_options
        rewrite_options.scoped_allocator_optimization = (
            rewriter_config_pb2.RewriterConfig.ON)
        del rewrite_options.scoped_allocator_opts.enable_op[:]
        rewrite_options.scoped_allocator_opts.enable_op.append(
            'CollectiveReduce')

        # Tests that execute collectives need to be enclosed in graph or tf.function
        with ops.Graph().as_default(), self.session(config=cfg) as sess:
            run_ops = []
            for i in range(group_size):
                with ops.device('CPU:%d' % i):
                    tensor_val = [i + 1.] * tensor_size
                    constant = constant_op.constant(tensor_val)

                    @def_function.function(jit_compile=True)
                    def f(x):
                        return 2 * x + 1

                    input_tensor1 = array_ops.identity(f(constant))
                    input_tensor2 = array_ops.identity(f(constant))
                    reduced_tensor1 = collective_ops.all_reduce(
                        input_tensor1, group_size, group_key, instance_key1,
                        'Add', 'Id')
                    reduced_tensor2 = collective_ops.all_reduce(
                        input_tensor2, group_size, group_key, instance_key2,
                        'Add', 'Id')
                    run_ops.append(array_ops.identity(reduced_tensor1))
                    run_ops.append(array_ops.identity(reduced_tensor2))
            results = sess.run(run_ops)
            for result in results:
                for result_val in result:
                    self.assertEqual(result_val, 8.)
Example #13
0
def native_op_vs_composed_ops(batch_size, num_classes, num_samples, num_iters):
  np.random.seed(1618)  # Make it reproducible.
  shape = [batch_size, num_classes]
  logits_np = np.random.randn(*shape).astype(np.float32)

  # No CSE/CF.
  optimizer_options = config_pb2.OptimizerOptions(
      opt_level=config_pb2.OptimizerOptions.L0)
  config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
      optimizer_options=optimizer_options))

  with session.Session(config=config) as sess:
    logits = constant_op.constant(logits_np, shape=shape)
    native_op = control_flow_ops.group(native_sampler(logits, num_samples))
    composed_op = control_flow_ops.group(composed_sampler(logits, num_samples))

    native_dt = timeit.timeit(lambda: sess.run(native_op), number=num_iters)
    composed_dt = timeit.timeit(lambda: sess.run(composed_op), number=num_iters)
    return native_dt, composed_dt
    def testTanhSymGrad(self):
        @function.Defun(dtypes.float32)
        def Forward(x):
            return math_ops.reduce_sum(math_ops.tanh(x))

        g = ops.Graph()
        with g.as_default():
            x = array_ops.placeholder(dtypes.float32)
            y = Forward(x)
            dx = gradients_impl.gradients([y], [x])

        inp = np.array([-1, 1, 2, -2], dtype=np.float32)
        feed = {x: inp}
        cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
            optimizer_options=config_pb2.OptimizerOptions(
                opt_level=config_pb2.OptimizerOptions.L1,
                do_function_inlining=True)))
        with session.Session(graph=g, config=cfg) as sess:
            out, = sess.run(dx, feed)
        self.assertAllClose(1 - np.square(np.tanh(inp)), out)
  def testControlFlowStrictness(self):
    """Inlined functions must not execute in a untaken control flow branch."""

    @function.Defun(dtypes.int32)
    def AssertFail(x):
      # Assertion that always fails and does not have a data dependency on `x`.
      assert_false = control_flow_ops.Assert(False, [42])
      with ops.control_dependencies([assert_false]):
        return array_ops.identity(x)

    with ops.device("CPU"):
      pred = array_ops.placeholder(dtypes.bool)
      x = array_ops.placeholder(dtypes.int32)
      cond = control_flow_ops.cond(pred, lambda: x + 1, lambda: AssertFail(x))
      # pylint: disable=unnecessary-lambda
      loop = control_flow_ops.while_loop(lambda y: pred,
                                         lambda y: AssertFail(y), [x])
      # pylint: enable=unnecessary-lambda

    # Enables inlining.
    config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
        optimizer_options=config_pb2.OptimizerOptions(
            opt_level=config_pb2.OptimizerOptions.L0,
            do_common_subexpression_elimination=True,
            do_function_inlining=True,
            do_constant_folding=True)))

    with session.Session(config=config) as sess:
      # Since the 'False' branch is not taken, the assertion should not fire.
      self.assertEqual(4, sess.run(cond, {pred: True, x: 3}))

      # The assertion should still fire if the False branch is taken.
      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                   "assertion"):
        sess.run(cond, {pred: False, x: 3})

      # Similarly for loops.
      self.assertEqual(3, sess.run(loop, {pred: False, x: 3}))
      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                   "assertion"):
        sess.run(loop, {pred: True, x: 3})
Example #16
0
def parameterized_vs_naive(shape, num_iters, use_gpu=False):
  np.random.seed(1618)  # Make it reproducible.

  # No CSE/CF.
  optimizer_options = config_pb2.OptimizerOptions(
      opt_level=config_pb2.OptimizerOptions.L0)
  config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
      optimizer_options=optimizer_options))

  with session.Session(config=config) as sess:
    with ops.device("/cpu:0" if not use_gpu else None):
      param_op = control_flow_ops.group(
          random_ops.parameterized_truncated_normal(shape))
      naive_op = control_flow_ops.group(random_ops.truncated_normal(shape))

    # Burn-in to avoid session setup costs in the timing.
    sess.run(param_op)
    sess.run(param_op)
    param_dt = timeit.timeit(lambda: sess.run(param_op), number=num_iters)
    sess.run(naive_op)
    sess.run(naive_op)
    naive_dt = timeit.timeit(lambda: sess.run(naive_op), number=num_iters)
    return param_dt, naive_dt