def testPeek(self):
    with ops.Graph().as_default() as G:
      with ops.device('/cpu:0'):
        x = array_ops.placeholder(dtypes.int32, name='x')
        pi = array_ops.placeholder(dtypes.int64)
        gi = array_ops.placeholder(dtypes.int64)
        p = array_ops.placeholder(dtypes.int32, name='p')
      with ops.device(test.gpu_device_name()):
        stager = data_flow_ops.MapStagingArea(
            [
                dtypes.int32,
            ], shapes=[[]])
        stage = stager.put(pi, [x], [0])
        peek = stager.peek(gi)
        size = stager.size()

    G.finalize()

    n = 10

    with self.session(use_gpu=True, graph=G) as sess:
      for i in range(n):
        sess.run(stage, feed_dict={x: i, pi: i})

      for i in range(n):
        self.assertTrue(sess.run(peek, feed_dict={gi: i})[0] == i)

      self.assertTrue(sess.run(size) == 10)
 def testColocation(self):
   with ops.device("/job:ps"):
     var = variables.VariableV1(0, name="v")
   with ops.device("/job:worker/task:7"):
     assign_op = var.assign(1)
   self.assertDeviceEqual("/job:ps", assign_op.device)
   self.assertEqual([b"loc:@v"], assign_op.op.colocation_groups())
Example #3
0
 def testManyCPUs(self):
   run_options = config_pb2.RunOptions(
       trace_level=config_pb2.RunOptions.FULL_TRACE)
   run_metadata = config_pb2.RunMetadata()
   config = config_pb2.ConfigProto(device_count={'CPU': 3})
   with session.Session(config=config) as sess:
     with ops.device('/cpu:0'):
       num1 = variables.Variable(1.0, name='num1')
     with ops.device('/cpu:1'):
       num2 = variables.Variable(2.0, name='num2')
     with ops.device('/cpu:2'):
       result = num1 + num2 + num1 * num2
     self.evaluate(variables.global_variables_initializer())
     sess.run(result, options=run_options, run_metadata=run_metadata)
   self.assertTrue(run_metadata.HasField('step_stats'))
   step_stats = run_metadata.step_stats
   devices = [d.device for d in step_stats.dev_stats]
   self.assertTrue('/job:localhost/replica:0/task:0/device:CPU:0' in devices)
   self.assertTrue('/job:localhost/replica:0/task:0/device:CPU:1' in devices)
   self.assertTrue('/job:localhost/replica:0/task:0/device:CPU:2' in devices)
   tl = timeline.Timeline(step_stats)
   ctf = tl.generate_chrome_trace_format()
   self._validateTrace(ctf)
   tl = timeline.Timeline(step_stats)
   ctf = tl.generate_chrome_trace_format(show_dataflow=False)
   self._validateTrace(ctf)
   tl = timeline.Timeline(step_stats)
   ctf = tl.generate_chrome_trace_format(show_memory=False)
   self._validateTrace(ctf)
   tl = timeline.Timeline(step_stats)
   ctf = tl.generate_chrome_trace_format(
       show_memory=False, show_dataflow=False)
   self._validateTrace(ctf)
  def testSizeAndClear(self):
    with ops.Graph().as_default() as G:
      with ops.device('/cpu:0'):
        x = array_ops.placeholder(dtypes.float32, name='x')
        pi = array_ops.placeholder(dtypes.int64)
        gi = array_ops.placeholder(dtypes.int64)
        v = 2. * (array_ops.zeros([128, 128]) + x)
      with ops.device(test.gpu_device_name()):
        stager = data_flow_ops.MapStagingArea(
            [dtypes.float32, dtypes.float32],
            shapes=[[], [128, 128]],
            names=['x', 'v'])
        stage = stager.put(pi, {'x': x, 'v': v})
        size = stager.size()
        clear = stager.clear()

    G.finalize()

    with self.session(use_gpu=True, graph=G) as sess:
      sess.run(stage, feed_dict={x: -1, pi: 3})
      self.assertEqual(sess.run(size), 1)
      sess.run(stage, feed_dict={x: -1, pi: 1})
      self.assertEqual(sess.run(size), 2)
      sess.run(clear)
      self.assertEqual(sess.run(size), 0)
Example #5
0
  def testCopyToGPU(self):
    if not test_util.is_gpu_available():
      self.skipTest("No GPU available")

    with ops.device("/cpu:0"):
      optional_with_value = optional_ops.Optional.from_value(
          (constant_op.constant(37.0), constant_op.constant("Foo"),
           constant_op.constant(42)))
      optional_none = optional_ops.Optional.none_from_structure(
          structure.TensorStructure(dtypes.float32, []))

    with ops.device("/gpu:0"):
      gpu_optional_with_value = optional_ops._OptionalImpl(
          array_ops.identity(optional_with_value._variant_tensor),
          optional_with_value.value_structure)
      gpu_optional_none = optional_ops._OptionalImpl(
          array_ops.identity(optional_none._variant_tensor),
          optional_none.value_structure)

      gpu_optional_with_value_has_value = gpu_optional_with_value.has_value()
      gpu_optional_with_value_values = gpu_optional_with_value.get_value()

      gpu_optional_none_has_value = gpu_optional_none.has_value()

    self.assertTrue(self.evaluate(gpu_optional_with_value_has_value))
    self.assertEqual((37.0, b"Foo", 42),
                     self.evaluate(gpu_optional_with_value_values))
    self.assertFalse(self.evaluate(gpu_optional_none_has_value))
  def benchmarkMatrixExponentialOp(self):
    for shape in self.shapes:
      with ops.Graph().as_default(), \
          session.Session() as sess, \
          ops.device("/cpu:0"):
        matrix = self._GenerateMatrix(shape)
        expm = linalg_impl.matrix_exponential(matrix)
        variables.global_variables_initializer().run()
        self.run_op_benchmark(
            sess,
            control_flow_ops.group(expm),
            min_iters=25,
            name="matrix_exponential_cpu_{shape}".format(
                shape=shape))

      if test.is_gpu_available(True):
        with ops.Graph().as_default(), \
            session.Session() as sess, \
            ops.device("/gpu:0"):
          matrix = self._GenerateMatrix(shape)
          expm = linalg_impl.matrix_exponential(matrix)
          variables.global_variables_initializer().run()
          self.run_op_benchmark(
              sess,
              control_flow_ops.group(expm),
              min_iters=25,
              name="matrix_exponential_gpu_{shape}".format(
                  shape=shape))
Example #7
0
  def testAnalysisAndAllocations(self):
    run_options = config_pb2.RunOptions(
        trace_level=config_pb2.RunOptions.FULL_TRACE)
    run_metadata = config_pb2.RunMetadata()
    config = config_pb2.ConfigProto(device_count={'CPU': 3})

    with session.Session(config=config) as sess:
      with ops.device('/cpu:0'):
        num1 = variables.Variable(1.0, name='num1')
      with ops.device('/cpu:1'):
        num2 = variables.Variable(2.0, name='num2')
      with ops.device('/cpu:2'):
        result = num1 + num2 + num1 * num2
      self.evaluate(variables.global_variables_initializer())
      sess.run(result, options=run_options, run_metadata=run_metadata)

    self.assertTrue(run_metadata.HasField('step_stats'))
    tl = timeline.Timeline(run_metadata.step_stats)
    step_analysis = tl.analyze_step_stats()
    ctf = step_analysis.chrome_trace.format_to_string()
    self._validateTrace(ctf)
    maximums = step_analysis.allocator_maximums
    cpuname = 'mklcpu' if test_util.IsMklEnabled() else 'cpu'
    self.assertTrue(cpuname in maximums)
    cpu_max = maximums[
        'cuda_host_bfc'] if 'cuda_host_bfc' in maximums else maximums[cpuname]
    # At least num1 + num2, both float32s (4 bytes each)
    self.assertGreaterEqual(cpu_max.num_bytes, 8)
    self.assertGreater(cpu_max.timestamp, 0)
Example #8
0
def _build_recursive_hd_scatter(input_tensors, devices):
  """Construct the scatter phase of recursive halving-doublng all-reduce.

  Args:
    input_tensors: list of T `tf.Tensor` that are fully-reduced shards.
    devices: a list of strings naming the devices on which the reconstituted
      full tensors should be placed.

  Returns:
    list of T `tf.Tensor` which are the fully reduced tensors.
  """
  num_devices = len(devices)
  num_hops = int(math.log(num_devices, 2))
  assert num_devices == (2 ** num_hops), "num_devices must be a power of 2"
  chunks = input_tensors
  for h in reversed(range(0, num_hops)):
    span = 2 ** h
    group_size = span * 2
    new_chunks = [[] for _ in devices]
    for d in range(0, num_devices):
      if (d % group_size) >= (group_size / 2):
        # skip right half of a pair
        continue
      left_idx = d
      right_idx = d + span
      left_dev = devices[left_idx]
      right_dev = devices[right_idx]
      with ops.device(left_dev):
        new_chunks[left_idx] = array_ops.concat([chunks[left_idx],
                                                 chunks[right_idx]], 0)
      with ops.device(right_dev):
        new_chunks[right_idx] = array_ops.concat([chunks[left_idx],
                                                  chunks[right_idx]], 0)
    chunks = new_chunks
  return chunks
  def testSplitPipelineFailsWithPlacementError(self):
    with session.Session(
        target="",
        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:

      dataset = dataset_ops.Dataset.from_tensors(0)

      # Define a pipeline that attempts to use variables on two
      # different devices.
      #
      # Initialize the variables before creating to iterator, to avoid the
      # placement algorithm overriding the DT_RESOURCE colocation constraints.
      with ops.device("/cpu:0"):
        var_0 = resource_variable_ops.ResourceVariable(initial_value=0)
        dataset = dataset.map(lambda x: x + var_0.read_value())
      sess.run(var_0.initializer)

      with ops.device("/cpu:1"):
        var_1 = resource_variable_ops.ResourceVariable(initial_value=0)
        dataset = dataset.map(lambda x: x + var_1.read_value())
      sess.run(var_1.initializer)

      iterator = dataset.make_initializable_iterator()
      sess.run(iterator.initializer)

      with self.assertRaisesRegexp(
          errors.FailedPreconditionError,
          "Error while reading resource variable Variable"):
        sess.run(iterator.get_next())
  def benchmarkCholeskyOp(self):
    for shape in self.shapes:
      with ops.Graph().as_default(), \
          session.Session() as sess, \
          ops.device("/cpu:0"):
        matrix = variables.Variable(self._GenerateMatrix(shape))
        l = linalg_ops.cholesky(matrix)
        variables.global_variables_initializer().run()
        self.run_op_benchmark(
            sess,
            control_flow_ops.group(
                l,),
            min_iters=25,
            name="cholesky_cpu_{shape}".format(shape=shape))

      if test.is_gpu_available(True):
        with ops.Graph().as_default(), \
            session.Session() as sess, \
            ops.device("/device:GPU:0"):
          matrix = variables.Variable(self._GenerateMatrix(shape))
          l = linalg_ops.cholesky(matrix)
          variables.global_variables_initializer().run()
          self.run_op_benchmark(
              sess,
              control_flow_ops.group(
                  l,),
              min_iters=25,
              name="cholesky_gpu_{shape}".format(shape=shape))
Example #11
0
def _build_ring_gather(input_tensors, devices, num_subchunks,
                       pred_by_s_d, rank_by_s_d, red_op):
  """Construct a subgraph for the first (reduction) pass of ring all-reduce.

  Args:
    input_tensors: a list of T `tf.Tensor` 1D input tensors of same
      shape and type.
    devices: array of device name strings
    num_subchunks: number of subchunks each device should process in one tick.
    pred_by_s_d: as produced by _ring_permutations
    rank_by_s_d: as produced by _ring_permutations
    red_op: a binary operator for elementwise reduction

  Raises:
    ValueError: tensors must all be one dimensional.

  Returns:
    list of list of T `tf.Tensor` of (partially) reduced values where
    exactly num_subchunks chunks at each device are fully reduced.
  """
  num_devices = len(input_tensors)
  if num_devices == 0:
    return []
  if num_devices == 1:
    return input_tensors
  shape = input_tensors[0].shape
  if 1 != len(shape):
    raise ValueError("input tensors must be 1D")
  num_chunks = num_devices * num_subchunks
  num_ticks = num_devices - 1
  # Initialize chunks_by_dev with splits of the input tensors.
  chunks_by_dev = []
  split_pad_len = 0
  for d in range(0, num_devices):
    with ops.device(devices[d]):
      splits, split_pad_len = _padded_split(input_tensors[d], num_chunks)
      chunks_by_dev.append(splits)
  # Reduction phase
  for tick in range(0, num_ticks):
    # One new partial reduction for every chunk
    new_partial_reductions = [None for _ in range(0, num_chunks)]
    # Compute reductions with respect to last tick's values
    for d in range(0, num_devices):
      with ops.device(devices[d]):
        for s in range(0, num_subchunks):
          rank = rank_by_s_d[s][d]
          seg_index = (rank + num_devices - (2 + tick)) % num_devices
          pred_dev = pred_by_s_d[s][d]
          chunk_index = (seg_index * num_subchunks) + s
          new_partial_reductions[chunk_index] = red_op(
              chunks_by_dev[pred_dev][chunk_index],
              chunks_by_dev[d][chunk_index])
    # Update chunks_by_dev with the new values at the end of the tick.
    for d in range(0, num_devices):
      for s in range(0, num_subchunks):
        rank = rank_by_s_d[s][d]
        seg_index = (rank + num_devices - (2 + tick)) % num_devices
        chunk_index = (seg_index * num_subchunks) + s
        chunks_by_dev[d][chunk_index] = new_partial_reductions[chunk_index]
  return chunks_by_dev, split_pad_len
  def _create_ops(self, ds, ds_iterator, buffer_name, device0, device1):
    ds_iterator_handle = ds_iterator.string_handle()

    @function.Defun(dtypes.string)
    def _remote_fn(h):
      remote_iterator = iterator_ops.Iterator.from_string_handle(
          h, ds.output_types, ds.output_shapes)
      return remote_iterator.get_next()

    target = constant_op.constant(device0)
    with ops.device(device1):
      buffer_resource_handle = prefetching_ops.function_buffering_resource(
          f=_remote_fn,
          output_types=[dtypes.float32],
          target_device=target,
          string_arg=ds_iterator_handle,
          buffer_size=3,
          shared_name=buffer_name)

    with ops.device(device1):
      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
          function_buffer_resource=buffer_resource_handle,
          output_types=[dtypes.float32])
      reset_op = prefetching_ops.function_buffering_resource_reset(
          function_buffer_resource=buffer_resource_handle)
      destroy_op = resource_variable_ops.destroy_resource_op(
          buffer_resource_handle, ignore_lookup_error=True)

    return (prefetch_op, reset_op, destroy_op)
Example #13
0
  def test_subscribe_tensors_on_different_devices(self):
    """Side effect ops are added with the same device of the subscribed op."""
    c1 = constant_op.constant(10)
    c2 = constant_op.constant(20)

    with ops.device('cpu:0'):
      add = math_ops.add(c1, c2)

    with ops.device('cpu:1'):
      mul = math_ops.multiply(c1, c2)

    def sub(t):
      return t

    add_sub = subscribe.subscribe(
        add, lambda t: script_ops.py_func(sub, [t], [t.dtype]))

    mul_sub = subscribe.subscribe(
        mul, lambda t: script_ops.py_func(sub, [t], [t.dtype]))

    # Expect the identity tensors injected by subscribe to have been created
    # on the same device as their original tensors.
    self.assertNotEqual(add_sub.device, mul_sub.device)
    self.assertEqual(add.device, add_sub.device)
    self.assertEqual(mul.device, mul_sub.device)
  def _between_graph_with_monitored_session(self, strategy):
    context = distribute_coordinator_context.get_current_worker_context()
    self.assertTrue(context is not None)
    with ops.device("/job:ps/task:0"):
      # TODO(yuefengz): investigate why not using resource variable will make
      # the test flaky.
      x = variable_scope.get_variable("xx", initializer=10.0, use_resource=True)
    with ops.device("/job:ps/task:1"):
      y = variable_scope.get_variable("yy", initializer=20.0, use_resource=True)

    x_add = x.assign_add(2.0)
    y_sub = y.assign_sub(2.0)
    train_op = control_flow_ops.group([x_add, y_sub])

    # The monitored session will run init or ready ops.
    with monitored_session.MonitoredSession() as sess:
      sess.run(train_op)

      # Synchronize workers after one step to make sure they all have finished
      # training.
      if context.has_barrier:
        context.wait_for_other_workers()
      else:
        self._barrier.wait()

      x_val, y_val = sess.run([x, y])

    self.assertEqual(x_val, 16.0)
    self.assertEqual(y_val, 14.0)
    if x_val == 16.0 and y_val == 14.0:
      with self._lock:
        self._result_correct += 1
def pack_range(key, packing, grad_vars, rng):
  """Form the concatenation of a specified range of gradient tensors.

  Args:
    key: Value under which to store meta-data in packing that will be used
      later to restore the grad_var list structure.
    packing: Dict holding data describing packed ranges of small tensors.
    grad_vars: List of (grad, var) pairs for one replica.
    rng: A pair of integers giving the first, last indices of a consecutive
      range of tensors to be packed.

  Returns:
    A tensor that is the concatenation of all the specified small tensors.
  """
  to_pack = grad_vars[rng[0]:rng[1] + 1]
  members = []
  variables = []
  restore_shapes = []
  with ops.name_scope('pack'):
    for g, v in to_pack:
      variables.append(v)
      restore_shapes.append(g.shape)
      with ops.device(g.device):
        members.append(array_ops.reshape(g, [-1]))
    packing[key] = GradPackTuple(
        indices=range(rng[0], rng[1] + 1),
        vars=variables,
        shapes=restore_shapes)
    with ops.device(members[0].device):
      return array_ops.concat(members, 0)
  def benchmarkMatrixSolveLsOp(self):
    run_gpu_test = test_lib.is_gpu_available(True)
    regularizer = 1.0
    for matrix_shape in self.matrix_shapes:
      for num_rhs in 1, 2, matrix_shape[-1]:

        with ops.Graph().as_default(), \
            session.Session(config=benchmark.benchmark_config()) as sess, \
            ops.device("/cpu:0"):
          matrix, rhs = _GenerateTestData(matrix_shape, num_rhs)
          x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer)
          variables.global_variables_initializer().run()
          self.run_op_benchmark(
              sess,
              control_flow_ops.group(x),
              min_iters=25,
              store_memory_usage=False,
              name=("matrix_solve_ls_cpu_shape_{matrix_shape}_num_rhs_{num_rhs}"
                   ).format(matrix_shape=matrix_shape, num_rhs=num_rhs))

        if run_gpu_test and (len(matrix_shape) < 3 or matrix_shape[0] < 513):
          with ops.Graph().as_default(), \
                session.Session(config=benchmark.benchmark_config()) as sess, \
                ops.device("/gpu:0"):
            matrix, rhs = _GenerateTestData(matrix_shape, num_rhs)
            x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer)
            variables.global_variables_initializer().run()
            self.run_op_benchmark(
                sess,
                control_flow_ops.group(x),
                min_iters=25,
                store_memory_usage=False,
                name=("matrix_solve_ls_gpu_shape_{matrix_shape}_num_rhs_"
                      "{num_rhs}").format(
                          matrix_shape=matrix_shape, num_rhs=num_rhs))
Example #17
0
def _set_checkpoint_initializer(variable,
                                ckpt_file,
                                tensor_name,
                                slice_spec,
                                name="checkpoint_initializer"):
  """Overrides given variable's initialization op.

  Sets variable initializer to assign op that initializes variable from tensor's
  value in the checkpoint.

  Args:
    variable: `tf.Variable` object.
    ckpt_file: string, full path of the checkpoint.
    tensor_name: Name of the tensor to load from the checkpoint.
    slice_spec: Slice specification for loading partitioned tensors.
    name: Name of the operation.
  """
  base_type = variable.dtype.base_dtype
  # Do not colocate with variable since RestoreV2 op only runs on CPU and
  # colocation will force variable (and other ops that colocate with variable)
  # to be on CPU as well. It is okay to place the variable's initializer op on
  # CPU since it will only be run once at the start.
  with ops.device(variable.device), ops.device("/cpu:0"):
    restore_op = io_ops.restore_v2(
        ckpt_file, [tensor_name], [slice_spec], [base_type], name=name)[0]
    if isinstance(variable, resource_variable_ops.ResourceVariable):
      init_op = variable.assign(restore_op, read_value=False)
    else:
      init_op = state_ops.assign(variable, restore_op)
    variable._initializer_op = init_op  # pylint:disable=protected-access
    restore_op.set_shape(variable.shape)
    variable._initial_value = restore_op  # pylint:disable=protected-access
Example #18
0
  def __init__(self,
               params,
               device_assigner,
               training=True,
               tree_variables_class=TreeVariables,
               tree_configs=None,
               tree_stats=None):
    self.variables = []
    # Set up some scalar variables to run through the device assigner, then
    # we can use those to colocate everything related to a tree.
    self.device_dummies = []
    with ops.device(device_assigner):
      for i in range(params.num_trees):
        self.device_dummies.append(
            variable_scope.get_variable(name='device_dummy_%d' % i, shape=0))

    for i in range(params.num_trees):
      with ops.device(self.device_dummies[i].device):
        kwargs = {}
        if tree_configs is not None:
          kwargs.update(dict(tree_config=tree_configs[i]))
        if tree_stats is not None:
          kwargs.update(dict(tree_stat=tree_stats[i]))
        self.variables.append(
            tree_variables_class(params, i, training, **kwargs))
Example #19
0
  def inference_graph(self, input_data, data_spec=None, **inference_args):
    """Constructs a TF graph for evaluating a random forest.

    Args:
      input_data: A tensor or SparseTensor or placeholder for input data.
      data_spec: A list of tf.dtype values specifying the original types of
        each column.
      **inference_args: Keyword arguments to pass through to each tree.

    Returns:
      The last op in the random forest inference graph.
    """
    data_spec = [constants.DATA_FLOAT] if data_spec is None else data_spec
    probabilities = []
    for i in range(self.params.num_trees):
      with ops.device(self.device_assigner.get_device(i)):
        tree_data = input_data
        if self.params.bagged_features:
          tree_data = self._bag_features(i, input_data)
        probabilities.append(self.trees[i].inference_graph(
            tree_data, data_spec, **inference_args))
    with ops.device(self.device_assigner.get_device(0)):
      all_predict = array_ops.pack(probabilities)
      return math_ops.div(
          math_ops.reduce_sum(all_predict, 0), self.params.num_trees,
          name='probabilities')
  def testRemoteFunction(self):
    worker_config = config_pb2.ConfigProto()
    worker_config.device_count["CPU"] = 2
    worker, _ = test_util.create_local_cluster(
        1, 1, worker_config=worker_config)

    @function.Defun(dtypes.int32, dtypes.int32)
    def _remote_fn(a, b):
      return math_ops.multiply(a, b)

    with ops.device("/job:ps/task:0"):
      a = variables.Variable(2, dtype=dtypes.int32)
      b = variables.Variable(3, dtype=dtypes.int32)

    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
      remote_op = functional_ops.remote_call(
          args=[a, b],
          Tout=[dtypes.int32],
          f=_remote_fn,
          target="/job:worker/replica:0/task:0/cpu:1")

    with session.Session(worker[0].target) as sess:
      self.evaluate(variables.global_variables_initializer())
      mul = self.evaluate(remote_op)
      self.assertEqual(mul, [6])
 def Body(x, y):
   with ops.device("/gpu:0"):
     a = x + x
     b = y + y
   with ops.device("/cpu:0"):
     c = a + b
     return c
  def benchmarkMatrixBandPartOp(self):
    for shape_ in self.shapes:
      for limits in (-1, -1), (-1, 0), (0, -1), (2, 2):
        with ops.Graph().as_default(), \
            session.Session() as sess, \
            ops.device("/cpu:0"):
          matrix = variables.Variable(array_ops.ones(shape_))
          band = array_ops.matrix_band_part(matrix, limits[0], limits[1])
          variables.global_variables_initializer().run()
          self.run_op_benchmark(
              sess,
              control_flow_ops.group(band),
              min_iters=10,
              name="matrix_band_part_cpu_{shape}_{limits}".format(
                  shape=shape_, limits=limits))

        if test_lib.is_gpu_available(True):
          with ops.Graph().as_default(), \
              session.Session() as sess, \
              ops.device("/gpu:0"):
            matrix = variables.Variable(array_ops.ones(shape_))
            band = array_ops.matrix_band_part(matrix, limits[0], limits[1])
            variables.global_variables_initializer().run()
            self.run_op_benchmark(
                sess,
                control_flow_ops.group(band),
                min_iters=10,
                name="matrix_band_part_gpu_{shape}_{limits}".format(
                    shape=shape_, limits=limits))
  def benchmarkMatrixDeterminantOp(self):
    for shape in self.shapes:
      with ops.Graph().as_default(), session.Session(
          config=benchmark.benchmark_config()) as sess, ops.device("/cpu:0"):
        matrix = self._GenerateMatrix(shape)
        d = linalg_ops.matrix_determinant(matrix)
        variables.global_variables_initializer().run()
        self.run_op_benchmark(
            sess,
            control_flow_ops.group(
                d,),
            min_iters=25,
            name="matrix_determinant_cpu_{shape}".format(shape=shape))

      if test.is_gpu_available(True):
        with ops.Graph().as_default(), session.Session(
            config=benchmark.benchmark_config()) as sess, ops.device("/gpu:0"):
          matrix = self._GenerateMatrix(shape)
          d = linalg_ops.matrix_determinant(matrix)
          variables.global_variables_initializer().run()
          self.run_op_benchmark(
              sess,
              control_flow_ops.group(
                  d,),
              min_iters=25,
              name="matrix_determinant_gpu_{shape}".format(shape=shape))
Example #24
0
  def __init__(self,
               input_dataset,
               device,
               buffer_size):
    with ops.device("/device:CPU:0"):
      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
      input_iterator_handle = core_gen_dataset_ops.iterator_to_string_handle(
          self._resource)

    self._device = device

    @function.Defun(dtypes.string)
    def _prefetch_fn(handle):
      """Prefetches one element from `input_iterator`."""
      remote_iterator = iterator_ops.Iterator.from_string_handle(
          handle, self.output_types, self.output_shapes, self.output_classes)
      ret = remote_iterator.get_next()
      return nest.flatten(sparse.serialize_sparse_tensors(ret))

    _prefetch_fn.add_to_graph(None)

    with ops.device(device):
      self._buffering_resource = function_buffering_resource(
          f=_prefetch_fn,
          output_types=self._flat_output_types,
          target_device=gen_dataset_ops.iterator_get_device(self._resource),
          string_arg=input_iterator_handle,
          buffer_size=buffer_size,
          shared_name=iterator_ops._generate_shared_name(
              "function_buffer_resource"))
  def __init__(self,
               input_dataset,
               device,
               buffer_size):
    with ops.device("/device:CPU:0"):
      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
      input_iterator_handle = gen_dataset_ops.iterator_to_string_handle(
          self._resource)

    self._device = device

    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
    def _prefetch_fn(handle):
      """Prefetches one element from `input_iterator`."""
      remote_iterator = iterator_ops.Iterator.from_string_handle(
          handle, self.output_types, self.output_shapes, self.output_classes)
      ret = remote_iterator.get_next()
      return nest.flatten(sparse.serialize_sparse_tensors(ret))

    self._prefetch_fn = _prefetch_fn._get_concrete_function_internal()  # pylint: disable=protected-access

    with ops.device(device):
      self._buffering_resource = function_buffering_resource(
          f=self._prefetch_fn,
          output_types=self._flat_output_types,
          target_device=ged_ops.experimental_iterator_get_device(
              self._resource),
          string_arg=input_iterator_handle,
          buffer_size=buffer_size,
          shared_name=iterator_ops._generate_shared_name(
              "function_buffer_resource"))
Example #26
0
  def testReturnsSingleCheckpointIfOneShardedCheckpoint(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(),
                                  'one_checkpoint_found_sharded')
    if not gfile.Exists(checkpoint_dir):
      gfile.MakeDirs(checkpoint_dir)

    global_step = variables.get_or_create_global_step()

    # This will result in 3 different checkpoint shard files.
    with ops.device('/cpu:0'):
      variables_lib.Variable(10, name='v0')
    with ops.device('/cpu:1'):
      variables_lib.Variable(20, name='v1')

    saver = saver_lib.Saver(sharded=True)

    with session_lib.Session(
        target='',
        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as session:

      session.run(variables_lib.global_variables_initializer())
      save_path = os.path.join(checkpoint_dir, 'model.ckpt')
      saver.save(session, save_path, global_step=global_step)

    num_found = 0
    for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0):
      num_found += 1
    self.assertEqual(num_found, 1)
Example #27
0
 def _TestRandomGraphWithDevices(self,
                                 sess,
                                 seed,
                                 op_placement,
                                 devices,
                                 debug_mode=False):
   data = []
   shape = (self._dim, self._dim)
   feed_dict = {}
   # Initialize the matrices
   for i in range(len(devices)):
     with ops.device(devices[i]):
       var = array_ops.placeholder(dtypes.float32, shape=shape)
       np.random.seed(seed + i)
       feed_dict[var] = np.random.uniform(
           low=0, high=0.1, size=shape).astype(np.float32)
       data.append(var)
   # Run the 'add' operations on those matrices
   for op in op_placement:
     with ops.device(devices[op[2]]):
       data[op[2]] = math_ops.add(data[op[0]], data[op[1]])
   with ops.device('/cpu:0'):
     s = data[0]
     for i in range(1, len(data)):
       s = math_ops.add(s, data[i])
   if debug_mode:
     logging.info(ops.get_default_graph().as_graph_def())
   result = sess.run(s, feed_dict=feed_dict)
   self._LogMatrix(result, self._dim)
   return result
Example #28
0
  def testServerDefChanged(self):
    """Update server def, and run ops on new cluster."""
    context.set_server_def(
        server_def=get_server_def(
            ALT_JOB_NAME,
            local_server_port=0,
            remote_server_addresses=[
                self._cached_server1_target, self._cached_server2_target
            ],
            task_index=0))

    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % ALT_JOB_NAME):
      x1 = array_ops.ones([2, 2])
    y = math_ops.matmul(x1, x1)
    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())

    # Set the server def back to JOB_NAME
    context.set_server_def(
        server_def=get_server_def(
            JOB_NAME,
            local_server_port=0,
            remote_server_addresses=[
                self._cached_server1_target, self._cached_server2_target
            ],
            task_index=0))

    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
      x1 = array_ops.ones([2, 2])
    y = math_ops.matmul(x1, x1)
    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
  def testDictionary(self):
    with ops.Graph().as_default() as G:
      with ops.device('/cpu:0'):
        x = array_ops.placeholder(dtypes.float32)
        pi = array_ops.placeholder(dtypes.int64)
        gi = array_ops.placeholder(dtypes.int64)
        v = 2. * (array_ops.zeros([128, 128]) + x)
      with ops.device(test.gpu_device_name()):
        stager = data_flow_ops.MapStagingArea(
            [dtypes.float32, dtypes.float32],
            shapes=[[], [128, 128]],
            names=['x', 'v'])
        stage = stager.put(pi, {'x': x, 'v': v})
        key, ret = stager.get(gi)
        z = ret['x']
        y = ret['v']
        y = math_ops.reduce_max(z * math_ops.matmul(y, y))

    G.finalize()

    with self.session(use_gpu=True, graph=G) as sess:
      sess.run(stage, feed_dict={x: -1, pi: 0})
      for i in range(10):
        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
        self.assertAllClose(
            4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
  def testFunctionWithResourcesOnDifferentDevices(self):
    if not test_util.is_gpu_available():
      self.skipTest("No GPUs available.")

    with ops.device("/cpu:0"):
      v_cpu_zero = resource_variable_ops.ResourceVariable(
          [0.0, 1.0, 2.0], name="v_cpu_zero")

    with ops.device("/cpu:1"):
      v_cpu_one = resource_variable_ops.ResourceVariable(
          [0.0, 1.0, 2.0], name="v_cpu_one")

    with ops.device("/gpu:0"):
      v_gpu = resource_variable_ops.ResourceVariable(
          [0.0, 1.0, 2.0], name="v_gpu")

    def sum_gather():
      cpu_result = math_ops.reduce_sum(array_ops.gather(v_cpu_zero, [1, 2]))
      also_cpu_result = math_ops.reduce_sum(array_ops.gather(v_cpu_one, [1, 2]))
      gpu_result = math_ops.reduce_sum(array_ops.gather(v_gpu, [1, 2]))
      return cpu_result, also_cpu_result, gpu_result

    defined = function.Defun()(sum_gather)
    with self.test_session(
        config=config_pb2.ConfigProto(
            allow_soft_placement=False,
            log_device_placement=True,
            device_count={"CPU": 2})) as sess:
      self.evaluate(variables.global_variables_initializer())
      expected = self.evaluate(sum_gather())
      result = sess.run(
          functional_ops.partitioned_call(
              args=defined.captured_inputs, f=defined))
      self.assertAllEqual(expected, result)
Example #31
0
    def batch_all_reduce(self,
                         input_tensor_packs,
                         communication_hint='AUTO',
                         timeout=0):
        """Batch all-reduce dense tensors.

    This takes a list of batches of tensors. Using multiple batches have the
    benefit that it doesn't need to wait for all inputs to be ready to start the
    all-reduce.

    This can be called in eager mode if a async executor is supplied when
    creating the launcher.

    Args:
      input_tensor_packs: a list of lists of dense tensors.
      communication_hint: string providing hint to runtime for choosing
        collective implementation.
      timeout: a float. The timeout in seconds.

    Returns:
      A flat list of reduced tensors.
    """
        # We don't batch with concat in eager. It's easy to get it wrong because
        # we need to avoid any numpy() calls on values produced by the async
        # executor. This effectively disables batching in eager, but it's unlikely
        # to all-reduce a large number of tensors in eager.
        batch_with_concat = (not self._use_scoped_allocator()
                             and not context.executing_eagerly())
        outputs = []
        for pack in input_tensor_packs:
            # TODO(b/169168846): inserts a parallel all_gather to verify packings
            # are the same on each replica.
            if batch_with_concat:
                with ops.device(self._device):
                    flat_tensors = [array_ops.reshape(t, [-1]) for t in pack]
                    shapes = [array_ops.shape(t) for t in pack]
                    if communication_hint == 'NCCL' and outputs:
                        control_input = outputs[-1]
                    else:
                        control_input = None
                    reduced = self.all_reduce(
                        array_ops.concat(flat_tensors, axis=0), control_input,
                        communication_hint, timeout)
                    num_elements = [math_ops.reduce_prod(s) for s in shapes]
                    flat_outputs = array_ops.split(reduced,
                                                   num_elements,
                                                   axis=0)
                    for shape, flat_output in zip(shapes, flat_outputs):
                        outputs.append(array_ops.reshape(flat_output, shape))
            else:
                # By placing all CollectiveReduce ops in a batch under single name
                # scope, we ensure they will be picked up by the `ScopedAllocator`
                # grappler optimizer and packed into a single all-reduce.
                with ops.name_scope('allreduce'):
                    for input_tensor in pack:
                        if communication_hint == 'NCCL' and outputs:
                            control_input = outputs[-1]
                        else:
                            control_input = None
                        outputs.append(
                            self.all_reduce(input_tensor, control_input,
                                            communication_hint, timeout))

        return outputs
Example #32
0
  def __init__(self, input_dataset, target_device, source_device="/cpu:0"):
    """Constructs a _CopyToDeviceDataset.

    Args:
      input_dataset: `Dataset` to be copied
      target_device: The name of the device to which elements would be copied.
      source_device: Device where input_dataset would be placed.
    """
    self._input_dataset = input_dataset
    self._target_device = target_device
    spec = framework_device.DeviceSpec().from_string(self._target_device)
    self._is_gpu_target = (spec.device_type == "GPU")
    self._source_device_string = source_device
    self._source_device = ops.convert_to_tensor(source_device)

    wrap_ds_variant = gen_dataset_ops.wrap_dataset_variant(
        self._input_dataset._variant_tensor)  # pylint: disable=protected-access

    @function.defun()
    def _init_func():
      """Creates an iterator for the input dataset.

      Returns:
        A `string` tensor that encapsulates the iterator created.
      """
      ds_variant = gen_dataset_ops.unwrap_dataset_variant(wrap_ds_variant)
      resource = gen_dataset_ops.anonymous_iterator(
          **dataset_ops.flat_structure(self._input_dataset))
      with ops.control_dependencies(
          [gen_dataset_ops.make_iterator(ds_variant, resource)]):
        return gen_dataset_ops.iterator_to_string_handle(resource)

    init_func_concrete = _init_func._get_concrete_function_internal()  # pylint: disable=protected-access

    @function.defun()
    def _remote_init_func():
      return functional_ops.remote_call(
          target=self._source_device,
          args=init_func_concrete.captured_inputs,
          Tout=[dtypes.string],
          f=init_func_concrete)

    self._init_func = _remote_init_func._get_concrete_function_internal()  # pylint: disable=protected-access
    self._init_captured_args = self._init_func.captured_inputs

    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
    def _next_func(string_handle):
      """Calls get_next for created iterator.

      Args:
        string_handle: An iterator string handle created by _init_func
      Returns:
        The elements generated from `input_dataset`
      """
      with ops.device(self._source_device_string):
        iterator = iterator_ops.Iterator.from_string_handle(
            string_handle,
            dataset_ops.get_legacy_output_types(self),
            dataset_ops.get_legacy_output_shapes(self),
            dataset_ops.get_legacy_output_classes(self))
      return self._element_structure._to_tensor_list(iterator.get_next())  # pylint: disable=protected-access

    next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access

    @function.defun_with_attributes(
        input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
        attributes={"experimental_ints_on_device": True})
    def _remote_next_func(string_handle):
      return functional_ops.remote_call(
          target=self._source_device,
          args=[string_handle] +
          next_func_concrete.captured_inputs,
          Tout=self._input_dataset._element_structure._flat_types,  # pylint: disable=protected-access
          f=next_func_concrete)

    self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
    self._next_captured_args = self._next_func.captured_inputs

    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
    def _finalize_func(string_handle):
      """Destroys the iterator resource created.

      Args:
        string_handle: An iterator string handle created by _init_func
      Returns:
        Tensor constant 0
      """
      iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
          string_handle,
          **dataset_ops.flat_structure(self._input_dataset))
      with ops.control_dependencies([
          resource_variable_ops.destroy_resource_op(
              iterator_resource, ignore_lookup_error=True)]):
        return array_ops.constant(0, dtypes.int64)

    finalize_func_concrete = _finalize_func._get_concrete_function_internal()  # pylint: disable=protected-access

    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
    def _remote_finalize_func(string_handle):
      return functional_ops.remote_call(
          target=self._source_device,
          args=[string_handle] + finalize_func_concrete.captured_inputs,
          Tout=[dtypes.int64],
          f=finalize_func_concrete)

    self._finalize_func = _remote_finalize_func._get_concrete_function_internal(  # pylint: disable=protected-access
    )
    self._finalize_captured_args = self._finalize_func.captured_inputs

    g = ops.get_default_graph()
    self._init_func.add_to_graph(g)
    self._next_func.add_to_graph(g)
    self._finalize_func.add_to_graph(g)
    # pylint: enable=protected-scope

    with ops.device(self._target_device):
      variant_tensor = gen_dataset_ops.generator_dataset(
          self._init_captured_args,
          self._next_captured_args,
          self._finalize_captured_args,
          init_func=self._init_func,
          next_func=self._next_func,
          finalize_func=self._finalize_func,
          **dataset_ops.flat_structure(self._input_dataset))
    super(_CopyToDeviceDataset, self).__init__(input_dataset, variant_tensor)
Example #33
0
 def test_application_model(self, app):
     # Run on CPU since model weights may exhaust GPU memory
     with policy.policy_scope('mixed_float16'), ops.device('/CPU:0'):
         app(weights=None)
Example #34
0
    def test_save_variable_devices(self, save_devices, meta_graph_only):
        context._reset_context()
        cpus = context.context().list_physical_devices("CPU")
        if len(cpus) == 1:
            context.context().set_logical_device_configuration(
                cpus[0], [
                    context.LogicalDeviceConfiguration(),
                    context.LogicalDeviceConfiguration()
                ])
        context.ensure_initialized()

        root = tracking.AutoTrackable()
        with ops.device("CPU:0"):
            root.v0 = variables.Variable(1., name="v0")
        with ops.device("CPU:1"):
            root.v1 = variables.Variable(1., name="v1")

        options = save_options.SaveOptions(
            experimental_variable_policy=save_devices)
        file_name = os.path.join(self.get_temp_dir(), "saved_model")
        if meta_graph_only:
            save.export_meta_graph(obj=root,
                                   filename=file_name,
                                   options=options)
        else:
            save.save(obj=root, export_dir=file_name, options=options)

        meta = None
        if meta_graph_only:
            meta = meta_graph.read_meta_graph_file(file_name)
        else:
            meta = loader_impl.parse_saved_model(file_name).meta_graphs[0]

        # Check devices in meta graph nodes.
        graph_def = meta.graph_def
        v0 = next((n for n in graph_def.node if n.name == "v0"), None)
        v1 = next((n for n in graph_def.node if n.name == "v1"), None)
        self.assertIsNotNone(v0)
        self.assertIsNotNone(v1)
        if save_devices == save_options.VariablePolicy.SAVE_VARIABLE_DEVICES:
            self.assertIn("CPU:0", v0.device)
            self.assertIn("CPU:1", v1.device)
        else:
            self.assertEmpty(v0.device)
            self.assertEmpty(v1.device)

        # Check devices in object graph nodes.
        object_graph_def = meta.object_graph_def
        v0 = next((n.variable for n in object_graph_def.nodes
                   if n.HasField("variable") and n.variable.name == "v0"),
                  None)
        v1 = next((n.variable for n in object_graph_def.nodes
                   if n.HasField("variable") and n.variable.name == "v1"),
                  None)
        self.assertIsNotNone(v0)
        self.assertIsNotNone(v1)
        if save_devices == save_options.VariablePolicy.SAVE_VARIABLE_DEVICES:
            self.assertIn("CPU:0", v0.device)
            self.assertIn("CPU:1", v1.device)
        else:
            self.assertEmpty(v0.device)
            self.assertEmpty(v1.device)
Example #35
0
    def all_reduce_indexed_slices(self,
                                  input_slices,
                                  communication_hint='AUTO',
                                  timeout=0):
        """All-reduce an IndexedSlices.

    This method must be called inside a tf.function.

    Args:
      input_slices: an IndexedSlices.
      communication_hint: string providing hint to runtime for choosing
        collective implementation.
      timeout: a float. The timeout in seconds.

    Returns:
      The reduced IndexedSlices.

    Raises:
      RuntimeError: if called in eager mode.
    """
        if context.executing_eagerly():
            raise RuntimeError(
                'all_reduce_indexed_slices in eager mode is not supported')

        # Current CollectiveAllGather implementations require input IndexedSlices to
        # have consistent length across the board, we handle the reduction of
        # IndexedSlices as follows:
        #   1. Gather the lengths of IndexedSlices from all participants.
        #   2. If they have consistent length, apply all_gather.
        #   3. Otherwise convert IndexedSlices to dense tensors and apply
        #      all_reduce.
        with ops.device(self._device):

            def all_gather():
                """Use all_gather to aggregate `IndexedSlices`."""
                all_values = self._all_gather(input_slices.values,
                                              communication_hint,
                                              timeout=timeout)
                # Add control dependency to order the all-gather.
                control = [all_values] if communication_hint == 'NCCL' else []
                with ops.control_dependencies(control):
                    all_indices = self._all_gather(input_slices.indices,
                                                   communication_hint,
                                                   timeout=timeout)
                return ops.IndexedSlices(values=all_values,
                                         indices=all_indices,
                                         dense_shape=input_slices.dense_shape)

            def densify_and_all_reduce():
                """Use all_reduce to aggregate `IndexedSlices`."""
                densified = ops.convert_to_tensor(input_slices)
                reduced = self.all_reduce(
                    densified,
                    communication_hint=communication_hint,
                    timeout=timeout)
                # We have to convert dense grad to IndexedSlice because all_reduce()
                # and all_gather() must have the same return type as required by
                # control_flow_ops.cond.
                return ops.IndexedSlices(values=reduced,
                                         indices=math_ops.range(
                                             array_ops.shape(reduced)[0]),
                                         dense_shape=input_slices.dense_shape)

            length = array_ops.shape(input_slices.indices)
            all_lengths = self._all_gather(length,
                                           communication_hint,
                                           timeout=timeout)
            return control_flow_ops.cond(
                math_ops.equal(math_ops.reduce_max(all_lengths),
                               math_ops.reduce_min(all_lengths)), all_gather,
                densify_and_all_reduce)
Example #36
0
def aggregate_gradients_using_hierarchical_copy(avail_devices, replica_grads):
    """Aggregate gradients using hierarchical copies.

  Args:
    avail_devices: available GPU devices.
    replica_grads: List of lists of (gradient, variable) tuples. The outer list
      is over replicas. The inner list is over individual gradients.

  Returns:
    The list of (aggregated_gradient, variable), where the gradient has been
      summed across all replicas and the variable is chosen from the first
      replica.
  """
    # This only works for DGX-1 type of machine topology
    # Device peer to peer matrix
    # DMA: 0 1 2 3 4 5 6 7
    # 0:   Y Y Y Y Y N N N
    # 1:   Y Y Y Y N Y N N
    # 2:   Y Y Y Y N N Y N
    # 3:   Y Y Y Y N N N Y
    # 4:   Y N N N Y Y Y Y
    # 5:   N Y N N Y Y Y Y
    # 6:   N N Y N Y Y Y Y
    # 7:   N N N Y Y Y Y Y
    agg_grads = []
    num_devices = len(avail_devices)
    # In the special case of DGX-1 machine topology, the two groups have equal
    # size.
    group_size = num_devices // 2
    for i, single_grads in enumerate(zip(*replica_grads)):
        group_0_main_device = i % num_devices
        group_1_main_device = (group_0_main_device + group_size) % num_devices
        if group_0_main_device < group_size:
            group_0_begin = 0
            group_1_begin = group_size
        else:
            group_0_begin = group_size
            group_1_begin = 0

        # Aggregate the first group.
        group_0_device_grads = single_grads[group_0_begin:group_0_begin +
                                            group_size]
        with ops.device(avail_devices[group_0_main_device]):
            group_0_agg_grads, _ = aggregate_single_gradient_using_copy(
                group_0_device_grads, False, False)

        # Aggregate the second group.
        group_1_device_grads = single_grads[group_1_begin:group_1_begin +
                                            group_size]
        with ops.device(avail_devices[group_1_main_device]):
            group_1_agg_grads, _ = aggregate_single_gradient_using_copy(
                group_1_device_grads, False, False)

        # Aggregate between the groups.
        with ops.device(avail_devices[group_0_main_device]):
            (agg_total_grads, _), _ = aggregate_single_gradient_using_copy(
                [group_0_agg_grads, group_1_agg_grads], False, False)

        # Broadcast the result back into the root of each group.
        with ops.device(avail_devices[group_0_main_device]):
            group_0_agg_grads_bcast = array_ops.identity(agg_total_grads)
        with ops.device(avail_devices[group_1_main_device]):
            group_1_agg_grads_bcast = array_ops.identity(agg_total_grads)

        agg_grads_bcast = []
        for j in range(len(single_grads)):
            with ops.device(avail_devices[j]):
                # Broadcast the result back to each member in the group from the root.
                if (group_0_main_device < group_size) == (j < group_size):
                    src_device_grad = group_0_agg_grads_bcast
                else:
                    src_device_grad = group_1_agg_grads_bcast
                agg_grads_bcast.append(array_ops.identity(src_device_grad))

        agg_grads.append([(g, v)
                          for g, (_, v) in zip(agg_grads_bcast, single_grads)])

    agg_grads = list(zip(*agg_grads))

    return agg_grads
Example #37
0
    def all_gather(self,
                   input_tensor,
                   axis,
                   communication_hint='AUTO',
                   timeout=0):
        """All-gather a dense tensor.

    This method must be called inside a tf.function.

    Args:
      input_tensor: a dense tensor. It must have the same rank on all replicas,
        and dimensions other than `axis` need to be the same as well.
      axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
        range [0, rank(value)).
      communication_hint: string providing hint to runtime for choosing
        collective implementation. Available options are `AUTO`, `NCCL`, and
        `RING`.
      timeout: a float. The timeout in seconds.

    Returns:
      The gathered Tensor.

    Raises:
      RuntimeError: if called in eager mode.
    """
        if context.executing_eagerly():
            raise RuntimeError('all_gather in eager mode is not supported')

        with ops.device(self._device), \
             ops.control_dependencies([array_ops.identity(input_tensor)]):
            # 1. Transpose
            # E.g. Given an input_tensor with shape [2,2,5,1] and axis to gather is 3,
            # we use perm_pre=[3 0 1 2] to reshape it to [1,2,2,5], which
            # brings the 3rd dim first; afterwards we use perm_after=[1,2,3,0] to
            # place it back.
            perm_pre = array_ops.concat(
                ([axis], math_ops.range(axis),
                 math_ops.range(axis + 1, array_ops.rank(input_tensor))),
                axis=0)
            input_tensor_t = array_ops.transpose(input_tensor, perm=perm_pre)
            # 2. Pad
            gathered_shape = self._all_gather(array_ops.expand_dims_v2(
                array_ops.shape_v2(input_tensor_t), axis=0),
                                              communication_hint,
                                              timeout=timeout)
            first_dims = gathered_shape[:, 0]
            full_axis_dim = math_ops.reduce_max(first_dims)
            padded_input_tensor = _pad_util(input_tensor_t, full_axis_dim)

            # 3. Gather
            gather_padded_out_tensor = self._all_gather(padded_input_tensor,
                                                        communication_hint,
                                                        timeout=timeout)
            # 4. Unpad
            split_tensors = []
            for i in range(self._group_size):
                start_pos = i * full_axis_dim
                split_tensors.append(
                    gather_padded_out_tensor[start_pos:start_pos +
                                             first_dims[i]])
            out_tensor_t = array_ops.concat(split_tensors, 0)

            # 5. Transpose back
            perm_after = array_ops.concat(
                (math_ops.range(1, axis + 1), [0],
                 math_ops.range(axis + 1, array_ops.rank(input_tensor_t))),
                axis=0)
            return array_ops.transpose(out_tensor_t, perm=perm_after)
    def _init_from_args(self,
                        initial_value=None,
                        trainable=True,
                        collections=None,
                        validate_shape=True,
                        caching_device=None,
                        name=None,
                        dtype=None,
                        constraint=None):
        """Creates a variable.

    Args:
      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
        which is the initial value for the Variable. The initial value must have
        a shape specified unless `validate_shape` is set to False. Can also be a
        callable with no argument that returns the initial value when called.
        (Note that initializer functions from init_ops.py must first be bound
         to a shape before being used here.)
      trainable: If `True`, the default, also adds the variable to the graph
        collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
        the default list of variables to use by the `Optimizer` classes.
      collections: List of graph collections keys. The new variable is added to
        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
      validate_shape: Ignored. Provided for compatibility with tf.Variable.
      caching_device: Optional device string or function describing where the
        Variable should be cached for reading.  Defaults to the Variable's
        device.  If not `None`, caches on another device.  Typical use is to
        cache on the device where the Ops using the Variable reside, to
        deduplicate copying through `Switch` and other conditional statements.
      name: Optional name for the variable. Defaults to `'Variable'` and gets
        uniquified automatically.
      dtype: If set, initial_value will be converted to the given type.
        If None, either the datatype will be kept (if initial_value is
       a Tensor) or float32 will be used (if it is a Python object convertible
       to a Tensor).
      constraint: An optional projection function to be applied to the variable
        after being updated by an `Optimizer` (e.g. used to implement norm
        constraints or value constraints for layer weights). The function must
        take as input the unprojected Tensor representing the value of the
        variable and return the Tensor for the projected value
        (which must have the same shape). Constraints are not safe to
        use when doing asynchronous distributed training.

    Raises:
      ValueError: If the initial value is not specified, or does not have a
        shape and `validate_shape` is `True`.
    """
        if initial_value is None:
            raise ValueError("initial_value must be specified.")
        init_from_fn = callable(initial_value)

        if collections is None:
            collections = [ops.GraphKeys.GLOBAL_VARIABLES]
        if not isinstance(collections, (list, tuple, set)):
            raise ValueError(
                "collections argument to Variable constructor must be a list, tuple, "
                "or set. Got %s of type %s" % (collections, type(collections)))
        if constraint is not None and not callable(constraint):
            raise ValueError("The `constraint` argument must be a callable.")

        self._trainable = trainable
        if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
            collections = list(collections) + [
                ops.GraphKeys.TRAINABLE_VARIABLES
            ]
        self._save_slice_info = None
        self._in_graph_mode = context.in_graph_mode()
        with ops.control_dependencies(None):
            with ops.name_scope(
                    name, "Variable",
                [] if init_from_fn else [initial_value]) as name:
                # pylint: disable=protected-access
                handle_name = ops._name_from_scope_name(name)
                if init_from_fn:
                    # Use attr_scope and device(None) to simulate the behavior of
                    # colocate_with when the variable we want to colocate with doesn't
                    # yet exist.
                    if self._in_graph_mode:
                        attr = attr_value_pb2.AttrValue(
                            list=attr_value_pb2.AttrValue.ListValue(
                                s=[compat.as_bytes("loc:@%s" % handle_name)]))
                        with ops.get_default_graph()._attr_scope(
                            {"_class": attr}):
                            with ops.name_scope("Initializer"), ops.device(
                                    None):
                                initial_value = ops.convert_to_tensor(
                                    initial_value(),
                                    name="initial_value",
                                    dtype=dtype)
                            self._handle = _eager_safe_variable_handle(
                                shape=initial_value.get_shape(),
                                dtype=initial_value.dtype.base_dtype,
                                shared_name=handle_name,
                                name=name)
                            self._handle_device = (
                                self._handle.device if self._in_graph_mode else
                                context.get_default_context().device_name)
                    else:
                        initial_value = initial_value()
                        with ops.name_scope("Initializer"):
                            initial_value = ops.convert_to_tensor(
                                initial_value,
                                name="initial_value",
                                dtype=dtype)
                        self._handle = _eager_safe_variable_handle(
                            shape=initial_value.get_shape(),
                            dtype=initial_value.dtype.base_dtype,
                            shared_name=handle_name,
                            name=name,
                            container="")
                        self._handle_device = (
                            self._handle.device if self._in_graph_mode else
                            context.get_default_context().device_name)
                # pylint: enable=protected-access

                # Or get the initial value from a Tensor or Python object.
                else:
                    with ops.name_scope("Initializer"):
                        initial_value = ops.convert_to_tensor(
                            initial_value, name="initial_value", dtype=dtype)
                    # pylint: disable=protected-access
                    if (self._in_graph_mode and initial_value is not None
                            and initial_value.op._get_control_flow_context()
                            is not None):
                        raise ValueError(
                            "Initializer for variable %s is from inside a control-flow "
                            "construct, such as a loop or conditional. When creating a "
                            "variable inside a loop or conditional, use a lambda as the "
                            "initializer." % name)
                    # pylint: enable=protected-access
                    self._handle = _eager_safe_variable_handle(
                        shape=initial_value.get_shape(),
                        dtype=initial_value.dtype.base_dtype,
                        shared_name=handle_name,
                        name=name,
                        container="")
                    self._handle_device = (
                        self._handle.device if self._in_graph_mode else
                        context.get_default_context().device_name)

                self._initial_value = initial_value if self._in_graph_mode else None
                self._handle_name = handle_name + ":0"
                self._dtype = initial_value.dtype.base_dtype
                self._constraint = constraint

                if self._in_graph_mode:
                    with ops.name_scope("IsInitialized"):
                        self._is_initialized_op = (
                            gen_resource_variable_ops.var_is_initialized_op(
                                self._handle))
                    if initial_value is not None:
                        with ops.name_scope("Assign") as n, ops.colocate_with(
                                self._handle):
                            self._initializer_op = (
                                gen_resource_variable_ops.assign_variable_op(
                                    self._handle,
                                    self._build_initializer_expr(
                                        initial_value),
                                    name=n))
                    with ops.name_scope("Read"), ops.colocate_with(
                            self._handle):
                        # Manually assign reads to the handle's device to avoid log
                        # messages.
                        with ops.device(self._handle_device):
                            value = self._read_variable_op()
                        self._graph_element = value
                        if caching_device is not None:
                            # Variables may be created in a tf.device() or ops.colocate_with()
                            # context. At the same time, users would expect caching device to
                            # be independent of this context, and/or would not expect the
                            # current device context to be merged with the caching device
                            # spec.  Therefore we reset the colocation stack before creating
                            # the cached value. Note that resetting the colocation stack will
                            # also reset the device stack.
                            with ops.colocate_with(None, ignore_existing=True):
                                with ops.device(caching_device):
                                    self._cached_value = array_ops.identity(
                                        value)
                        else:
                            self._cached_value = None
                else:
                    gen_resource_variable_ops.assign_variable_op(
                        self._handle, initial_value)
                    self._is_initialized_op = None
                    self._initializer_op = None
                    self._graph_element = None
                    if caching_device:
                        with ops.device(caching_device):
                            self._cached_value = self._read_variable_op()
                    else:
                        self._cached_value = None
                ops.add_to_collections(collections, self)
    def _create_variable(self, next_creator, **kwargs):
        if self._num_replicas_in_sync > 1:
            aggregation = kwargs.pop("aggregation",
                                     vs.VariableAggregation.NONE)
            if aggregation not in (vs.VariableAggregation.NONE,
                                   vs.VariableAggregation.SUM,
                                   vs.VariableAggregation.MEAN,
                                   vs.VariableAggregation.ONLY_FIRST_REPLICA):
                raise ValueError("Invalid variable aggregation mode: " +
                                 aggregation + " for variable: " +
                                 kwargs["name"])

            def var_creator(**kwargs):
                """Create an AggregatingVariable and fix up collections."""
                # Record what collections this variable should be added to.
                collections = kwargs.pop("collections", None)
                if collections is None:
                    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
                kwargs["collections"] = []

                # Create and wrap the variable.
                v = next_creator(**kwargs)
                wrapped = values.AggregatingVariable(
                    self._container_strategy(), v, aggregation)

                # Add the wrapped variable to the requested collections.
                # The handling of eager mode and the global step matches
                # ResourceVariable._init_from_args().
                if not context.executing_eagerly():
                    g = ops.get_default_graph()
                    # If "trainable" is True, next_creator() will add the contained
                    # variable to the TRAINABLE_VARIABLES collection, so we manually
                    # remove it and replace with the wrapper. We can't set "trainable"
                    # to False for next_creator() since that causes functions like
                    # implicit_gradients to skip those variables.
                    if kwargs.get("trainable", True):
                        collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
                        l = g.get_collection_ref(
                            ops.GraphKeys.TRAINABLE_VARIABLES)
                        if v in l:
                            l.remove(v)
                    g.add_to_collections(collections, wrapped)
                elif ops.GraphKeys.GLOBAL_STEP in collections:
                    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)

                return wrapped
        else:
            var_creator = next_creator

        if "colocate_with" in kwargs:
            colocate_with = kwargs["colocate_with"]
            if isinstance(colocate_with, numpy_dataset.SingleDevice):
                with ops.device(colocate_with.device):
                    return var_creator(**kwargs)
            with ops.device(None):
                with ops.colocate_with(colocate_with):
                    return var_creator(**kwargs)

        with ops.colocate_with(None, ignore_existing=True):
            with ops.device(self._variable_device):
                return var_creator(**kwargs)
Example #40
0
 def __init__(self, graph_def, device, element_spec):
     self._elem_spec = element_spec
     with ops.device(device):
         variant_tensor = ged_ops.dataset_from_graph(graph_def)
     super(_RemoteDataset, self).__init__(variant_tensor)
Example #41
0
    def restore(self, save_path, session=None):
        """Restore a training checkpoint.

    Restores `root_checkpointable` and any objects that it tracks
    (transitive). Either assigns values immediately if variables to restore have
    been created already, or defers restoration until the variables are
    created. Dependencies added to the `root_checkpointable` passed to the
    constructor after this call will be matched if they have a corresponding
    object in the checkpoint.

    When building a graph, restorations are added to the graph but not run. A
    session is required to retrieve checkpoint metadata.

    To disallow deferred loading, assert immediately that all checkpointed
    variables have been matched to variable objects:

    ```python
    saver = Saver(root)
    saver.restore(path).assert_consumed()
    ```

    An exception will be raised unless every object was matched and its
    variables already exist.

    When graph building, `assert_consumed()` indicates that all of the restore
    ops which will be created for this checkpoint have been created. They can be
    run via the `run_restore_ops()` function of the status object:

    ```python
    saver.restore(path).assert_consumed().run_restore_ops()
    ```

    If the checkpoint has not been consumed completely, then the list of restore
    ops will grow as more objects are added to the dependency graph.

    Name-based `tf.train.Saver` checkpoints can be loaded using this
    method. There is no deferred loading, and names are used to match
    variables. No restore ops are created/run until `run_restore_ops()` or
    `initialize_or_restore()` are called on the returned status object, even
    when executing eagerly. Re-encode name-based checkpoints using this
    object-based `Saver.save` as soon as possible.

    Args:
      save_path: The path to the checkpoint, as returned by `save` or
        `tf.train.latest_checkpoint`. If None (as when there is no latest
        checkpoint for `tf.train.latest_checkpoint` to return), returns an
        object which may run initializers for objects in the dependency
        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
        names are used to match variables.
      session: The session to retrieve metadata with. Ignored when executing
        eagerly. If not provided when graph building, the default session is
        used.

    Returns:
      A load status object, which can be used to make assertions about the
      status of checkpoint restoration and run initialization/restore ops
      (of type `CheckpointLoadStatus`, or `InitializationOnlyStatus` if
      `save_path` is `None`).

      If `save_path` points to a name-based checkpoint, a `NameBasedSaverStatus`
      object is returned which runs restore ops from a name-based saver.
    """
        if save_path is None:
            return InitializationOnlyStatus(self._root_checkpointable)
        in_graph_mode = not context.executing_eagerly()
        if in_graph_mode:
            if session is None:
                session = ops.get_default_session()
            file_prefix_tensor = self._file_prefix_placeholder
            file_prefix_feed_dict = {self._file_prefix_placeholder: save_path}
        else:
            session = None
            with ops.device("/cpu:0"):
                file_prefix_tensor = constant_op.constant(save_path)
            file_prefix_feed_dict = None
        try:
            if not in_graph_mode or self._object_graph_restore_tensor is None:
                with ops.device("/cpu:0"):
                    object_graph_string, = io_ops.restore_v2(
                        prefix=file_prefix_tensor,
                        tensor_names=[_OBJECT_GRAPH_PROTO_KEY],
                        shape_and_slices=[""],
                        dtypes=[dtypes.string],
                        name="object_graph_proto_read")
                if in_graph_mode:
                    self._object_graph_restore_tensor = object_graph_string
            if in_graph_mode:
                object_graph_string = session.run(
                    self._object_graph_restore_tensor,
                    feed_dict=file_prefix_feed_dict)
            else:
                object_graph_string = object_graph_string.numpy()
        except errors_impl.NotFoundError:
            # The object graph proto does not exist in this checkpoint. Try again with
            # name-based saving.
            return NameBasedSaverStatus(self, save_path)

        object_graph_proto = (
            checkpointable_object_graph_pb2.CheckpointableObjectGraph())
        object_graph_proto.ParseFromString(object_graph_string)
        if in_graph_mode and object_graph_proto == self._last_restore_object_graph:
            checkpoint = self._last_restore_checkpoint
        else:
            if in_graph_mode:
                dtype_map = None
            else:
                reader = pywrap_tensorflow.NewCheckpointReader(save_path)
                dtype_map = reader.get_variable_to_dtype_map()
            checkpoint = core_checkpointable_utils._Checkpoint(  # pylint: disable=protected-access
                object_graph_proto=object_graph_proto,
                save_path=file_prefix_tensor,
                dtype_map=dtype_map)
            if in_graph_mode:
                if self._last_restore_object_graph is not None:
                    raise NotImplementedError(
                        "Using a single Saver to restore different object graphs is not "
                        "currently supported when graph building. Use a different Saver "
                        "for each object graph (restore ops will be duplicated), or "
                        "file a feature request if this limitation bothers you."
                    )
                self._last_restore_checkpoint = checkpoint
                self._last_restore_object_graph = object_graph_proto
        core_checkpointable._CheckpointPosition(  # pylint: disable=protected-access
            checkpoint=checkpoint,
            proto_id=0).restore(self._root_checkpointable)
        load_status = CheckpointLoadStatus(checkpoint,
                                           feed_dict=file_prefix_feed_dict)
        return load_status
def assign_sub_on_device(device, variable, tensor):
  with ops.device(device):
    return variable.assign_sub(tensor)
 def run_reduce():
     with ops.device(self._local_device):
         t_in = array_ops.ones(tensor_shape) * worker_id
         return strategy.reduce(reduce_util.ReduceOp.MEAN,
                                t_in,
                                axis=None)
    def _test_reduction(self, task_type, task_id, num_gpus, local_mode=False):
        collective_all_reduce, devices, master_target = self._get_test_objects(
            task_type, task_id, num_gpus, local_mode=local_mode)
        if local_mode:
            num_workers = 1
            worker_device = None
        else:
            num_workers = len(self._cluster_spec.get("chief", [])) + len(
                self._cluster_spec.get("worker", []))
            worker_device = "/job:%s/task:%d" % (task_type, task_id)
        with ops.Graph().as_default(), \
             ops.device(worker_device), \
             self.test_session(target=master_target) as sess:
            # Collective ops doesn't support scalar tensors, so we have to construct
            # 1-d tensors.
            values = [
                constant_op.constant([float(d)]) for d in range(len(devices))
            ]
            per_device = _make_per_device(values, devices, regroup=True)
            mean = np.array([(len(devices) - 1.) / 2.])

            values_2 = [
                constant_op.constant([d + 1.0]) for d in range(len(devices))
            ]
            per_device_2 = _make_per_device(values_2, devices)
            mean_2 = np.array([mean[0] + 1.])

            destination_mirrored = _fake_mirrored(1., devices)
            destination_different = _fake_mirrored(1., _cpu_device)
            destination_str = _cpu_device
            destination_list = devices

            all_destinations = [
                destination_different, destination_mirrored, destination_str,
                destination_list
            ]

            # test reduce()
            for destinations in all_destinations:
                self._assert_values_equal(
                    collective_all_reduce.reduce(vs.VariableAggregation.MEAN,
                                                 per_device,
                                                 destinations=destinations),
                    _fake_mirrored(mean, destinations), sess)
                self._assert_values_equal(
                    collective_all_reduce.reduce(vs.VariableAggregation.MEAN,
                                                 per_device_2,
                                                 destinations=destinations),
                    _fake_mirrored(mean_2, destinations), sess)
                self._assert_values_equal(
                    collective_all_reduce.reduce(vs.VariableAggregation.SUM,
                                                 per_device,
                                                 destinations=destinations),
                    _fake_mirrored(mean * len(devices) * num_workers,
                                   destinations), sess)
                self._assert_values_equal(
                    collective_all_reduce.reduce(vs.VariableAggregation.SUM,
                                                 per_device_2,
                                                 destinations=destinations),
                    _fake_mirrored(mean_2 * len(devices) * num_workers,
                                   destinations), sess)

            # test batch_reduce()
            for d1, d2 in itertools.product(all_destinations,
                                            all_destinations):
                self._assert_values_equal(
                    collective_all_reduce.batch_reduce(
                        vs.VariableAggregation.MEAN, [(per_device, d1),
                                                      (per_device_2, d2)]),
                    [_fake_mirrored(mean, d1),
                     _fake_mirrored(mean_2, d2)], sess)
                self._assert_values_equal(
                    collective_all_reduce.batch_reduce(
                        vs.VariableAggregation.SUM, [(per_device, d1),
                                                     (per_device_2, d2)]),
                    [
                        _fake_mirrored(mean * len(devices) * num_workers, d1),
                        _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
                    ], sess)

        return True
Example #45
0
  def _train_model(self, input_fn, hooks):
    all_hooks = []
    with ops.Graph().as_default() as g, g.device(self._device_fn):
      random_seed.set_random_seed(self._config.tf_random_seed)
      global_step_tensor = self._create_and_assert_global_step(g)
      with ops.device('/cpu:0'):
        features, labels = input_fn()
      estimator_spec = self._call_model_fn(features, labels,
                                           model_fn_lib.ModeKeys.TRAIN)
      ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
      all_hooks.extend(hooks)
      all_hooks.extend([
          training.NanTensorHook(estimator_spec.loss),
          training.LoggingTensorHook(
              {
                  'loss': estimator_spec.loss,
                  'step': global_step_tensor
              },
              every_n_iter=100)
      ])
      all_hooks.extend(estimator_spec.training_hooks)

      if not (estimator_spec.scaffold.saver or
              ops.get_collection(ops.GraphKeys.SAVERS)):
        ops.add_to_collection(
            ops.GraphKeys.SAVERS,
            training.Saver(
                sharded=True,
                max_to_keep=self._config.keep_checkpoint_max,
                keep_checkpoint_every_n_hours=(
                    self._config.keep_checkpoint_every_n_hours),
                defer_build=True,
                save_relative_paths=True))

      chief_hooks = []
      if (self._config.save_checkpoints_secs or
          self._config.save_checkpoints_steps):
        saver_hook_exists = any([
            isinstance(h, training.CheckpointSaverHook)
            for h in (all_hooks + chief_hooks +
                      list(estimator_spec.training_chief_hooks))
        ])
        if not saver_hook_exists:
          chief_hooks = [
              training.CheckpointSaverHook(
                  self._model_dir,
                  save_secs=self._config.save_checkpoints_secs,
                  save_steps=self._config.save_checkpoints_steps,
                  scaffold=estimator_spec.scaffold)
          ]
      with training.MonitoredTrainingSession(
          master=self._config.master,
          is_chief=self._config.is_chief,
          checkpoint_dir=self._model_dir,
          scaffold=estimator_spec.scaffold,
          hooks=all_hooks,
          chief_only_hooks=(
              tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
          save_checkpoint_secs=0,  # Saving is handled by a hook.
          save_summaries_steps=self._config.save_summary_steps,
          config=self._session_config) as mon_sess:
        loss = None
        while not mon_sess.should_stop():
          _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
      return loss
Example #46
0
    def save(self, file_prefix, checkpoint_number=None, session=None):
        """Save a training checkpoint.

    The saved checkpoint includes variables created by this object and any
    Checkpointable objects it depends on at the time `Saver.save()` is called.

    Args:
      file_prefix: A prefix to use for the checkpoint filenames
        (/path/to/directory/and_a_prefix). Names are generated based on this
        prefix and `checkpoint_number`, if provided.
      checkpoint_number: An integer variable or Tensor, used to number
        checkpoints. Typically this value is saved along with other variables in
        training checkpoints, which will happen automatically if it was created
        by `root_checkpointable` or one of its dependencies (via
        `Checkpointable._add_variable`).
      session: The session to evaluate variables in. Ignored when executing
        eagerly. If not provided when graph building, the default session is
        used.

    Returns:
      The full path to the checkpoint.
    """
        named_variables, graph_proto = _serialize_object_graph(
            self._root_checkpointable)
        in_graph_mode = not context.executing_eagerly()
        if in_graph_mode:
            if session is None:
                session = ops.get_default_session()
            if self._object_graph_feed_tensor is None:
                with ops.device("/cpu:0"):
                    self._object_graph_feed_tensor = constant_op.constant(
                        "", dtype=dtypes.string)
            object_graph_tensor = self._object_graph_feed_tensor
            feed_additions = {
                object_graph_tensor: graph_proto.SerializeToString()
            }
        else:
            session = None
            with ops.device("/cpu:0"):
                object_graph_tensor = constant_op.constant(
                    graph_proto.SerializeToString(), dtype=dtypes.string)
            feed_additions = None
        assert _OBJECT_GRAPH_PROTO_KEY not in named_variables
        named_variables[_OBJECT_GRAPH_PROTO_KEY] = _NoRestoreSaveable(
            tensor=object_graph_tensor, name=_OBJECT_GRAPH_PROTO_KEY)
        if not in_graph_mode or self._last_save_object_graph != graph_proto:
            if self._last_save_object_graph is not None and in_graph_mode:
                raise NotImplementedError(
                    "Using a single Saver to save a mutated object graph is not "
                    "currently supported when graph building. Use a different Saver "
                    "when the object graph changes (save ops will be duplicated), or "
                    "file a feature request if this limitation bothers you.")
            saver = saver_lib.Saver(var_list=named_variables)
            if in_graph_mode:
                self._last_save_saver = saver
                self._last_save_object_graph = graph_proto
        else:
            saver = self._last_save_saver
        with ops.device("/cpu:0"):
            save_path = saver.save(sess=_SessionWithFeedDictAdditions(
                session=session, feed_additions=feed_additions),
                                   save_path=file_prefix,
                                   write_meta_graph=False,
                                   global_step=checkpoint_number)
        return save_path
Example #47
0
 def initial_value_fn(device=d):
     with ops.device(device):
         return array_ops.identity(
             index[devices[0]].initial_value)
Example #48
0
  def map_resources(self):
    """Makes new resource handle ops corresponding to existing resource tensors.

    Creates resource handle ops in the current default graph, whereas
    `accessible_objects` will be from an eager context. Resource mapping adds
    resource handle ops to the main GraphDef of a SavedModel, which allows the
    C++ loader API to interact with variables.

    Returns:
      A tuple of (object_map, resource_map, asset_info):
        object_map: A dictionary mapping from object in `accessible_objects` to
          replacement objects created to hold the new resource tensors.
        resource_map: A dictionary mapping from resource tensors extracted from
          `accessible_objects` to newly created resource tensors.
        asset_info: An _AssetInfo tuple describing external assets referenced
          from accessible_objects.
    """
    # Only makes sense when adding to the export Graph
    assert not context.executing_eagerly()
    # TODO(allenl): Handle MirroredVariables and other types of variables which
    # may need special casing.
    object_map = object_identity.ObjectIdentityDictionary()
    resource_map = {}
    asset_info = _AssetInfo(
        asset_defs=[],
        asset_initializers_by_resource={},
        asset_filename_map={},
        asset_index={})
    for node_id, obj in enumerate(self.nodes):
      if isinstance(obj, tracking.CapturableResource):
        # pylint: disable=protected-access
        with ops.device(obj._resource_device):
          new_resource = obj._create_resource()
        # pylint: enable=protected-access
        resource_map[obj.resource_handle] = new_resource
        self.captured_tensor_node_ids[obj.resource_handle] = node_id
      elif resource_variable_ops.is_resource_variable(obj):
        new_variable = resource_variable_ops.copy_to_graph_uninitialized(obj)
        object_map[obj] = new_variable
        resource_map[obj.handle] = new_variable.handle
        self.captured_tensor_node_ids[obj.handle] = node_id
      elif isinstance(obj, tracking.TrackableAsset):
        _process_asset(obj, asset_info, resource_map)
        self.captured_tensor_node_ids[obj.asset_path] = node_id

    for concrete_function in self.concrete_functions:
      for capture in concrete_function.captured_inputs:
        if (tensor_util.is_tensor(capture)
            and capture.dtype not in _UNCOPIABLE_DTYPES
            and capture not in self.captured_tensor_node_ids):
          copied_tensor = constant_op.constant(
              tensor_util.constant_value(capture))
          node_id = len(self.nodes)
          node = _CapturedConstant(
              eager_tensor=capture, graph_tensor=copied_tensor)
          self.nodes.append(node)
          self.node_ids[capture] = node_id
          self.node_ids[node] = node_id
          self.captured_tensor_node_ids[capture] = node_id
          resource_map[capture] = copied_tensor

    return object_map, resource_map, asset_info
Example #49
0
 def mod():
     with ops.device('/device:GPU:0'):
         a = constant_op.constant(1.0)
         b = constant_op.constant(1.0)
         return math_ops.mod(a, b)
    def test_matmul_biasadd_gelu_fusion(self, mode):
        """Test MatMul+BiasAdd+Gelu fusion."""
        self._maybe_skip(mode)
        is_bf16_supported = _pywrap_utils.IsBF16SupportedByOneDNNOnThisCPU()
        run_options = config_pb2.RunOptions(output_partition_graphs=True)
        metadata = config_pb2.RunMetadata()

        m, n, k = (3, 3, 4)  # Matrix dimensions
        for precision in ('float32', 'bfloat16'):
            for approximate in (False, True):
                # Gelu exact (approximate=False) is not supported with bfloat16
                # precision since no support for Erf with bfloat16 data type.
                # TODO(intel-tf): Enable gelu exact with bfloat16, when Erf op is
                # supported with bfloat16.
                if precision == 'bfloat16':
                    if not (approximate and is_bf16_supported):
                        continue

                # Create MatMul + BiasAdd + Gelu graph
                ops.reset_default_graph()
                with ops.device('/device:CPU:0'):
                    x = _input([m, k])
                    w = _weight([k, n])
                    b = _bias([n])
                    if precision == 'bfloat16':
                        x = math_ops.cast(x, dtypes.bfloat16)
                        w = math_ops.cast(w, dtypes.bfloat16)
                        b = math_ops.cast(b, dtypes.bfloat16)
                    y = math_ops.matmul(x, w)
                    z = nn.bias_add(y, b)
                    out = nn.gelu(z, approximate=approximate)

                # Compute reference value.
                config = _get_config(remapping_on=False)
                with session.Session(config=config) as sess:
                    sess.run(variables.global_variables_initializer())
                    output_val_ref = sess.run(out,
                                              options=run_options,
                                              run_metadata=metadata)
                # Compute output with fusion.
                config = _get_config(remapping_on=True)
                with session.Session(config=config) as sess:
                    sess.run(variables.global_variables_initializer())
                    output_val = sess.run(out,
                                          options=run_options,
                                          run_metadata=metadata)
                    graph = metadata.partition_graphs[0]

                # Graph should contain fused op.
                found_fused_op = False
                gelu_type = b'GeluApproximate' if approximate else b'GeluExact'
                for node in graph.node:
                    if node.op in ('_MklNativeFusedMatMul', '_MklFusedMatMul'):
                        fused_ops = node.attr['fused_ops'].list.s
                        found_fused_op = len(fused_ops) == 2 and \
                            fused_ops[0] == b'BiasAdd' and fused_ops[1] == gelu_type
                        break
                self.assertTrue(found_fused_op)

                # Computed output value should be close to reference value.
                tol = 1e-5 if precision == 'float32' else 1e-2
                self.assertAllClose(output_val_ref,
                                    output_val,
                                    atol=tol,
                                    rtol=tol)
Example #51
0
def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
                                                  m,
                                                  k,
                                                  n,
                                                  adjoint_a,
                                                  adjoint_b,
                                                  use_gpu,
                                                  skip_dense=False):
    config = config_pb2.ConfigProto()
    config.allow_soft_placement = True

    # Configurable for benchmarking:
    # config.intra_op_parallelism_threads = 100
    # config.gpu_options.per_process_gpu_memory_fraction = 0.3

    np.random.seed([6, 117])  # Reproducibility
    x = np.random.rand(m, k).astype(np.float32)
    x[x < thresh] = 0
    y = np.random.randn(k, n).astype(np.float32)
    if adjoint_a:
        x = x.T
    if adjoint_b:
        y = y.T

    def _timer(sess, ops_fn, iterations):
        # Warm in
        sess.run(ops_fn(10, sess))

        # Timing run
        start = time.time()
        sess.run(ops_fn(iterations, sess))
        end = time.time()

        return (end - start) / (1.0 * iterations
                                )  # Average runtime per iteration

    # Using regular matmul, marking one of the matrices as dense.
    if skip_dense:
        delta_dense = float("nan")
    else:
        with session.Session(config=config, graph=ops.Graph()) as sess:
            if not use_gpu:
                with ops.device("/cpu:0"):
                    x_t = constant_op.constant(x)
                    y_t = constant_op.constant(y)
                    ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
                        x_t, y_t, adjoint_a, adjoint_b)
            else:
                with ops.device("/device:GPU:0"):
                    x_t = constant_op.constant(x)
                    y_t = constant_op.constant(y)
                    ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
                        x_t, y_t, adjoint_a, adjoint_b)
            delta_dense = _timer(sess, ops_fn, 200)

    # Using sparse_tensor_dense_matmul.
    with session.Session("", config=config, graph=ops.Graph()) as sess:
        if not use_gpu:
            with ops.device("/cpu:0"):
                x_ind = constant_op.constant(
                    np.vstack(np.where(x)).astype(np.int64).T)
                x_val = constant_op.constant(x[np.where(x)])
                x_shape = constant_op.constant(
                    np.array(x.shape).astype(np.int64))
                y_t = constant_op.constant(y)
                ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
                    x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
        else:
            with ops.device("/device:GPU:0"):
                x_ind = constant_op.constant(
                    np.vstack(np.where(x)).astype(np.int64).T)
                x_val = constant_op.constant(x[np.where(x)])
                x_shape = constant_op.constant(
                    np.array(x.shape).astype(np.int64))
                y_t = constant_op.constant(y)
                ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
                    x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
        delta_sparse = _timer(sess, ops_fn, 200)

    print("%g \t %d \t %s \t %d \t %d \t %g \t %g \t %g" %
          (1 - thresh, n, use_gpu, m, k, delta_dense, delta_sparse,
           delta_sparse / delta_dense))
Example #52
0
def embedding_lookup(params, ids, partition_strategy="mod", name=None):
  """Looks up `ids` in a list of embedding tensors.

  This function is used to perform parallel lookups on the list of
  tensors in `params`.  It is a generalization of
  [`tf.gather()`](../../api_docs/python/array_ops.md#gather), where `params` is
  interpreted as a partition of a larger embedding tensor.

  If `len(params) > 1`, each element `id` of `ids` is partitioned between
  the elements of `params` according to the `partition_strategy`.
  In all strategies, if the id space does not evenly divide the number of
  partitions, each of the first `(max_id + 1) % len(params)` partitions will
  be assigned one more id.

  If `partition_strategy` is `"mod"`, we assign each id to partition
  `p = id % len(params)`. For instance,
  13 ids are split across 5 partitions as:
  `[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`

  If `partition_strategy` is `"div"`, we assign ids to partitions in a
  contiguous manner. In this case, 13 ids are split across 5 partitions as:
  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`

  The results of the lookup are concatenated into a dense
  tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.

  Args:
    params: A list of tensors with the same type and which can be concatenated
      along dimension 0. Each `Tensor` must be appropriately sized for the given
      `partition_strategy`.
    ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
      up in `params`.
    partition_strategy: A string specifying the partitioning strategy, relevant
      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
      is `"mod"`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` with the same type as the tensors in `params`.

  Raises:
    ValueError: If `params` is empty.
  """
  if not isinstance(params, list):
    params = [params]
  with ops.op_scope(params + [ids], name, "embedding_lookup") as name:
    if not params:
      raise ValueError("Need at least one param")
    np = len(params)  # Number of partitions
    params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
    if np == 1:
      with ops.device(params[0].device):
        return array_ops.gather(params[0], ids, name=name)
    else:
      ids = ops.convert_to_tensor(ids, name="ids")
      flat_ids = array_ops.reshape(ids, [-1])
      original_indices = math_ops.range(array_ops.size(flat_ids))

      # Create p_assignments and set new_ids depending on the strategy.
      if partition_strategy == "mod":
        p_assignments = flat_ids % np
        new_ids = flat_ids // np
      elif partition_strategy == "div":
        # Compute num_total_ids as the sum of dim-0 of params, then assign to
        # partitions based on a constant number of ids per partition. Optimize
        # if we already know the full shape statically.
        dim_0_size = params[0].get_shape()[0]
        for p in xrange(1, np):
          dim_0_size += params[p].get_shape()[0]
        if dim_0_size.value:
          num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
        else:
          dim_0_sizes = []
          for p in xrange(np):
            with ops.device(params[p].device):
              dim_0_sizes.append(array_ops.shape(params[p])[0])
          num_total_ids = math_ops.reduce_sum(
              math_ops.cast(array_ops.pack(dim_0_sizes), flat_ids.dtype))
        ids_per_partition = num_total_ids // np
        extras = num_total_ids % np

        p_assignments = math_ops.maximum(
            flat_ids // (ids_per_partition + 1),
            (flat_ids - extras) // ids_per_partition)

        # Emulate a conditional using a boolean indicator tensor
        is_in_first_extras_partitions = math_ops.cast(
            p_assignments < extras, flat_ids.dtype)
        new_ids = (
            is_in_first_extras_partitions * (
                flat_ids % (ids_per_partition + 1)) +
            (1 - is_in_first_extras_partitions) * (
                (flat_ids - extras) % ids_per_partition))
      else:
        raise ValueError("Unrecognized partition strategy: " +
                         partition_strategy)

      # Cast partition assignments to int32 for use in dynamic_partition.
      # There really should not be more than 2^32 partitions.
      p_assignments = math_ops.cast(p_assignments, dtypes.int32)
      # Partition list of ids based on assignments into np separate lists
      gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, np)
      # Similarly, partition the original indices.
      pindices = data_flow_ops.dynamic_partition(original_indices,
                                                 p_assignments, np)
      # Do np separate lookups, finding embeddings for plist[p] in params[p]
      partitioned_result = []
      for p in xrange(np):
        with ops.device(params[p].device):
          partitioned_result.append(array_ops.gather(params[p], gather_ids[p]))
      # Stitch these back together
      ret = data_flow_ops.dynamic_stitch(pindices, partitioned_result,
                                         name=name)
      # Reshape to reverse the flattening of ids.
      # It's important that we compute params[0].shape on the right device
      # to avoid data motion.
      with ops.device(params[0].device):
        params_shape = array_ops.shape(params[0])
      ret = array_ops.reshape(ret, array_ops.concat(0, [
          array_ops.shape(ids), array_ops.slice(params_shape, [1], [-1])]))
      # output shape = ids.shape + params[*].shape[1:]
      # Normally the reshape is sufficient, but setting shape explicitly
      # teaches shape inference that params[1:].get_shape() matters.
      element_shape = params[0].get_shape()[1:]
      for p in params[1:]:
        element_shape = element_shape.merge_with(p.get_shape()[1:])
      ret.set_shape(ids.get_shape().concatenate(element_shape))
      return ret
Example #53
0
def _GradientsHelper(ys,
                     xs,
                     grad_ys=None,
                     name="gradients",
                     colocate_gradients_with_ops=False,
                     gate_gradients=False,
                     aggregation_method=None,
                     stop_gradients=None,
                     src_graph=None):
    """Implementation of gradients()."""
    if context.executing_eagerly():
        raise RuntimeError(
            "tf.gradients is not supported when eager execution "
            "is enabled. Use tf.GradientTape instead.")
    if src_graph is None:
        src_graph = ops.get_default_graph()

    ys = _AsList(ys)
    xs = _AsList(xs)
    stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
    if grad_ys is None:
        grad_ys = [None] * len(ys)
    else:
        grad_ys = _AsList(grad_ys)

    with ops.name_scope(
            name, "gradients",
            list(ys) + list(xs) + list(stop_gradients) +
            list(grad_ys)) as grad_scope:
        # Get a uid for this call to gradients that can be used to help
        # cluster ops for compilation.
        gradient_uid = ops.get_default_graph().unique_name("uid")
        ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
        xs = [
            x.handle if resource_variable_ops.is_resource_variable(x) else x
            for x in xs
        ]
        xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs,
                                                                name="x",
                                                                as_ref=True)
        grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
                                 gradient_uid)

        # The approach we take here is as follows: Create a list of all ops in the
        # subgraph between the ys and xs.  Visit these ops in reverse order of ids
        # to ensure that when we visit an op the gradients w.r.t its outputs have
        # been collected.  Then aggregate these gradients if needed, call the op's
        # gradient function, and add the generated gradients to the gradients for
        # its input.

        # Initialize the pending count for ops in the connected subgraph from ys
        # to the xs.
        if len(ys) > 1:
            ys = [array_ops.identity(y) if y.consumers() else y for y in ys]
        to_ops = [t.op for t in ys]
        from_ops = [t.op for t in xs]
        stop_gradient_ops = [t.op for t in stop_gradients]
        reachable_to_ops, pending_count, loop_state = _PendingCount(
            to_ops, from_ops, colocate_gradients_with_ops)

        # Iterate over the collected ops.
        #
        # grads: op => list of gradients received on each output endpoint of the
        # op.  The gradients for each endpoint are initially collected as a list.
        # When it is time to call the op's gradient function, for each endpoint we
        # aggregate the list of received gradients into a Add() Operation if there
        # is more than one.
        grads = {}

        # Add the initial gradients for the ys.
        for y, grad_y in zip(ys, grad_ys):
            _SetGrad(grads, y, grad_y)

        # Initialize queue with to_ops.
        queue = collections.deque()
        # Add the ops in 'to_ops' into the queue.
        to_ops_set = set()
        for op in to_ops:
            # 'ready' handles the case where one output gradient relies on
            # another output's gradient.
            ready = (pending_count[op] == 0)
            if ready and op not in to_ops_set and op in reachable_to_ops:
                to_ops_set.add(op)
                queue.append(op)

        if loop_state:
            loop_exits = loop_state.ProcessUnusedLoopExits(
                pending_count, to_ops_set)
            for y in loop_exits:
                if _IsTrainable(y):
                    _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
                    queue.append(y.op)

        stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count)
        while queue:
            # generate gradient subgraph for op.
            op = queue.popleft()
            with _maybe_colocate_with(op, gradient_uid,
                                      colocate_gradients_with_ops):
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=True)
                out_grads = _AggregatedGrads(grads, op, gradient_uid,
                                             loop_state, aggregation_method)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=True)

                grad_fn = None
                func_call = None
                # pylint: disable=protected-access
                is_func_call = src_graph._is_function(op.type)
                # pylint: enable=protected-access
                has_out_grads = any(
                    isinstance(g, ops.Tensor) or g for g in out_grads)
                if has_out_grads and (op not in stop_ops):
                    if is_func_call:
                        func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
                        # Note that __defun is not set if the graph is
                        # imported. If it's set, we prefer to access the original
                        # defun.
                        func_call = getattr(op, "__defun", func_call)
                        grad_fn = func_call.python_grad_func
                    else:
                        # A grad_fn must be defined, either as a function or as None
                        # for ops that do not have gradients.
                        try:
                            grad_fn = ops.get_gradient_function(op)
                        except LookupError:
                            raise LookupError(
                                "No gradient defined for operation '%s' (op type: %s)"
                                % (op.name, op.type))
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=False)

                # NOTE(skyewm): We don't support computing gradients wrt a loop variable
                # unless it's within the context of a single iteration (i.e. the
                # gradient is wrt to the loop parameter in the body function, not wrt or
                # through the initial value). This means if we're in a while loop
                # context, we should never see a switch node from this context.
                # pylint: disable=protected-access
                if (control_flow_util.IsSwitch(op)
                        and op._control_flow_context is not None
                        and op._control_flow_context.IsWhileContext()
                        and op._control_flow_context ==
                        ops.get_default_graph()._get_control_flow_context()):
                    _RaiseNoGradWrtInitialLoopValError(op, from_ops)
                # pylint: enable=protected-access

                if (grad_fn or is_func_call) and has_out_grads:
                    # NOTE: If _AggregatedGrads didn't compute a value for the i'th
                    # output, it means that the cost does not depend on output[i],
                    # therefore dC/doutput[i] is 0.
                    for i, out_grad in enumerate(out_grads):
                        if (not isinstance(out_grad, ops.Tensor)
                                and not out_grad) and (
                                    (not grad_fn and is_func_call)
                                    or _IsTrainable(op.outputs[i])):
                            # Only trainable outputs or outputs for a function call that
                            # will use SymbolicGradient get a zero gradient. Gradient
                            # functions should ignore the gradient for other outputs.
                            # TODO(apassos) gradients of resource handles might be an
                            # issue here because of zeros.
                            if loop_state:
                                out_grads[i] = loop_state.ZerosLike(op, i)
                            else:
                                out_grads[
                                    i] = control_flow_ops.ZerosLikeOutsideLoop(
                                        op, i)
                    with ops.name_scope(op.name + "_grad"):
                        # pylint: disable=protected-access
                        with src_graph._original_op(op):
                            # pylint: enable=protected-access
                            if grad_fn:
                                # If grad_fn was found, do not use SymbolicGradient even for
                                # functions.
                                in_grads = _MaybeCompile(
                                    grad_scope, op, func_call,
                                    lambda: grad_fn(op, *out_grads))
                            else:
                                # For function call ops, we add a 'SymbolicGradient'
                                # node to the graph to compute gradients.
                                in_grads = _MaybeCompile(
                                    grad_scope, op, func_call,
                                    lambda: _SymGrad(op, out_grads))
                            in_grads = _AsList(in_grads)
                            _VerifyGeneratedGradients(in_grads, op)
                            if gate_gradients and len(
                                [x for x in in_grads if x is not None]) > 1:
                                with ops.device(None):
                                    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
                                            None,
                                            gradient_uid,
                                            ignore_existing=True):
                                        in_grads = control_flow_ops.tuple(
                                            in_grads)
                    _LogOpGradients(op, out_grads, in_grads)
                else:
                    # If no grad_fn is defined or none of out_grads is available,
                    # just propagate a list of None backwards.
                    in_grads = [None] * len(op.inputs)
                for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)):
                    if in_grad is not None:
                        if (isinstance(in_grad, ops.Tensor)
                                and t_in.dtype != dtypes.resource):
                            try:
                                in_grad.set_shape(t_in.get_shape())
                            except ValueError:
                                raise ValueError(
                                    "Incompatible shapes between op input and calculated "
                                    "input gradient.  Forward operation: %s.  Input index: %d. "
                                    "Original input shape: %s.  "
                                    "Calculated input gradient shape: %s" %
                                    (op.name, i, t_in.shape, in_grad.shape))
                        _SetGrad(grads, t_in, in_grad)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=False)

            # Update pending count for the inputs of op and enqueue ready ops.
            _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count,
                                          loop_state)

    if loop_state:
        loop_state.PostProcessing()
    return [_GetGrad(grads, x) for x in xs]
  def _VerifyValues(self,
                    tensor_in_sizes,
                    filter_in_sizes,
                    stride,
                    padding,
                    data_type,
                    data_format="NHWC"):
    """Verifies the output values of the convolution function.

    Args:
      tensor_in_sizes: Input tensor dimensions in
        [batch, input_rows, input_cols, input_depth].
      filter_in_sizes: Filter tensor dimensions in
        [filter_rows, filter_cols, input_depth, depth_multiplier].
      stride: Stride.
      padding: Padding type.
      data_type: The data type to use.
      data_format: The data_format of the input. "NHWC" or "NCHW".
    """
    total_size_1 = 1
    total_size_2 = 1
    for s in tensor_in_sizes:
      total_size_1 *= s
    for s in filter_in_sizes:
      total_size_2 *= s
    # Initializes the input and filter tensor with numbers incrementing from 1.
    x1 = np.array([f * 1.0 for f in range(1, total_size_1 + 1)],
                  dtype=data_type).reshape(tensor_in_sizes)
    x2 = np.array([f * 1.0 for f in range(1, total_size_2 + 1)],
                  dtype=data_type).reshape(filter_in_sizes)
    with self.test_session() as sess:
      if data_type == np.float32:
        tolerance = 1e-5
      else:
        self.assertEqual(data_type, np.float64)
        tolerance = 1e-8

      t1 = array_ops.placeholder(shape=tensor_in_sizes, dtype=data_type)
      t2 = array_ops.placeholder(shape=filter_in_sizes, dtype=data_type)

      native_t1 = t1
      strides = [1, stride, stride, 1]
      if data_format == "NCHW":
        # Transpose from NWHC input to NCHW
        # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
        native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
        strides = [1, 1, stride, stride]

      with self.test_scope():
        conv_native = nn_ops.depthwise_conv2d_native(
            native_t1,
            t2,
            strides=strides,
            data_format=data_format,
            padding=padding)

      if data_format == "NCHW":
        # Transpose back from NCHW to NHWC
        conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])

      with ops.device("CPU"):
        conv_interface = ReferenceDepthwiseConv2D(
            t1, t2, strides=[1, stride, stride, 1], padding=padding)

      native_result = sess.run(conv_native, {t1: x1, t2: x2})
      interface_result = sess.run(conv_interface, {t1: x1, t2: x2})

    print("data_type:", data_type, "max diff = ",
          np.amax(np.absolute(native_result - interface_result)))
    self.assertAllClose(
        np.ravel(native_result), np.ravel(interface_result), rtol=tolerance)
 def reduce_fn(state, value):
   with ops.device("/gpu:0"):
     return state + value
    def testSelectEverythingDetail(self):
        ops.reset_default_graph()
        dev = '/device:GPU:0' if test.is_gpu_available() else '/device:CPU:0'
        outfile = os.path.join(test.get_temp_dir(), 'dump')
        opts = (builder(
            builder.trainable_variables_parameter()).with_file_output(
                outfile).with_accounted_types(['.*']).select([
                    'micros', 'bytes', 'params', 'float_ops', 'occurrence',
                    'device', 'op_types', 'input_shapes'
                ]).build())

        with profile_context.ProfileContext(test.get_temp_dir(),
                                            trace_steps=[],
                                            dump_steps=[]) as pctx:
            with session.Session() as sess, ops.device(dev):
                x = lib.BuildSmallModel()

                sess.run(variables.global_variables_initializer())
                pctx.trace_next_step()
                pctx.dump_next_step()
                _ = sess.run(x)

                pctx.profiler.profile_name_scope(options=opts)

                with gfile.Open(outfile, 'r') as f:
                    # pylint: disable=line-too-long
                    dump_str = f.read()
                    outputs = dump_str.split('\n')

                    self.assertEqual(
                        outputs[0],
                        'node name | # parameters | # float_ops | requested bytes | total execution time | accelerator execution time | cpu execution time | assigned devices | op types | op count (run|defined) | input shapes'
                    )
                    for o in outputs[1:]:
                        if o.find('Conv2D ') > 0:
                            metrics = o[o.find('(') + 1:o.find(')')].split(',')
                            # Make sure time is profiled.
                            gap = 1 if test.is_gpu_available() else 2
                            for i in range(3, 6, gap):
                                mat = re.search('(.*)[um]s/(.*)[um]s',
                                                metrics[i])
                                self.assertGreater(float(mat.group(1)), 0.0)
                                self.assertGreater(float(mat.group(2)), 0.0)
                            # Make sure device is profiled.
                            if test.is_gpu_available():
                                self.assertTrue(metrics[6].find('gpu') > 0)
                                self.assertFalse(metrics[6].find('cpu') > 0)
                            else:
                                self.assertFalse(metrics[6].find('gpu') > 0)
                                self.assertTrue(metrics[6].find('cpu') > 0)
                            # Make sure float_ops is profiled.
                            mat = re.search('(.*)k/(.*)k flops',
                                            metrics[1].strip())
                            self.assertGreater(float(mat.group(1)), 0.0)
                            self.assertGreater(float(mat.group(2)), 0.0)
                            # Make sure op_count is profiled.
                            self.assertEqual(metrics[8].strip(), '1/1|1/1')
                            # Make sure input_shapes is profiled.
                            self.assertEqual(metrics[9].strip(),
                                             '0:2x6x6x3|1:3x3x3x6')

                        if o.find('DW (3x3x3x6') > 0:
                            metrics = o[o.find('(') + 1:o.find(')')].split(',')
                            mat = re.search('(.*)/(.*) params',
                                            metrics[1].strip())
                            self.assertGreater(float(mat.group(1)), 0.0)
                            self.assertGreater(float(mat.group(2)), 0.0)
                    # pylint: enable=line-too-long

        # Test that profiler restored from profile file gives the same result.
        gfile.Remove(outfile)
        profile_file = os.path.join(test.get_temp_dir(), 'profile_1')
        with lib.ProfilerFromFile(profile_file) as profiler:
            profiler.profile_name_scope(options=opts)
            with gfile.Open(outfile, 'r') as f:
                self.assertEqual(dump_str, f.read())
Example #57
0
def many2one_attention_seq2seq(encoder_inputs_list,
                               decoder_inputs,
                               text_len,
                               text_cell,
                               speech_cell,
                               parse_cell,
                               num_encoder_symbols,
                               num_decoder_symbols,
                               embedding_size,
                               output_projection=None,
                               feed_previous=False,
                               dtype=dtypes.float32,
                               scope=None,
                               initial_state_attention=False,
                               attention_vec_size=None):

    text_encoder_inputs, speech_encoder_inputs = encoder_inputs_list
    with variable_scope.variable_scope(scope or "many2one_attention_seq2seq"):
        with ops.device("/cpu:0"):
            embedding_words = variable_scope.get_variable(
                "embedding_words", [num_encoder_symbols, embedding_size])

        text_encoder_inputs = [
            embedding_ops.embedding_lookup(embedding_words, i)
            for i in text_encoder_inputs
        ]
        # Encoder.
        with variable_scope.variable_scope(scope or "text_encoder"):
            text_encoder_outputs, text_encoder_state = rnn.rnn(
                text_cell,
                text_encoder_inputs,
                sequence_length=text_len,
                dtype=dtype)

        with variable_scope.variable_scope(scope or "speech_encoder"):
            speech_encoder_outputs, speech_encoder_state = rnn.rnn(
                speech_cell, speech_encoder_inputs, dtype=dtype)

        # First calculate a concatenation of encoder outputs to put attention on.
        text_top_states = [
            array_ops.reshape(e, [-1, 1, text_cell.output_size])
            for e in text_encoder_outputs
        ]
        # h_states =  attention_states in original code
        h_states = array_ops.concat(1, text_top_states)

        speech_top_states = [
            array_ops.reshape(e, [-1, 1, speech_cell.output_size])
            for e in speech_encoder_outputs
        ]
        m_states = array_ops.concat(1, speech_top_states)

        attention_states = [h_states, m_states]
        both_encoder_states = [text_encoder_state, speech_encoder_state]

        # Decoder.
        output_size = None
        if output_projection is None:
            parse_cell = rnn_cell.OutputProjectionWrapper(
                parse_cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        if isinstance(feed_previous, bool):
            return many2one_embedding_attention_decoder(
                decoder_inputs,
                both_encoder_states,
                attention_states,
                parse_cell,
                num_decoder_symbols,
                embedding_size,
                output_size=output_size,
                output_projection=output_projection,
                feed_previous=feed_previous,
                initial_state_attention=initial_state_attention,
                attention_vec_size=attention_vec_size)

        # If feed_previous is a Tensor, we construct 2 graphs and use cond.
        def decoder(feed_previous_bool):
            reuse = None if feed_previous_bool else True
            with variable_scope.variable_scope(
                    variable_scope.get_variable_scope(), reuse=reuse):
                outputs, state = many2one_embedding_attention_decoder(
                    decoder_inputs,
                    both_encoder_states,
                    attention_states,
                    parse_cell,
                    num_decoder_symbols,
                    embedding_size,
                    output_size=output_size,
                    output_projection=output_projection,
                    feed_previous=feed_previous_bool,
                    update_embedding_for_previous=False,
                    initial_state_attention=initial_state_attention,
                    attention_vec_size=attention_vec_size)
                return outputs + [state]

        outputs_and_state = control_flow_ops.cond(feed_previous,
                                                  lambda: decoder(True),
                                                  lambda: decoder(False))
        return outputs_and_state[:-1], outputs_and_state[-1]
Example #58
0
    def testColocationWithDeviceFn(self):
        original_graph_def = self._MakeGraphDef("""
          node { name: 'A' op: 'None' attr {
            key: '_class'
            value { list { s: 'loc:@A' } }
          } }
          node { name: 'B' op: 'None'  attr {
            key: '_class'
            value { list { s: 'loc:@A' } }
          } }""")

        # A device function that places "A" on one device and "B" on
        # another device.  Because B is colocated with A, we test that B's
        # device function is overridden by A.
        def CustomDeviceFn(op):
            if "A" in op.name:
                return "/device:A:0"
            else:
                return "/device:B:0"

        with ops.Graph().as_default():
            with ops.device(CustomDeviceFn):
                b, = importer.import_graph_def(original_graph_def,
                                               return_elements=["B"],
                                               name="imported_graph")

            self.assertProtoEqualsVersion(
                """
          node { name: 'imported_graph/A' op: 'None' device: "/device:A:0"
                attr {
                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
                }
          }
          node { name: 'imported_graph/B' op: 'None' device: "/device:A:0"
                attr {
                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
          } }""", b.graph.as_graph_def())

        # Test a scenario where 'A' doesn't get a device; 'A' should
        # not have a device, but during runtime will get colocated with
        # 'B' because of the colocation attribute.
        def BDeviceFn(op):
            if "B" in op.name:
                return "/device:B:0"
            return ""

        with ops.Graph().as_default():
            with ops.device(BDeviceFn):
                b, = importer.import_graph_def(original_graph_def,
                                               return_elements=["B"],
                                               name="imported_graph")

            self.assertProtoEqualsVersion(
                """
          node { name: 'imported_graph/A' op: 'None'
                attr {
                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
                }
          }
          node { name: 'imported_graph/B' op: 'None'
                attr {
                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
          } }""", b.graph.as_graph_def())

        # Only A gets a device, so B inherits it implicitly.
        def ADeviceFn(op):
            if "A" in op.name:
                return "/device:A:0"
            return ""

        with ops.Graph().as_default():
            with ops.device(ADeviceFn):
                b, = importer.import_graph_def(original_graph_def,
                                               return_elements=["B"],
                                               name="imported_graph")

            self.assertProtoEqualsVersion(
                """
          node { name: 'imported_graph/A' op: 'None' device: "/device:A:0"
                attr {
                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
                }
          }
          node { name: 'imported_graph/B' op: 'None' device: "/device:A:0"
                attr {
                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
          } }""", b.graph.as_graph_def())
Example #59
0
 def ambiguous_device(i):
     with ops.device('cpu:0'):
         return i + constant_op.constant([2])
Example #60
0
    def __init__(self,
                 dtype,
                 size=None,
                 dynamic_size=None,
                 clear_after_read=None,
                 tensor_array_name=None,
                 handle=None,
                 flow=None,
                 infer_shape=True,
                 element_shape=None,
                 colocate_with_first_write_call=True,
                 name=None):
        """Construct a new TensorArray or wrap an existing TensorArray handle.

    A note about the parameter `name`:

    The name of the `TensorArray` (even if passed in) is uniquified: each time
    a new `TensorArray` is created at runtime it is assigned its own name for
    the duration of the run.  This avoids name collisions if a `TensorArray`
    is created within a `while_loop`.

    Args:
      dtype: (required) data type of the TensorArray.
      size: (optional) int32 scalar `Tensor`: the size of the TensorArray.
        Required if handle is not provided.
      dynamic_size: (optional) Python bool: If true, writes to the TensorArray
        can grow the TensorArray past its initial size.  Default: False.
      clear_after_read: Boolean (optional, default: True).  If True, clear
        TensorArray values after reading them.  This disables read-many
        semantics, but allows early release of memory.
      tensor_array_name: (optional) Python string: the name of the TensorArray.
        This is used when creating the TensorArray handle.  If this value is
        set, handle should be None.
      handle: (optional) A `Tensor` handle to an existing TensorArray.  If this
        is set, tensor_array_name should be None.
      flow: (optional) A float `Tensor` scalar coming from an existing
        `TensorArray.flow`.
      infer_shape: (optional, default: True) If True, shape inference
        is enabled.  In this case, all elements must have the same shape.
      element_shape: (optional, default: None) A `TensorShape` object specifying
        the shape constraints of each of the elements of the TensorArray.
        Need not be fully defined.
      colocate_with_first_write_call: If `True`, the TensorArray will be
        colocated on the same device as the the Tensor used on its first write
        (write operations include `write`, `unstack`, and `split`).  If `False`,
        the TensorArray will be placed on the device determined by the
        device context available during its initialization.
      name: A name for the operation (optional).

    Raises:
      ValueError: if both handle and tensor_array_name are provided.
      TypeError: if handle is provided but is not a Tensor.
    """
        if handle is not None and tensor_array_name:
            raise ValueError(
                "Cannot construct with both handle and tensor_array_name")
        if handle is not None and not isinstance(handle, ops.Tensor):
            raise TypeError("Handle must be a Tensor")
        if handle is None and size is None:
            raise ValueError("Size must be provided if handle is not provided")
        if handle is not None and size is not None:
            raise ValueError("Cannot provide both a handle and size "
                             "at the same time")
        if handle is not None and element_shape is not None:
            raise ValueError("Cannot provide both a handle and element_shape "
                             "at the same time")
        if handle is not None and dynamic_size is not None:
            raise ValueError("Cannot provide both a handle and dynamic_size "
                             "at the same time")
        if handle is not None and clear_after_read is not None:
            raise ValueError(
                "Cannot provide both a handle and clear_after_read "
                "at the same time")

        if clear_after_read is None:
            clear_after_read = True
        dynamic_size = dynamic_size or False

        self._dtype = dtype

        # Used to keep track of what tensors the TensorArray should be
        # colocated with.  We choose to colocate the TensorArray with the
        # first tensor written to it.
        self._colocate_with_first_write_call = colocate_with_first_write_call
        if colocate_with_first_write_call:
            self._colocate_with = []
        else:
            self._colocate_with = None

        # Record the current static shape for the array elements. The element
        # shape is defined either by `element_shape` or the shape of the tensor
        # of the first write. If `infer_shape` is true, all writes checks for
        # shape equality.
        if element_shape is None:
            self._infer_shape = infer_shape
            self._element_shape = []
        else:
            self._infer_shape = True
            self._element_shape = [tensor_shape.TensorShape(element_shape)]
        with ops.name_scope(name, "TensorArray",
                            [handle, size, flow]) as scope:
            if handle is not None:
                self._handle = handle
                if flow is None:
                    raise ValueError(
                        "flow must not be None if handle is not None.")
                self._flow = flow
            else:
                # Construct the TensorArray with an empty device.  The first
                # write into the TensorArray from a Tensor with a set device
                # will retroactively set the device value of this op.
                def create():
                    return gen_data_flow_ops._tensor_array_v3(
                        dtype=dtype,
                        size=size,
                        element_shape=element_shape,
                        dynamic_size=dynamic_size,
                        clear_after_read=clear_after_read,
                        tensor_array_name=tensor_array_name,
                        name=scope)

                if colocate_with_first_write_call:
                    with ops.device(None), ops.colocate_with(
                            None, ignore_existing=True):
                        self._handle, self._flow = create()
                else:
                    self._handle, self._flow = create()