Esempio n. 1
0
  def decorated(self, **kwargs):
    """A wrapped test method that treats some arguments in a special way."""
    mode = kwargs.pop("mode", "graph")

    distribution = kwargs.get("distribution", None)
    required_tpu = kwargs.pop("required_tpu", False)
    required_gpus = kwargs.pop("required_gpus", None)

    if distribution:
      assert required_gpus is None, (
          "Do not use `required_gpus` and `distribution` together.")
      assert required_tpu is False, (
          "Do not use `required_tpu` and `distribution` together.")
      required_gpus = distribution.required_gpus
      required_tpu = distribution.required_tpu

    if required_tpu and not TPU_TEST:
      self.skipTest("Test requires a TPU, but it's not available.")
    if not required_tpu and TPU_TEST:
      self.skipTest("Test that doesn't require a TPU.")

    if not required_gpus:
      if GPU_TEST:
        self.skipTest("Test that doesn't require GPUs.")
    elif context.num_gpus() < required_gpus:
      self.skipTest(
          "{} GPUs are not available for this test. {} GPUs are available".
          format(required_gpus, context.num_gpus()))

    # At this point, `kwargs` doesn't have `required_gpus` or `required_tpu`
    # that the user might have specified.  `kwargs` still has `mode`, which
    # the test is allowed to accept or ignore.
    requested_arguments = tf_inspect.getfullargspec(test_method).args
    missing_arguments = set(list(kwargs.keys()) + ["self"]).difference(
        set(requested_arguments + ["mode"]))
    if missing_arguments:
      raise ValueError("The test is missing arguments {} .".format(
          missing_arguments))

    kwargs_to_pass = {}
    for arg in requested_arguments:
      if arg == "self":
        kwargs_to_pass[arg] = self
      else:
        kwargs_to_pass[arg] = kwargs[arg]

    if mode == "eager":
      with ops.Graph().as_default(), context.eager_mode():
        if distribution:
          kwargs_to_pass["distribution"] = distribution.strategy
        test_method(**kwargs_to_pass)
    elif mode == "graph":
      with ops.Graph().as_default(), context.graph_mode():
        if distribution:
          kwargs_to_pass["distribution"] = distribution.strategy
        test_method(**kwargs_to_pass)
    else:
      raise ValueError(
          "'mode' has to be either 'eager' or 'graph' and not {}".format(
              mode))
 def _get_distribution_strategy(self):
   devices = ["/device:CPU:0", "/device:GPU:0"]
   if GPU_TEST:
     self.assertGreater(context.num_gpus(), 0)
     if context.num_gpus() > 1:
       devices = ["/device:GPU:0", "/device:GPU:1"]
   print(self.id().split(".")[-1], "devices:", ", ".join(devices))
   return mirrored_strategy.MirroredStrategy(devices)
Esempio n. 3
0
  def test_end_to_end_keras_2_gpu(self):
    if context.num_gpus() < 2:
      self.skipTest(
          "{} GPUs are not available for this test. {} GPUs are available".
          format(2, context.num_gpus()))

    integration.run_synthetic(
        ncf_keras_main.main, tmp_root=self.get_temp_dir(), max_train=None,
        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '2'])
Esempio n. 4
0
def maybe_skip_test(test_case, is_tpu_required, num_gpus_required):
  if is_tpu_required and not TPU_TEST:
    test_case.skipTest("Test requires a TPU, but it's not available.")
  if not is_tpu_required and TPU_TEST:
    test_case.skipTest("Test that doesn't require a TPU.")

  if not num_gpus_required:
    if GPU_TEST:
      test_case.skipTest("Test that doesn't require GPUs.")
  elif context.num_gpus() < num_gpus_required:
    # TODO(priyag): Consider allowing tests in graph mode using soft
    # placement.
    test_case.skipTest(
        "{} GPUs are not available for this test. {} GPUs are available".format(
            num_gpus_required, context.num_gpus()))
Esempio n. 5
0
 def testInstantError(self):
   if context.num_gpus():
     # TODO(nareshmodi): make this test better
     self.skipTest("Gather doesn't do index checking on GPUs")
   with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                r'indices = 7 is not in \[0, 3\)'):
     array_ops.gather([0, 1, 2], 7)
 def benchmark_defun_matmul_100_by_784_GPU(self):
   if not context.num_gpus():
     return
   with context.device(GPU):
     m = self._m_100_by_784.gpu()
     self._benchmark_defun_matmul(
         m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 def testVariableInitialization(self, num_gpus):
   if context.num_gpus() < num_gpus:
     return
   self._run_between_graph_clients(
       self._test_variable_initialization,
       self._cluster_spec,
       num_gpus=num_gpus)
 def testVariableInitialization(self, num_gpus):
   if context.num_gpus() < num_gpus:
     self.skipTest('Not enough GPUs')
   self._run_between_graph_clients(
       self._test_variable_initialization,
       self._cluster_spec,
       num_gpus=num_gpus)
 def testNumpyIterator(self, use_core_strategy):
   num_gpus = 2
   if context.num_gpus() < num_gpus:
     self.skipTest('Not enough GPUs')
   strategy, _, _ = self._get_test_object(
       None, None, num_gpus=num_gpus, use_core_strategy=use_core_strategy)
   self._test_numpy_iterator(strategy)
Esempio n. 10
0
  def testSaveAndRestoreMirroredOneGraph(self):
    if context.num_gpus() < 1 and context.executing_eagerly():
      # Graph mode can work without GPU because the Placer "moves" the
      # variable to a CPU. In other words, if there is no GPU available, but
      # user requested to create a variable on GPU, Placer will ignore the
      # user request and assign the VarHandleOp to CPU. This requires
      # soft_placement, which is on by default.
      self.skipTest("A GPU is not available for this test in eager mode.")

    with self.cached_session(config=self.config) as sess:
      v, device_map, mirrored = _make_mirrored()
      devices = device_map.all_devices

      # Overwrite the initial values.
      self._assign_mirrored(devices, v, [3., 4.])

      # Saves the current value of v[0], 3.
      save_path, saver = self._save_return_saver(sess, mirrored)

      # Change the values between save and restore.
      self._assign_mirrored(devices, v, [5., 6.])

      # Restores the saved value of 3. to both variables.
      saver.restore(sess, save_path)
      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
 def _get_distribution_strategy(self):
   cluster_spec = server_lib.ClusterSpec({
       "worker": ["/job:worker/task:0", "/job:worker/task:1"]
   })
   strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
   strategy.configure(cluster_spec=cluster_spec)
   return strategy
Esempio n. 12
0
  def __init__(self,
               devices=None,
               num_gpus=None,
               cross_tower_ops=None,
               prefetch_on_device=None):
    super(MirroredStrategy, self).__init__()
    # Convert `num_gpus` into `devices`, shouldn't specify both.
    if devices is None:
      if num_gpus is None:
        num_gpus = context.num_gpus()
      devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
    elif num_gpus is not None:
      raise ValueError("Must only specify one of `devices` and `num_gpus`.")

    assert devices, "Must specify at least one device."
    assert len(set(devices)) == len(devices), (
        "No duplicates allowed in `devices` argument.")
    # TODO(josh11b): Require at least 2 devices?
    self._devices = devices
    self._canonical_device_set = set(
        [device_util.canonicalize(d) for d in devices])
    self._device_index = values.PerDevice(
        dict((d, i) for i, d in enumerate(devices)))
    self._cross_tower_ops = cross_tower_ops
    self._prefetch_on_device = prefetch_on_device
 def _get_strategy_object(self, strategy_cls, eval_strategy=False):
   if strategy_cls == mirrored_strategy.CoreMirroredStrategy:
     if eval_strategy:
       return strategy_cls()
     else:
       return strategy_cls(
           cross_device_ops=self._make_cross_device_ops(
               num_gpus_per_worker=context.num_gpus()))
   elif (strategy_cls == mirrored_strategy.MirroredStrategy and
         not eval_strategy):
     return strategy_cls(
         num_gpus_per_worker=context.num_gpus(),
         cross_device_ops=self._make_cross_device_ops(
             num_gpus_per_worker=context.num_gpus()))
   else:
     return strategy_cls(num_gpus_per_worker=context.num_gpus())
  def testMakeInputFnIteratorDistributed(self, num_gpus, use_core_strategy,
                                         use_dataset):
    if context.num_gpus() < num_gpus:
      self.skipTest('Not enough GPUs')
    if use_dataset:
      fn = lambda: dataset_ops.Dataset.range(100)
    else:
      def fn():
        dataset = dataset_ops.Dataset.range(100)
        it = dataset.make_one_shot_iterator()
        return it.get_next
    expected_values = [[i+j for j in range(num_gpus)]
                       for i in range(0, 100, num_gpus)]

    input_fn = self._input_fn_to_test_input_context(
        fn,
        expected_num_replicas_in_sync=num_gpus,
        expected_num_input_pipelines=3,
        expected_input_pipeline_id=1)  # because task_id = 1
    self._test_input_fn_iterator(
        'worker',
        1,
        num_gpus,
        input_fn,
        expected_values,
        test_reinitialize=use_dataset,
        use_core_strategy=use_core_strategy)
Esempio n. 15
0
def _all_devices():
  devices = []
  tfconfig = TFConfigClusterResolver()
  if tfconfig.cluster_spec().as_dict():
    devices = _cluster_spec_to_device_list(tfconfig.cluster_spec(),
                                           context.num_gpus())
  return devices if devices else all_local_devices()
    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside of a thread."""
      # TODO(rchao/yuefengz): The following is run by both worker and ps
      # threads. The distribute coordinator should run std server immediately
      # without configuring the session (or building the graph) on PS.
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        batch_size = 64
        steps = 10
        strategy = strategy_cls(num_gpus_per_worker=context.num_gpus())
        verification_callback.is_between_graph = \
            strategy.extended.experimental_between_graph

        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        val_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))

          # TODO(b/123868066): Verify callback for model.evaluate().
          callbacks_for_fit = nest.flatten(
              kwargs.get('verification_callback', []))
          history = model.fit(
              x=train_ds,
              epochs=num_epoch,
              steps_per_epoch=steps,
              validation_data=val_ds,
              validation_steps=steps,
              callbacks=callbacks_for_fit)
        self.assertIsInstance(history, keras.callbacks.History)
  def DISABLED_testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy,
                                            use_dataset):
    if context.num_gpus() < num_gpus:
      self.skipTest('Not enough GPUs')
    if use_dataset:
      fn = lambda: dataset_ops.Dataset.range(100)
    else:
      def fn():
        dataset = dataset_ops.Dataset.range(100)
        it = dataset.make_one_shot_iterator()
        return it.get_next
    expected_values = [[i+j for j in range(num_gpus)]
                       for i in range(0, 100, num_gpus)]

    input_fn = self._input_fn_to_test_input_context(
        fn,
        expected_num_replicas_in_sync=num_gpus,
        expected_num_input_pipelines=1,
        expected_input_pipeline_id=0)  # only one worker and pipeline for local.
    self._test_input_fn_iterator(
        None,
        None,
        num_gpus,
        input_fn,
        expected_values,
        test_reinitialize=use_dataset,
        use_core_strategy=use_core_strategy)
Esempio n. 18
0
 def benchmark_read_variable_op_with_tape_2_by_2_GPU(self):
   if not context.num_gpus():
     return
   with context.device(GPU):
     m = resource_variable_ops.ResourceVariable(self._m_2_by_2.gpu())
     self._benchmark_read_variable_with_tape(
         m, num_iters=self._num_iters_2_by_2)
  def DISABLED_testMakeInputFnIterator(self, num_gpus, use_dataset,
                                       use_core_strategy):
    if context.num_gpus() < num_gpus:
      self.skipTest('Not enough GPUs')
    if use_dataset:
      fn = lambda: dataset_ops.Dataset.range(100)
    else:
      def fn():
        dataset = dataset_ops.Dataset.range(100)
        it = dataset.make_one_shot_iterator()
        return it.get_next
    # We use CPU as the device when num_gpus = 0
    devices_per_worker = max(1, num_gpus)
    expected_values = [[i+j for j in range(devices_per_worker)]
                       for i in range(0, 100, devices_per_worker)]

    input_fn = self._input_fn_to_test_input_context(
        fn,
        expected_num_replicas_in_sync=3*devices_per_worker,
        expected_num_input_pipelines=3,
        expected_input_pipeline_id=1)  # because task_id = 1
    self._test_input_fn_iterator(
        'worker',
        1,
        num_gpus,
        input_fn,
        expected_values,
        test_reinitialize=use_dataset,
        use_core_strategy=use_core_strategy)
Esempio n. 20
0
 def benchmark_defun_matmul_2_by_2_GPU(self):
   if not context.num_gpus():
     return
   with context.device(GPU):
     m = self._m_2_by_2.gpu()
     self._benchmark_defun_matmul(
         m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 def testComplexModel(self, num_gpus, use_core_strategy):
   if context.num_gpus() < num_gpus:
     self.skipTest('Not enough GPUs')
   self._run_between_graph_clients(
       self._test_complex_model,
       self._cluster_spec,
       num_gpus=num_gpus,
       use_core_strategy=use_core_strategy)
Esempio n. 22
0
 def testDataDistributionTwoDevicePerWorker(self):
   if context.num_gpus() < 1:
     self.skipTest("A GPU is not available for this test.")
   worker_device_map, devices = self._cpu_and_one_gpu_devices()
   with context.graph_mode():
     dataset_fn = lambda: dataset_ops.Dataset.range(8)
     self._test_dataset(dataset_fn, worker_device_map, devices,
                        [[0, 2, 1, 3], [4, 6, 5, 7]])
 def testReductionDistributed(self, num_gpus, use_strategy_object):
   if context.num_gpus() < num_gpus:
     return
   self._run_between_graph_clients(
       self._test_reduction,
       self._cluster_spec,
       num_gpus,
       use_strategy_object=use_strategy_object)
Esempio n. 24
0
 def testTwoDevicesPerWorker(self):
   if context.num_gpus() < 1:
     self.skipTest("A GPU is not available for this test.")
   worker_devices = self._cpu_and_one_gpu_devices()
   with context.graph_mode(), self.cached_session() as sess:
     input_fn = lambda: dataset_ops.Dataset.range(4)
     self._test_iterator(input_fn, worker_devices,
                         [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
Esempio n. 25
0
 def setUp(self):
   super(SummaryWriterTest, self).setUp()
   self._test_device = "gpu:0" if context.num_gpus() else "cpu:0"
   self._tmp_logdir = tempfile.mkdtemp()
   with context.device(self._test_device):
     # Use max_queue=0 so that summaries are immediately flushed to filesystem,
     # making testing easier.
     self._writer = summary_writer.SummaryWriter(self._tmp_logdir, max_queue=0)
Esempio n. 26
0
    def decorated(self, **kwargs):
      """A wrapped test method that sets up `test_function`."""
      assert "mode" in kwargs
      mode = kwargs["mode"]

      if "distribution" in kwargs:
        distribution = kwargs["distribution"]
        kwargs["distribution"] = distribution.strategy
        if distribution.required_tpu and not TPU_TEST:
          self.skipTest("Test requires a TPU, but it's not available.")
        if not distribution.required_tpu and TPU_TEST:
          self.skipTest("Test that doesn't require a TPU.")

        if not distribution.required_gpus:
          if GPU_TEST:
            self.skipTest("Test that doesn't require GPUs.")
        elif context.num_gpus() < distribution.required_gpus:
          self.skipTest(
              "{} GPUs are not available for this test. {} GPUs are available".
              format(distribution.required_gpus, context.num_gpus()))

      requested_arguments = tf_inspect.getfullargspec(test_function).args
      missing_arguments = set(list(kwargs.keys()) + ["self"]).difference(
          set(requested_arguments + ["mode"]))
      if missing_arguments:
        raise ValueError("The test is missing arguments {} .".format(
            missing_arguments))

      kwargs_to_pass = {}
      for arg in requested_arguments:
        if arg == "self":
          kwargs_to_pass[arg] = self
        else:
          kwargs_to_pass[arg] = kwargs[arg]

      if mode == "eager":
        with context.eager_mode(), ops.Graph().as_default():
          test_function(**kwargs_to_pass)
      elif mode == "graph":
        with context.graph_mode(), ops.Graph().as_default():
          test_function(**kwargs_to_pass)
      else:
        raise ValueError(
            "'mode' has to be either 'eager' or 'graph' and not {}".format(
                mode))
  def test_complete_flow_standalone_client(self, train_distribute_cls,
                                           eval_distribute_cls):
    train_distribute = train_distribute_cls(
        num_gpus_per_worker=context.num_gpus())

    if eval_distribute_cls:
      eval_distribute = eval_distribute_cls(
          num_gpus_per_worker=context.num_gpus())
    else:
      eval_distribute = None

    cluster_spec = copy.deepcopy(self._cluster_spec)
    if (train_distribute_cls !=
        parameter_server_strategy.ParameterServerStrategy):
      cluster_spec.pop("ps", None)
    estimator = self._complete_flow(train_distribute, eval_distribute,
                                    cluster_spec)
    self._inspect_train_and_eval_events(estimator)
Esempio n. 28
0
  def testProperties(self):
    if context.num_gpus() < 1 and context.executing_eagerly():
      self.skipTest("A GPU is not available for this test in eager mode.")

    v, _, mirrored = _make_mirrored()

    self.assertEqual(v[0].name, mirrored.name)
    self.assertEqual(v[0].dtype, mirrored.dtype)
    self.assertEqual(v[0].shape, mirrored.shape)
 def testMinimizeLoss(self, num_gpus):
   # Collective ops doesn't support strategy with one device.
   if context.num_gpus() < num_gpus:
     self.skipTest('Not enough GPUs')
   if context.executing_eagerly():
     strategy, _, _ = self._get_test_object(None, None, num_gpus)
     self._test_minimize_loss_eager(strategy)
   else:
     self._test_minimize_loss_graph(None, None, num_gpus)
Esempio n. 30
0
  def call(self, inputs, mask=None, training=None, initial_state=None):
    if isinstance(inputs, list):
      initial_state = inputs[1:]
      inputs = inputs[0]
    elif initial_state is not None:
      pass
    elif self.stateful:
      initial_state = self.states
    else:
      initial_state = self.get_initial_state(inputs)

    if len(initial_state) != len(self.states):
      raise ValueError('Layer has ' + str(len(self.states)) +
                       ' states but was passed ' + str(len(initial_state)) +
                       ' initial states.')

    if self.go_backwards:
      # Reverse time axis.
      inputs = K.reverse(inputs, 1)

    if ops.executing_eagerly_outside_functions():
      if context.num_gpus() > 0:
        outputs, [new_h, new_c], runtime = cudnn_lstm(
            inputs, initial_state[0], initial_state[1], self.kernel,
            self.recurrent_kernel, self.bias, self.units)
      else:
        outputs, [new_h, new_c], runtime = normal_lstm(
            inputs, initial_state[0], initial_state[1], self.kernel,
            self.recurrent_kernel, self.bias, self.units, self.activation,
            self.recurrent_activation)
    else:
      outputs, [new_h, new_c], runtime = normal_lstm(
          inputs, initial_state[0], initial_state[1], self.kernel,
          self.recurrent_kernel, self.bias, self.units, self.activation,
          self.recurrent_activation)

      function.register(cudnn_lstm, inputs, initial_state[0], initial_state[1],
                        self.kernel, self.recurrent_kernel, self.bias,
                        self.units)

    states = [new_h, new_c]

    if self.stateful:
      updates = []
      for i in range(len(states)):
        updates.append(state_ops.assign(self.states[i], states[i]))
      self.add_update(updates, inputs)

    if self.return_sequences:
      output = outputs
    else:
      output = outputs[:, -1, :]

    if self.return_state:
      return [output] + states
    else:
      return output, runtime
 def testPrefetchToDevice(self):
     if not context.num_gpus():
         self.skipTest("No GPU available")
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(prefetching_ops.prefetch_to_device("/gpu:0"))
 def testVariableInitialization(self, num_gpus):
     if context.num_gpus() < num_gpus:
         self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(self._test_variable_initialization,
                                     self._cluster_spec,
                                     num_gpus=num_gpus)
 def testComplexModel(self, num_gpus):
     if context.num_gpus() < num_gpus:
         self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(self._test_complex_model,
                                     self._cluster_spec,
                                     num_gpus=num_gpus)
Esempio n. 34
0
  def _test_device_assignment_local(self,
                                    d,
                                    compute_device='CPU',
                                    variable_device='CPU',
                                    num_gpus=0):
    with ops.Graph().as_default(), \
         self.cached_session(target=self._default_target,
                             config=self._sess_config) as sess, \
         d.scope():

      def model_fn():
        if 'CPU' in compute_device:
          replica_compute_device = '/device:CPU:0'
        else:
          replica_id = _get_replica_id_integer()
          replica_compute_device = ('/device:GPU:%d' % replica_id)
        replica_compute_device = device_util.canonicalize(
            replica_compute_device)

        if 'CPU' in variable_device:
          replica_variable_device = '/device:CPU:0'
        else:
          replica_id = _get_replica_id_integer()
          replica_variable_device = ('/device:GPU:%d' % replica_id)
        replica_variable_device = device_util.canonicalize(
            replica_variable_device)

        a = constant_op.constant(1.0)
        b = constant_op.constant(2.0)
        c = a + b
        self.assertEqual(a.device, replica_compute_device)
        self.assertEqual(b.device, replica_compute_device)
        self.assertEqual(c.device, replica_compute_device)

        # The device scope is ignored for variables but not for normal ops.
        with ops.device('/device:GPU:2'):
          x = variable_scope.get_variable(
              'x', initializer=10.0,
              aggregation=variable_scope.VariableAggregation.SUM)
          x_add = x.assign_add(c)
          e = a + c
        self.assertEqual(
            device_util.canonicalize(x.device), replica_variable_device)
        self.assertEqual(x_add.device, x.device)
        self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2'))

        # The colocate_vars_with can override the distribution's device.
        with d.extended.colocate_vars_with(x):
          y = variable_scope.get_variable(
              'y', initializer=20.0,
              aggregation=variable_scope.VariableAggregation.SUM)
        # We add an identity here to avoid complaints about summing
        # non-distributed values.
        y_add = y.assign_add(array_ops.identity(x_add))
        self.assertEqual(
            device_util.canonicalize(y.device), replica_variable_device)
        self.assertEqual(y_add.device, y.device)
        self.assertEqual(y.device, x.device)

        z = variable_scope.get_variable(
            'z', initializer=10.0,
            aggregation=variable_scope.VariableAggregation.SUM)
        self.assertEqual(
            device_util.canonicalize(z.device), replica_variable_device)

        with ops.control_dependencies([y_add]):
          # We add an identity here to avoid complaints about summing
          # non-distributed values.
          z_add = z.assign_add(array_ops.identity(y))
        with ops.control_dependencies([z_add]):
          f = z + c
        self.assertEqual(f.device, replica_compute_device)

        # The device scope would merge with the default worker device.
        with ops.device('/CPU:1'):
          g = e + 1.0
        self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1'))

        # Ths ops.colocate_with will be ignored when defining a variale but not
        # for a normal tensor.
        with ops.colocate_with(x):
          u = variable_scope.get_variable('u', initializer=30.0)
          h = f + 1.0
        self.assertEqual(
            device_util.canonicalize(u.device), replica_variable_device)
        self.assertEqual(
            device_util.canonicalize(x.device),
            device_util.canonicalize(h.device))
        return y_add, z_add, f

      y, z, f = d.extended.call_for_each_replica(model_fn)
      self.assertNotEqual(y, None)
      self.assertNotEqual(z, None)
      self.assertNotEqual(f, None)

      if context.num_gpus() >= 1 and num_gpus <= 1:
        variables.global_variables_initializer().run()
        y_val, z_val, f_val = sess.run([y, z, f])
        self.assertEqual(y_val, 33.0)
        self.assertEqual(z_val, 43.0)
        self.assertEqual(f_val, 46.0)
Esempio n. 35
0
  def decorated(self, **kwargs):
    """A wrapped test method that treats some arguments in a special way."""
    mode = kwargs.pop("mode", "graph")

    distribution = kwargs.get("distribution", None)
    required_tpu = kwargs.pop("required_tpu", False)
    required_gpus = kwargs.pop("required_gpus", None)

    if distribution:
      assert required_gpus is None, (
          "Do not use `required_gpus` and `distribution` together.")
      assert required_tpu is False, (
          "Do not use `required_tpu` and `distribution` together.")
      required_gpus = distribution.required_gpus
      required_tpu = distribution.required_tpu

    if required_tpu and not TPU_TEST:
      self.skipTest("Test requires a TPU, but it's not available.")
    if not required_tpu and TPU_TEST:
      self.skipTest("Test that doesn't require a TPU.")

    if not required_gpus:
      if GPU_TEST:
        self.skipTest("Test that doesn't require GPUs.")
    elif context.num_gpus() < required_gpus:
      # TODO(priyag): Consider allowing tests in graph mode using soft
      # placement.
      self.skipTest(
          "{} GPUs are not available for this test. {} GPUs are available".
          format(required_gpus, context.num_gpus()))

    # At this point, `kwargs` doesn't have `required_gpus` or `required_tpu`
    # that the user might have specified.  `kwargs` still has `mode`, which
    # the test is allowed to accept or ignore.
    requested_arguments = tf_inspect.getfullargspec(test_method).args
    missing_arguments = set(list(kwargs.keys()) + ["self"]).difference(
        set(requested_arguments + ["mode"]))
    if missing_arguments:
      raise ValueError("The test is missing arguments {} .".format(
          missing_arguments))

    kwargs_to_pass = {}
    for arg in requested_arguments:
      if arg == "self":
        kwargs_to_pass[arg] = self
      else:
        kwargs_to_pass[arg] = kwargs[arg]

    if mode == "eager":
      with context.eager_mode():
        if distribution:
          kwargs_to_pass["distribution"] = distribution.strategy
        test_method(**kwargs_to_pass)
    elif mode == "graph":
      with ops.Graph().as_default(), context.graph_mode():
        if distribution:
          kwargs_to_pass["distribution"] = distribution.strategy
        test_method(**kwargs_to_pass)
    else:
      raise ValueError(
          "'mode' has to be either 'eager' or 'graph' and not {}".format(
              mode))
Esempio n. 36
0
def all_local_devices(num_gpus=None):
    if num_gpus is None:
        num_gpus = context.num_gpus()
    return device_util.local_devices_from_num_gpus(num_gpus)
  def _initialize_multi_worker(self, cluster_resolver):
    """Initialize devices for multiple workers.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per
    replica. The variable device is a device function or device string. The
    default variable device assigns variables to parameter servers in a
    round-robin fashion.

    Args:
      cluster_resolver: a descendant of `ClusterResolver` object.

    Raises:
      ValueError: if the cluster doesn't have ps jobs.
    """
    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
    # some cases.
    if isinstance(cluster_resolver, TFConfigClusterResolver):
      num_gpus = context.num_gpus()
    else:
      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

    # Save the num_gpus_per_worker for configure method.
    self._num_gpus_per_worker = num_gpus

    cluster_spec = cluster_resolver.cluster_spec()
    task_type = cluster_resolver.task_type
    task_id = cluster_resolver.task_id
    if not task_type or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`")
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    assert cluster_spec.as_dict()

    worker_device = "/job:%s/task:%d" % (task_type, task_id)
    self._input_host_device = numpy_dataset.SingleDevice(worker_device)

    # Define compute devices which is a list of device strings and one for each
    # replica. When there are GPUs, replicate operations on these GPUs.
    # Otherwise, place operations on CPU.
    if num_gpus > 0:
      compute_devices = tuple(
          "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus))
    else:
      compute_devices = (worker_device,)

    self._compute_devices = [
        device_util.canonicalize(d) for d in compute_devices]
    self._input_workers = input_lib.InputWorkers(
        [(worker_device, compute_devices)])

    # In distributed mode, place variables on ps jobs in a round-robin fashion.
    # Note that devices returned from `replica_device_setter` are not
    # canonical and therefore we don't canonicalize all variable devices to
    # make them consistent.
    # TODO(yuefengz): support passing a strategy object to control variable
    # assignment.
    # TODO(yuefengz): merge the logic of replica_device_setter into this
    # class.
    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
    if num_ps_replicas == 0:
      raise ValueError("The cluster spec needs to have `ps` jobs.")
    self._variable_device = device_setter.replica_device_setter(
        ps_tasks=num_ps_replicas,
        worker_device=worker_device,
        merge_devices=True,
        cluster=cluster_spec)

    # The `_parameter_devices` is needed for the `parameter_devices` property
    # and is a list of all variable devices. Here parameter devices are all
    # tasks of the "ps" job.
    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
                                        range(num_ps_replicas)))

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = worker_device

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)
    self._cluster_spec = cluster_spec
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker ParameterServerStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_ps_replicas = %r, is_chief = %r, compute_devices = %r, "
        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
        num_ps_replicas, self._is_chief, self._compute_devices,
        self._variable_device)
 def benchmark_read_variable_op_2_by_2_GPU(self):
     if not context.num_gpus():
         return
     with context.device(GPU):
         m = resource_variable_ops.ResourceVariable(self._m_2_by_2.gpu())
         self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
Esempio n. 39
0
  def _test_device_assignment_distributed_enable_partitioner(
      self, task_type, task_id, num_gpus):
    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
    num_shards = len(d.parameter_devices)
    partitioner = partitioned_variables.fixed_size_partitioner(num_shards)
    with ops.Graph().as_default(), \
         self.cached_session(target=self._default_target,
                             config=sess_config) as sess, \
         d.scope():

      n = variable_scope.get_variable(
          'n',
          initializer=constant_op.constant([10.0, 20.0]),
          aggregation=variable_scope.VariableAggregation.SUM,
          partitioner=partitioner)

      for part_id, var in enumerate(n):
        self.assertEqual(var.device, '/job:ps/task:%d' % part_id)

      def model_fn():
        a = constant_op.constant([3.0, 5.0])
        # The device scope is ignored for variables but not for normal ops.
        with ops.device('/job:worker/task:0'):
          x = variable_scope.get_variable(
              'x',
              initializer=constant_op.constant([10.0, 20.0]),
              aggregation=variable_scope.VariableAggregation.SUM,
              partitioner=partitioner)
          x_add = x.assign_add(a, name='x_add')
        # The variable x is on the task 1 since the device_function has been
        # called once before the model_fn.
        for part_id, var in enumerate(x):
          self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
          self.assertEqual(var.device, x_add[part_id].device)

        # The colocate_vars_with can override the distribution's device.
        with d.colocate_vars_with(x_add[0]):
          y = variable_scope.get_variable(
              'y',
              initializer=constant_op.constant([20.0, 10.0]),
              aggregation=variable_scope.VariableAggregation.SUM,
              partitioner=partitioner)
        y_add = y.assign_add(
            [array_ops.identity(x_add[0]),
             array_ops.identity(x_add[1])])

        for part_id, var in enumerate(y):
          self.assertEqual(var.device, '/job:ps/task:0')
          self.assertEqual(y_add[part_id].device, var.device)
          self.assertEqual(var.device, x_add[0].device)

        return x_add, y_add

      x, y = d.call_for_each_replica(model_fn)

      if context.num_gpus() >= 1:
        variables.global_variables_initializer().run()
        x_val, y_val = sess.run([x, y])
        if num_gpus < 1:
          self.assertEqual(x_val, [13.0, 25.0])
          self.assertEqual(y_val, [33.0, 35.0])
        else:
          x_expect = [10.0 + 3 * num_gpus, 20.0 + 5 * num_gpus]
          y_expect = [
              20.0 + x_expect[0] * num_gpus, 10.0 + x_expect[1] * num_gpus
          ]
          self.assertEqual(x_val, x_expect)
          self.assertEqual(y_val, y_expect)
 def testVariableInitialization(self, num_gpus):
     if context.num_gpus() < num_gpus:
         return
     self._run_between_graph_clients(self._test_variable_initialization,
                                     self._cluster_spec,
                                     num_gpus=num_gpus)
Esempio n. 41
0
  def _test_simple_increment(self,
                             task_type,
                             task_id,
                             num_gpus,
                             use_core_strategy=False):
    d, master_target, sess_config = self._get_test_objects(
        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
    if d.extended._cluster_spec:
      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
      if 'chief' in d.extended._cluster_spec.as_dict():
        num_workers += 1
    else:
      num_workers = 1
    with ops.Graph().as_default(), \
         self.cached_session(target=master_target,
                             config=sess_config) as sess, \
         d.scope():

      def model_fn():
        x = variable_scope.get_variable(
            'x', initializer=10.0,
            aggregation=variable_scope.VariableAggregation.SUM)
        y = variable_scope.get_variable(
            'y', initializer=20.0,
            aggregation=variable_scope.VariableAggregation.SUM)
        z = variable_scope.get_variable(
            'z', initializer=30.0,
            aggregation=variable_scope.VariableAggregation.ONLY_FIRST_REPLICA)

        # We explicitly make a constant tensor here to avoid complaints about
        # summing non-distributed values.
        one = constant_op.constant(1.0)
        x_add = x.assign_add(one, use_locking=True)
        y_add = y.assign_add(one, use_locking=True)
        z_add = z.assign_add(one, use_locking=True)

        train_op = control_flow_ops.group(x_add, y_add, z_add)
        return x, y, z, train_op

      x, y, z, train_op = d.extended.call_for_each_replica(model_fn)
      train_op = d.group(train_op)

      if context.num_gpus() < d.extended._num_gpus_per_worker:
        return True

      if task_id == 0:
        variables.global_variables_initializer().run()

      # Workers waiting for chief worker's initializing variables.
      self._init_condition.acquire()
      self._init_reached += 1
      while self._init_reached != num_workers:
        self._init_condition.wait()
      self._init_condition.notify_all()
      self._init_condition.release()

      sess.run(train_op)

      # Wait for other workers to finish training.
      self._finish_condition.acquire()
      self._finish_reached += 1
      while self._finish_reached != num_workers:
        self._finish_condition.wait()
      self._finish_condition.notify_all()
      self._finish_condition.release()

      x_val, y_val, z_val = sess.run([x, y, z])
      self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_replicas_in_sync)
      self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas_in_sync)
      self.assertEqual(z_val, 30.0 + 1.0 * num_workers)
      return (x_val == 10.0 + 1.0 * num_workers * d.num_replicas_in_sync and
              y_val == 20.0 + 1.0 * num_workers * d.num_replicas_in_sync and
              z_val == 30.0 + 1.0 * num_workers)
Esempio n. 42
0
 def _dev(self):
   return '/device:GPU:0' if context.num_gpus() else '/device:CPU:0'
 def benchmark_create_int32_tensor_from_np_array_GPU(self):
     # int32's are kept on host memory even when executing on GPU.
     if not context.num_gpus():
         return
     self._benchmark_create_tensor(np.array([[3]], dtype=np.int32),
                                   dtypes.int32.as_datatype_enum, GPU)
Esempio n. 44
0
def create_mirrored_strategy():
    if context.num_gpus() >= 1:
        return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0'])
    else:
        return mirrored_strategy.MirroredStrategy(['cpu:0'])
Esempio n. 45
0
 def testSimpleBetweenGraph(self):
   self._run_between_graph_clients(self._test_simple_increment,
                                   self._cluster_spec, context.num_gpus())
 def benchmark_create_float_tensor_from_np_array_GPU(self):
     if not context.num_gpus():
         return
     self._benchmark_create_tensor(np.array([[3.0]], dtype=np.float32),
                                   dtypes.float32.as_datatype_enum, GPU)
Esempio n. 47
0
 def testGlobalStepUpdate(self):
   strategy = parameter_server_strategy.ParameterServerStrategy(
       num_gpus_per_worker=context.num_gpus())
   self._test_global_step_update(strategy)
 def testComplexModel(self, num_gpus):
     if context.num_gpus() < num_gpus:
         return
     self._run_between_graph_clients(self._test_complex_model,
                                     self._cluster_spec,
                                     num_gpus=num_gpus)
 def testComplexModel(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
         self.skipTest('Not enough GPUs')
     self._test_complex_model(None, None, num_gpus)
Esempio n. 50
0
  def _test_device_assignment_distributed(self,
                                          task_type,
                                          task_id,
                                          num_gpus,
                                          use_core_strategy=False):
    worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id)
    d, _, sess_config = self._get_test_objects(
        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
    with ops.Graph().as_default(), \
         self.cached_session(target=self._default_target,
                             config=sess_config) as sess, \
         d.scope():

      # Define a variable outside the call_for_each_replica scope.
      n = variable_scope.get_variable('n', initializer=10.0)
      self.assertEqual(n.device, '/job:ps/task:0')

      def model_fn():
        if num_gpus == 0:
          last_part_device = 'device:CPU:0'
        else:
          replica_id = _get_replica_id_integer()
          last_part_device = ('device:GPU:%d' % replica_id)

        a = constant_op.constant(1.0)
        b = constant_op.constant(2.0)
        c = a + b
        self.assertEqual(a.device, worker_device + '/' + last_part_device)
        self.assertEqual(b.device, worker_device + '/' + last_part_device)
        self.assertEqual(c.device, worker_device + '/' + last_part_device)

        # The device scope is ignored for variables but not for normal ops.
        with ops.device('/job:worker/task:0'):
          x = variable_scope.get_variable(
              'x', initializer=10.0,
              aggregation=variable_scope.VariableAggregation.SUM)
          x_add = x.assign_add(c)
          e = a + c
        # The variable x is on the task 1 since the device_function has been
        # called once before the model_fn.
        self.assertEqual(x.device, '/job:ps/task:1')
        self.assertEqual(x_add.device, x.device)
        self.assertEqual(e.device,
                         '/job:worker/replica:0/task:0/%s' % last_part_device)

        # The colocate_vars_with can override the distribution's device.
        with d.extended.colocate_vars_with(x):
          y = variable_scope.get_variable(
              'y', initializer=20.0,
              aggregation=variable_scope.VariableAggregation.SUM)
        # We add an identity here to avoid complaints about summing
        # non-distributed values.
        y_add = y.assign_add(array_ops.identity(x_add))
        self.assertEqual(y.device, '/job:ps/task:1')
        self.assertEqual(y_add.device, y.device)
        self.assertEqual(y.device, x.device)

        z = variable_scope.get_variable(
            'z', initializer=10.0,
            aggregation=variable_scope.VariableAggregation.SUM)
        self.assertEqual(z.device, '/job:ps/task:0')
        self.assertNotEqual(z.device, x.device)

        with ops.control_dependencies([y_add]):
          # We add an identity here to avoid complaints about summing
          # non-distributed values.
          z_add = z.assign_add(array_ops.identity(y))
        with ops.control_dependencies([z_add]):
          f = z + c
        self.assertEqual(f.device, worker_device + '/' + last_part_device)

        # The device scope would merge with the default worker device.
        with ops.device('/CPU:1'):
          g = e + 1.0
        self.assertEqual(g.device, worker_device + '/device:CPU:1')

        # Ths ops.colocate_with will be ignored when defining a variale but not
        # for a normal tensor.
        with ops.colocate_with(x):
          u = variable_scope.get_variable('u', initializer=30.0)
          v = variable_scope.get_variable('v', initializer=30.0)
          h = f + 1.0
        self.assertIn('/job:ps/', u.device)
        self.assertIn('/job:ps/', v.device)
        # u and v are on different parameter servers.
        self.assertTrue(u.device != x.device or v.device != x.device)
        self.assertTrue(u.device == x.device or v.device == x.device)
        # Here h is not on one worker. Note h.device is canonical while x.device
        # is not but.
        self.assertIn('/job:ps/', h.device)
        return y_add, z_add, f

      y, z, f = d.extended.call_for_each_replica(model_fn)
      self.assertNotEqual(y, None)
      self.assertNotEqual(z, None)
      self.assertNotEqual(f, None)

      if context.num_gpus() >= 1 and num_gpus <= 1:
        variables.global_variables_initializer().run()
        y_val, z_val, f_val = sess.run([y, z, f])
        self.assertEqual(y_val, 33.0)
        self.assertEqual(z_val, 43.0)
        self.assertEqual(f_val, 46.0)
 def testMinimizeLossGraph(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
         self.skipTest('Not enough GPUs')
     self._test_minimize_loss_graph(None, None, num_gpus)
    def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
        d, master_target = self._get_test_object(task_type, task_id, num_gpus)
        with ops.Graph().as_default(), \
             self.test_session(config=self._sess_config,
                               target=master_target) as sess, \
             d.scope():
            l = core.Dense(1,
                           use_bias=False,
                           name='gpu_%d' % d._num_gpus_per_worker)

            def loss_fn(x):
                y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
                return y * y

            # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
            # multiple graphs (b/111216820).
            def grad_fn(x):
                loss = loss_fn(x)
                var_list = (variables.trainable_variables() +
                            ops.get_collection(
                                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
                grads = gradients.gradients(loss, var_list)
                ret = list(zip(grads, var_list))
                return ret

            def update(v, g):
                return v.assign_sub(0.05 * g, use_locking=True)

            one = d.broadcast(constant_op.constant([[1.]]))

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.call_for_each_tower(grad_fn, one)
                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.read_var(v)
                    before_list.append(fetched)
                    with ops.control_dependencies([fetched]):
                        # TODO(yuefengz): support non-Mirrored variable as destinations.
                        g = d.reduce(variable_scope.VariableAggregation.SUM,
                                     g,
                                     destinations=v)
                        with ops.control_dependencies(
                                d.unwrap(d.update(v, update, g))):
                            after_list.append(d.read_var(v))
                return before_list, after_list

            before_out, after_out = step()

            if context.num_gpus() < d._num_gpus_per_worker:
                return True

            sess.run(variables.global_variables_initializer(),
                     options=self._run_options)

            for i in range(10):
                b, a = sess.run((before_out, after_out),
                                options=self._run_options)
                if i == 0:
                    before, = b
                after, = a

            error_before = abs(before - 1)
            error_after = abs(after - 1)
            # Error should go down
            self.assertLess(error_after, error_before)
            return error_after < error_before
 def testReduceIndexedSlicesDistributed(self, num_gpus, batch_reduce):
     if context.num_gpus() < num_gpus:
         return
     self._run_between_graph_clients(self._test_reduce_indexed_slices,
                                     self._cluster_spec, num_gpus,
                                     batch_reduce)
Esempio n. 54
0
 def test_train_no_dist_strat(self):
     if context.num_gpus() >= 2:
         self.skipTest(
             'No need to test 2+ GPUs without a distribution strategy.')
     t = tm.TransformerTask(FLAGS)
     t.train()
 def benchmark_create_float_tensor_from_list_GPU(self):
     if not context.num_gpus():
         return
     self._benchmark_create_tensor([[3.0]], dtypes.float32.as_datatype_enum,
                                   GPU)
    def _testReductionAndBroadcast(self, cross_device_ops, devices):
        if context.num_gpus() < sum(1 for d in devices if "GPU" in d.upper()):
            self.skipTest("Not enough GPUs")

        values = [constant_op.constant(float(d)) for d in range(len(devices))]
        per_replica = _make_per_replica(values, devices)
        mean = (len(devices) - 1.) / 2.

        values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
        per_replica_2 = _make_per_replica(values_2, devices)
        mean_2 = mean + 1.

        destination_mirrored = _fake_mirrored(1., devices)
        destination_different = _fake_mirrored(1., _cpu_device)
        destination_str = _cpu_device

        all_destinations = [
            destination_mirrored,
            destination_different,
            destination_str,
        ]

        # test reduce()
        for destinations in all_destinations:
            self._assert_values_equal(
                cross_device_ops.reduce(reduce_util.ReduceOp.MEAN,
                                        per_replica,
                                        destinations=destinations),
                _fake_mirrored(mean, destinations))
            self._assert_values_equal(
                cross_device_ops.reduce(reduce_util.ReduceOp.MEAN,
                                        per_replica_2,
                                        destinations=destinations),
                _fake_mirrored(mean_2, destinations))
            self._assert_values_equal(
                cross_device_ops.reduce(reduce_util.ReduceOp.SUM,
                                        per_replica,
                                        destinations=destinations),
                _fake_mirrored(mean * len(devices), destinations))
            self._assert_values_equal(
                cross_device_ops.reduce(reduce_util.ReduceOp.SUM,
                                        per_replica_2,
                                        destinations=destinations),
                _fake_mirrored(mean_2 * len(devices), destinations))

        # test batch_reduce()
        for d1, d2 in itertools.product(all_destinations, all_destinations):
            self._assert_values_equal(
                cross_device_ops.batch_reduce(reduce_util.ReduceOp.MEAN,
                                              [(per_replica, d1),
                                               (per_replica_2, d2)]),
                [_fake_mirrored(mean, d1),
                 _fake_mirrored(mean_2, d2)])
            self._assert_values_equal(
                cross_device_ops.batch_reduce(reduce_util.ReduceOp.SUM,
                                              [(per_replica, d1),
                                               (per_replica_2, d2)]),
                [
                    _fake_mirrored(mean * len(devices), d1),
                    _fake_mirrored(mean_2 * len(devices), d2)
                ])

        # test broadcast()
        for destinations in all_destinations:
            self._assert_values_equal(
                cross_device_ops.broadcast(constant_op.constant(1.),
                                           destinations),
                _fake_mirrored(1., destinations))
Esempio n. 57
0
 def testSimpleBetweenGraph(self, use_core_strategy):
   self._run_between_graph_clients(
       self._test_simple_increment,
       self._cluster_spec,
       context.num_gpus(),
       use_core_strategy=use_core_strategy)
Esempio n. 58
0
    def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
        d, master_target, config = self._get_test_object(
            task_type, task_id, num_gpus)
        with ops.Graph().as_default(), \
             self.cached_session(config=config,
                                 target=master_target) as sess, \
             d.scope():
            initializer = functools.partial(init_ops_v2.GlorotUniform(),
                                            (1, 1),
                                            dtype=dtypes.float32)
            kernel = variables.Variable(initial_value=initializer,
                                        name='gpu_%d/kernel' %
                                        d.extended._num_devices_per_worker,
                                        trainable=True)

            def loss_fn(x):
                y = array_ops.reshape(gen_math_ops.mat_mul(x, kernel),
                                      []) - constant_op.constant(1.)
                return y * y

            # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
            # multiple graphs (b/111216820).
            def grad_fn(x):
                loss = loss_fn(x)
                var_list = (variables.trainable_variables() +
                            ops.get_collection(
                                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
                grads = gradients.gradients(loss, var_list)
                ret = list(zip(grads, var_list))
                return ret

            def update(v, g):
                return v.assign_sub(0.05 * g, use_locking=True)

            one = constant_op.constant([[1.]])

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.extended.call_for_each_replica(grad_fn, args=[one])
                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.extended.read_var(v)
                    before_list.append(fetched)
                    with ops.control_dependencies([fetched]):
                        # TODO(yuefengz): support non-Mirrored variable as destinations.
                        g = d.extended.reduce_to(reduce_util.ReduceOp.SUM,
                                                 g,
                                                 destinations=v)
                        with ops.control_dependencies(
                                d.extended.update(v,
                                                  update,
                                                  args=(g, ),
                                                  group=False)):
                            after_list.append(d.extended.read_var(v))
                return before_list, after_list

            before_out, after_out = step()

            if (d.extended._local_device_type == 'GPU' and
                    context.num_gpus() < d.extended._num_devices_per_worker):
                return True

            sess.run(variables.global_variables_initializer())

            for i in range(10):
                b, a = sess.run((before_out, after_out))
                if i == 0:
                    before, = b
                after, = a

            error_before = abs(before - 1)
            error_after = abs(after - 1)
            # Error should go down
            self.assertLess(error_after, error_before)
Esempio n. 59
0
  def _test_minimize_loss_graph(self,
                                task_type,
                                task_id,
                                num_gpus,
                                use_core_strategy=False):
    d, master_target, sess_config = self._get_test_objects(
        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
    if task_type:
      # Multi-worker
      assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec
      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
      if CHIEF in d.extended._cluster_spec.as_dict():
        num_workers += 1
    else:
      # local
      num_workers = 1

    with ops.Graph().as_default(), \
         self.cached_session(target=master_target,
                             config=sess_config) as sess, \
         d.scope():
      l = core.Dense(1, use_bias=False)

      def loss_fn(x):
        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
        return y * y

      # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
      # multiple graphs (b/111216820).
      def grad_fn(x):
        loss = loss_fn(x)
        var_list = (
            variables.trainable_variables() + ops.get_collection(
                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
        grads = gradients.gradients(loss, var_list)
        ret = list(zip(grads, var_list))
        return ret

      def update(v, g):
        return v.assign_sub(0.05 * g, use_locking=True)

      one = constant_op.constant([[1.]])

      def step():
        """Perform one optimization step."""
        # Run forward & backward to get gradients, variables list.
        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))
        # Update the variables using the gradients and the update() function.
        before_list = []
        after_list = []
        for g, v in g_v:
          fetched = d.extended.read_var(v)
          before_list.append(fetched)
          with ops.control_dependencies([fetched]):
            # TODO(yuefengz): support non-Mirrored variable as destinations.
            g = d.extended.reduce_to(
                reduce_util.ReduceOp.SUM, g, destinations=v)
            with ops.control_dependencies(
                d.extended.update(v, update, args=(g,), group=False)):
              after_list.append(d.extended.read_var(v))
        return before_list, after_list

      before_out, after_out = step()

      if context.num_gpus() < d.extended._num_gpus_per_worker:
        return True

      if (not task_type or
          multi_worker_util.is_chief(
              d.extended._cluster_spec, task_type, task_id)):
        variables.global_variables_initializer().run()

      # Workers waiting for chief worker's initializing variables.
      self._init_condition.acquire()
      self._init_reached += 1
      while self._init_reached != num_workers:
        self._init_condition.wait()
      self._init_condition.notify_all()
      self._init_condition.release()

      for i in range(10):
        b, a = sess.run((before_out, after_out))
        if i == 0:
          before, = b
        after, = a

      error_before = abs(before - 1)
      error_after = abs(after - 1)
      # Error should go down
      self.assertLess(error_after, error_before)
      return error_after < error_before
 def benchmark_tf_multiply_op_GPU(self):
     if not context.num_gpus():
         return
     with context.device(GPU):
         m = self._m_2.gpu()
         self._benchmark_tf_multiply_op(m, 30000)