Example #1
0
  def __init__(self, model_dir=None, config=None):
    """Initializes a BaseEstimator instance.

    Args:
      model_dir: Directory to save model parameters, graph and etc.
      config: A RunConfig instance.
    """
    # Model directory.
    self._model_dir = model_dir
    if self._model_dir is None:
      self._model_dir = tempfile.mkdtemp()
      logging.warning('Using temporary folder as model directory: %s',
                      self._model_dir)

    # Create a run configuration
    if config is None:
      self._config = BaseEstimator._Config()
    else:
      self._config = config

    # Set device function depending if there are replicas or not.
    if self._config.num_ps_replicas > 0:
      ps_ops = ['Variable', 'AutoReloadVariable']
      self._device_fn = device_setter.replica_device_setter(
          ps_tasks=self._config.num_ps_replicas,
          merge_devices=False, ps_ops=ps_ops)
    else:
      self._device_fn = None

    # Features and targets TensorSignature objects.
    # TODO(wicke): Rename these to something more descriptive
    self._features_info = None
    self._targets_info = None

    self._graph = None
Example #2
0
  def __init__(self, model_dir=None, config=None):
    # Model directory.
    self._model_dir = model_dir
    if self._model_dir is None:
      self._model_dir = tempfile.mkdtemp()
      logging.info('Using temporary folder as model directory: %s',
                   self._model_dir)

    # Create a run configuration
    if config is None:
      self._config = BaseEstimator._Config()
    else:
      self._config = config

    # Set device function depending if there are replicas or not.
    if self._config.num_ps_replicas > 0:
      ps_ops = ['Variable', 'AutoReloadVariable']
      self._device_fn = device_setter.replica_device_setter(
          ps_tasks=self._config.num_ps_replicas,
          merge_devices=False, ps_ops=ps_ops)
    else:
      self._device_fn = None

    # Features and targets TensorSingature objects.
    self._features_info = None
    self._targets_info = None

    self._graph = None
  def benchmark_create_1000_partitions_with_100_parameter_servers(self):
    workers, _ = test.create_local_cluster(num_workers=1, num_ps=100)
    worker_sessions = [session_lib.Session(w.target) for w in workers]
    worker = worker_sessions[0]
    partition_sizes = (1, 512, 1024 * 32, 1024 * 128)

    partitioned = []

    for partition_size in partition_sizes:
      # max_shard_bytes is 4, shape is 1000*partition_size float32s which should
      # partition into 1000 shards, each containing partition_size float32s.
      print("Building partitioned variable with %d floats per partition" %
            partition_size)
      with ops.device(device_setter.replica_device_setter(ps_tasks=100)):
        partitioned_ix = variable_scope.get_variable(
            "partitioned_%d" % partition_size,
            shape=[1000 * partition_size],
            dtype=dtypes.float32,
            # Each partition to have exactly N float32s
            partitioner=partitioned_variables.variable_axis_size_partitioner(
                max_shard_bytes=4 * partition_size))
        # Concatenates along axis 0
        partitioned.append(ops.convert_to_tensor(partitioned_ix))

    variables.global_variables_initializer().run(session=worker)

    for ix, partition_size in enumerate(partition_sizes):
      print("Running benchmark having partitions with %d floats" %
            partition_size)
      self.run_op_benchmark(
          worker,
          partitioned[ix],
          name=("read_concat_1000_partitions_from_"
                "100_parameter_servers_partsize_%d_floats" % partition_size))
Example #4
0
  def __init__(self, model_dir=None, config=None):
    """Initializes a BaseEstimator instance.

    Args:
      model_dir: Directory to save model parameters, graph and etc.
      config: A RunConfig instance.
    """
    # Model directory.
    self._model_dir = model_dir
    if self._model_dir is None:
      self._model_dir = tempfile.mkdtemp()
      logging.warning('Using temporary folder as model directory: %s',
                      self._model_dir)

    # Create a run configuration
    if config is None:
      self._config = BaseEstimator._Config()
    else:
      self._config = config

    # Set device function depending if there are replicas or not.
    if self._config.num_ps_replicas > 0:
      ps_ops = ['Variable', 'AutoReloadVariable']
      self._device_fn = device_setter.replica_device_setter(
          ps_tasks=self._config.num_ps_replicas,
          merge_devices=False, ps_ops=ps_ops)
    else:
      self._device_fn = None

    # Features and targets TensorSignature objects.
    # TODO(wicke): Rename these to something more descriptive
    self._features_info = None
    self._targets_info = None

    self._graph = None
Example #5
0
    def __init__(self, model_dir=None, config=None):
        # Model directory.
        self._model_dir = model_dir
        if self._model_dir is None:
            self._model_dir = tempfile.mkdtemp()
            logging.info('Using temporary folder as model directory: %s',
                         self._model_dir)

        # Create a run configuration
        if config is None:
            self._config = BaseEstimator._Config()
        else:
            self._config = config

        # Set device function depending if there are replicas or not.
        if self._config.num_ps_replicas > 0:
            ps_ops = ['Variable', 'AutoReloadVariable']
            self._device_fn = device_setter.replica_device_setter(
                ps_tasks=self._config.num_ps_replicas,
                merge_devices=False,
                ps_ops=ps_ops)
        else:
            self._device_fn = None

        # Features and targets TensorSingature objects.
        self._features_info = None
        self._targets_info = None

        self._graph = None
Example #6
0
 def testVariableWithReplicaDeviceSetter(self):
   with self.test_session():
     with ops.device(device_setter.replica_device_setter(ps_tasks=2)):
       a = variables_lib2.variable('a', [])
       b = variables_lib2.variable('b', [])
       c = variables_lib2.variable('c', [], device='cpu:12')
       d = variables_lib2.variable('d', [])
       with ops.device('cpu:99'):
         e_init = constant_op.constant(12)
       e = variables_lib2.variable('e', initializer=e_init)
     # The values below highlight how the replica_device_setter puts initial
     # values on the worker job, and how it merges explicit devices.
     self.assertDeviceEqual(a.device, '/job:ps/task:0/cpu:0')
     self.assertEqual(a.initial_value.op.colocation_groups(),
                      a.op.colocation_groups())
     self.assertDeviceEqual(b.device, '/job:ps/task:1/cpu:0')
     self.assertEqual(b.initial_value.op.colocation_groups(),
                      b.op.colocation_groups())
     self.assertDeviceEqual(c.device, '/job:ps/task:0/cpu:12')
     self.assertEqual(c.initial_value.op.colocation_groups(),
                      c.op.colocation_groups())
     self.assertDeviceEqual(d.device, '/job:ps/task:1/cpu:0')
     self.assertEqual(d.initial_value.op.colocation_groups(),
                      d.op.colocation_groups())
     self.assertDeviceEqual(e.device, '/job:ps/task:0/cpu:0')
     self.assertDeviceEqual(e.initial_value.device, '/job:worker/cpu:99')
Example #7
0
def _get_replica_device_setter(config):
  """Creates a replica device setter if required.

  Args:
    config: A RunConfig instance.

  Returns:
    A replica device setter, or None.
  """
  ps_ops = [
      'Variable', 'AutoReloadVariable', 'MutableHashTable',
      'MutableHashTableOfTensors', 'MutableDenseHashTable'
  ]

  if config.job_name:
    worker_device = '/job:%s/task:%d' % (config.job_name, config.task)
  else:
    worker_device = '/job:worker'

  if config.num_ps_replicas > 0:
    return device_setter.replica_device_setter(
        ps_tasks=config.num_ps_replicas, worker_device=worker_device,
        merge_devices=False, ps_ops=ps_ops, cluster=config.cluster_spec)
  else:
    return None
Example #8
0
def _get_replica_device_setter(config):
    """Creates a replica device setter if required.

  Args:
    config: A RunConfig instance.

  Returns:
    A replica device setter, or None.
  """
    ps_ops = [
        'Variable', 'AutoReloadVariable', 'MutableHashTable',
        'MutableHashTableOfTensors'
    ]

    if config.job_name:
        worker_device = '/job:%s/task:%d' % (config.job_name, config.task)
    else:
        worker_device = '/job:worker'

    if config.num_ps_replicas > 0:
        return device_setter.replica_device_setter(
            ps_tasks=config.num_ps_replicas,
            worker_device=worker_device,
            merge_devices=False,
            ps_ops=ps_ops,
            cluster=config.cluster_spec)
    else:
        return None
Example #9
0
def _get_replica_device_setter(num_ps_replicas):
  """Creates a replica device setter if required."""
  ps_ops = ['Variable', 'AutoReloadVariable',
            'MutableHashTable', 'MutableHashTableOfTensors']
  if num_ps_replicas > 0:
    return device_setter.replica_device_setter(
        ps_tasks=num_ps_replicas, merge_devices=False, ps_ops=ps_ops)
  else:
    return None
Example #10
0
 def testPS2TasksWithClusterSpecClass(self):
   with ops.device(
       device_setter.replica_device_setter(cluster=self._cluster_spec)):
     v = variables.Variable([1, 2])
     w = variables.Variable([2, 1])
     a = v + w
     self.assertDeviceEqual("/job:ps/task:0", v.device)
     self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
     self.assertDeviceEqual("/job:ps/task:1", w.device)
     self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
     self.assertDeviceEqual("/job:worker", a.device)
 def testPS2TasksWithClusterSpecClass(self):
   with ops.device(
       device_setter.replica_device_setter(cluster=self._cluster_spec)):
     v = variables.Variable([1, 2])
     w = variables.Variable([2, 1])
     a = v + w
     self.assertDeviceEqual("/job:ps/task:0", v.device)
     self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
     self.assertDeviceEqual("/job:ps/task:1", w.device)
     self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
     self.assertDeviceEqual("/job:worker", a.device)
Example #12
0
 def testPS2TasksUseCpuForPS(self):
   with ops.device(
       device_setter.replica_device_setter(ps_tasks=1, ps_device="/cpu:0")):
     v = variables.Variable([1, 2])
     with ops.device("/job:moon"):
       w = variables.Variable([2, 1])
     a = v + w
     self.assertDeviceEqual("/cpu:0", v.device)
     self.assertDeviceEqual("/cpu:0", v.initializer.device)
     self.assertDeviceEqual("/job:moon/cpu:0", w.device)
     self.assertDeviceEqual("/job:moon/cpu:0", w.initializer.device)
     self.assertDeviceEqual("/job:worker", a.device)
 def testPS2TasksUseCpuForPS(self):
   with ops.device(
       device_setter.replica_device_setter(ps_tasks=1, ps_device="/cpu:0")):
     v = variables.Variable([1, 2])
     with ops.device("/job:moon"):
       w = variables.Variable([2, 1])
     a = v + w
     self.assertDeviceEqual("/cpu:0", v.device)
     self.assertDeviceEqual("/cpu:0", v.initializer.device)
     self.assertDeviceEqual("/job:moon/cpu:0", w.device)
     self.assertDeviceEqual("/job:moon/cpu:0", w.initializer.device)
     self.assertDeviceEqual("/job:worker", a.device)
Example #14
0
 def testCPUOverride(self):
   with ops.device(
       device_setter.replica_device_setter(cluster=self._cluster_spec)):
     with ops.device("/cpu:0"):
       v = variables.Variable([1, 2])
     w = variables.Variable([2, 1])
     with ops.device("/cpu:0"):
       a = v + w
     self.assertDeviceEqual("/job:ps/task:0/cpu:0", v.device)
     self.assertDeviceEqual("/job:ps/task:0/cpu:0", v.initializer.device)
     self.assertDeviceEqual("/job:ps/task:1", w.device)
     self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
     self.assertDeviceEqual("/job:worker/cpu:0", a.device)
 def testPS2TasksNoMerging(self):
   with ops.device(
       device_setter.replica_device_setter(
           cluster=self._cluster_spec, merge_devices=False)):
     v = variables.Variable([1, 2])
     with ops.device("/job:ps"):  # Won't assign task when merge_devices=False.
       w = variables.Variable([2, 1])
     a = v + w
     self.assertDeviceEqual("/job:ps/task:0", v.device)
     self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
     self.assertDeviceEqual("/job:ps", w.device)
     self.assertDeviceEqual("/job:ps", w.initializer.device)
     self.assertDeviceEqual("/job:worker", a.device)
 def testCPUOverride(self):
   with ops.device(
       device_setter.replica_device_setter(cluster=self._cluster_spec)):
     with ops.device("/cpu:0"):
       v = variables.Variable([1, 2])
     w = variables.Variable([2, 1])
     with ops.device("/cpu:0"):
       a = v + w
     self.assertDeviceEqual("/job:ps/task:0/cpu:0", v.device)
     self.assertDeviceEqual("/job:ps/task:0/cpu:0", v.initializer.device)
     self.assertDeviceEqual("/job:ps/task:1", w.device)
     self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
     self.assertDeviceEqual("/job:worker/cpu:0", a.device)
Example #17
0
 def testByteSizeLoadFnWithScalar(self):
   with ops.device(
       device_setter.replica_device_setter(
           cluster=self._cluster_spec,
           ps_strategy=device_setter_lib.GreedyLoadBalancingStrategy(
               2, device_setter_lib.byte_size_load_fn))):
     # Note: we must test the load function as part of the device function
     # instead of passing u.op to the function directly, because the only
     # time that the output Tensor has unknown shape for scalars is during
     # Variable construction.
     u = variables.Variable(0)
     self.assertDeviceEqual("/job:ps/task:0", u.device)
     self.assertDeviceEqual("/job:ps/task:0", u.initializer.device)
Example #18
0
 def testPS2TasksNoMerging(self):
   with ops.device(
       device_setter.replica_device_setter(
           cluster=self._cluster_spec, merge_devices=False)):
     v = variables.Variable([1, 2])
     with ops.device("/job:ps"):  # Won't assign task when merge_devices=False.
       w = variables.Variable([2, 1])
     a = v + w
     self.assertDeviceEqual("/job:ps/task:0", v.device)
     self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
     self.assertDeviceEqual("/job:ps", w.device)
     self.assertDeviceEqual("/job:ps", w.initializer.device)
     self.assertDeviceEqual("/job:worker", a.device)
def _get_workers(num_workers, steps, workers):
  sessions = []
  graphs = []
  train_ops = []
  for worker_id in range(num_workers):
    graph = ops.Graph()
    is_chief = (worker_id == 0)
    with graph.as_default():
      worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
      ma_coustom = ModelAverageCustomGetter(
        worker_device=worker_device)
      with variable_scope.variable_scope('',
                                         custom_getter=ma_coustom), ops.device(
        device_setter.replica_device_setter(worker_device=worker_device,
                                            ps_device="/job:ps/task:0/cpu:0",
                                            ps_tasks=1)):

        global_step = variables.Variable(0, name='global_step',
                                         trainable=False)
        var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
        var_1 = variable_scope.get_variable(initializer=1.0, name="v1")

      with ops.device("/job:worker/task:" + str(worker_id)):
        if worker_id == 0:
          grads_0 = constant_op.constant(-1.0)
          grads_1 = constant_op.constant(-1.0)
        else:
          grads_0 = constant_op.constant(-2.0)
          grads_1 = constant_op.constant(-2.0)
        sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
        opt = ModelAverageOptimizer(
          opt=sgd_opt,
          num_worker=num_workers,
          ma_custom_getter=ma_coustom,
          is_chief=is_chief,
          interval_steps=steps
        )
        train_op = [
          opt.apply_gradients(
            [[grads_0, var_0],
             [grads_1, var_1]], global_step)
        ]
      easgd_hook = opt.make_session_run_hook()
      # Creates MonitoredSession
      sess = training.MonitoredTrainingSession(workers[worker_id].target,
                                               hooks=[easgd_hook])

    sessions.append(sess)
    graphs.append(graph)
    train_ops.append(train_op)
  return sessions, graphs, train_ops
def _get_workers(num_workers, steps, workers):
    sessions = []
    graphs = []
    train_ops = []
    for worker_id in range(num_workers):
        graph = ops.Graph()
        is_chief = (worker_id == 0)
        with graph.as_default():
            worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
            ma_coustom = model_average_optimizer.ModelAverageCustomGetter(
                worker_device=worker_device)
            with variable_scope.variable_scope(
                    "", custom_getter=ma_coustom), ops.device(
                        device_setter.replica_device_setter(
                            worker_device=worker_device,
                            ps_device="/job:ps/task:0/cpu:0",
                            ps_tasks=1)):

                global_step = variables.Variable(0,
                                                 name="global_step",
                                                 trainable=False)
                var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
                var_1 = variable_scope.get_variable(initializer=1.0, name="v1")

            with ops.device("/job:worker/task:" + str(worker_id)):
                if worker_id == 0:
                    grads_0 = constant_op.constant(-1.0)
                    grads_1 = constant_op.constant(-1.0)
                else:
                    grads_0 = constant_op.constant(-2.0)
                    grads_1 = constant_op.constant(-2.0)
                sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
                opt = model_average_optimizer.ModelAverageOptimizer(
                    opt=sgd_opt,
                    num_worker=num_workers,
                    ma_custom_getter=ma_coustom,
                    is_chief=is_chief,
                    interval_steps=steps)
                train_op = [
                    opt.apply_gradients([[grads_0, var_0], [grads_1, var_1]],
                                        global_step)
                ]
            easgd_hook = opt.make_session_run_hook()
            # Creates MonitoredSession
            sess = training.MonitoredTrainingSession(workers[worker_id].target,
                                                     hooks=[easgd_hook])

        sessions.append(sess)
        graphs.append(graph)
        train_ops.append(train_op)
    return sessions, graphs, train_ops
Example #21
0
 def test_treated_as_worker_op_by_device_setter(self):
   num_ps_tasks = 2
   with ops.device("/job:worker/task:0"):
     ids = constant_op.constant([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                                dtype=dtypes.int64)
   setter = device_setter.replica_device_setter(ps_tasks=num_ps_tasks,
                                                ps_device="/job:ps",
                                                worker_device="/job:worker")
   with ops.device(setter):
     p1 = de.get_variable(name="p1",
                          devices=["/job:ps/task:0", "/job:ps/task:1"])
     _ = de.embedding_lookup(p1, ids, name="emb")
   self.assertTrue("/job:ps/task:0" in p1._tables[0].resource_handle.device)
   self.assertTrue("/job:ps/task:1" in p1._tables[1].resource_handle.device)
Example #22
0
  def testVariableDevicePlacement(self):
    classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
    target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
    with ops.device(
        device_setter.replica_device_setter(ps_tasks=1, ps_device="/cpu:0")):
      dataset = (dataset_ops.Dataset.from_tensor_slices(classes)
                 .shuffle(200, seed=21)
                 .map(lambda c: (c, string_ops.as_string(c))))
      dataset = dataset_ops.rejection_resample(
          dataset, target_dist=target_dist, initial_dist=None,
          class_func=lambda c, _: c, seed=27)

      self.assertEqual(1, len(variables.local_variables()))
      self.assertEqual(b"",
                       compat.as_bytes(variables.local_variables()[0].device))
Example #23
0
 def _get_replica_device_setter(self):
     """Creates a replica device setter."""
     ps_tasks = self._num_ps_replicas
     ps_ops = [
         "Variable",
         "VariableV2",
         "DecisionTreeEnsembleResourceHandleOp",
         "StatsAccumulatorScalarResourceHandleOp",
         "StatsAccumulatorTensorResourceHandleOp",
         "QuantileStreamResourceHandleOp",
     ]
     ps_strategy = _OpRoundRobinStrategy(ps_ops, ps_tasks)
     return device_setter.replica_device_setter(ps_tasks=ps_tasks,
                                                merge_devices=True,
                                                ps_ops=ps_ops,
                                                ps_strategy=ps_strategy)
 def testPS2TasksPinVariableToJob(self):
   with ops.device(
       device_setter.replica_device_setter(cluster=self._cluster_spec)):
     v = variables.Variable([1, 2])
     with ops.device("/job:moon"):
       w = variables.Variable([2, 1])
       with ops.device("/job:ps"):  # Explicit PS job will get task set.
         x = variables.Variable([0, 1])
     a = v + w + x
     self.assertDeviceEqual("/job:ps/task:0", v.device)
     self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
     self.assertDeviceEqual("/job:moon", w.device)
     self.assertDeviceEqual("/job:moon", w.initializer.device)
     self.assertDeviceEqual("/job:ps/task:1", x.device)
     self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
     self.assertDeviceEqual("/job:worker", a.device)
Example #25
0
 def testPS2TasksPinVariableToJob(self):
   with ops.device(
       device_setter.replica_device_setter(cluster=self._cluster_spec)):
     v = variables.Variable([1, 2])
     with ops.device("/job:moon"):
       w = variables.Variable([2, 1])
       with ops.device("/job:ps"):  # Explicit PS job will get task set.
         x = variables.Variable([0, 1])
     a = v + w + x
     self.assertDeviceEqual("/job:ps/task:0", v.device)
     self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
     self.assertDeviceEqual("/job:moon", w.device)
     self.assertDeviceEqual("/job:moon", w.initializer.device)
     self.assertDeviceEqual("/job:ps/task:1", x.device)
     self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
     self.assertDeviceEqual("/job:worker", a.device)
Example #26
0
def _get_replica_device_setter(config):
    """Creates a replica device setter if required.

  Args:
    config: A RunConfig instance.

  Returns:
    A replica device setter, or None.
  """
    ps_ops = ["Variable", "AutoReloadVariable", "MutableHashTable", "MutableHashTableOfTensors"]
    if config.num_ps_replicas > 0:
        return device_setter.replica_device_setter(
            ps_tasks=config.num_ps_replicas, merge_devices=False, ps_ops=ps_ops, cluster=config.cluster_spec
        )
    else:
        return None
def _get_workers(num_workers, period, workers, moving_rate):
    sessions = []
    graphs = []
    train_ops = []
    for worker_id in range(num_workers):
        graph = ops.Graph()
        is_chief = (worker_id == 0)
        with graph.as_default():
            worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
            ea_coustom = ElasticAverageCustomGetter(
                worker_device=worker_device)
            with variable_scope.variable_scope(
                    '', custom_getter=ea_coustom), ops.device(
                        device_setter.replica_device_setter(
                            worker_device=worker_device,
                            ps_device="/job:ps/task:0/cpu:0",
                            ps_tasks=1)):
                global_step = variables.Variable(0,
                                                 name='global_step',
                                                 trainable=False)
                var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
                var_1 = variable_scope.get_variable(initializer=1.0, name="v1")

            with ops.device("/job:worker/task:" + str(worker_id)):
                grads_0 = constant_op.constant(-1.0)
                grads_1 = constant_op.constant(-1.0)

                sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
                opt = ElasticAverageOptimizer(opt=sgd_opt,
                                              num_worker=num_workers,
                                              moving_rate=moving_rate,
                                              communication_period=period,
                                              ea_custom_getter=ea_coustom)
                train_op = [
                    opt.apply_gradients(([grads_0, var_0], [grads_1, var_1]),
                                        global_step)
                ]
                easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
            # Creates MonitoredSession
            sess = training.MonitoredTrainingSession(workers[worker_id].target,
                                                     hooks=[easgd_hook])

        sessions.append(sess)
        graphs.append(graph)
        train_ops.append(train_op)

    return sessions, graphs, train_ops
Example #28
0
    def testVariableDevicePlacement(self):
        classes = np.random.randint(5, size=(20000, ))  # Uniformly sampled
        target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
        with ops.device(
                device_setter.replica_device_setter(ps_tasks=1,
                                                    ps_device="/cpu:0")):
            dataset = (dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
                200, seed=21).map(lambda c: (c, string_ops.as_string(c))))
            dataset = dataset_ops.rejection_resample(dataset,
                                                     target_dist=target_dist,
                                                     initial_dist=None,
                                                     class_func=lambda c, _: c,
                                                     seed=27)

            self.assertEqual(1, len(variables.local_variables()))
            self.assertEqual(
                b"", compat.as_bytes(variables.local_variables()[0].device))
Example #29
0
 def _get_replica_device_setter(self):
   """Creates a replica device setter."""
   ps_tasks = self._num_ps_replicas
   ps_ops = [
       "Variable",
       "VariableV2",
       "DecisionTreeEnsembleResourceHandleOp",
       "StatsAccumulatorScalarResourceHandleOp",
       "StatsAccumulatorTensorResourceHandleOp",
       "QuantileStreamResourceHandleOp",
   ]
   ps_strategy = _OpRoundRobinStrategy(ps_ops, ps_tasks)
   return device_setter.replica_device_setter(
       ps_tasks=ps_tasks,
       merge_devices=True,
       ps_ops=ps_ops,
       ps_strategy=ps_strategy)
def _get_workers(num_workers, period, workers, moving_rate):
  sessions = []
  graphs = []
  train_ops = []
  for worker_id in range(num_workers):
    graph = ops.Graph()
    is_chief = (worker_id == 0)
    with graph.as_default():
      worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
      ea_coustom = ElasticAverageCustomGetter(worker_device=worker_device)
      with variable_scope.variable_scope(
          "", custom_getter=ea_coustom), ops.device(
              device_setter.replica_device_setter(
                  worker_device=worker_device,
                  ps_device="/job:ps/task:0/cpu:0",
                  ps_tasks=1)):
        global_step = variables.Variable(0, name="global_step", trainable=False)
        var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
        var_1 = variable_scope.get_variable(initializer=1.0, name="v1")

      with ops.device("/job:worker/task:" + str(worker_id)):
        grads_0 = constant_op.constant(-1.0)
        grads_1 = constant_op.constant(-1.0)

        sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
        opt = ElasticAverageOptimizer(
            opt=sgd_opt,
            num_worker=num_workers,
            moving_rate=moving_rate,
            communication_period=period,
            ea_custom_getter=ea_coustom)
        train_op = [
            opt.apply_gradients(([grads_0, var_0], [grads_1, var_1]),
                                global_step)
        ]
        easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
      # Creates MonitoredSession
      sess = training.MonitoredTrainingSession(
          workers[worker_id].target, hooks=[easgd_hook])

    sessions.append(sess)
    graphs.append(graph)
    train_ops.append(train_op)

  return sessions, graphs, train_ops
 def testAGNCustomGetter(self):
     cluster_spec = server_lib.ClusterSpec({
         "ps": ["ps0:2222", "ps1:2222"],
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
     agn_getter = agn_optimizer.AGNCustomGetter(
         worker_device="/job:worker/task:0")
     with ops.device(
         device_setter.replica_device_setter(cluster=cluster_spec,
                                             worker_device="/job:worker/task:0",
                                             ps_device="/job:ps")), \
         variable_scope.variable_scope("", custom_getter=agn_getter):
         v = variable_scope.get_variable(initializer=[1, 2], name="v")
         w = variable_scope.get_variable(initializer=[2, 1], name="w")
         v_g, w_g = agn_getter._global_map[v], agn_getter._global_map[w]
         self.assertDeviceEqual("/job:worker/task:0", v.device)
         self.assertDeviceEqual("job:ps/task:0", v_g.device)
         self.assertDeviceEqual("/job:worker/task:0", w.device)
         self.assertDeviceEqual("job:ps/task:1", w_g.device)
Example #32
0
  def testPS2TasksWithCPUConstraint(self):
    cluster_spec = server_lib.ClusterSpec({
        "sun": ["sun0:2222", "sun1:2222", "sun2:2222"],
        "moon": ["moon0:2222", "moon1:2222"]
    })

    with ops.device(
        device_setter.replica_device_setter(
            ps_device="/job:moon/cpu:0",
            worker_device="/job:sun",
            cluster=cluster_spec.as_cluster_def())):
      v = variables.Variable([1, 2])
      w = variables.Variable([2, 1])
      a = v + w
      self.assertDeviceEqual("/job:moon/task:0/cpu:0", v.device)
      self.assertDeviceEqual("/job:moon/task:0/cpu:0", v.initializer.device)
      self.assertDeviceEqual("/job:moon/task:1/cpu:0", w.device)
      self.assertDeviceEqual("/job:moon/task:1/cpu:0", w.initializer.device)
      self.assertDeviceEqual("/job:sun", a.device)
 def testPS2TasksWithClusterSpecClass(self):
   cluster_spec = server_lib.ClusterSpec({
       "ps": ["ps0:2222", "ps1:2222"],
       "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
   })
   ea_coustom = ElasticAverageCustomGetter(worker_device="/job:worker/task:0")
   from tensorflow.python.training import device_setter
   with ops.device(
       device_setter.replica_device_setter(cluster=cluster_spec,
                                           worker_device="/job:worker/task:0",
                                           ps_device="/job:ps")), \
        variable_scope.variable_scope("", custom_getter=ea_coustom):
     v = variable_scope.get_variable(initializer=[1, 2], name="v")
     w = variable_scope.get_variable(initializer=[2, 1], name="w")
     v_g, w_g = ea_coustom._global_map[v], ea_coustom._global_map[w]
     self.assertDeviceEqual("/job:worker/task:0", v.device)
     self.assertDeviceEqual("job:ps/task:0", v_g.device)
     self.assertDeviceEqual("/job:worker/task:0", w.device)
     self.assertDeviceEqual("job:ps/task:1", w_g.device)
  def testPS2TasksWithDevice(self):
    cluster_spec = server_lib.ClusterSpec({
        "sun": ["sun0:2222", "sun1:2222", "sun2:2222"],
        "moon": ["moon0:2222", "moon1:2222"]
    })

    with ops.device(
        device_setter.replica_device_setter(
            ps_device="/job:moon",
            worker_device="/job:sun",
            cluster=cluster_spec.as_cluster_def())):
      v = variables.Variable([1, 2])
      w = variables.Variable([2, 1])
      a = v + w
      self.assertDeviceEqual("/job:moon/task:0", v.device)
      self.assertDeviceEqual("/job:moon/task:0", v.initializer.device)
      self.assertDeviceEqual("/job:moon/task:1", w.device)
      self.assertDeviceEqual("/job:moon/task:1", w.initializer.device)
      self.assertDeviceEqual("/job:sun", a.device)
Example #35
0
 def testByteSizeLoadFn(self):
   with ops.device(
       device_setter.replica_device_setter(
           cluster=self._cluster_spec,
           ps_strategy=device_setter_lib.GreedyLoadBalancingStrategy(
               2, device_setter_lib.byte_size_load_fn))):
     u = variables.Variable(array_ops.zeros([2, 2]))
     v = variables.Variable(array_ops.zeros([2, 1]))
     w = variables.Variable(array_ops.zeros([2, 2]))
     x = variables.Variable(array_ops.zeros([1, 3]))
     a = v + w
     self.assertDeviceEqual("/job:ps/task:0", u.device)
     self.assertDeviceEqual("/job:ps/task:0", u.initializer.device)
     self.assertDeviceEqual("/job:ps/task:1", v.device)
     self.assertDeviceEqual("/job:ps/task:1", v.initializer.device)
     self.assertDeviceEqual("/job:ps/task:1", w.device)
     self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
     self.assertDeviceEqual("/job:ps/task:0", x.device)
     self.assertDeviceEqual("/job:ps/task:0", x.initializer.device)
     self.assertDeviceEqual("/job:worker", a.device)
Example #36
0
 def testBasic(self):
     ps_strategy = device_setter_lib.RandomStrategy(2, seed=0)
     with ops.device(
             device_setter.replica_device_setter(cluster=_CLUSTER_SPEC,
                                                 ps_strategy=ps_strategy)):
         u = variables.Variable(array_ops.zeros([2, 2]))
         v = variables.Variable(array_ops.zeros([2, 1]))
         w = variables.Variable(array_ops.zeros([2, 2]))
         x = variables.Variable(array_ops.zeros([1, 3]))
         a = v + w
         # Randomly distributed with seed 0.
         self.assertDeviceEqual("/job:ps/task:1", u.device)
         self.assertDeviceEqual("/job:ps/task:1", u.initializer.device)
         self.assertDeviceEqual("/job:ps/task:0", v.device)
         self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
         self.assertDeviceEqual("/job:ps/task:1", w.device)
         self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
         self.assertDeviceEqual("/job:ps/task:1", x.device)
         self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
         self.assertDeviceEqual("/job:worker", a.device)
 def testPS2TasksWithClusterSpecClass(self):
     cluster_spec = server_lib.ClusterSpec({
         "ps": ["ps0:2222", "ps1:2222"],
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
     ea_coustom = ElasticAverageCustomGetter(
         worker_device="/job:worker/task:0")
     from tensorflow.python.training import device_setter
     with ops.device(
         device_setter.replica_device_setter(cluster=cluster_spec,
                                             worker_device="/job:worker/task:0",
                                             ps_device="/job:ps")), \
          variable_scope.variable_scope("", custom_getter=ea_coustom):
         v = variable_scope.get_variable(initializer=[1, 2], name="v")
         w = variable_scope.get_variable(initializer=[2, 1], name="w")
         v_g, w_g = ea_coustom._global_map[v], ea_coustom._global_map[w]
         self.assertDeviceEqual("/job:worker/task:0", v.device)
         self.assertDeviceEqual("job:ps/task:0", v_g.device)
         self.assertDeviceEqual("/job:worker/task:0", w.device)
         self.assertDeviceEqual("job:ps/task:1", w_g.device)
Example #38
0
 def testBasic(self):
   ps_strategy = device_setter_lib.RandomStrategy(2, seed=0)
   with ops.device(
       device_setter.replica_device_setter(
           cluster=_CLUSTER_SPEC,
           ps_strategy=ps_strategy)):
     u = variables.Variable(array_ops.zeros([2, 2]))
     v = variables.Variable(array_ops.zeros([2, 1]))
     w = variables.Variable(array_ops.zeros([2, 2]))
     x = variables.Variable(array_ops.zeros([1, 3]))
     a = v + w
     # Randomly distributed with seed 0.
     self.assertDeviceEqual("/job:ps/task:1", u.device)
     self.assertDeviceEqual("/job:ps/task:1", u.initializer.device)
     self.assertDeviceEqual("/job:ps/task:0", v.device)
     self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
     self.assertDeviceEqual("/job:ps/task:1", w.device)
     self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
     self.assertDeviceEqual("/job:ps/task:1", x.device)
     self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
     self.assertDeviceEqual("/job:worker", a.device)
Example #39
0
    def testUniformLoadEqualsRoundRobin(self):
        def _load_fn(unused_op):
            return 1

        with ops.device(
                device_setter.replica_device_setter(
                    cluster=_CLUSTER_SPEC,
                    ps_strategy=device_setter_lib.GreedyLoadBalancingStrategy(
                        2, _load_fn))):
            u = variables.Variable(array_ops.zeros([2, 2]))
            v = variables.Variable(array_ops.zeros([2, 1]))
            w = variables.Variable(array_ops.zeros([2, 2]))
            x = variables.Variable(array_ops.zeros([1, 3]))
            a = v + w
            self.assertDeviceEqual("/job:ps/task:0", u.device)
            self.assertDeviceEqual("/job:ps/task:0", u.initializer.device)
            self.assertDeviceEqual("/job:ps/task:1", v.device)
            self.assertDeviceEqual("/job:ps/task:1", v.initializer.device)
            self.assertDeviceEqual("/job:ps/task:0", w.device)
            self.assertDeviceEqual("/job:ps/task:0", w.initializer.device)
            self.assertDeviceEqual("/job:ps/task:1", x.device)
            self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
            self.assertDeviceEqual("/job:worker", a.device)
    def benchmark_create_1000_partitions_with_100_parameter_servers(self):
        workers, _ = test.create_local_cluster(num_workers=1, num_ps=100)
        worker_sessions = [session_lib.Session(w.target) for w in workers]
        worker = worker_sessions[0]
        partition_sizes = (1, 512, 1024 * 32, 1024 * 128)

        partitioned = []

        for partition_size in partition_sizes:
            # max_shard_bytes is 4, shape is 1000*partition_size float32s which should
            # partition into 1000 shards, each containing partition_size float32s.
            print(
                "Building partitioned variable with %d floats per partition" %
                partition_size)
            with ops.device(device_setter.replica_device_setter(ps_tasks=100)):
                partitioned_ix = variable_scope.get_variable(
                    "partitioned_%d" % partition_size,
                    shape=[1000 * partition_size],
                    dtype=dtypes.float32,
                    # Each partition to have exactly N float32s
                    partitioner=partitioned_variables.
                    variable_axis_size_partitioner(max_shard_bytes=4 *
                                                   partition_size))
                # Concatenates along axis 0
                partitioned.append(ops.convert_to_tensor(partitioned_ix))

        variables.global_variables_initializer().run(session=worker)

        for ix, partition_size in enumerate(partition_sizes):
            print("Running benchmark having partitions with %d floats" %
                  partition_size)
            self.run_op_benchmark(
                worker,
                partitioned[ix],
                name=("read_concat_1000_partitions_from_"
                      "100_parameter_servers_partsize_%d_floats" %
                      partition_size))
  def testUniformLoadEqualsRoundRobin(self):

    def _load_fn(unused_op):
      return 1

    with ops.device(
        device_setter.replica_device_setter(
            cluster=self._cluster_spec,
            ps_strategy=device_setter_lib.GreedyLoadBalancingStrategy(
                2, _load_fn))):
      u = variables.Variable(array_ops.zeros([2, 2]))
      v = variables.Variable(array_ops.zeros([2, 1]))
      w = variables.Variable(array_ops.zeros([2, 2]))
      x = variables.Variable(array_ops.zeros([1, 3]))
      a = v + w
      self.assertDeviceEqual("/job:ps/task:0", u.device)
      self.assertDeviceEqual("/job:ps/task:0", u.initializer.device)
      self.assertDeviceEqual("/job:ps/task:1", v.device)
      self.assertDeviceEqual("/job:ps/task:1", v.initializer.device)
      self.assertDeviceEqual("/job:ps/task:0", w.device)
      self.assertDeviceEqual("/job:ps/task:0", w.initializer.device)
      self.assertDeviceEqual("/job:ps/task:1", x.device)
      self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
      self.assertDeviceEqual("/job:worker", a.device)
  def _initialize_devices(self, num_gpus_per_worker, cluster_spec, task_type,
                          task_id):
    """Initialize internal devices.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per tower.
    The variable device is a device function or device string. The default
    variable device assigns variables to parameter servers in a round-robin
    fashion.

    Args:
      num_gpus_per_worker: number of local GPUs or GPUs per worker.
      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
        cluster configurations.
      task_type: the current task type.
      task_id: the current task id.

    Raises:
      ValueError: if the cluster_spec doesn't have ps jobs.
    """
    self._task_type = task_type or "worker"
    self._task_id = task_id or 0
    self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)

    # TODO(yuefengz): maybe clearer to split it into two classes, one for
    # the distribuetd case and one for the local case, once we have the factory
    # class/method.

    # Define compute devices which is a list of device strings and one for each
    # tower. When there are GPUs, replicate operations on these GPUs. Otherwise,
    # place operations on CPU.
    if cluster_spec is None:
      # Local mode.
      if num_gpus_per_worker > 0:
        self._compute_devices = list(
            map("/device:GPU:{}".format, range(num_gpus_per_worker)))
      else:
        self._compute_devices = [_LOCAL_CPU]
    else:
      # Distributed mode.
      if num_gpus_per_worker > 0:
        self._compute_devices = [
            "%s/device:GPU:%d" % (self._worker_device, i)
            for i in range(num_gpus_per_worker)
        ]
      else:
        self._compute_devices = [self._worker_device]

    self._compute_devices = list(
        map(device_util.resolve, self._compute_devices))
    self._canonical_compute_device_set = set(self._compute_devices)

    # Define variable device which is a device string in the local case and a
    # device function in the distributed case. It is used to open a device scope
    # where varibles are defined.
    # The `_parameter_devices` is needed for the `parameter_devices` property
    # and is a list of all variable devices.
    if cluster_spec is None:
      # Local mode. If there is only one GPU, put everything on that GPU.
      # Otherwise, place variables on CPU.
      if num_gpus_per_worker == 1:
        assert len(list(self._compute_devices)) == 1
        self._variable_device = _LOCAL_GPU_0
        self._parameter_devices = [_LOCAL_GPU_0]
      else:
        self._variable_device = _LOCAL_CPU
        self._parameter_devices = [_LOCAL_CPU]
    else:
      # Distributed mode. Place variables on ps jobs in a round-robin fashion.
      # Note that devices returned from `replica_device_setter` are not
      # canonical and therefore we don't canonicalize all variable devices to
      # make them consistent.
      # TODO(yuefengz): support passing a strategy object to control variable
      # assignment.
      # TODO(yuefengz): merge the logic of replica_device_setter into this
      # class.
      num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
      if num_ps_replicas == 0:
        raise ValueError("The cluster spec needs to have `ps` jobs.")
      self._variable_device = device_setter.replica_device_setter(
          ps_tasks=num_ps_replicas,
          worker_device=self._worker_device,
          merge_devices=True,
          cluster=cluster_spec)

      # Parameter devices are all tasks of the "ps" job.
      self._parameter_devices = map("/job:ps/task:{}".format,
                                    range(num_ps_replicas))

    # Define the default device in cross-tower mode. In the distributed case, we
    # set the default device to the corresponding worker to prevent these ops
    # from being placed on other workers.
    if cluster_spec is None:
      self._default_device = None
    else:
      self._default_device = self._worker_device
    def _initialize_devices(self, num_gpus_per_worker, cluster_spec, task_type,
                            task_id):
        """Initialize internal devices.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per tower.
    The variable device is a device function or device string. The default
    variable device assigns variables to parameter servers in a round-robin
    fashion.

    Args:
      num_gpus_per_worker: number of local GPUs or GPUs per worker.
      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
        cluster configurations.
      task_type: the current task type.
      task_id: the current task id.

    Raises:
      ValueError: if the cluster_spec doesn't have ps jobs.
    """
        self._task_type = task_type or "worker"
        self._task_id = task_id or 0
        self._worker_device = "/job:%s/task:%d" % (self._task_type,
                                                   self._task_id)

        # TODO(yuefengz): maybe clearer to split it into two classes, one for
        # the distribuetd case and one for the local case, once we have the factory
        # class/method.

        # Define compute devices which is a list of device strings and one for each
        # tower. When there are GPUs, replicate operations on these GPUs. Otherwise,
        # place operations on CPU.
        if cluster_spec is None:
            # Local mode.
            if num_gpus_per_worker > 0:
                self._compute_devices = list(
                    map("/device:GPU:{}".format, range(num_gpus_per_worker)))
            else:
                self._compute_devices = [_LOCAL_CPU]
        else:
            # Distributed mode.
            if num_gpus_per_worker > 0:
                self._compute_devices = [
                    "%s/device:GPU:%d" % (self._worker_device, i)
                    for i in range(num_gpus_per_worker)
                ]
            else:
                self._compute_devices = [self._worker_device]

        self._compute_devices = list(
            map(device_util.resolve, self._compute_devices))
        self._canonical_compute_device_set = set(self._compute_devices)

        # Define variable device which is a device string in the local case and a
        # device function in the distributed case. It is used to open a device scope
        # where varibles are defined.
        # The `_parameter_devices` is needed for the `parameter_devices` property
        # and is a list of all variable devices.
        if cluster_spec is None:
            # Local mode. If there is only one GPU, put everything on that GPU.
            # Otherwise, place variables on CPU.
            if num_gpus_per_worker == 1:
                assert len(list(self._compute_devices)) == 1
                self._variable_device = _LOCAL_GPU_0
                self._parameter_devices = [_LOCAL_GPU_0]
            else:
                self._variable_device = _LOCAL_CPU
                self._parameter_devices = [_LOCAL_CPU]
        else:
            # Distributed mode. Place variables on ps jobs in a round-robin fashion.
            # Note that devices returned from `replica_device_setter` are not
            # canonical and therefore we don't canonicalize all variable devices to
            # make them consistent.
            # TODO(yuefengz): support passing a strategy object to control variable
            # assignment.
            # TODO(yuefengz): merge the logic of replica_device_setter into this
            # class.
            num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
            if num_ps_replicas == 0:
                raise ValueError("The cluster spec needs to have `ps` jobs.")
            self._variable_device = device_setter.replica_device_setter(
                ps_tasks=num_ps_replicas,
                worker_device=self._worker_device,
                merge_devices=True,
                cluster=cluster_spec)

            # Parameter devices are all tasks of the "ps" job.
            self._parameter_devices = map("/job:ps/task:{}".format,
                                          range(num_ps_replicas))

        # Define the default device in cross-tower mode. In the distributed case, we
        # set the default device to the corresponding worker to prevent these ops
        # from being placed on other workers.
        if cluster_spec is None:
            self._default_device = None
        else:
            self._default_device = self._worker_device
  def _initialize_multi_worker(self, cluster_resolver):
    """Initialize devices for multiple workers.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per
    replica. The variable device is a device function or device string. The
    default variable device assigns variables to parameter servers in a
    round-robin fashion.

    Args:
      cluster_resolver: a descendant of `ClusterResolver` object.

    Raises:
      ValueError: if the cluster doesn't have ps jobs.
    """
    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
    # some cases.
    if isinstance(cluster_resolver, TFConfigClusterResolver):
      num_gpus = context.num_gpus()
    else:
      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

    # Save the num_gpus_per_worker for configure method.
    self._num_gpus_per_worker = num_gpus

    cluster_spec = cluster_resolver.cluster_spec()
    task_type = cluster_resolver.task_type
    task_id = cluster_resolver.task_id
    if not task_type or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`")
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    assert cluster_spec.as_dict()

    worker_device = "/job:%s/task:%d" % (task_type, task_id)
    self._input_host_device = numpy_dataset.SingleDevice(worker_device)

    # Define compute devices which is a list of device strings and one for each
    # replica. When there are GPUs, replicate operations on these GPUs.
    # Otherwise, place operations on CPU.
    if num_gpus > 0:
      compute_devices = tuple(
          "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus))
    else:
      compute_devices = (worker_device,)

    self._device_map = values.ReplicaDeviceMap(compute_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(worker_device, compute_devices)])

    # In distributed mode, place variables on ps jobs in a round-robin fashion.
    # Note that devices returned from `replica_device_setter` are not
    # canonical and therefore we don't canonicalize all variable devices to
    # make them consistent.
    # TODO(yuefengz): support passing a strategy object to control variable
    # assignment.
    # TODO(yuefengz): merge the logic of replica_device_setter into this
    # class.
    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
    if num_ps_replicas == 0:
      raise ValueError("The cluster spec needs to have `ps` jobs.")
    self._variable_device = device_setter.replica_device_setter(
        ps_tasks=num_ps_replicas,
        worker_device=worker_device,
        merge_devices=True,
        cluster=cluster_spec)

    # The `_parameter_devices` is needed for the `parameter_devices` property
    # and is a list of all variable devices. Here parameter devices are all
    # tasks of the "ps" job.
    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
                                        range(num_ps_replicas)))

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = worker_device

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)
    self._cluster_spec = cluster_spec
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker ParameterServerStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_ps_replicas = %r, is_chief = %r, device_map = %r, "
        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
        num_ps_replicas, self._is_chief, self._device_map,
        self._variable_device)
Example #45
0
 def testResource(self):
   with ops.device(
       device_setter.replica_device_setter(cluster=self._cluster_spec)):
     v = resource_variable_ops.ResourceVariable([1, 2])
     self.assertDeviceEqual("/job:ps/task:0", v.device)
  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                               task_type, task_id):
    """Initialize devices for multiple workers.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per
    replica. The variable device is a device function or device string. The
    default variable device assigns variables to parameter servers in a
    round-robin fashion.

    Args:
      num_gpus_per_worker: number of local GPUs or GPUs per worker.
      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
        cluster configurations.
      task_type: the current task type.
      task_id: the current task id.

    Raises:
      ValueError: if the cluster_spec doesn't have ps jobs.
    """
    assert cluster_spec
    if not task_type or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`")
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)

    self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)

    # Define compute devices which is a list of device strings and one for each
    # replica. When there are GPUs, replicate operations on these GPUs.
    # Otherwise, place operations on CPU.
    if num_gpus_per_worker > 0:
      self._compute_devices = [
          "%s/device:GPU:%d" % (self._worker_device, i)
          for i in range(num_gpus_per_worker)
      ]
    else:
      self._compute_devices = [self._worker_device]

    self._compute_devices = list(
        map(device_util.resolve, self._compute_devices))
    self._canonical_compute_device_set = set(self._compute_devices)

    # In distributed mode, place variables on ps jobs in a round-robin fashion.
    # Note that devices returned from `replica_device_setter` are not
    # canonical and therefore we don't canonicalize all variable devices to
    # make them consistent.
    # TODO(yuefengz): support passing a strategy object to control variable
    # assignment.
    # TODO(yuefengz): merge the logic of replica_device_setter into this
    # class.
    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
    if num_ps_replicas == 0:
      raise ValueError("The cluster spec needs to have `ps` jobs.")
    self._variable_device = device_setter.replica_device_setter(
        ps_tasks=num_ps_replicas,
        worker_device=self._worker_device,
        merge_devices=True,
        cluster=cluster_spec)

    # The `_parameter_devices` is needed for the `parameter_devices` property
    # and is a list of all variable devices. Here parameter devices are all
    # tasks of the "ps" job.
    self._parameter_devices = map("/job:ps/task:{}".format,
                                  range(num_ps_replicas))

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = self._worker_device

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)
    self._cluster_spec = cluster_spec
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker ParameterServerStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_ps_replicas = %r, is_chief = %r, compute_devices = %r, "
        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
        num_ps_replicas, self._is_chief, self._compute_devices,
        self._variable_device)
 def testResource(self):
   with ops.device(
       device_setter.replica_device_setter(cluster=self._cluster_spec)):
     v = resource_variable_ops.ResourceVariable([1, 2])
     self.assertDeviceEqual("/job:ps/task:0", v.device)
def _get_workers(num_workers, period, workers, num_ps=1):
    sessions = []
    graphs = []
    train_ops = []
    for worker_id in range(num_workers):
        graph = ops.Graph()
        is_chief = (worker_id == 0)
        with graph.as_default():
            worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
            ps_device = device_setter.replica_device_setter(
                worker_device=worker_device,
                ps_device="/job:ps/task:0/cpu:0",
                ps_tasks=1)
            agn_getter = agn_optimizer.AGNCustomGetter(
                worker_device=worker_device)
            with variable_scope.variable_scope(
                    "", custom_getter=agn_getter), ops.device(ps_device):
                global_step = training_util.get_or_create_global_step()
                var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
                var_1 = variable_scope.get_variable(initializer=0.5, name="v1")
            if num_ps > 1:
                with variable_scope.variable_scope(
                        "",
                        partitioner=partitioned_variables.
                        fixed_size_partitioner(num_ps, axis=0),
                        custom_getter=agn_getter), ops.device(ps_device):

                    partition_var = variable_scope.get_variable(
                        "partition_var",
                        shape=[2, 4],
                        initializer=init_ops.zeros_initializer)
                    part_0 = list(partition_var)[0]
                    part_1 = list(partition_var)[1]

            with ops.device("/job:worker/task:" + str(worker_id)):
                grads_0 = constant_op.constant(-1.0)
                grads_1 = constant_op.constant(-1.0)
                grads_part_0 = constant_op.constant([[-1., -1., -1., -1.]])
                grads_part_1 = constant_op.constant([[-1., -1., -1., -1.]])

                optimizer = \
                    adam.AdamOptimizer(learning_rate=0.1, beta1=0.0, beta2=0.0)
                opt = agn_optimizer.AGNOptimizer(optimizer,
                                                 num_worker=num_workers,
                                                 communication_period=period,
                                                 custom_getter=agn_getter)
                if num_ps == 1:
                    train_op = [
                        opt.apply_gradients(
                            ([grads_0, var_0], [grads_1, var_1]), global_step)
                    ]
                else:
                    train_op = [
                        opt.apply_gradients(
                            ([grads_0, var_0], [grads_1, var_1],
                             [grads_part_0, part_0], [grads_part_1, part_1]),
                            global_step)
                    ]
                hook = opt.make_session_run_hook(is_chief, worker_id)
            # Creates MonitoredSession
            sess = training.MonitoredTrainingSession(workers[worker_id].target,
                                                     hooks=[hook])

        sessions.append(sess)
        graphs.append(graph)
        train_ops.append(train_op)

    return sessions, graphs, train_ops
Example #49
0
    def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                                 task_type, task_id):
        """Initialize devices for multiple workers.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per
    replica. The variable device is a device function or device string. The
    default variable device assigns variables to parameter servers in a
    round-robin fashion.

    Args:
      num_gpus_per_worker: number of local GPUs or GPUs per worker.
      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
        cluster configurations.
      task_type: the current task type.
      task_id: the current task id.

    Raises:
      ValueError: if the cluster_spec doesn't have ps jobs.
    """
        assert cluster_spec
        if not task_type or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)

        self._worker_device = "/job:%s/task:%d" % (self._task_type,
                                                   self._task_id)

        # Define compute devices which is a list of device strings and one for each
        # replica. When there are GPUs, replicate operations on these GPUs.
        # Otherwise, place operations on CPU.
        if num_gpus_per_worker > 0:
            self._compute_devices = [
                "%s/device:GPU:%d" % (self._worker_device, i)
                for i in range(num_gpus_per_worker)
            ]
        else:
            self._compute_devices = [self._worker_device]

        self._compute_devices = list(
            map(device_util.resolve, self._compute_devices))
        self._canonical_compute_device_set = set(self._compute_devices)

        # In distributed mode, place variables on ps jobs in a round-robin fashion.
        # Note that devices returned from `replica_device_setter` are not
        # canonical and therefore we don't canonicalize all variable devices to
        # make them consistent.
        # TODO(yuefengz): support passing a strategy object to control variable
        # assignment.
        # TODO(yuefengz): merge the logic of replica_device_setter into this
        # class.
        num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
        if num_ps_replicas == 0:
            raise ValueError("The cluster spec needs to have `ps` jobs.")
        self._variable_device = device_setter.replica_device_setter(
            ps_tasks=num_ps_replicas,
            worker_device=self._worker_device,
            merge_devices=True,
            cluster=cluster_spec)

        # The `_parameter_devices` is needed for the `parameter_devices` property
        # and is a list of all variable devices. Here parameter devices are all
        # tasks of the "ps" job.
        self._parameter_devices = map("/job:ps/task:{}".format,
                                      range(num_ps_replicas))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = self._worker_device

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker ParameterServerStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_ps_replicas = %r, is_chief = %r, compute_devices = %r, "
            "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
            num_ps_replicas, self._is_chief, self._compute_devices,
            self._variable_device)
Example #50
0
    def common_run_context(self, var_list, opt_list, name):
        batch_size = 2
        sample_length = 3
        emb_domain_list = list()
        tws = list()

        cluster = ps_worker_cluster(ps_num=2)
        ps_servers, worker_servers, cluster_def = cluster

        config = config_pb2.ConfigProto(
            cluster_def=cluster_def,
            experimental=config_pb2.ConfigProto.Experimental(
                share_session_state_in_clusterspec_propagation=True, ),
            allow_soft_placement=False,
            inter_op_parallelism_threads=2,
            intra_op_parallelism_threads=2,
            gpu_options=config_pb2.GPUOptions(allow_growth=True),
        )

        dev_placement = device_setter.replica_device_setter(
            ps_tasks=2,
            ps_device='/job:ps',
            worker_device='/job:worker',
            cluster=cluster_def,
        )

        with ops.device(dev_placement):
            shared_var_0 = deo.get_variable('distributed_sp_var_0',
                                            initializer=0.0,
                                            devices=['/job:worker/task:0'],
                                            dim=8)
            shared_var_1 = deo.get_variable('distributed_sp_var_1',
                                            initializer=0.0,
                                            devices=['/job:worker/task:0'],
                                            dim=4)
            opt_list = get_multiple_optimizers()

            distributed_var_list = [shared_var_0, shared_var_1]
            for _v in distributed_var_list:
                ids = random_ops.random_uniform((batch_size, sample_length),
                                                maxval=1000000,
                                                dtype=_v.key_dtype)
                ids = array_ops.reshape(ids, (-1, ))

                _, tw = deo.embedding_lookup(_v, ids, return_trainable=True)
                tws.append(tw)
                _collapse = array_ops.reshape(tw, (batch_size, -1))
                _logits = math_ops.reduce_sum(_collapse, axis=1)
                _logits = math_ops.cast(_logits, dtypes.float32)
                emb_domain_list.append(_logits)
            logits = math_ops.add_n(emb_domain_list)

            labels = array_ops.zeros((batch_size, ), dtype=dtypes.float32)
            loss = math_ops.reduce_mean(
                nn_impl.sigmoid_cross_entropy_with_logits(
                    logits=logits,
                    labels=labels,
                ))

            _train_ops = list()
            for _opt in opt_list:
                _train_ops.append(_opt.minimize(loss))
            train_op = control_flow_ops.group(_train_ops)

            restrictor = dvr.VariableRestrictor(var_list=distributed_var_list,
                                                optimizer_list=opt_list)
            update_op = restrictor.update()
            threshold = int(batch_size * sample_length * 1.5)
            factor = 1.2
            restrict_op = restrictor.restrict(threshold=threshold,
                                              factor=factor)

        policies = list(itertools.chain(*restrictor.policy_group.values()))
        tstp_vars = [policy.tstp_var for policy in policies]
        slot_vars = list()
        for tw in tws:
            for opt in opt_list:
                slot_vars += select_slot_vars(tw, opt)

        with session.Session(worker_servers[0].target, config=config) as sess:
            sess.run(variables.global_variables_initializer())
            n, MAX_ITER = 0, 1000
            while n < MAX_ITER:
                sess.run([train_op, update_op])
                if all(
                        sess.run(var.size()) > threshold * factor
                        for var in distributed_var_list):
                    break

            s1 = sess.run([var.size() for var in distributed_var_list])
            s2 = sess.run([tv.size() for tv in tstp_vars])
            s3 = sess.run([sv.size() for sv in slot_vars])

            self.assertAllGreater(s1, threshold * factor)
            self.assertAllGreater(s2, threshold * factor)
            if s3:
                self.assertAllGreater(s3, threshold * factor)

            sess.run(restrict_op)
            s1 = sess.run([var.size() for var in distributed_var_list])
            s2 = sess.run([tv.size() for tv in tstp_vars])
            s3 = sess.run([sv.size() for sv in slot_vars])

            self.assertAllLess(s1, threshold * factor + 1)
            self.assertAllLess(s2, threshold * factor + 1)
            if s3:
                self.assertAllLess(s3, threshold * factor + 1)
            sess.close()
Example #51
0
def _get_workers(num_workers, period, workers, moving_rate, num_ps=1):
    sessions = []
    graphs = []
    train_ops = []
    savers = []
    for worker_id in range(num_workers):
        graph = ops.Graph()
        is_chief = (worker_id == 0)
        with graph.as_default():
            worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
            ea_custom = ElasticAverageCustomGetter(worker_device=worker_device)
            with variable_scope.variable_scope(
                    "", custom_getter=ea_custom), ops.device(
                        device_setter.replica_device_setter(
                            worker_device=worker_device,
                            ps_device="/job:ps/task:0/cpu:0",
                            ps_tasks=1)):
                global_step = training_util.get_or_create_global_step()
                var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
                var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
            if num_ps > 1:
                with variable_scope.variable_scope(
                        "",
                        partitioner=partitioned_variables.
                        fixed_size_partitioner(num_ps, axis=0),
                        custom_getter=ea_custom), ops.device(
                            device_setter.replica_device_setter(
                                worker_device=worker_device,
                                ps_device="/job:ps/task:0/cpu:0",
                                ps_tasks=num_ps)):

                    partition_var = variable_scope.get_variable(
                        'partition_var',
                        shape=[2, 4],
                        initializer=init_ops.ones_initializer)
                    part_0 = list(partition_var)[0]
                    part_1 = list(partition_var)[1]

            with ops.device("/job:worker/task:" + str(worker_id)):
                grads_0 = constant_op.constant(-1.0)
                grads_1 = constant_op.constant(-1.0)
                grads_part_0 = constant_op.constant([[-1., -1., -1., -1.]])
                grads_part_1 = constant_op.constant([[-1., -1., -1., -1.]])

                sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
                opt = ElasticAverageOptimizer(opt=sgd_opt,
                                              num_worker=num_workers,
                                              moving_rate=moving_rate,
                                              communication_period=period,
                                              ea_custom_getter=ea_custom)
                if num_ps == 1:
                    train_op = [
                        opt.apply_gradients(
                            ([grads_0, var_0], [grads_1, var_1]), global_step)
                    ]
                else:
                    train_op = [
                        opt.apply_gradients(
                            ([grads_0, var_0], [grads_1, var_1],
                             [grads_part_0, part_0], [grads_part_1, part_1]),
                            global_step)
                    ]
                easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
                saver = opt.swapping_saver()
            # Creates MonitoredSession
            sess = training.MonitoredTrainingSession(workers[worker_id].target,
                                                     hooks=[easgd_hook])

        sessions.append(sess)
        graphs.append(graph)
        train_ops.append(train_op)
        savers.append(saver)

    return sessions, graphs, train_ops, savers
    def _initialize_multi_worker(self, cluster_resolver):
        """Initialize devices for multiple workers.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per
    replica. The variable device is a device function or device string. The
    default variable device assigns variables to parameter servers in a
    round-robin fashion.

    Args:
      cluster_resolver: a descendant of `ClusterResolver` object.

    Raises:
      ValueError: if the cluster doesn't have ps jobs.
    """
        num_gpus = cluster_resolver.num_accelerators()
        cluster_spec = cluster_resolver.cluster_spec()
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if not task_type or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        assert cluster_spec.as_dict()

        worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._input_host_device = numpy_dataset.SingleDevice(worker_device)

        # Define compute devices which is a list of device strings and one for each
        # replica. When there are GPUs, replicate operations on these GPUs.
        # Otherwise, place operations on CPU.
        if num_gpus > 0:
            compute_devices = tuple("%s/device:GPU:%d" % (worker_device, i)
                                    for i in range(num_gpus))
        else:
            compute_devices = (worker_device, )

        self._device_map = values.ReplicaDeviceMap(compute_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(worker_device, compute_devices)])

        # In distributed mode, place variables on ps jobs in a round-robin fashion.
        # Note that devices returned from `replica_device_setter` are not
        # canonical and therefore we don't canonicalize all variable devices to
        # make them consistent.
        # TODO(yuefengz): support passing a strategy object to control variable
        # assignment.
        # TODO(yuefengz): merge the logic of replica_device_setter into this
        # class.
        num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
        if num_ps_replicas == 0:
            raise ValueError("The cluster spec needs to have `ps` jobs.")
        self._variable_device = device_setter.replica_device_setter(
            ps_tasks=num_ps_replicas,
            worker_device=worker_device,
            merge_devices=True,
            cluster=cluster_spec)

        # The `_parameter_devices` is needed for the `parameter_devices` property
        # and is a list of all variable devices. Here parameter devices are all
        # tasks of the "ps" job.
        self._parameter_devices = tuple(
            map("/job:ps/task:{}".format, range(num_ps_replicas)))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = worker_device

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker ParameterServerStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_ps_replicas = %r, is_chief = %r, device_map = %r, "
            "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
            num_ps_replicas, self._is_chief, self._device_map,
            self._variable_device)
def _get_workers(num_workers, period, workers, moving_rate, num_ps=1):
  sessions = []
  graphs = []
  train_ops = []
  savers = []
  for worker_id in range(num_workers):
    graph = ops.Graph()
    is_chief = (worker_id == 0)
    with graph.as_default():
      worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
      ea_custom = ElasticAverageCustomGetter(worker_device=worker_device)
      with variable_scope.variable_scope(
          "", custom_getter=ea_custom), ops.device(
              device_setter.replica_device_setter(
                  worker_device=worker_device,
                  ps_device="/job:ps/task:0/cpu:0",
                  ps_tasks=1)):
        global_step = training_util.get_or_create_global_step()
        var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
        var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
      if num_ps > 1:
        with variable_scope.variable_scope(
            "",
            partitioner=partitioned_variables.fixed_size_partitioner(
                num_ps, axis=0),
            custom_getter=ea_custom), ops.device(
                device_setter.replica_device_setter(
                    worker_device=worker_device,
                    ps_device="/job:ps/task:0/cpu:0",
                    ps_tasks=num_ps)):

          partition_var = variable_scope.get_variable(
              'partition_var',
              shape=[2, 4],
              initializer=init_ops.ones_initializer)
          part_0 = list(partition_var)[0]
          part_1 = list(partition_var)[1]

      with ops.device("/job:worker/task:" + str(worker_id)):
        grads_0 = constant_op.constant(-1.0)
        grads_1 = constant_op.constant(-1.0)
        grads_part_0 = constant_op.constant([[-1., -1., -1., -1.]])
        grads_part_1 = constant_op.constant([[-1., -1., -1., -1.]])

        sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
        opt = ElasticAverageOptimizer(
            opt=sgd_opt,
            num_worker=num_workers,
            moving_rate=moving_rate,
            communication_period=period,
            ea_custom_getter=ea_custom)
        if num_ps == 1:
          train_op = [
              opt.apply_gradients(([grads_0, var_0], [grads_1, var_1]),
                                  global_step)
          ]
        else:
          train_op = [
              opt.apply_gradients(([grads_0, var_0],
                                   [grads_1, var_1],
                                   [grads_part_0, part_0],
                                   [grads_part_1, part_1]),
                                  global_step)
          ]
        easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
        saver = opt.swapping_saver()
      # Creates MonitoredSession
      sess = training.MonitoredTrainingSession(
          workers[worker_id].target, hooks=[easgd_hook])

    sessions.append(sess)
    graphs.append(graph)
    train_ops.append(train_op)
    savers.append(saver)

  return sessions, graphs, train_ops, savers