Ejemplo n.º 1
0
 def testNoPS(self):
     p = cluster_factory.Cluster.Params()
     p.worker.name = '/job:trainer'
     p.worker.replicas = 1
     p.ps.name = '/job:trainer'
     p.ps.replicas = 1
     c = cluster_factory.Cluster(p)
     g = tf.Graph()
     vs = []
     with g.as_default():
         with tf.device(c.GetPlacer()):
             for i in range(10):
                 vs.append(tf.get_variable('x%d' % i, (10, 10, 10)))
             sum_all = tf.add_n(vs)
     for v in vs:
         self.assertEqual(
             v.device,
             cluster.MakeDeviceString(job_name='/job:trainer',
                                      task_id=0,
                                      device_name='CPU',
                                      device_id=0))
     self.assertEqual(
         sum_all.device,
         cluster.MakeDeviceString(job_name='/job:trainer',
                                  task_id=0,
                                  device_name='CPU',
                                  device_id=0))
Ejemplo n.º 2
0
 def testDeviceListMultiReplicaSyncSgd(self):
     p = cluster_factory.Cluster.Params()
     p.mode = 'sync'
     p.job = 'trainer_client'
     p.worker.name = '/job:localhost'
     p.worker.replicas = 2
     p.worker.gpus_per_replica = 2
     c = cluster_factory.Cluster(p)
     gpu_devices = c.available_devices
     expected_gpu_devices = [
         [
             cluster.MakeDeviceString(job_name='/job:localhost',
                                      task_id=0,
                                      device_name='GPU',
                                      device_id=0),
             cluster.MakeDeviceString(job_name='/job:localhost',
                                      task_id=0,
                                      device_name='GPU',
                                      device_id=1),
         ],
         [
             cluster.MakeDeviceString(job_name='/job:localhost',
                                      task_id=1,
                                      device_name='GPU',
                                      device_id=0),
             cluster.MakeDeviceString(job_name='/job:localhost',
                                      task_id=1,
                                      device_name='GPU',
                                      device_id=1),
         ]
     ]
     self.assertAllEqual(gpu_devices, expected_gpu_devices)
Ejemplo n.º 3
0
 def testDefaultParamsWithDynamicShape(self):
     p = cluster_factory.Cluster.Params()
     c = cluster_factory.Cluster(p)
     g = tf.Graph()
     vs = []
     with g.as_default():
         with tf.device(c.GetPlacer()):
             for i in range(10):
                 dyn_shape = tf.constant([2], dtype=tf.int32)
                 dyn_shape = tf.placeholder_with_default(dyn_shape,
                                                         shape=[None])
                 v = tf.get_variable('x%d_wb/var' % i,
                                     initializer=tf.random.uniform(
                                         dyn_shape, dtype=tf.float64),
                                     validate_shape=False)
                 vs.append(v)
             sum_all = tf.add_n(vs)
     for v in vs:
         self.assertEqual(
             v.device,
             cluster.MakeDeviceString(job_name='/job:localhost',
                                      task_id=0,
                                      device_name='CPU',
                                      device_id=0))
     self.assertEqual(
         sum_all.device,
         cluster.MakeDeviceString(job_name='/job:localhost',
                                  task_id=0,
                                  device_name='CPU',
                                  device_id=0))
Ejemplo n.º 4
0
    def testDeviceListMultiReplicaNoSyncSgd(self):
        p = cluster_factory.Cluster.Params()
        p.mode = 'async'
        p.job = 'trainer'
        p.task = 1
        p.worker.replicas = 2
        p.worker.gpus_per_replica = 2
        c = cluster_factory.Cluster(p)
        gpu_devices = c.available_devices
        expected_gpu_devices = [[
            cluster.MakeDeviceString(job_name='/job:localhost',
                                     task_id=1,
                                     device_name='GPU',
                                     device_id=0),
            cluster.MakeDeviceString(job_name='/job:localhost',
                                     task_id=1,
                                     device_name='GPU',
                                     device_id=1),
        ]]
        self.assertAllEqual(gpu_devices, expected_gpu_devices)

        # Compute the total number of worker devices for a multi
        # replica setup.
        self.assertEqual(4, c.total_worker_devices)

        # Even when the job is different, we still look at the worker
        # information.
        p.job = 'controller'
        p.task = 0
        c = cluster_factory.Cluster(p)
        self.assertEqual(4, c.total_worker_devices)
Ejemplo n.º 5
0
 def testPSWithGPUs(self):
     p = cluster_factory.Cluster.Params()
     p.worker.name = '/job:trainer'
     p.worker.replicas = 1
     p.ps.name = '/job:ps'
     p.ps.replicas = 4
     p.ps.gpus_per_replica = 2
     c = cluster_factory.Cluster(p)
     g = tf.Graph()
     vs = []
     with g.as_default():
         with tf.device(c.GetPlacer()):
             for i in range(10):
                 vs.append(tf.get_variable('x%d' % i, (10, 10, 10)))
             sum_all = tf.add_n(vs)
     for i, v in enumerate(vs):
         self.assertEqual(
             v.device,
             cluster.MakeDeviceString(job_name='/job:ps',
                                      task_id=(i / 2) % 4,
                                      device_name='GPU',
                                      device_id=i % 2))
     self.assertEqual(
         sum_all.device,
         cluster.MakeDeviceString(job_name='/job:trainer',
                                  task_id=0,
                                  device_name='CPU',
                                  device_id=0))
Ejemplo n.º 6
0
 def testDefaultParams(self):
   p = cluster_factory.Cluster.Params()
   c = cluster_factory.Cluster(p)
   self.assertFalse(c.add_summary)
   g = tf.Graph()
   vs = []
   with g.as_default():
     with tf.device(c.GetPlacer()):
       for i in range(10):
         vs.append(tf.get_variable('x%d' % i, (10, 10, 10)))
       sum_all = tf.add_n(vs)
   for v in vs:
     self.assertEqual(
         v.device,
         cluster.MakeDeviceString(
             job_name='/job:localhost',
             replica_id=0,
             task_id=0,
             device_name='CPU',
             device_id=0))
   self.assertEqual(
       sum_all.device,
       cluster.MakeDeviceString(
           job_name='/job:localhost',
           replica_id=0,
           task_id=0,
           device_name='CPU',
           device_id=0))
Ejemplo n.º 7
0
 def testDeviceListOneRepliaCpu(self):
     p = cluster_factory.Cluster.Params()
     p.mode = 'async'
     p.job = 'trainer'
     p.worker.cpus_per_replica = 2
     c = cluster_factory.Cluster(p)
     cpu_devices = c.available_devices
     expected_cpu_devices = [[
         cluster.MakeDeviceString(job_name='/job:localhost',
                                  task_id=0,
                                  device_name='CPU',
                                  device_id=0),
         cluster.MakeDeviceString(job_name='/job:localhost',
                                  task_id=0,
                                  device_name='CPU',
                                  device_id=1),
     ]]
     print(expected_cpu_devices)
     self.assertAllEqual(cpu_devices, expected_cpu_devices)
Ejemplo n.º 8
0
 def testInputDevice(self):
     p = cluster_factory.Cluster.Params()
     p.mode = 'sync'
     p.job = 'decoder'
     p.decoder.replicas = 1
     p.task = 0
     p.input.name = '/job:input'
     p.input.replicas = 1
     c = cluster_factory.Cluster(p)
     input_device = c.input_device
     expected_device = cluster.MakeDeviceString(job_name='/job:input',
                                                task_id=0,
                                                device_name='CPU',
                                                device_id=0)
     self.assertEqual(input_device, expected_device)
Ejemplo n.º 9
0
 def testWorkerDeviceInModelSplitSync(self):
     p = cluster_factory.Cluster.Params()
     p.mode = 'sync'
     p.job = 'trainer_client'
     p.worker.name = '/job:trainer'
     p.worker.replicas = 4
     p.worker.gpus_per_replica = 4
     p.worker.devices_per_split = 2
     with cluster_factory.Cluster(p):
         with cluster_factory.SetModelSplit(1) as c:
             d = c.WorkerDeviceInModelSplit(1)
             expected_device = cluster.MakeDeviceString(
                 job_name='/job:trainer',
                 task_id=0,
                 device_name='GPU',
                 device_id=3)
     self.assertEqual(expected_device, d)
Ejemplo n.º 10
0
 def testPSRandomSize(self):
   p = cluster_factory.Cluster.Params()
   p.worker.name = '/job:trainer'
   p.ps.name = '/job:ps'
   p.ps.replicas = 10
   c = cluster_factory.Cluster(p)
   g = tf.Graph()
   vs = []
   np.random.seed(301)
   with g.as_default():
     with tf.device(c.GetPlacer()):
       # Creates 200 variables with different sizes.
       for i in range(200):
         if i % 13:
           size = np.random.randint(10000)
         elif i % 7:
           size = np.random.randint(100)
         else:
           size = np.random.randint(10)
         vs.append(tf.get_variable('x%d' % i, shape=(size)))
       sum_all = tf.add_n([tf.reduce_sum(x) for x in vs])
   # Computes the total size of variables placed on each device.
   total_size = {}  # device name -> size
   for v in vs:
     size = tf.TensorShape(v.op.get_attr('shape')).num_elements()
     if v.device in total_size:
       total_size[v.device] += size
     else:
       total_size[v.device] = size
   for (device, allocated) in zip(
       sorted(total_size),
       [91701, 91361, 90346, 88738, 87240, 89265, 91944, 92472, 88051, 95053]):
     self.assertEqual(total_size[device], allocated)
   self.assertEqual(
       sum_all.device,
       cluster.MakeDeviceString(
           job_name='/job:trainer',
           replica_id=0,
           task_id=0,
           device_name='CPU',
           device_id=0))