def generate_data(args):
    dense_variables = generate_dense_variables(
        args.slot_num * args.nnz_per_slot * args.embedding_vec_size,
        [args.num_dense_units for _ in range(args.num_dense_layers)])
    vocabulary_tensors = generate_vocabulary_table(
        args.max_vocabulary_size_per_gpu, args.embedding_vec_size, hvd.size())
    samples, labels = utils.generate_random_samples(
        num_of_samples=args.global_batch_size,
        vocabulary_size=args.max_vocabulary_size_per_gpu * hvd.size(),
        slot_num=args.slot_num,
        max_nnz=args.nnz_per_slot,
        use_sparse_mask=False)
    samples, labels = tf.convert_to_tensor(samples), tf.convert_to_tensor(
        labels)

    for i in range(args.num_dense_layers):
        # dense_variables[0] means weight, dense_variables[1] means bias
        dense_variables[0][i] = hvd.broadcast(dense_variables[0][i],
                                              root_rank=0)
        dense_variables[1][i] = hvd.broadcast(dense_variables[1][i],
                                              root_rank=0)
    for i in range(hvd.size()):
        vocabulary_tensors[i] = hvd.broadcast(vocabulary_tensors[i],
                                              root_rank=0)
    samples = hvd.broadcast(samples, root_rank=0)
    labels = hvd.broadcast(labels, root_rank=0)

    return dense_variables, vocabulary_tensors, samples, labels
Beispiel #2
0
    def test_horovod_broadcast_grad_gpu(self):
        """Test the correctness of the broadcast gradient on GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        if os.environ.get('HOROVOD_MIXED_INSTALL'):
            # Skip if compiled with CUDA but without HOROVOD_GPU_BROADCAST.
            return

        hvd.init()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        root_ranks = list(range(size))
        for dtype, dim, root_rank in itertools.product(dtypes, dims,
                                                       root_ranks):
            if _executing_eagerly():
                tensor = self.tfe.Variable(tf.ones([5] * dim) * rank)
            else:
                tensor = tf.ones([5] * dim) * rank
            if dtype == tf.bool:
                tensor = tensor % 2
            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = tf.cast(tensor, dtype=dtype)
                    broadcasted_tensor = hvd.broadcast(tensor, root_rank)
                with tf.device("/gpu:%d" % local_rank):
                    grad_out = tape.gradient(broadcasted_tensor, tensor)
            else:
                tensor = tf.cast(tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)

                grad_ys = tf.ones([5] * dim)
                with tf.device("/gpu:%d" % local_rank):
                    grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0]
                grad_out = self.evaluate(grad)

            c = size if rank == root_rank else 0
            expected = np.ones([5] * dim) * c
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Beispiel #3
0
def _add_broadcast_ops():
    bcast_global_variables_ops = []
    for var in tf.global_variables():
        bcast_global_variables_ops.append(tf.assign(var, hvd.broadcast(var,
                                                                       0)))
    with tf.control_dependencies(bcast_global_variables_ops):
        tf.no_op(name='auto_parallel_bcast_global_vars')
    def test_horovod_broadcast_grad(self):
        """Test the correctness of the broadcast gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            # As of TensorFlow v1.9, gradients are not supported on
            # integer tensors
            dtypes = [tf.float32, tf.float64]
            dims = [1, 2, 3]
            root_ranks = list(range(size))
            for dtype, dim, root_rank in itertools.product(
                    dtypes, dims, root_ranks):
                tensor = tf.ones([5] * dim) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)

                grad_ys = tf.ones([5] * dim)
                grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0]
                grad_out = session.run(grad)

                c = size if rank == root_rank else 0
                expected = np.ones([5] * dim) * c
                err = np.linalg.norm(expected - grad_out)
                self.assertLess(err, 0.00000001,
                                "gradient %s differs from expected %s, "
                                "error: %s" % (grad_out, expected, str(err)))
Beispiel #5
0
    def test_horovod_broadcast_grad(self):
        """Test the correctness of the broadcast gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            # As of TensorFlow v1.9, gradients are not supported on
            # integer tensors
            dtypes = [tf.float32, tf.float64]
            dims = [1, 2, 3]
            root_ranks = list(range(size))
            for dtype, dim, root_rank in itertools.product(
                    dtypes, dims, root_ranks):
                tensor = tf.ones([5] * dim) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)

                grad_ys = tf.ones([5] * dim)
                grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0]
                grad_out = session.run(grad)

                c = size if rank == root_rank else 0
                expected = np.ones([5] * dim) * c
                err = np.linalg.norm(expected - grad_out)
                self.assertLess(
                    err, 0.00000001, "gradient %s differs from expected %s, "
                    "error: %s" % (grad_out, expected, str(err)))
Beispiel #6
0
    def test_horovod_broadcast(self):
        """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            dtypes = [
                tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64,
                tf.float16, tf.float32, tf.float64, tf.bool
            ]
            dims = [1, 2, 3]
            root_ranks = list(range(size))
            for dtype, dim, root_rank in itertools.product(
                    dtypes, dims, root_ranks):
                tensor = tf.ones([17] * dim) * rank
                root_tensor = tf.ones([17] * dim) * root_rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                    root_tensor = root_tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                root_tensor = tf.cast(root_tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)
                self.assertTrue(
                    session.run(
                        tf.reduce_all(
                            tf.equal(tf.cast(root_tensor, tf.int32),
                                     tf.cast(broadcasted_tensor, tf.int32)))),
                    "hvd.broadcast produces incorrect broadcasted tensor")
Beispiel #7
0
    def test_horovod_broadcast_cpu(self):
        """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors on CPU."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        dtypes = [
            tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64,
            tf.float16, tf.float32, tf.float64, tf.bool
        ]
        dims = [1, 2, 3]
        root_ranks = list(range(size))
        for dtype, dim, root_rank in itertools.product(dtypes, dims,
                                                       root_ranks):
            with tf.device("/cpu:0"):
                tensor = tf.ones([17] * dim) * rank
                root_tensor = tf.ones([17] * dim) * root_rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                    root_tensor = root_tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                root_tensor = tf.cast(root_tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)
            self.assertTrue(
                self.evaluate(
                    tf.reduce_all(
                        tf.equal(tf.cast(root_tensor, tf.int32),
                                 tf.cast(broadcasted_tensor, tf.int32)))),
                "hvd.broadcast produces incorrect broadcasted tensor")
Beispiel #8
0
    def test_horovod_broadcast(self):
        """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16,
                      tf.int32, tf.int64, tf.float32, tf.float64,
                      tf.bool]
            dims = [1, 2, 3]
            root_ranks = list(range(size))
            for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks):
                try:
                    tensor = tf.ones([17] * dim) * rank
                    root_tensor = tf.ones([17] * dim) * root_rank
                    if dtype == tf.bool:
                        tensor = tensor % 2
                        root_tensor = root_tensor % 2
                    tensor = tf.cast(tensor, dtype=dtype)
                    root_tensor = tf.cast(root_tensor, dtype=dtype)
                    broadcasted_tensor = hvd.broadcast(tensor, root_rank)
                    self.assertTrue(
                        session.run(tf.reduce_all(tf.equal(
                            tf.cast(root_tensor, tf.int32), tf.cast(broadcasted_tensor, tf.int32)))),
                        "hvd.broadcast produces incorrect broadcasted tensor")
                except Exception:
                    import traceback
                    traceback.print_exc()
Beispiel #9
0
def _add_broadcast_ops(target, worker_id):
    bcast_global_variables_ops = []
    with tf.device('/job:worker/task:%d' % worker_id):
        for var in target:
            bcast_global_variables_ops.append(
                tf.assign(var, hvd.broadcast(var, 0)))
        with tf.control_dependencies(bcast_global_variables_ops):
            tf.no_op(name='auto_parallel_bcast_global_vars')
Beispiel #10
0
    def test_horovod_broadcast_grad_cpu(self):
        """Test the correctness of the broadcast gradient on CPU."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        root_ranks = list(range(size))
        for dtype, dim, root_rank in itertools.product(dtypes, dims,
                                                       root_ranks):
            if _executing_eagerly():
                tensor = self.tfe.Variable(tf.ones([5] * dim) * rank)
            else:
                tensor = tf.ones([5] * dim) * rank
            if dtype == tf.bool:
                tensor = tensor % 2
            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = tf.cast(tensor, dtype=dtype)
                    broadcasted_tensor = hvd.broadcast(tensor, root_rank)
                with tf.device("/cpu:0"):
                    grad_out = tape.gradient(broadcasted_tensor, tensor)
            else:
                tensor = tf.cast(tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)

                grad_ys = tf.ones([5] * dim)
                with tf.device("/cpu:0"):
                    grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0]
                grad_out = self.evaluate(grad)

            c = size if rank == root_rank else 0
            expected = np.ones([5] * dim) * c
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Beispiel #11
0
def batch_shuffle(tensor):  # nx...
    total, rank = hvd.size(), hvd.rank()
    batch_size = tf.shape(tensor)[0]
    with tf.device('/cpu:0'):
        all_idx = tf.range(total * batch_size)
        shuffle_idx = tf.random.shuffle(all_idx)
        shuffle_idx = hvd.broadcast(shuffle_idx, 0)
        my_idxs = tf.slice(shuffle_idx, [rank * batch_size], [batch_size])

    all_tensor = allgather(tensor, 'batch_shuffle_key')  # gn x ...
    return tf.gather(all_tensor, my_idxs), shuffle_idx
    def send_receive(self, tensors, ctx):
        decompressed_tensors = []
        for ranki in range(self.world_size):
            ranki_tensors = [
                broadcast(tensor, root_rank=ranki) for tensor in tensors
            ]
            ranki_decompressed = self.compressor.decompress(ranki_tensors, ctx)
            decompressed_tensors.append(ranki_decompressed)

        aggregated_tensor = self.compressor.aggregate(decompressed_tensors)
        return aggregated_tensor
Beispiel #13
0
def broadcast(value, root_rank, name=None):
    """
    Perform a broadcast on a tensor-compatible value.

    Arguments:
        value: A tensor-compatible value to reduce.
               The shape of the input must be identical across all ranks.
        root_rank: Rank of the process from which global variables will be
                   broadcasted to all other processes.
        name: Optional name for the constants created by this operation.
    """
    bcast_op = hvd.broadcast(tf.constant(value, name=name), root_rank)
    return K.get_session().run(bcast_op)
Beispiel #14
0
def broadcast(value, root_rank, name=None):
    """
    Perform a broadcast on a tensor-compatible value.

    Arguments:
        value: A tensor-compatible value to reduce.
               The shape of the input must be identical across all ranks.
        root_rank: Rank of the process from which global variables will be
                   broadcasted to all other processes.
        name: Optional name for the constants created by this operation.
    """
    bcast_op = hvd.broadcast(tf.constant(value, name=name), root_rank)
    return K.get_session().run(bcast_op)
Beispiel #15
0
    def broadcasting_dataloader_wrapper(self):
        if hvd.rank() == 0:
            (numerical_features,
             categorical_features), labels = self.pipe.get_next()

            # Bitcasting to float32 before broadcast and back to int32 right afterwards is necessary
            # otherwise tensorflow performs a spurious D2H and H2D transfer on this tensor.
            # Without this call, the columnwise-split mode gets about 2x slower.
            categorical_features = tf.bitcast(categorical_features,
                                              type=tf.float32)
        else:
            # using random uniform instead of e.g., tf.zeros is necessary here
            # tf.zeros would be placed on CPU causing a device clash in the broadcast
            numerical_features = tf.random.uniform(
                shape=[self.dlrm.batch_size, self.dlrm.num_numerical_features],
                dtype=tf.float16)
            categorical_features = tf.random.uniform(
                maxval=1,
                dtype=tf.float32,
                shape=[self.dlrm.batch_size,
                       len(self.dlrm.table_sizes)])
            labels = tf.random.uniform(maxval=1,
                                       shape=[self.dlrm.batch_size],
                                       dtype=tf.int32)
            labels = tf.cast(labels, dtype=tf.int8)

        numerical_features = hvd.broadcast(numerical_features,
                                           root_rank=0,
                                           name='numerical_broadcast')

        categorical_features = hvd.broadcast(categorical_features,
                                             root_rank=0,
                                             name='cat_broadcast')

        labels = hvd.broadcast(labels, root_rank=0, name='labels_broadcast')

        categorical_features = tf.bitcast(categorical_features, type=tf.int32)
        return (numerical_features, categorical_features), labels
Beispiel #16
0
    def test_horovod_broadcast_rank_error(self):
        """Test that the broadcast returns an error if different ranks
        specify different root rank."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        tensor = tf.ones([17] * 3, dtype=tf.float32)
        with self.assertRaises(tf.errors.FailedPreconditionError):
            self.evaluate(hvd.broadcast(tensor, rank))
Beispiel #17
0
    def test_horovod_broadcast_rank_error(self):
        """Test that the broadcast returns an error if different ranks
        specify different root rank."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor = tf.ones([17] * 3, dtype=tf.float32)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, rank))
Beispiel #18
0
    def test_horovod_broadcast_type_error(self):
        """Test that the broadcast returns an error if the types being broadcasted
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        tensor_size = [17] * 3
        dtype = tf.int32 if rank % 2 == 0 else tf.float32
        tensor = tf.ones(tensor_size, dtype=dtype) * rank
        with self.assertRaises(tf.errors.FailedPreconditionError):
            self.evaluate(hvd.broadcast(tensor, 0))
Beispiel #19
0
    def test_horovod_broadcast_error(self):
        """Test that the broadcast returns an error if any dimension besides
        the first is different among the tensors being broadcasted."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        tensor_size = [17] * 3
        tensor_size[1] = 10 * (rank + 1)
        tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
        with self.assertRaises(tf.errors.FailedPreconditionError):
            self.evaluate(hvd.broadcast(tensor, 0))
Beispiel #20
0
    def test_horovod_broadcast_type_error(self):
        """Test that the broadcast returns an error if the types being broadcasted
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor_size = [17] * 3
            dtype = tf.int32 if rank % 2 == 0 else tf.float32
            tensor = tf.ones(tensor_size, dtype=dtype) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, 0))
Beispiel #21
0
    def test_horovod_broadcast_error(self):
        """Test that the broadcast returns an error if any dimension besides
        the first is different among the tensors being broadcasted."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor_size = [17] * 3
            tensor_size[1] = 10 * (rank + 1)
            tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, 0))
Beispiel #22
0
    def test_horovod_broadcast_gpu(self):
        """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors on GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            self.skipTest(("No GPUs available"))

        if os.environ.get('HOROVOD_MIXED_INSTALL'):
            # Skip if compiled with CUDA but without HOROVOD_GPU_ALLREDUCE.
            self.skipTest("Not compiled with HOROVOD_GPU_ALLREDUCE")

        hvd.init()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        dtypes = [
            tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64,
            tf.float16, tf.float32, tf.float64, tf.bool
        ]
        dims = [1, 2, 3]
        root_ranks = list(range(size))
        for dtype, dim, root_rank in itertools.product(dtypes, dims,
                                                       root_ranks):
            with tf.device("/gpu:%d" % local_rank):
                tensor = tf.ones([17] * dim) * rank
                root_tensor = tf.ones([17] * dim) * root_rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                    root_tensor = root_tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                root_tensor = tf.cast(root_tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)
            self.assertTrue(
                self.evaluate(
                    tf.reduce_all(
                        tf.equal(tf.cast(root_tensor, tf.int32),
                                 tf.cast(broadcasted_tensor, tf.int32)))),
                "hvd.broadcast produces incorrect broadcasted tensor")
Beispiel #23
0
# Parameters:
# eps -- time resolution
# damping -- wave damping
eps = tf.placeholder(tf.float32, shape=())
damping = tf.placeholder(tf.float32, shape=())

# Create variables for simulation state
U = tf.Variable(u_init)
Ut = tf.Variable(ut_init)

#The complete 3N/2 * N matrices on which we'll perform calculations
U_full = tf.Variable(np.zeros([(N / 2) + 3, N], dtype=np.float32))
Ut_full = tf.Variable(np.zeros([(N / 2) + 3, N], dtype=np.float32))

rank_bcast = tf.group(
    tf.assign(U_full[N / 2:], hvd.broadcast(
        U[:3], 1)),  #Sending first 3 rows of rank 1 to rank 0 for U
    tf.assign(Ut_full[N / 2:], hvd.broadcast(
        Ut[:3], 1)),  #Sending first 3 rows of rank 1 to rank 0 for Ut
    tf.assign(U_full[:3], hvd.broadcast(
        U[-3:], 0)),  #Sending last 3 rows of rank 0 to rank 1 for U
    tf.assign(Ut_full[:3],
              hvd.broadcast(Ut[-3:],
                            0)))  #Sending 2nd half of rank 0 to rank 1 for Ut

#Copy the rest of U and Ut for rank 0
U_full_rank0_group = tf.group(U_full[:N / 2].assign(U),
                              Ut_full[:N / 2].assign(Ut))

#Copy the rest of U and Ut for rank 1
U_full_rank1_group = tf.group(U_full[3:].assign(U), Ut_full[3:].assign(Ut))
Beispiel #24
0
def broadcast(tensor, name=None):
    return hvd.broadcast(tensor, root_rank=0)
Beispiel #25
0
# Print initial state
print "Rank " + str(hvd.rank()) + " send initial: " + str(send_buf)
if hvd.rank() == 0:
    print "Rank " + str(hvd.rank()) + " recv initial: " + str(recv0_buf)
else:
    print "Rank " + str(hvd.rank()) + " recv initial: " + str(recv1_buf)

# Create tensorflow variables
Send_Buffer = tf.Variable(send_buf, name='Send_Buffer')
Recv0_Buffer = tf.Variable(recv0_buf, name='Recv0_Buffer')
Recv1_Buffer = tf.Variable(recv1_buf, name='Recv1_Buffer')

#communicate
bcast = tf.group(
    tf.assign(Recv1_Buffer,
              hvd.broadcast(Send_Buffer,
                            0)),  #Rank 0's send_buffer to Rank 1's recv
    tf.assign(Recv0_Buffer,
              hvd.broadcast(Send_Buffer,
                            1)))  #Rank 1's send_buffer to Rank 0's recv

# Initialize state to initial conditions
tf.global_variables_initializer().run()

bcast.run()

# Print final state
if hvd.rank() == 0:
    print "Rank " + str(hvd.rank()) + " recv final: " + str(
        Recv0_Buffer.eval())
else:
    print "Rank " + str(hvd.rank()) + " recv final: " + str(
Beispiel #26
0
  U.assign(tf.concat(values=[tf.slice(U, [0,0],[N,N]), tf_recv_buf_0], axis=0)),
  U.assign(U_),
  Ut.assign(Ut_),
  tf_send_buf.assign(send_buf))
  

# Update the state for Rank 1
r1_step = tf.group(
  U.assign(tf.concat(values=[tf_recv_buf_1, tf.slice(U, [2,0], [N,N])], axis=0)),
  U.assign(U_),
  Ut.assign(Ut_),
  tf_send_buf.assign(send_buf))

# Broadcast the two rows
broadcast = tf.group(
  tf.assign(tf_recv_buf_1, hvd.broadcast(tf_send_buf, 0)),
  tf.assign(tf_recv_buf_0, hvd.broadcast(tf_send_buf, 1)))

# Initialize state to initial conditions
tf.global_variables_initializer().run()

# Run num_iter steps of PDE
start = time.time()
for i in range(num_iter):
  broadcast.run()
  # Step simulation
  if hvd.rank() == 0:
    r0_step.run({eps: 0.06, damping: 0.03})
  else:
    r1_step.run({eps: 0.06, damping: 0.03})
Beispiel #27
0
  U.assign(U_),
  Ut.assign(Ut_))



#create send and receive buffers
send_buf  = np.zeros([2, N], dtype=np.float32)
recv0_buf  = np.zeros([2, N], dtype=np.float32)
recv1_buf  = np.zeros([2, N], dtype=np.float32)

Send_Buffer  = tf.Variable(send_buf,  name='Send_Buffer')
Recv0_Buffer  = tf.Variable(recv0_buf,  name='Recv0_Buffer')
Recv1_Buffer  = tf.Variable(recv1_buf,  name='Recv1_Buffer')

bcast = tf.group(
  tf.assign(Recv1_Buffer, hvd.broadcast(Send_Buffer, 0)),
  tf.assign(Recv0_Buffer, hvd.broadcast(Send_Buffer, 1)))


fill_row = None
if hvd.rank() == 0:
  #fill bottom 2 rows ka values in send_buffer
  fill_row = tf.scatter_update(Send_Buffer, [0,1], Ut[N-2:N, :])
else:
  #fill top 2 rows ka values in send_buffer
  fill_row = tf.scatter_update(Send_Buffer, [0,1], Ut[2:4, :])

update_row = None
if hvd.rank() == 0:
  #fill bottom 2 rows ka values in send_buffer
  update_row = tf.scatter_update(Ut, [N,N+1], Recv0_Buffer[:, :])
Beispiel #28
0
    airlineData['IsDelayed'],
    test_size=0.25,
    random_state=42)

xTrain, xTest, yTrain, yTest = np.array(X_train), np.array(X_test), np.array(
    y_train), np.array(y_test)

indices = splitData(xTrain, yTrain, numTrees)
currSess = tf.InteractiveSession()

indexBC = tf.get_variable(initializer=tf.constant(indices),
                          dtype=tf.int32,
                          name="IndexBC")

### broadcast only for 0 rank
indicesBroadCast = hvd.broadcast(indexBC, 0)


def tree_fit_predict(xTrain, yTrain, xTest, index):
    model = decisionTree.DecisionTree(maxDepth=maxDepth, verbose=True)
    #   new = tf.gather(xTrain,index)
    #   print(new)
    # #   index = np.array(index)
    # #   index = index.astype(int)
    # #   print(index.dtype)
    result = model.fit(xTrain[index], yTrain[index]).predict(xTest)
    return tf.convert_to_tensor(result)


xTrainTensor = tf.placeholder(tf.float32)
yTrainTensor = tf.placeholder(tf.float32)
Beispiel #29
0
def worker(rank,
           size,
           input_file_specs,
           batch_size=256,
           warmup_sec=10.0,
           run_sec=60 * 60 * 4,
           num_threads=0,
           sync=False,
           warn_latency_sec=4.0,
           report_period_sec=2.0,
           round_robin_files=True,
           throttle_sleep_sec=0.01,
           throttle_total_rate_bytes_per_sec=0):

    if rank == 0:
        print('storage_benchmark_tensorflow: BEGIN')
        print(datetime.datetime.utcnow())

    metrics_file_name = '/imagenet-scratch/logs/storage_benchmark_tensorflow_metrics-%d.log' % rank
    with open(metrics_file_name, 'a') as metrics_file:

        hostname = socket.gethostname()

        # Set random seed to have deterministic behavior.
        tf.set_random_seed(rank + 1)

        # Round robin the input file spec. This allows multiple mount points to be used.
        input_file_spec = input_file_specs[hvd.local_rank() %
                                           len(input_file_specs)]
        print('rank=%3d: %s: input_file_spec=%s' %
              (rank, hostname, input_file_spec))

        if round_robin_files:
            # Distribute sets of file names evenly over all processes and without overlap.
            all_input_filenames = sorted(glob.glob(input_file_spec))
            num_files = len(all_input_filenames)
            i = rank
            input_filenames = []
            while i < num_files:
                input_filenames.append(all_input_filenames[i])
                i += size
            print(
                'rank=%3d: Found %d total files. %d files assigned to this process.'
                % (rank, len(all_input_filenames), len(input_filenames)))
            if len(input_filenames) == 0:
                raise ValueError('Not enough matching files.')
            input_file_spec = None
        else:
            # This will use tf.data.TFRecordDataset.list_files to randomly distribute files.
            input_filenames = None

        #
        # Build execution graph.
        #

        ds_iterator = create_iterator(batch_size,
                                      num_threads,
                                      input_file_spec=input_file_spec,
                                      input_filenames=input_filenames)

        # num_bytes_tensor is an int64 tensor of shape (batch_size).
        num_bytes_tensor = ds_iterator.get_next()

        # When num_bytes_for_step_tensor is evaluated, it reads the TFRecord files.
        num_bytes_for_step_tensor = tf.reduce_sum(num_bytes_tensor)

        # The following operations are used to synchronize the processes when running in sync mode.
        if sync:
            stop_flag_placeholder = tf.placeholder(tf.bool, shape=())
            stop_flag_broadcast_tensor = hvd.broadcast(stop_flag_placeholder,
                                                       0,
                                                       'stop_flag_broadcast')
            num_bytes_for_step_placeholder = tf.placeholder(tf.int64, shape=())
            total_bytes_for_step_tensor = hvd.allreduce(
                num_bytes_for_step_placeholder, average=False)

        #
        # Start the TensorFlow session and execute the graph.
        #

        config = tf.ConfigProto()
        config.device_count['GPU'] = 0
        config.intra_op_parallelism_threads = 1
        config.inter_op_parallelism_threads = 1
        print('rank=%3d: Creating session' % rank)
        with tf.Session(config=config) as session:
            print('rank=%3d: Session created' % rank)
            session.run(
                [tf.initializers.global_variables(),
                 tf.tables_initializer()])
            print('rank=%3d: Initialized variables' % rank)

            # Run first step. This can take 30 seconds for 100,000 files.
            print('rank=%3d: Running first step' % rank)
            _ = session.run(num_bytes_for_step_tensor)
            print('rank=%3d: First step complete' % rank)

            # Wait for barrier so we know when all processes have finished the first step.
            print('rank=%3d: Waiting for barrier' % rank)
            session.run(hvd.allreduce(tf.constant(0)))
            if rank == 0:
                print('rank=%3d: Completed waiting for barrier' % rank)

            # To ensure that all processes finish warmup and stop at exactly the same time,
            # the rank 0 node broadcasts its time to all other ranks.
            # This also serves as a synchronization barrier.
            local_t0 = time.time()
            t0_tensor = tf.constant(local_t0, tf.float64)
            t0_tensor = hvd.broadcast(t0_tensor, 0, 't0')
            t0 = session.run(t0_tensor)

            start_time = t0 + warmup_sec
            stop_time = start_time + run_sec
            step = 0
            warmed_up = False
            num_records = 0
            num_bytes = 0
            total_bytes = 0
            next_report_time = time.time() + report_period_sec

            if throttle_total_rate_bytes_per_sec:
                throttle_rate_bytes_per_sec = throttle_total_rate_bytes_per_sec / size
                burst_sec = 1.0
                throttle = TokenBucket(tokens=throttle_rate_bytes_per_sec *
                                       burst_sec,
                                       fill_rate=throttle_rate_bytes_per_sec)
            else:
                throttle = None

            while True:
                # Reset all counters when warmup completes.
                t = time.time()
                if not warmed_up and t >= start_time:
                    print('rank=%3d: warmup complete at step %d' %
                          (rank, step))
                    warmed_up = True
                    t0 = start_time
                    step = 0
                    num_records = 0
                    num_bytes = 0
                    total_bytes = 0

                # Run a single step of batch_size records per process.
                run_options = tf.RunOptions()
                # run_options.timeout_in_ms = 10000
                num_bytes_for_step = np.int64(0)
                try:
                    num_bytes_for_step = session.run(num_bytes_for_step_tensor,
                                                     options=run_options)
                except Exception as e:
                    print('rank=%3d: %s: ERROR: %s' % (rank, hostname, e))

                step_dt = time.time() - t

                if (warmed_up or step >= 1) and step_dt > warn_latency_sec:
                    print('rank=%3d: %s: WARNING: step %d took %0.3f seconds' %
                          (rank, hostname, step, step_dt))
                    next_report_time = 0.0

                # Calculate local stop flag. In sync mode, this is broadcast from rank 0.
                stop_flag = time.time() >= stop_time

                # Use Horovod to aggregate the byte counter across all processes.
                # This also acts as a synchronization barrier, much like gradient descent when
                # it shares gradients.
                # Also coordinate the stop flag so all processes stop at the same step.
                sync_dt = 0.0
                if sync:
                    t = time.time()
                    total_bytes_for_step, stop_flag = session.run(
                        [
                            total_bytes_for_step_tensor,
                            stop_flag_broadcast_tensor
                        ],
                        feed_dict={
                            num_bytes_for_step_placeholder: num_bytes_for_step,
                            stop_flag_placeholder: stop_flag,
                        },
                    )

                    total_bytes += total_bytes_for_step

                    sync_dt = time.time() - t
                    if warmed_up and sync_dt > 30.0:
                        print(
                            'rank=%3d: %s: WARNING: sync after step %d took %0.3f seconds'
                            % (rank, hostname, step, sync_dt))
                        next_report_time = 0.0

                num_records += batch_size
                num_bytes += num_bytes_for_step
                t = time.time()

                metrics = {
                    '@timestamp': datetime.datetime.utcnow().isoformat() + 'Z',
                    'batch_size': batch_size,
                    'rank': rank,
                    'hostname': hostname,
                    'step': step,
                    'num_bytes': int(num_bytes_for_step),
                    'latency_sec': step_dt,
                    'sync_latency_sec': sync_dt,
                }
                json.dump(metrics, metrics_file)
                metrics_file.write("\n")
                metrics_file.flush()

                if t >= next_report_time:
                    dt = t - t0
                    if not sync:
                        records_per_sec = num_records / dt
                        bytes_per_sec = num_bytes / dt
                        MB_per_sec = bytes_per_sec / 1e6
                        print(
                            'rank=%3d: warmed_up=%d, step=%6d, records/sec=%8.0f, MB/sec=%11.3f, records=%10d, bytes=%15d, dt=%9.3f'
                            % (rank, warmed_up, step, records_per_sec,
                               MB_per_sec, num_records, num_bytes, dt))
                    if sync:
                        if rank == 0:
                            total_records = num_records * size
                            records_per_sec = total_records / dt
                            bytes_per_sec = total_bytes / dt
                            MB_per_sec = bytes_per_sec / 1e6
                            print(
                                'TOTAL:    warmed up=%d, step=%6d, records/sec=%8.0f, MB/sec=%11.3f, records=%10d, bytes=%15d, dt=%9.3f'
                                % (warmed_up, step, records_per_sec,
                                   MB_per_sec, total_records, total_bytes, dt))
                    next_report_time = t + report_period_sec

                # Throttle byte rate.
                if throttle:
                    while not throttle.consume(num_bytes_for_step):
                        # print('sleeping')
                        time.sleep(throttle_sleep_sec)

                if stop_flag:
                    print('rank=%3d: %s: complete at step %d' %
                          (rank, hostname, step))
                    break

                step += 1

            # Use Horovod to aggregate the final counters across all processes.
            num_steps_tensor = tf.constant(step)
            num_bytes_tensor = tf.constant(num_bytes)
            total_steps_tensor = hvd.allreduce(num_steps_tensor, average=False)
            total_bytes_tensor = hvd.allreduce(num_bytes_tensor, average=False)
            total_steps, total_bytes = session.run(
                [total_steps_tensor, total_bytes_tensor])
            if rank == 0:
                dt = stop_time - start_time
                num_records = total_steps * batch_size
                records_per_sec = num_records / dt
                total_GB = total_bytes / 1e9
                bytes_per_sec = total_bytes / dt
                MB_per_sec = bytes_per_sec / 1e6
                print('FINAL: number of processes: %12d' % size)
                print('FINAL: batch size:          %12d' % batch_size)
                print('FINAL: sync:                %12s' % sync)
                print('FINAL: round robin files:   %12s' % round_robin_files)
                print('FINAL: number of records:   %12d' % num_records)
                print('FINAL: GB:                  %12.3f' % total_GB)
                print('FINAL: elapsed sec:         %12.3f' % dt)
                print('FINAL: records/sec:         %12.0f' % records_per_sec)
                print('FINAL: MB/sec:              %12.3f' % MB_per_sec)

        if rank == 0:
            print('storage_benchmark_tensorflow: END')
rank = hvd.rank()
print('rank', rank)
points_per_device = int(num_points / hvd.size())

np_points = np_points[rank * points_per_device:rank * points_per_device +
                      points_per_device - 1]

#points = tf.constant(np_points, dtype=tf.float32)
points = tf.placeholder(dtype=tf.float32, name='global_sum_place_hold')

centroids = tf.get_variable(name='centroids',
                            shape=[num_cluster, dim],
                            initializer=tf.initializers.random_uniform(
                                minval=0, maxval=10.0, seed=123))

bcast_result = hvd.broadcast(centroids, 0)

init_centroids_sync = tf.assign(centroids, bcast_result)

expanded_points = tf.expand_dims(points, 0)
expanded_centroids = tf.expand_dims(centroids, 1)

distances = tf.reduce_sum(tf.square(tf.subtract(points, expanded_centroids)),
                          2)
assignments = tf.argmin(distances, 0)

loss_op = tf.reduce_sum(tf.reduce_min(distances, 0))
tf_sum = tf.unsorted_segment_sum(points, assignments, num_cluster)
tf_count = tf.unsorted_segment_sum(tf.ones_like(points), assignments,
                                   num_cluster)
Ut_Send = tf.Variable(ut_init, name='Ut_Send')

# Create tensor flow variable to receive into
Ur0 = tf.Variable(np.zeros([3, N], dtype=np.float32))
Utr0 = tf.Variable(np.zeros([3, N], dtype=np.float32))
Ur1 = tf.Variable(np.zeros([3, N], dtype=np.float32))
Utr1 = tf.Variable(np.zeros([3, N], dtype=np.float32))

#Used for calculations
U_main = tf.Variable(np.zeros([N + 3, N], dtype=np.float32))
Ut_main = tf.Variable(np.zeros([N + 3, N], dtype=np.float32))

#Communicate 3 rows
rank_bcast = tf.group(
    tf.assign(Ur0,
              hvd.broadcast(U[:3],
                            1)),  #Rank 1's send_buffer to Rank 0's recv for U
    tf.assign(Utr0,
              hvd.broadcast(Ut[:3],
                            1)),  #Rank 1's send_buffer to Rank 0's recv for Ut
    tf.assign(Ur1,
              hvd.broadcast(U[-3:],
                            0)),  #Rank 0's send_buffer to Rank 1's recv for U
    tf.assign(Utr1,
              hvd.broadcast(Ut[-3:],
                            0)))  #Rank 0's send_buffer to Rank 1's recv for Ut

rank0_join = tf.group(U_main.assign(tf.concat([U, Ur0], 0)),
                      Ut_main.assign(tf.concat([Ut, Utr0], 0)))

rank1_join = tf.group(U_main.assign((tf.concat([Ur1, U], 0))),
                      Ut_main.assign((tf.concat([Utr1, Ut], 0))))
Beispiel #32
0
    def get_image_labels(self):
        if self.is_all_shared:
            ### ALL SHARED ###
            img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, 
                                                                 is_training=True)
            with tf.device("/cpu:0"):
                with tf.name_scope("reading"):
                    data_provider = slim.dataset_data_provider.DatasetDataProvider(
                        self.dataset, num_readers=self.FLAGS.num_data_readers,
                        common_queue_capacity=20*self.FLAGS.batch_size,
                        common_queue_min=10*self.FLAGS.batch_size,
                        seed=self.rank)
                    [image, label] = data_provider.get(['image', 'label'])
                with tf.name_scope("to-preprocessing"):
                    capacity = 20 * self.FLAGS.batch_size
                    to_pre_queue = data_flow_ops.FIFOQueue(capacity=capacity,
                                                           dtypes=[image.dtype, label.dtype],
                                                           shapes=None,
                                                           name="to_pre_queue")
                    to_pre_op = to_pre_queue.enqueue([image, label])
                    queue_runner.add_queue_runner(queue_runner.QueueRunner(to_pre_queue, [to_pre_op] * Pipeline.QR_THREADS))
                    tf.summary.scalar("to_pre_fraction_of_%d_full" % capacity,
                                    math_ops.to_float(to_pre_queue.size()) * (1. / capacity))
                    image, label = to_pre_queue.dequeue()
                with tf.name_scope("preprocessing"):#TODO
                    image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode)
                with tf.name_scope("to-allgather"):
                    capacity = 20 * self.FLAGS.batch_size
                    to_allg_queue = data_flow_ops.FIFOQueue(capacity=capacity,
                                                            dtypes=[image.dtype, label.dtype],
                                                            shapes=[[self.train_image_size, self.train_image_size, 3], []],
                                                            name="to_allgather_queue")#[image.get_shape(), label.get_shape()])
                    queue_runner.add_queue_runner(queue_runner.QueueRunner(to_allg_queue, [to_allg_queue.enqueue([image, label])] * Pipeline.QR_THREADS))
                    tf.summary.scalar("to_allgather_fraction_of_%d_full" % capacity,
                                   math_ops.to_float(to_allg_queue.size()) * (1. / capacity))

                # num_preprocessors = tf.placeholder(tf.int32, shape=[], name="num_preprocessors)
                # self.num_hvd_send_tensor = 
                send_images, send_labels = to_allg_queue.dequeue_many(self.num_hvd_send)
                # if rank == #TODO
                all_images = hvd.allgather(send_images, name="hvd_allgather")
                all_labels = hvd.allgather(send_labels, name="hvd_allgather")
                #TODO: Remove extra queues
                with tf.name_scope("to-compute"):
                    capacity = 30 * self.FLAGS.batch_size
                    to_compute_queue = data_flow_ops.FIFOQueue(capacity=capacity,
                                                               dtypes=[image.dtype, label.dtype],
                                                               shapes=[[self.train_image_size, self.train_image_size, 3], []],#TODO
                                                               name="to_compute_queue")#[image.get_shape(), label.get_shape()])
                    queue_runner.add_queue_runner(queue_runner.QueueRunner(to_compute_queue, [to_compute_queue.enqueue_many([all_images, all_labels])]))#1 thread!
                    tf.summary.scalar("to_compute_fraction_of_%d_full" % capacity,
                                   math_ops.to_float(to_compute_queue.size()) * (1. / capacity))
                image, label = to_compute_queue.dequeue()
        elif self.is_single_bcast:
            ### SINGLE BROADCAST ###
            img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, 
                                                                 is_training=True)
            allg_images_name = "allgather-images-op"
            allg_labels_name = "allgather-labels-op"
            bcast_images_name = "bcast-images-op"
            bcast_labels_name = "bcast-labels-op"
            if 0 in self.member_of_group: #If we belong to group 0, initialize the reading and preprocessing pipeline
                with tf.device("/cpu:0"):
                    with tf.name_scope("reading"):
                        data_provider = slim.dataset_data_provider.DatasetDataProvider(
                            self.dataset, num_readers=self.FLAGS.num_data_readers,
                            common_queue_capacity=20*self.FLAGS.batch_size,
                            common_queue_min=10*self.FLAGS.batch_size,
                            seed=self.rank)
                        [image, label] = data_provider.get(['image', 'label'])
                    image, label = create_qr("to-pre", 10 * self.FLAGS.batch_size, [image, label], None, [image.dtype, label.dtype], Pipeline.QR_THREADS, False, False)

                    with tf.name_scope("preprocessing"):
                        image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode)

                    send_images, send_labels = create_qr("to-allg", 10 * self.FLAGS.batch_size, [image, label], [[self.train_image_size, self.train_image_size, 3], []], [image.dtype, label.dtype], Pipeline.QR_THREADS, False, True, self.num_hvd_send)
                all_images = hvd.allgather(send_images, group=0, name=allg_images_name)
                all_labels = hvd.allgather(send_labels, group=0, name=allg_labels_name)
                all_images, all_labels = create_qr("to-bcast", 20 * self.FLAGS.batch_size, [all_images, all_labels], [[self.train_image_size, self.train_image_size, 3], []], [post_pre_image_dtype, post_pre_label_dtype], 1, True, True, self.images_per_bcast)
            if 1 in self.member_of_group:
                # For the middle man rank, reset all_images and all_labels
                # names to their broadcasted tensors so that the bcast is
                # performed. Note that the bcast root is rank 0 since the
                # group1 sent to init had this rank listed first, meaning that
                # the resulting mpi group comm has this rank has rank 0
                if len(self.member_of_group) == 1:
                    # Then not middle man, so construct holder variable WITH CORRECT NAME!
                    # tf.Variable(self.num_hvd_send?
                    all_images = tf.zeros([self.images_per_bcast, self.train_image_size, self.train_image_size, 3], dtype=post_pre_image_dtype)
                    all_labels = tf.zeros([self.images_per_bcast]                                       , dtype=post_pre_label_dtype) #shape of [] turns into 1D instead of 0D
                all_images = hvd.broadcast(all_images, 0, group=1, name=bcast_images_name)
                all_labels = hvd.broadcast(all_labels, 0, group=1, name=bcast_labels_name)
            image, label = create_qr("to-compute", 20 * self.FLAGS.batch_size, [all_images, all_labels], [[self.train_image_size, self.train_image_size, 3], []], [post_pre_image_dtype, post_pre_label_dtype], 1, True, False)
        elif self.is_multi_bcast:
            ### MULTIPLE BROADCAST
            # print("Rank:", rank, member_of_group, group_rank_list)
            img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, 
                                                                 is_training=True)
            # allg_image_name = "allgathered-image" # need some naming commonalities
            # allg_label_name = "allgathered-label"
            allg_images_name = "allgather-images-op"
            allg_labels_name = "allgather-labels-op"
            bcast_images_name = "bcast-images-op"
            bcast_labels_name = "bcast-labels-op"
            # if 0 in member_of_group: #If we belong to group 0, initialize the reading and preprocessing pipeline
            if self.rank < self.FLAGS.num_pre:
                with tf.device("/cpu:0"):
                    with tf.name_scope("reading"):
                        data_provider = slim.dataset_data_provider.DatasetDataProvider(
                            self.dataset, num_readers=self.FLAGS.num_data_readers,
                            common_queue_capacity=20*self.FLAGS.batch_size,
                            common_queue_min=10*self.FLAGS.batch_size,
                            seed=self.rank)
                        [image, label] = data_provider.get(['image', 'label'])

                    image, label = create_qr("to-pre", 10 * self.FLAGS.batch_size, [image, label], None, [image.dtype, label.dtype], Pipeline.QR_THREADS, False, False)

                    with tf.name_scope("preprocessing"):
                        image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode)
                        # image = tf.Print(image, ["using preprocessed image"])
                    send_images, send_labels = create_qr("to-bcast", 20 * self.FLAGS.batch_size, [image, label], [[self.train_image_size, self.train_image_size, 3], []], [image.dtype, label.dtype], 2 * Pipeline.QR_THREADS, False, True, self.images_per_bcast)
            else:
                send_images = tf.zeros([self.images_per_bcast, self.train_image_size, self.train_image_size, 3], dtype=post_pre_image_dtype)
                send_labels = tf.zeros([self.images_per_bcast]                                                 , dtype=post_pre_label_dtype)
            with tf.device("/cpu:0"):
                bcast_images_root = "broadcast-images-"
                bcast_labels_root = "broadcast-labels-"
                bcast_images_per_group = [hvd.broadcast(send_images, i, group=i, name=bcast_images_root + str(i)) for i in range(self.FLAGS.num_pre)]
                bcast_labels_per_group = [hvd.broadcast(send_labels, i, group=i, name=bcast_labels_root + str(i)) for i in range(self.FLAGS.num_pre)]
                
                with tf.name_scope("to-compute"):
                    capacity = 30 * self.FLAGS.batch_size
                    to_compute_q = data_flow_ops.FIFOQueue(capacity=capacity,
                                                    dtypes=[post_pre_image_dtype, post_pre_label_dtype],
                                                    shapes=[[self.train_image_size, self.train_image_size, 3], []], 
                                                    name="to-compute-queue")
                    to_comp_ops = [to_compute_q.enqueue_many([bcast_images_per_group[i], bcast_labels_per_group[i]]) for i in range(self.FLAGS.num_pre)]
                    queue_runner.add_queue_runner(queue_runner.QueueRunner(to_compute_q, to_comp_ops))
                    tf.summary.scalar("to_compute_fraction_of_%d_full" % capacity,
                                      math_ops.to_float(to_compute_q.size()) * (1. / capacity))
                    image, label = to_compute_q.dequeue()
        return image, label
Beispiel #33
0
    if (hvd.rank() == 0):
        u_init[a, b] = np.random.uniform()
    else:
        u_init[a + 3, b] = np.random.uniform()
# Parameters:
# eps -- time resolution
# damping -- wave damping
eps = tf.placeholder(tf.float32, shape=())
damping = tf.placeholder(tf.float32, shape=())

# Create variables for simulation state
U = tf.Variable(u_init)
Ut = tf.Variable(ut_init)

#communicate rows for calculations
bcast = tf.group(tf.assign(U[0:3], hvd.broadcast(U[N - 3:N], 0)),
                 tf.assign(Ut[0:3], hvd.broadcast(Ut[N - 3:N], 0)),
                 tf.assign(U[N - 3:N], hvd.broadcast(U[0:3], 1)),
                 tf.assign(Ut[N - 3:N], hvd.broadcast(Ut[0:3], 1)))

# Discretized PDE update rules
U_ = U + eps * Ut
Ut_ = Ut + eps * (laplace(U) - damping * Ut)

# Operation to update the state
step = tf.group(U.assign(U_), Ut.assign(Ut_))

#sliced output n*n
U_slice0 = tf.group(U[0:N].assign(tf.slice(U, [0, 0], [N, N])))
Ut_slice0 = tf.group(Ut[0:N].assign(tf.slice(Ut, [0, 0], [N, N])))
Beispiel #34
0
def broadcast(backend, value, root_rank, name):
    bcast_op = hvd.broadcast(tf.constant(value, name=name), root_rank)
    return backend.get_session().run(bcast_op)
Beispiel #35
0
def broadcast(backend, value, root_rank, name):
    return _eval(backend,
                 hvd.broadcast(tf.constant(value, name=name), root_rank))
 def __call__(self, *args, **kwargs):
     weights = self.wrapped(*args, **kwargs)
     weights = hvd.broadcast(weights,
                             root_rank=0,
                             name='BroadcastingInitializer')
     return weights