コード例 #1
0
 def get_gradients(self, loss, params):
     """
     Compute gradients of all trainable variables.
     See Optimizer.get_gradients() for more info.
     In DistributedOptimizer, get_gradients() is overriden to also
     push_pull the gradients before returning them.
     """
     gradients = super(self.__class__, self).get_gradients(loss, params)
     if bps.size() > 1:
         averaged_gradients = []
         with tf.name_scope(self._name + "_Push_Pull") as scope:
             for grad in gradients:
                 if grad is not None:
                     if self._sparse_as_dense and \
                             isinstance(grad, tf.IndexedSlices):
                         grad = tf.convert_to_tensor(grad)
                     avg_grad = bps.push_pull(
                         grad,
                         scope,
                         device_dense=self._device_dense,
                         device_sparse=self._device_sparse,
                         compression=self._compression)
                     averaged_gradients.append(avg_grad)
                 else:
                     averaged_gradients.append(None)
             return averaged_gradients
     else:
         return gradients
コード例 #2
0
    def _byteps_average_metrics_in_place(self, logs):
        logs = logs or {}
        reduced_logs = {}
        import byteps.tensorflow as bps

        if self._allreduce_ranks <= 1.:
            self._allreduce_ranks = float(bps.size())
        # Reduce every metric among workers. Sort metrics by name
        # to ensure consistent order.
        for metric, value in sorted(logs.items()):
            from tensorflow.python.eager import context
            if context.executing_eagerly():
                with tf.device(self._device):
                    reduced_logs[metric] = bps.push_pull(
                        K.constant(value, name=metric)).numpy()
            else:
                if metric not in self.variables:
                    with tf.name_scope('MetricAverageCallback') as scope:
                        var = tf.Variable(value, name=metric)
                        K.get_session().run(var.initializer)
                        self._m_vars[metric] = var
                        self._allreduce_ops[metric] = bps.push_pull(
                            var, scope, device_dense=self._device)
                else:
                    K.set_value(self._m_vars[metric], value)
                reduced_logs[metric] = K.get_session().run(
                    self._allreduce_ops[metric])

        # Override the reduced values back into logs dictionary
        # for other callbacks to use.
        for metric, value in reduced_logs.items():
            logs[metric] = value / self._allreduce_ranks
コード例 #3
0
def run(benchmark_step):
    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
        img_secs.append(img_sec)

    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log('Total img/sec on %d %s(s): %.1f +-%.1f' %
        (bps.size(), device, bps.size() * img_sec_mean, bps.size() * img_sec_conf))
コード例 #4
0
 def reduce_implementation(self, reduce_op, per_replica_value,
                           destinations):
     if tf_cross_device_ops.check_destinations(destinations):
         devices = tf_cross_device_ops.get_devices_from(destinations)
     else:
         devices = tf_cross_device_ops.get_devices_from(per_replica_value)
     reduce_to_device = devices[0]
     logging.log_first_n(logging.INFO,
                         "Using byteps push pull to aggregate values", 1)
     reduced = _simple_reduce(per_replica_value, reduce_to_device,
                              self.accumulation_fn, reduce_op)
     if size() > 1:
         reduced = _push_pull(reduced)
     return reduced
コード例 #5
0
    def on_batch_end(self, batch, logs=None):
        if self.broadcast_done:
            return

        if bps.size() <= 1:
            return

        with tf.device(self.device):
            if bps._executing_eagerly() and hasattr(self.model, 'variables'):
                # TensorFlow 2.0 or TensorFlow eager
                bps.broadcast_variables(self.model.variables,
                                        root_rank=self.root_rank)
                bps.broadcast_variables(self.model.optimizer.variables(),
                                        root_rank=self.root_rank)
            else:
                bcast_op = bps.broadcast_global_variables(self.root_rank)
                self.backend.get_session().run(bcast_op)

        self.broadcast_done = True
コード例 #6
0
ファイル: __init__.py プロジェクト: zprhhs/byteps
 def _push_pull(self, gradients):
     self._aggregated_gradients = True
     if bps.size() > 1:
         averaged_gradients = []
         with tf.name_scope(self._name + "_Push_Pull") as scope:
             for grad in gradients:
                 if grad is not None:
                     if self._sparse_as_dense and \
                             isinstance(grad, tf.IndexedSlices):
                         grad = tf.convert_to_tensor(grad)
                     avg_grad = bps.push_pull(
                         grad,
                         scope,
                         device_dense=self._device_dense,
                         device_sparse=self._device_sparse,
                         compression=self._compression)
                     averaged_gradients.append(avg_grad)
                 else:
                     averaged_gradients.append(None)
             return averaged_gradients
     else:
         return gradients
コード例 #7
0
 def _push_pull(self, grads):
     self._aggregated_gradients = True
     import byteps.tensorflow as bps
     if bps.size() > 1:
         averaged_gradients = []
         with tf.name_scope(
                 "DistributedLossScaleOptimizer_Push_Pull") as scope:
             for grad in grads:
                 if grad is not None:
                     if self._sparse_as_dense and isinstance(
                             grad, tf.IndexedSlices):
                         grad = tf.convert_to_tensor(grad)
                     avg_grad = bps.push_pull(
                         grad,
                         scope,
                         device_dense=self._device_dense,
                         device_sparse=self._device_sparse,
                         compression=self._compression)
                     averaged_gradients.append(avg_grad)
                 else:
                     averaged_gradients.append(None)
             return averaged_gradients
     else:
         return grads
コード例 #8
0
ファイル: tensorflow_mnist.py プロジェクト: yuxihu/byteps
def main(_):
    # BytePS: initialize BytePS.
    bps.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % bps.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # BytePS: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * bps.size())

    # BytePS: add BytePS Distributed Optimizer.
    opt = bps.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # BytePS: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        bps.BroadcastGlobalVariablesHook(0),

        # BytePS: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=200000 // bps.size()),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # BytePS: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(bps.local_rank())

    # BytePS: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if bps.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train, batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
コード例 #9
0
def loss_function():
    logits = model(data, training=True)
    return tf.losses.sparse_softmax_cross_entropy(target, logits)


def log(s, nl=True):
    if bps.rank() != 0:
        return
    print(s, end='\n' if nl else '')
    sys.stdout.flush()

log('Model: %s' % args.model)
log('Batch size: %d' % args.batch_size)
device = 'GPU' if args.cuda else 'CPU'
log('Number of %ss: %d' % (device, bps.size()))


def run(benchmark_step):
    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
        img_secs.append(img_sec)
コード例 #10
0
 def multiplier(epoch):
     # Adjust epoch to produce round numbers at the end of each epoch, so that TensorBoard
     # learning rate graphs look better.
     epoch += 1. / self.steps_per_epoch
     return 1. / bps.size() * (epoch * (bps.size() - 1) / warmup_epochs + 1)
コード例 #11
0
             tf.float32), tf.cast(mnist_labels, tf.int64)))
dataset = dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
])
loss = tf.losses.SparseCategoricalCrossentropy()

opt = tf.optimizers.Adam(0.001 * bps.size())

checkpoint_dir = './checkpoints'
checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)


@tf.function
def training_step(images, labels, first_batch):
    with tf.GradientTape() as tape:
        probs = mnist_model(images, training=True)
        loss_value = loss(labels, probs)

    tape = bps.DistributedGradientTape(tape)

    grads = tape.gradient(loss_value, mnist_model.trainable_variables)
    opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
コード例 #12
0
 def on_train_begin(self, logs=None):
     if bps.size() <= 1:
         return
     with tf.device(self.device):
         bcast_op = bps.broadcast_global_variables(self.root_rank)
         self.backend.get_session().run(bcast_op)
コード例 #13
0
    # initialization.
    if first_batch:
        bps.broadcast_variables(model.variables, root_rank=0)
        bps.broadcast_variables(opt.variables(), root_rank=0)


def log(s, nl=True):
    if bps.rank() != 0:
        return
    print(s, end='\n' if nl else '')


log('Model: %s' % args.model)
log('Batch size: %d' % args.batch_size)
device = 'GPU' if args.cuda else 'CPU'
log('Number of %ss: %d' % (device, bps.size()))

with tf.device(device):
    # Warm-up
    log('Running warmup...')
    benchmark_step(first_batch=True)
    timeit.timeit(lambda: benchmark_step(first_batch=False),
                  number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(lambda: benchmark_step(first_batch=False),
                             number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
コード例 #14
0
def main(_):
    bps.init()
    tf.logging.set_verbosity(tf.logging.INFO)

    bert_config = modeling.BertConfig(256)

    model_fn = model_fn_builder(bert_config=bert_config,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps)

    max_seq_length = FLAGS.max_seq_length
    max_predictions_per_seq = FLAGS.max_predictions_per_seq

    with tf.name_scope("input"):
        input_ids = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32)
        input_mask = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32)
        segment_ids = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32)
        masked_lm_positions = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_predictions_per_seq],
            dtype=tf.int32)
        masked_lm_ids = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_predictions_per_seq],
            dtype=tf.int32)
        masked_lm_weights = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_predictions_per_seq],
            dtype=tf.float32)
        next_sentence_labels = tf.placeholder(
            shape=[FLAGS.train_batch_size, 1], dtype=tf.int32)

    features = {
        "input_ids": input_ids,
        "input_mask": input_mask,
        "segment_ids": segment_ids,
        "masked_lm_positions": masked_lm_positions,
        "masked_lm_ids": masked_lm_ids,
        "masked_lm_weights": masked_lm_weights,
        "next_sentence_labels": next_sentence_labels
    }

    train_op = model_fn(features, None, None, None)

    infer_shape_ops = add_infer_shape_ops()

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        bps.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=205 // bps.size()),
    ]

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(bps.local_rank())

    training_batch_generator = train_input_generator(features)

    with tf.train.MonitoredTrainingSession(hooks=hooks,
                                           config=config) as mon_sess:
        mon_sess = TimelineSession(mon_sess, infer_shape_ops)
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            feed_dict = next(training_batch_generator)
            mon_sess.run([train_op], feed_dict=feed_dict)