def __init__(self, average=True): """ Args: average (bool): whether to average or sum the gradients across processes. """ import byteps.tensorflow as bps self.hvd = bps # BytePS has the same interface as Horovod self.hvd.allreduce = bps.push_pull # https://github.com/bytedance/byteps/issues/8 # TODO bootstrap env vars bps.init() self.is_chief = bps.rank() == 0 self._local_rank = bps.local_rank() self._rank = bps.rank() self._average = average self._compression = None self._has_compression = False logger.info("[BytePSTrainer] local rank={}".format(self._local_rank)) SingleCostTrainer.__init__(self)
def __init__(self, average=True): """ Args: average (bool): whether to average or sum the gradients across processes. """ import byteps.tensorflow as bps self.hvd = bps # BytePS has the same interface as Horovod self.hvd.allreduce = bps.push_pull # https://github.com/bytedance/byteps/issues/8 assert os.environ.get("DMLC_ROLE", None) == "worker" assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ bps.init() self.is_chief = bps.rank() == 0 self._local_rank = bps.local_rank() self._rank = bps.rank() self._average = average self._compression = None self._has_compression = False logger.info("[BytePSTrainer] local rank={}".format(self._local_rank)) SingleCostTrainer.__init__(self)
parser.add_argument('--num-warmup-batches', type=int, default=10, help='number of warm-up batches that don\'t count towards benchmark') parser.add_argument('--num-batches-per-iter', type=int, default=10, help='number of batches per benchmark iteration') parser.add_argument('--num-iters', type=int, default=10000, help='number of benchmark iterations') parser.add_argument('--eager', action='store_true', default=False, help='enables eager execution') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') args = parser.parse_args() args.cuda = not args.no_cuda bps.init() # BytePS: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() if args.cuda: config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(bps.local_rank()) else: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" config.gpu_options.allow_growth = False config.gpu_options.visible_device_list = '' if args.eager: tf.enable_eager_execution(config) # Set up standard model.
def main(_): # BytePS: initialize BytePS. bps.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % bps.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) # BytePS: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * bps.size()) # BytePS: add BytePS Distributed Optimizer. opt = bps.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # BytePS: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. bps.BroadcastGlobalVariablesHook(0), # BytePS: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=200000 // bps.size()), tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ] # BytePS: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(bps.local_rank()) # BytePS: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if bps.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
def main(_): bps.init() tf.logging.set_verbosity(tf.logging.INFO) bert_config = modeling.BertConfig(256) model_fn = model_fn_builder(bert_config=bert_config, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps) max_seq_length = FLAGS.max_seq_length max_predictions_per_seq = FLAGS.max_predictions_per_seq with tf.name_scope("input"): input_ids = tf.placeholder( shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32) input_mask = tf.placeholder( shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32) segment_ids = tf.placeholder( shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32) masked_lm_positions = tf.placeholder( shape=[FLAGS.train_batch_size, max_predictions_per_seq], dtype=tf.int32) masked_lm_ids = tf.placeholder( shape=[FLAGS.train_batch_size, max_predictions_per_seq], dtype=tf.int32) masked_lm_weights = tf.placeholder( shape=[FLAGS.train_batch_size, max_predictions_per_seq], dtype=tf.float32) next_sentence_labels = tf.placeholder( shape=[FLAGS.train_batch_size, 1], dtype=tf.int32) features = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "masked_lm_positions": masked_lm_positions, "masked_lm_ids": masked_lm_ids, "masked_lm_weights": masked_lm_weights, "next_sentence_labels": next_sentence_labels } train_op = model_fn(features, None, None, None) infer_shape_ops = add_infer_shape_ops() hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. bps.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=205 // bps.size()), ] config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(bps.local_rank()) training_batch_generator = train_input_generator(features) with tf.train.MonitoredTrainingSession(hooks=hooks, config=config) as mon_sess: mon_sess = TimelineSession(mon_sess, infer_shape_ops) while not mon_sess.should_stop(): # Run a training step synchronously. feed_dict = next(training_batch_generator) mon_sess.run([train_op], feed_dict=feed_dict)