def allreduce(self, grads): if self.hvd.size() == 1: return grads # copied from https://github.com/uber/horovod/blob/master/horovod/tensorflow/__init__.py averaged_gradients = [] with tf.name_scope("AllReduce"): for grad, var in grads: if grad is not None: # Apply gradient compression using GRACE. import horovod.tensorflow as hvd from grace_dl.tensorflow.communicator.allgather import Allgather from grace_dl.tensorflow.compressor.topk import TopKCompressor from grace_dl.tensorflow.memory.residual import ResidualMemory world_size = hvd.size() grc = Allgather(TopKCompressor(0.3), ResidualMemory(), world_size) if self._compression is not None and self._has_compression: avg_grad = self.hvd.allreduce(grad, grace=grc, average=self._average, compression=self._compression) else: avg_grad = self.hvd.allreduce(grad, grace=grc, average=self._average) averaged_gradients.append((avg_grad, var)) else: averaged_gradients.append((None, var)) return averaged_gradients
def grace_from_params(params): comp = params.get('compressor', 'none') mem = params.get('memory', 'none') comm = params.get('communicator', 'allreduce') if comp == 'adapsparse': from grace_dl.tensorflow.compressor.adapsparse import AdapSparseCompressor compressor = AdapSparseCompressor() elif comp == 'adaq': from grace_dl.tensorflow.compressor.adaq import AdaqCompressor compressor = AdaqCompressor() elif comp == 'dgc': from grace_dl.tensorflow.compressor.dgc import DgcCompressor compressor = DgcCompressor() elif comp == 'efsignsgd': from grace_dl.tensorflow.compressor.efsignsgd import EFSignSGDCompressor compressor = EFSignSGDCompressor() elif comp == 'fp16': from grace_dl.tensorflow.compressor.fp16 import FP16Compressor compressor = FP16Compressor() elif comp == 'inceptionnc': from grace_dl.tensorflow.compressor.inceptionn import INCEPTIONNCompressor compressor = INCEPTIONNCompressor() elif comp == 'natural': from grace_dl.tensorflow.compressor.natural import NaturalCompressor compressor = NaturalCompressor() elif comp == 'none': from grace_dl.tensorflow.compressor.none import NoneCompressor compressor = NoneCompressor() elif comp == 'onebit': from grace_dl.tensorflow.compressor.onebit import OneBitCompressor compressor = OneBitCompressor() elif comp == 'powersgd': from grace_dl.tensorflow.compressor.powersgd import PowerSGDCompressor compressor = PowerSGDCompressor() elif comp == 'qsgd': from grace_dl.tensorflow.compressor.qsgd import QSGDCompressor compressor = QSGDCompressor() elif comp == 'randomk': from grace_dl.tensorflow.compressor.randomk import RandomKCompressor compressor = RandomKCompressor() elif comp == 'signsgd': from grace_dl.tensorflow.compressor.signsgd import SignSGDCompressor compressor = SignSGDCompressor() elif comp == 'signum': from grace_dl.tensorflow.compressor.signum import SignumCompressor compressor = SignumCompressor() elif comp == 'sketch': from grace_dl.tensorflow.compressor.sketch import SketchCompressor compressor = SketchCompressor() elif comp == 'terngrad': from grace_dl.tensorflow.compressor.terngrad import TernGradCompressor compressor = TernGradCompressor() elif comp == 'threshold': from grace_dl.tensorflow.compressor.threshold import ThresholdCompressor compressor = ThresholdCompressor() elif comp == 'topk': from grace_dl.tensorflow.compressor.topk import TopKCompressor compressor = TopKCompressor() elif comp == 'u8bit': from grace_dl.tensorflow.compressor.u8bit import U8bitCompressor compressor = U8bitCompressor() else: raise NotImplementedError(comp) if mem == 'dgc': from grace_dl.tensorflow.memory.dgc import DgcMemory memory = DgcMemory() elif mem == 'none': from grace_dl.tensorflow.memory.none import NoneMemory memory = NoneMemory() elif mem == 'powersgd': from grace_dl.tensorflow.memory.powersgd import PowerSGDMemory memory = PowerSGDMemory() elif mem == 'residual': from grace_dl.tensorflow.memory.residual import ResidualMemory memory = ResidualMemory() else: raise NotImplementedError(mem) if comm == 'allreduce': from grace_dl.tensorflow.communicator.allreduce import Allreduce return Allreduce(compressor, memory, params['world_size']) elif comm == 'allgather': from grace_dl.tensorflow.communicator.allgather import Allgather return Allgather(compressor, memory, params['world_size']) elif comm == 'broadcast': from grace_dl.tensorflow.communicator.broadcast import Broadcast return Broadcast(compressor, memory, params['world_size']) else: raise NotImplementedError(comm)
data = tf.random.uniform([args.batch_size, 224, 224, 3]) target = tf.random.uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) loss = tf.losses.SparseCategoricalCrossentropy() # Horovod: adjust learning rate based on number of GPUs. opt = tf.optimizers.Adam(0.001 * hvd.size()) checkpoint_dir = './checkpoints' checkpoint = tf.train.Checkpoint(model=model, optimizer=opt) # GRACE: compression algorithm grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size()) @tf.function def benchmark_step(first_batch): # Horovod: (optional) compression algorithm. #compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: use DistributedGradientTape with tf.GradientTape() as tape: probs = model(data, training=True) loss = tf.losses.sparse_categorical_crossentropy(target, probs) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape, grace=grc)
def __call__(self, features, labels, mode, params): if "debug_verbosity" not in params.keys(): raise RuntimeError("Parameter `debug_verbosity` is missing...") if mode == tf.estimator.ModeKeys.TRAIN: if "rmsprop_decay" not in params.keys(): raise RuntimeError("Parameter `rmsprop_decay` is missing...") if "rmsprop_momentum" not in params.keys(): raise RuntimeError( "Parameter `rmsprop_momentum` is missing...") if "learning_rate" not in params.keys(): raise RuntimeError("Parameter `learning_rate` is missing...") if "learning_rate_decay_steps" not in params.keys(): raise RuntimeError("Parameter `learning_rate` is missing...") if "learning_rate_decay_factor" not in params.keys(): raise RuntimeError("Parameter `learning_rate` is missing...") if "weight_decay" not in params.keys(): raise RuntimeError("Parameter `weight_decay` is missing...") if "loss_fn_name" not in params.keys(): raise RuntimeError("Parameter `loss_fn_name` is missing...") if mode == tf.estimator.ModeKeys.PREDICT: y_pred, y_pred_logits = self.build_model( features, training=False, reuse=False, debug_verbosity=params["debug_verbosity"]) predictions = {'logits': y_pred} return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) input_image, mask_image = features with tf.device("/gpu:0"): tf.identity(input_image, name="input_image_ref") tf.identity(mask_image, name="mask_image_ref") tf.identity(labels, name="labels_ref") y_pred, y_pred_logits = self.build_model( input_image, training=mode == tf.estimator.ModeKeys.TRAIN, reuse=False, debug_verbosity=params["debug_verbosity"]) all_trainable_vars = tf.reduce_sum( [tf.reduce_prod(v.shape) for v in tf.trainable_variables()]) tf.identity(all_trainable_vars, name='trainable_parameters_count_ref') if mode == tf.estimator.ModeKeys.EVAL: eval_metrics = dict() # ==================== Samples ==================== # image_uint8 = tf.cast((input_image + 1) * 127.5, dtype=tf.uint8) input_image_jpeg = tf.image.encode_jpeg(image_uint8[0], format='grayscale', quality=100) tf.identity(input_image_jpeg, name="input_image_jpeg_ref") for threshold in [ None, 0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99 ]: binarize_img, binarize_img_jpeg = image_processing.binarize_output( y_pred[0], threshold=threshold) tf.identity(binarize_img_jpeg, name="output_sample_ths_%s_ref" % threshold) tf.summary.image('output_sample_ths_%s' % threshold, binarize_img, 10) # ==============+ Evaluation Metrics ==================== # with tf.name_scope("IoU_Metrics"): for threshold in [ 0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99 ]: iou_score = metrics.iou_score(y_pred=y_pred, y_true=mask_image, threshold=threshold) tf.identity(iou_score, name='iou_score_ths_%s_ref' % threshold) tf.summary.scalar('iou_score_ths_%s' % threshold, iou_score) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics["IoU_THS_%s" % threshold] = tf.metrics.mean(iou_score) labels = tf.cast(labels, tf.float32) labels_preds = tf.reduce_max(y_pred, axis=(1, 2, 3)) assert (abs(labels_preds - tf.clip_by_value(labels_preds, 0, 1)) < 0.00001, "Clipping labels_preds introduces non-trivial loss.") labels_preds = tf.clip_by_value(labels_preds, 0, 1) with tf.variable_scope("Confusion_Matrix") as scope: tp, update_tp = tf.metrics.true_positives_at_thresholds( labels=labels, predictions=labels_preds, thresholds=[ 0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99 ], ) tn, update_tn = tf.metrics.true_negatives_at_thresholds( labels=labels, predictions=labels_preds, thresholds=[ 0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99 ], ) fp, update_fp = tf.metrics.false_positives_at_thresholds( labels=labels, predictions=labels_preds, thresholds=[ 0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99 ], ) fn, update_fn = tf.metrics.false_negatives_at_thresholds( labels=labels, predictions=labels_preds, thresholds=[ 0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99 ], ) if mode == tf.estimator.ModeKeys.TRAIN: local_vars = tf.get_collection( tf.GraphKeys.LOCAL_VARIABLES, scope=scope.name) confusion_matrix_reset_op = tf.initializers.variables( local_vars, name='reset_op') with tf.control_dependencies([confusion_matrix_reset_op]): with tf.control_dependencies( [update_tp, update_tn, update_fp, update_fn]): tp = tf.identity(tp) tn = tf.identity(tn) fp = tf.identity(fp) fn = tf.identity(fn) else: eval_metrics["Confusion_Matrix_TP"] = tp, update_tp eval_metrics["Confusion_Matrix_TN"] = tn, update_tn eval_metrics["Confusion_Matrix_FP"] = fp, update_fp eval_metrics["Confusion_Matrix_FN"] = fn, update_fn tf.identity(tp, name='true_positives_ref' ) # Confusion_Matrix/true_positives_ref:0 tf.identity(tn, name='true_negatives_ref' ) # Confusion_Matrix/true_negatives_ref:0 tf.identity(fp, name='false_positives_ref' ) # Confusion_Matrix/false_positives_ref:0 tf.identity(fn, name='false_negatives_ref' ) # Confusion_Matrix/false_negatives_ref:0 tf.summary.scalar('true_positives', tp[3]) # For Ths = 0.5 tf.summary.scalar('true_negatives', tn[3]) # For Ths = 0.5 tf.summary.scalar('false_positives', fp[3]) # For Ths = 0.5 tf.summary.scalar('false_negatives', fn[3]) # For Ths = 0.5 binarized_mask, binarized_mask_jpeg = image_processing.binarize_output( mask_image[0], threshold=0.5) tf.identity(binarized_mask_jpeg, name="mask_sample_ref") tf.summary.image('sample_mask', binarized_mask, 10) ########################## mask_max_val = tf.reduce_max(mask_image) tf.identity(mask_max_val, name='mask_max_val_ref') mask_min_val = tf.reduce_min(mask_image) tf.identity(mask_min_val, name='mask_min_val_ref') mask_mean_val = tf.reduce_mean(mask_image) tf.identity(mask_mean_val, name='mask_mean_val_ref') mask_std_val = tf.math.reduce_std(mask_image) tf.identity(mask_std_val, name='mask_std_val_ref') ########################## output_max_val = tf.reduce_max(y_pred) tf.identity(output_max_val, name='output_max_val_ref') output_min_val = tf.reduce_min(y_pred) tf.identity(output_min_val, name='output_min_val_ref') output_mean_val = tf.reduce_mean(y_pred) tf.identity(output_mean_val, name='output_mean_val_ref') output_std_val = tf.math.reduce_std(y_pred) tf.identity(output_std_val, name='output_std_val_ref') with tf.variable_scope("losses"): # ==============+ Reconstruction Loss ==================== # if params["loss_fn_name"] == "x-entropy": reconstruction_loss = losses.reconstruction_x_entropy( y_pred=y_pred, y_true=mask_image) elif params["loss_fn_name"] == "l2_loss": reconstruction_loss = losses.reconstruction_l2loss( y_pred=y_pred, y_true=mask_image) elif params["loss_fn_name"] == "dice_sorensen": reconstruction_loss = 1 - losses.dice_coe( y_pred=y_pred, y_true=mask_image, loss_type='sorensen') elif params["loss_fn_name"] == "dice_jaccard": reconstruction_loss = 1 - losses.dice_coe( y_pred=y_pred, y_true=mask_image, loss_type='jaccard') elif params["loss_fn_name"] == "adaptive_loss": reconstruction_loss = losses.adaptive_loss( y_pred=y_pred, y_pred_logits=y_pred_logits, y_true=mask_image, switch_at_threshold=0.3, loss_type='sorensen') else: raise ValueError("Unknown loss function received: %s" % params["loss_fn_name"]) tf.identity(reconstruction_loss, name='reconstruction_loss_ref') tf.summary.scalar('reconstruction_loss', reconstruction_loss) if mode == tf.estimator.ModeKeys.TRAIN: # ============== Regularization Loss ==================== # l2_loss = losses.regularization_l2loss( weight_decay=params["weight_decay"]) tf.identity(l2_loss, name='l2_loss_ref') tf.summary.scalar('l2_loss', l2_loss) total_loss = tf.add(reconstruction_loss, l2_loss, name="total_loss") else: total_loss = reconstruction_loss tf.identity(total_loss, name='total_loss_ref') tf.summary.scalar('total_loss', total_loss) if mode == tf.estimator.ModeKeys.TRAIN: with tf.variable_scope("optimizers"): # Update Global Step global_step = tf.train.get_or_create_global_step() tf.identity(global_step, name="global_step_ref") learning_rate = tf.train.exponential_decay( learning_rate=params["learning_rate"], decay_steps=params["learning_rate_decay_steps"], decay_rate=params["learning_rate_decay_factor"], global_step=global_step, staircase=True) tf.identity(learning_rate, name="learning_rate_ref") tf.summary.scalar('learning_rate_ref', learning_rate) opt = tf.train.RMSPropOptimizer( learning_rate=learning_rate, use_locking=False, centered=True, decay=params["rmsprop_decay"], momentum=params["rmsprop_momentum"], ) if hvd_utils.is_using_hvd(): # Apply gradient compression using GRACE. from grace_dl.tensorflow.communicator.allgather import Allgather from grace_dl.tensorflow.compressor.topk import TopKCompressor from grace_dl.tensorflow.memory.residual import ResidualMemory world_size = hvd.size() grc = Allgather(TopKCompressor(0.3), ResidualMemory(), world_size) opt = hvd.DistributedOptimizer(opt, grace=grc, device_dense='/gpu:0') if params["apply_manual_loss_scaling"]: # if not hvd_utils.is_using_hvd() or hvd.rank() == 0: # Logger.log("Applying manual Loss Scaling ...") loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager( init_loss_scale=2**32, # 4,294,967,296 incr_every_n_steps=1000) opt = tf.contrib.mixed_precision.LossScaleOptimizer( opt, loss_scale_manager) deterministic = True gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE) backprop_op = opt.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step) train_op = tf.group( backprop_op, tf.get_collection(tf.GraphKeys.UPDATE_OPS)) return tf.estimator.EstimatorSpec( mode, loss=total_loss, train_op=train_op, ) elif mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode, loss=total_loss, eval_metric_ops=eval_metrics, predictions={"output": y_pred}) else: raise NotImplementedError('Unknown mode {}'.format(mode))
def ncf_model_ops(users, items, labels, dup_mask, params, mode='TRAIN'): """ Constructs the training and evaluation graphs """ # Validation params val_batch_size = params['val_batch_size'] K = params['top_k'] # Training params learning_rate = params['learning_rate'] beta_1 = params['beta_1'] beta_2 = params['beta_2'] epsilon = params['epsilon'] # Model params fp16 = params['fp16'] nb_users = params['num_users'] nb_items = params['num_items'] mf_dim = params['num_factors'] mf_reg = params['mf_reg'] mlp_layer_sizes = params['layer_sizes'] mlp_layer_regs = params['layer_regs'] dropout = params['dropout'] sigmoid = False #params['sigmoid'] loss_scale = params['loss_scale'] model_dtype = tf.float16 if fp16 else tf.float32 # If manually enabling mixed precision, use the custom variable getter custom_getter = None if not fp16 else float32_variable_storage_getter # Allow soft device placement with tf.device(None), \ tf.variable_scope('neumf', custom_getter=custom_getter): # Model graph logits = neural_mf( users, items, model_dtype, nb_users, nb_items, mf_dim, mf_reg, mlp_layer_sizes, mlp_layer_regs, dropout, sigmoid ) logits = tf.squeeze(logits) if mode == 'INFERENCE': return logits # Evaluation Ops found_positive, dcg = compute_eval_metrics(logits, dup_mask, val_batch_size, K) # Metrics hit_rate = tf.metrics.mean(found_positive, name='hit_rate') ndcg = tf.metrics.mean(dcg, name='ndcg') eval_op = tf.group(hit_rate[1], ndcg[1]) if mode == 'EVAL': return hit_rate[0], ndcg[0], eval_op, None # Labels labels = tf.reshape(labels, [-1, 1]) logits = tf.reshape(logits, [-1, 1]) # Use adaptive momentum optimizer optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate, beta1=beta_1, beta2=beta_2, epsilon=epsilon) loss = tf.losses.sigmoid_cross_entropy( labels, logits, reduction=tf.losses.Reduction.MEAN) # Apply loss scaling if manually enabling mixed precision if fp16: if loss_scale is None: loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(2**32, 1000) else: loss_scale_manager = tf.contrib.mixed_precision.FixedLossScaleManager(loss_scale) optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) # Horovod wrapper for distributed training # Apply gradient compression using GRACE. from grace_dl.tensorflow.communicator.allgather import Allgather from grace_dl.tensorflow.compressor.topk import TopKCompressor from grace_dl.tensorflow.memory.residual import ResidualMemory world_size = hvd.size() grc = Allgather(TopKCompressor(0.3), ResidualMemory(), world_size) optimizer = hvd.DistributedOptimizer(optimizer, grace=grc) # Update ops global_step = tf.train.get_global_step() train_op = optimizer.minimize(loss, global_step=global_step) return hit_rate[0], ndcg[0], eval_op, train_op
def grace_from_params(params): import horovod.tensorflow as hvd world_size = hvd.size() comp = params.get('compressor', 'none') mem = params.get('memory', 'none') comm = params.get('communicator', 'allreduce') if comp == 'adaq': from grace_dl.tensorflow.compressor.adaq import AdaqCompressor compressor = AdaqCompressor(compress_ratio=0.01) elif comp == 'dgc': from grace_dl.tensorflow.compressor.dgc import DgcCompressor compressor = DgcCompressor(compress_ratio=0.01) elif comp == 'efsignsgd': from grace_dl.tensorflow.compressor.efsignsgd import EFSignSGDCompressor compressor = EFSignSGDCompressor(lr=0.1) elif comp == 'fp16': from grace_dl.tensorflow.compressor.fp16 import FP16Compressor compressor = FP16Compressor() elif comp == 'inceptionn': from grace_dl.tensorflow.compressor.inceptionn import INCEPTIONNCompressor compressor = INCEPTIONNCompressor(error_bound=2e-10) elif comp == 'natural': from grace_dl.tensorflow.compressor.natural import NaturalCompressor compressor = NaturalCompressor() elif comp == 'none': from grace_dl.tensorflow.compressor.none import NoneCompressor compressor = NoneCompressor() elif comp == 'onebit': from grace_dl.tensorflow.compressor.onebit import OneBitCompressor compressor = OneBitCompressor() elif comp == 'powersgd': from grace_dl.tensorflow.compressor.powersgd import PowerSGDCompressor compressor = PowerSGDCompressor(momentum_factor=0.9, world_size=world_size) elif comp == 'qsgd': from grace_dl.tensorflow.compressor.qsgd import QSGDCompressor compressor = QSGDCompressor(quantum_num=64) elif comp == 'randomk': from grace_dl.tensorflow.compressor.randomk import RandomKCompressor compressor = RandomKCompressor(compress_ratio=0.01) elif comp == 'signsgd': from grace_dl.tensorflow.compressor.signsgd import SignSGDCompressor compressor = SignSGDCompressor() elif comp == 'signum': from grace_dl.tensorflow.compressor.signum import SignumCompressor compressor = SignumCompressor(momentum=0.9) elif comp == 'sketch': from grace_dl.tensorflow.compressor.sketch import SketchCompressor compressor = SketchCompressor(quantiles=64) elif comp == 'terngrad': from grace_dl.tensorflow.compressor.terngrad import TernGradCompressor compressor = TernGradCompressor() elif comp == 'threshold': from grace_dl.tensorflow.compressor.threshold import ThresholdCompressor compressor = ThresholdCompressor(threshold=0.01) elif comp == 'topk': from grace_dl.tensorflow.compressor.topk import TopKCompressor compressor = TopKCompressor(compress_ratio=0.01) elif comp == 'u8bit': from grace_dl.tensorflow.compressor.u8bit import U8bitCompressor compressor = U8bitCompressor() else: raise NotImplementedError(comp) if mem == 'dgc': from grace_dl.tensorflow.memory.dgc import DgcMemory memory = DgcMemory(momentum=0.9, gradient_clipping=False, world_size=world_size) elif mem == 'none': from grace_dl.tensorflow.memory.none import NoneMemory memory = NoneMemory() elif mem == 'powersgd': from grace_dl.tensorflow.memory.powersgd import PowerSGDMemory memory = PowerSGDMemory( q_memory=compressor.q_memory, compress_rank=1, ) elif mem == 'residual': from grace_dl.tensorflow.memory.residual import ResidualMemory memory = ResidualMemory() elif mem == 'efsignsgd': from grace_dl.tensorflow.memory.efsignsgd import EFSignSGDMemory memory = EFSignSGDMemory(lr=0.1) else: raise NotImplementedError(mem) if comm == 'allreduce': from grace_dl.tensorflow.communicator.allreduce import Allreduce return Allreduce(compressor, memory, world_size) elif comm == 'allgather': from grace_dl.tensorflow.communicator.allgather import Allgather return Allgather(compressor, memory, world_size) elif comm == 'broadcast': from grace_dl.tensorflow.communicator.broadcast import Broadcast return Broadcast(compressor, memory, world_size) else: raise NotImplementedError(comm)
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.AdamOptimizer(0.001 * hvd.size()) # GRACE: compression algorithm grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt, grace=grc) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=20000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_, label: label_})