def allgather_object(obj): """ Serializes and allgathers an object from all other processes. Arguments: obj: An object capable of being serialized without losing any context. Returns: The list of objects that were allgathered across all ranks. """ # Horovod and its deps are optional, so do not import them at the top. # TensorFlow is included here as certain modules (like comet) must import before TF. import cloudpickle import tensorflow as tf from horovod.tensorflow import allgather, size def load(byte_array): buf = io.BytesIO(byte_array.tobytes()) return cloudpickle.load(buf) b = io.BytesIO() cloudpickle.dump(obj, b) t = tf.convert_to_tensor(bytearray(b.getvalue()), dtype=tf.uint8) sz = tf.convert_to_tensor([t.shape[0]], dtype=tf.int32) sizes = allgather(sz, name=type(obj).__name__ + '.sz').numpy() gathered = allgather(t, name=type(obj).__name__ + '.t').numpy() def select(i): start = sizes[i - 1] if i > 0 else 0 end = start + sizes[i] return gathered[start:end] return [load(select(i)) for i in range(size())]
def _setup_graph(self): num_gpu = cfg.TRAIN.NUM_GPUS if cfg.TRAINER == 'replicated': # Use two predictor threads per GPU to get better throughput self.num_predictor = num_gpu * 2 self.predictors = [ self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor) ] self.dataflows = [ get_eval_dataflow(shard=k, num_shards=self.num_predictor) for k in range(self.num_predictor) ] else: self.predictor = self._build_coco_predictor(0) self.dataflow = get_eval_dataflow(shard=hvd.rank(), num_shards=hvd.size()) # use uint8 to aggregate strings self.local_result_tensor = tf.placeholder( tf.uint8, shape=[None], name='local_result_string') self.concat_results = hvd.allgather(self.local_result_tensor, name='concat_results') local_size = tf.expand_dims(tf.size(self.local_result_tensor), 0) self.string_lens = hvd.allgather(local_size, name='concat_sizes')
def hvd_gather_parameters_gradients(list_of_weights, list_of_gradients): ''' gather the parameters and gradients from each individual "train_svpg" function call Input: List of "weights" from all agents for one batch (single weights is a list of tensor of weights/biases) List of "gradients" from all agents for one batch (single gradienst is a list of Tensors of gradients) Output: 2D tf tensor stacked from flattened “trainable_variables” Tensors 2D tf tes stacked from flattended "gradients" Tensors List of the shapes for the weights/gradients List of sizes for the weights/gradients ''' # assume 1 mpi process per agent # flatten weight_list = list(chain.from_iterable(list_of_weights)) shape_list = [itm.shape for itm in weight_list] grad_list = list(chain.from_iterable(list_of_gradients)) # Convert single list of gradients Tensors to list of numpy arrays flat_gradients = tf.concat([tf.reshape(g, [-1]) for g in grad_list], axis=0) flat_gradients = tf.expand_dims(flat_gradients, axis=0) flat_weights = tf.concat([tf.reshape(vec, [-1]) for vec in weight_list], axis=0) flat_weights = tf.expand_dims(flat_weights, axis=0) gather_weight_tensor = hvd.allgather(flat_weights) gather_gradient_tensor = hvd.allgather(flat_gradients) return gather_weight_tensor, gather_gradient_tensor, shape_list
def send_receive(self, tensors, ctx): tensors_size = [] tensors_shape = [] tensors_ag = [] for tensor in tensors: tensors_size.append(tf.reshape(tf.size(tensor), [-1])) tensors_shape.append(tf.shape(tensor)) tensors_1d = tf.reshape(tensor, [-1]) tensors_ag.append(allgather(tensors_1d)) tensors_size = tf.concat(tensors_size, 0) if self.compressor.tensors_size_are_same: tensors_size_list = [tensors_size] * self.world_size tensors_size_ag = tf.concat(tensors_size_list, 0) else: tensors_size_ag = allgather(tensors_size) index = [0] * self.world_size num = len(tensors) decompressed_tensors = [] for ranki in range(self.world_size): tensors_size = tensors_size_ag[num * ranki:num * (ranki + 1)] ranki_tensors = [] for i, (tensor, shape) in enumerate(zip(tensors_ag, tensors_shape)): a = index[i] b = a + tensors_size[i] ranki_tensors.append(tf.reshape(tensor[a:b], shape)) index[i] = b ranki_decompressed = self.compressor.decompress(ranki_tensors, ctx) decompressed_tensors.append(ranki_decompressed) aggregated_tensor = self.compressor.aggregate(decompressed_tensors) return aggregated_tensor
def test_horovod_allgather_grad_cpu(self): """Test the correctness of the allgather gradient on CPU.""" hvd.init() rank = hvd.rank() size = hvd.size() # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] if _executing_eagerly(): with tf.GradientTape() as tape: tensor = self.tfe.Variable( tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank) if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) with tf.device("/cpu:0"): grad_out = tape.gradient(gathered, tensor, grad_ys) else: tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) with tf.device("/cpu:0"): grad = tf.gradients(gathered, tensor, grad_ys)[0] grad_out = self.evaluate(grad) expected = np.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def __init__(self, name): self.name = name with tf.name_scope("horovod_python_ops/" + name): self.allgather_obj_size_inp = tf.placeholder( name="allgather_obj_size", dtype=tf.int32, shape=[None]) self.allgather_obj_inp = tf.placeholder(name="allgather_obj", dtype=tf.uint8, shape=[None]) self.allgather_obj_size_result = hvd.allgather( self.allgather_obj_size_inp) self.allgather_obj_result = hvd.allgather(self.allgather_obj_inp)
def all_gather(tensors, axis=0, comm_options=None): if tf.distribute.has_strategy(): replica_ctx = tf.distribute.get_replica_context() return replica_ctx.all_gather(tensors, axis=axis, options=comm_options) else: import horovod.tensorflow as hvd return [hvd.allgather(tensor) for tensor in tensors]
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session() as session: dtypes = [ tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64 ] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = tf.ones([17] * dim, dtype=dtype) * rank gathered = hvd.allgather(tensor) gathered_tensor = session.run(gathered) self.assertEqual(list(gathered_tensor.shape), [17 * size] + [17] * (dim - 1)) for i in range(size): rank_tensor = tf.slice(gathered_tensor, [i * 17] + [0] * (dim - 1), [17] + [-1] * (dim - 1)) self.assertEqual(list(rank_tensor.shape), [17] * dim) # tf.equal() does not support tf.uint16 as of TensorFlow 1.2, # so need to cast rank_tensor to tf.int32. self.assertTrue( session.run( tf.reduce_all( tf.equal(tf.cast(rank_tensor, tf.int32), i))), "hvd.allgather produces incorrect gathered tensor")
def test_horovod_allgather_variable_size_fused(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors with Tensor Fusion, even if those tensors have different sizes along the first dim.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [ tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64, tf.bool ] dims = [1, 2, 3] tests = [] shape_tests = [] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5 tensor_sizes = tensor_sizes[:size] tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) shape_tests.append( tf.reduce_all( tf.equal(tf.shape(gathered), [sum(tensor_sizes)] + [17] * (dim - 1)))) for i in range(size): rank_size = [tensor_sizes[i]] + [17] * (dim - 1) rank_tensor = tf.slice(gathered, [sum(tensor_sizes[:i])] + [0] * (dim - 1), rank_size) self.assertEqual(list(rank_tensor.shape), rank_size) if dtype != tf.bool: value = i else: value = i % 2 # tf.equal() does not support tf.uint16 as of TensorFlow 1.2, # so need to cast rank_tensor to tf.int32. tests.append( tf.reduce_all( tf.equal(tf.cast(rank_tensor, tf.int32), value))) shape_tests_passed, value_tests_passed = \ self.evaluate([tf.reduce_all(shape_tests), tf.reduce_all(tests)]) self.assertTrue( shape_tests_passed, "hvd.allgather produces incorrect gathered tensor") self.assertTrue( value_tests_passed, "hvd.allgather produces incorrect gathered tensor")
def evaluate(validation_pipeline, dlrm, timer, auc_thresholds, data_parallel_splitter, max_steps=None, cast_dtype=None): auc, test_loss = 0, 0 latencies, all_test_losses = [], [] distributed = hvd.size() != 1 pipe = iter(validation_pipeline.op()) auc_metric = tf.keras.metrics.AUC(num_thresholds=auc_thresholds, curve='ROC', summation_method='interpolation', from_logits=True) bce_op = tf.keras.losses.BinaryCrossentropy( reduction=tf.keras.losses.Reduction.NONE, from_logits=True) for eval_step in range(len(validation_pipeline)): begin = time.time() (numerical_features, categorical_features), labels = pipe.get_next() if hasattr( dlrm, 'data_parallel_bottom_mlp') and dlrm.data_parallel_bottom_mlp: numerical_features = data_parallel_splitter(numerical_features) if cast_dtype is not None: numerical_features = tf.cast(numerical_features, cast_dtype) if max_steps is not None and eval_step >= max_steps: break inputs = _create_inputs_dict(numerical_features, categorical_features) y_pred = dlrm(inputs, sigmoid=False, training=False) end = time.time() latency = end - begin latencies.append(latency) if distributed: y_pred = hvd.allgather(y_pred) timer.step_test() if hvd.rank() == 0 and auc_metric is not None: update_auc_metric(auc_metric, labels, y_pred) test_loss = compute_bce_loss(bce_op, labels, y_pred) all_test_losses.append(test_loss) if hvd.rank() == 0 and dlrm.auc_metric is not None: auc = auc_metric.result().numpy().item() test_loss = tf.reduce_mean(all_test_losses).numpy().item() auc_metric.reset_state() return auc, test_loss, latencies
def hvd_gather_scores(list_of_epoch_scores): score_list = list(list_of_epoch_scores) flat_scores = tf.concat([tf.reshape(vec, [-1]) for vec in score_list], axis=0) flat_scores = tf.expand_dims(flat_scores, axis=0) gather_score_tensor = hvd.allgather(flat_scores) return gather_score_tensor
def evaluate(validation_pipeline, dlrm, timer, auc_thresholds, data_parallel_splitter, max_steps=None, cast_dtype=None): auc, test_loss = 0, 0 latencies, all_test_losses = [], [] distributed = hvd.size() != 1 iterator = enumerate(validation_pipeline) if hasattr(dlrm, 'auc_metric') and isinstance(dlrm.auc_metric, tf.keras.metrics.AUC): auc_metric = dlrm.auc_metric bce_op = dlrm.compute_bce_loss else: auc_metric = tf.keras.metrics.AUC(num_thresholds=auc_thresholds, curve='ROC', summation_method='interpolation', from_logits=True) bce_op = tf.keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.NONE, from_logits=True) while True: begin = time.time() try: eval_step, ((numerical_features, categorical_features), labels) = next(iterator) except StopIteration: break if hasattr(dlrm, 'data_parallel_bottom_mlp') and dlrm.data_parallel_bottom_mlp: numerical_features = data_parallel_splitter(numerical_features) if cast_dtype is not None: numerical_features = tf.cast(numerical_features, cast_dtype) if max_steps is not None and eval_step >= max_steps: break inputs = _create_inputs_dict(numerical_features, categorical_features) y_pred = dlrm(inputs, False) end = time.time() latency = end - begin latencies.append(latency) if distributed: y_pred = hvd.allgather(y_pred) timer.step_test() if hvd.rank() == 0 and auc_metric is not None: auc_metric.update_state(labels, y_pred) test_loss = bce_op(labels, y_pred) all_test_losses.append(test_loss) if hvd.rank() == 0 and dlrm.auc_metric is not None: auc = auc_metric.result().numpy().item() test_loss = tf.reduce_mean(all_test_losses).numpy().item() auc_metric.reset_state() return auc, test_loss, latencies
def evaluate(validation_pipeline, dlrm, timer, auc_thresholds, data_parallel_splitter, max_steps=None): if auc_thresholds is not None: auc_metric = tf.keras.metrics.AUC(num_thresholds=auc_thresholds, curve='ROC', summation_method='interpolation', name='my_auc') else: auc_metric = None bce_op = tf.keras.losses.BinaryCrossentropy( reduction=tf.keras.losses.Reduction.NONE, from_logits=False) auc, test_loss = 0, 0 latencies, all_test_losses = [], [] distributed = hvd.size() != 1 iterator = enumerate(validation_pipeline) while True: begin = time.time() try: eval_step, ((numerical_features, categorical_features), labels) = next(iterator) except StopIteration: break if dlrm.data_parallel_bottom_mlp: numerical_features = data_parallel_splitter(numerical_features) if max_steps is not None and eval_step >= max_steps: break y_pred = dlrm((numerical_features, categorical_features), sigmoid=True) end = time.time() latency = end - begin latencies.append(latency) if distributed: y_pred = hvd.allgather(y_pred) timer.step_test() if hvd.rank() == 0 and auc_metric is not None: auc_metric.update_state(labels, y_pred) test_loss = bce_op(labels, y_pred) all_test_losses.append(test_loss) if hvd.rank() == 0 and auc_metric is not None: auc = auc_metric.result().numpy().item() test_loss = tf.reduce_mean(all_test_losses).numpy().item() return auc, test_loss, latencies
def allgather(value, name=None): """ Perform an allgather on a tensor-compatible value. The concatenation is done on the first dimension, so the input values on the different processes must have the same rank and shape, except for the first dimension, which is allowed to be different. Arguments: value: A tensor-compatible value to gather. name: Optional name prefix for the constants created by this operation. """ allgather_op = hvd.allgather(tf.constant(value, name=name)) return K.get_session().run(allgather_op)
def test_horovod_allgather_fused(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors with Tensor Fusion.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [ tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64, tf.bool ] dims = [1, 2, 3] tests = [] shape_tests = [] for dtype, dim in itertools.product(dtypes, dims): tensor = tf.ones([17] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) shape_tests.append( tf.reduce_all( tf.equal(tf.shape(gathered), [17 * size] + [17] * (dim - 1)))) for i in range(size): rank_tensor = tf.slice(gathered, [i * 17] + [0] * (dim - 1), [17] + [-1] * (dim - 1)) if dtype != tf.bool: value = i else: value = i % 2 # tf.equal() does not support tf.uint16 as of TensorFlow 1.2, # so need to cast rank_tensor to tf.int32. tests.append( tf.reduce_all( tf.equal(tf.cast(rank_tensor, tf.int32), value))) shape_tests_passed, value_tests_passed = \ self.evaluate([tf.reduce_all(shape_tests), tf.reduce_all(tests)]) self.assertTrue( shape_tests_passed, "hvd.allgather produces incorrect gathered tensor") self.assertTrue( value_tests_passed, "hvd.allgather produces incorrect gathered tensor")
def test_horovod_allgather_error(self): """Test that the allgather returns an error if any dimension besides the first is different among the tensors being gathered.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = tf.ones(tensor_size, dtype=tf.float32) * rank with self.assertRaises(tf.errors.FailedPreconditionError): self.evaluate(hvd.allgather(tensor))
def test_horovod_allgather_type_error(self): """Test that the allgather returns an error if the types being gathered differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") tensor_size = [17] * 3 dtype = tf.int32 if rank % 2 == 0 else tf.float32 tensor = tf.ones(tensor_size, dtype=dtype) * rank with self.assertRaises(tf.errors.FailedPreconditionError): self.evaluate(hvd.allgather(tensor))
def test_horovod_allgather_variable_size(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors, even if those tensors have different sizes along the first dim.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session() as session: dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64, tf.bool] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5 tensor_sizes = tensor_sizes[:size] tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) gathered_tensor = session.run(gathered) expected_size = sum(tensor_sizes) self.assertEqual(list(gathered_tensor.shape), [expected_size] + [17] * (dim - 1)) for i in range(size): rank_size = [tensor_sizes[i]] + [17] * (dim - 1) rank_tensor = tf.slice( gathered, [sum(tensor_sizes[:i])] + [0] * (dim - 1), rank_size) self.assertEqual(list(rank_tensor.shape), rank_size) # tf.equal() does not support tf.uint16 as of TensorFlow 1.2, # so need to cast rank_tensor to tf.int32. if dtype != tf.bool: value = i else: value = i % 2 self.assertTrue( session.run(tf.reduce_all( tf.equal(tf.cast(rank_tensor, tf.int32), value))), "hvd.allgather produces incorrect gathered tensor")
def test_horovod_allgather_error(self): """Test that the allgather returns an error if any dimension besides the first is different among the tensors being gathered.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = tf.ones(tensor_size, dtype=tf.float32) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allgather(tensor))
def test_horovod_allgather_type_error(self): """Test that the allgather returns an error if the types being gathered differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor_size = [17] * 3 dtype = tf.int32 if rank % 2 == 0 else tf.float32 tensor = tf.ones(tensor_size, dtype=dtype) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allgather(tensor))
def test_horovod_allgather(): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) with tf.Session(config=config) as session: tensor = tf.ones([1])* rank tensor = tf.cast(tensor, dtype=tf.float32) gathered = hvd.allgather(tensor) gathered_tensor = session.run(gathered) while True: print('gathered_tensor = ', gathered_tensor)
def test_horovod_allgather_variable_size(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors, even if those tensors have different sizes along the first dim.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session() as session: dtypes = [ tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64 ] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5 tensor_sizes = tensor_sizes[:size] tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1), dtype=dtype) * rank gathered = hvd.allgather(tensor) gathered_tensor = session.run(gathered) expected_size = sum(tensor_sizes) self.assertEqual(list(gathered_tensor.shape), [expected_size] + [17] * (dim - 1)) for i in range(size): rank_size = [tensor_sizes[i]] + [17] * (dim - 1) rank_tensor = tf.slice(gathered, [sum(tensor_sizes[:i])] + [0] * (dim - 1), rank_size) self.assertEqual(list(rank_tensor.shape), rank_size) # tf.equal() does not support tf.uint16 as of TensorFlow 1.2, # so need to cast rank_tensor to tf.int32. self.assertTrue( session.run( tf.reduce_all( tf.equal(tf.cast(rank_tensor, tf.int32), i))), "hvd.allgather produces incorrect gathered tensor")
def evaluate_wilcoxon(model, dataset): @tf.function def _step(samples, labels): probs = model(samples, training=False) return tf.concat([probs, labels], axis=1) results = [] for idx, (samples, labels) in enumerate(dataset): result = _step(samples, labels) results.append(result) results = tf.concat(results, axis=0) results = hvd.allgather(results, name='wilcoxon_AUC') sort_order = tf.argsort(results[:, 0]) sorted_label = tf.gather(results[:, 1], sort_order) rank = tf.cast(tf.range(1, sorted_label.shape[0]+1), tf.float32) num_true = tf.reduce_sum(sorted_label) num_false = sorted_label.shape[0] - num_true auc = (tf.reduce_sum(rank * sorted_label) - (num_true * (num_true + 1) / 2)) / (num_true * num_false) return auc.numpy()
def test_horovod_allgather_grad(self): """Test the correctness of the allgather gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session(config=self.config) as session: # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) grad = tf.gradients(gathered, tensor, grad_ys)[0] grad_out = session.run(grad) expected = np.ones( [tensor_sizes[rank]] + [17] * (dim - 1) ) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() with self.test_session() as session: dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64, tf.bool] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = tf.ones([17] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) gathered_tensor = session.run(gathered) self.assertEqual(list(gathered_tensor.shape), [17 * size] + [17] * (dim - 1)) for i in range(size): rank_tensor = tf.slice(gathered_tensor, [i * 17] + [0] * (dim - 1), [17] + [-1] * (dim - 1)) self.assertEqual(list(rank_tensor.shape), [17] * dim) # tf.equal() does not support tf.uint16 as of TensorFlow 1.2, # so need to cast rank_tensor to tf.int32. if dtype != tf.bool: value = i else: value = i % 2 self.assertTrue( session.run(tf.reduce_all( tf.equal(tf.cast(rank_tensor, tf.int32), value))), "hvd.allgather produces incorrect gathered tensor")
f"Creating graph for KNN of {num_train_images} training images ...") local_train_files = [(idx, fname, label) for idx, (fname, label) in enumerate(all_train_files) if idx % hvd.size() == hvd.rank()] image_input = tf.placeholder(tf.uint8, [None, 224, 224, 3], "image") idx_input = tf.placeholder(tf.int64, [None], "image_idx") feat_buffer = tf.get_variable("feature_buffer", shape=[num_train_images, 128], trainable=False) net = ResNetModel(num_output=(2048, 128) if args.v2 else (128, )) with TowerContext("", is_training=False): feat = net.forward(image_input) feat = tf.math.l2_normalize(feat, axis=1) # Nx128 all_feat = hvd.allgather(feat) # GN x 128 all_idx_input = hvd.allgather(idx_input) # GN update_buffer = tf.scatter_update(feat_buffer, all_idx_input, all_feat) dist = tf.matmul(feat, tf.transpose(feat_buffer)) # N x #DS _, topk_indices = tf.math.top_k(dist, k=args.top_k) # Nxtopk train_ds = build_dataflow(local_train_files) config = get_default_sess_config() config.gpu_options.visible_device_list = str(hvd.local_rank()) def evaluate(checkpoint_file): result_file = get_checkpoint_path( checkpoint_file) + f".knn{args.top_k}.txt" if os.path.isfile(result_file):
def test_horovod_allgather_grad_gpu(self): """Test the correctness of the allgather gradient on GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): self.skipTest(("No GPUs available")) if os.environ.get('HOROVOD_MIXED_INSTALL'): # Skip if compiled with CUDA but without HOROVOD_GPU_ALLREDUCE. self.skipTest("Not compiled with HOROVOD_GPU_ALLREDUCE") hvd.init() rank = hvd.rank() local_rank = hvd.local_rank() size = hvd.size() # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] if _executing_eagerly(): with tf.GradientTape() as tape: tensor = self.tfe.Variable( tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank) if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) with tf.device("/gpu:%d" % local_rank): grad_out = tape.gradient(gathered, tensor, grad_ys) else: tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) gathered = hvd.allgather(tensor) grad_list = [] for r, tensor_size in enumerate(tensor_sizes): g = tf.ones([tensor_size] + [17] * (dim - 1)) * r grad_list.append(g) grad_ys = tf.concat(grad_list, axis=0) with tf.device("/gpu:%d" % local_rank): grad = tf.gradients(gathered, tensor, grad_ys)[0] grad_out = self.evaluate(grad) expected = np.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
xTrainTensor = tf.placeholder(tf.float32) yTrainTensor = tf.placeholder(tf.float32) xTestTensor = tf.placeholder(tf.float32) indexTensor = tf.placeholder(tf.int32) inputTensor = [xTrainTensor, yTrainTensor, xTestTensor, indexTensor] tree_fit_predict_tensor = tf.py_func(tree_fit_predict, inputTensor, tf.float32) currSess.run(tf.global_variables_initializer()) indices = indicesBroadCast.eval() result = tree_fit_predict(xTrain, yTrain, xTest, indices[rank]) allgatherOp = hvd.allgather(result) #print("rank is : ",rank, "result " , result.eval()) print("rank is : ", rank, "indices ", indices[0][0]) #print(allgatherOp.eval()) AllPreds = allgatherOp.eval() #print("juggled up : ", np.sum(AllPreds[rank*len(y_test):rank*len(y_test)+len(y_test)] != result.eval() ),"rank: ",rank) #print("rank : ",rank,"allpreds :" ,AllPreds) if rank == 0: #print("yTest is ",yTest) AllPreds = AllPreds.reshape(-1, len(y_test)).T print("Shape is : ", AllPreds.shape) MajorityPreds = stats.mode(AllPreds, axis=1)[0]
def get_image_labels(self): if self.is_all_shared: ### ALL SHARED ### img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, is_training=True) with tf.device("/cpu:0"): with tf.name_scope("reading"): data_provider = slim.dataset_data_provider.DatasetDataProvider( self.dataset, num_readers=self.FLAGS.num_data_readers, common_queue_capacity=20*self.FLAGS.batch_size, common_queue_min=10*self.FLAGS.batch_size, seed=self.rank) [image, label] = data_provider.get(['image', 'label']) with tf.name_scope("to-preprocessing"): capacity = 20 * self.FLAGS.batch_size to_pre_queue = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[image.dtype, label.dtype], shapes=None, name="to_pre_queue") to_pre_op = to_pre_queue.enqueue([image, label]) queue_runner.add_queue_runner(queue_runner.QueueRunner(to_pre_queue, [to_pre_op] * Pipeline.QR_THREADS)) tf.summary.scalar("to_pre_fraction_of_%d_full" % capacity, math_ops.to_float(to_pre_queue.size()) * (1. / capacity)) image, label = to_pre_queue.dequeue() with tf.name_scope("preprocessing"):#TODO image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode) with tf.name_scope("to-allgather"): capacity = 20 * self.FLAGS.batch_size to_allg_queue = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[image.dtype, label.dtype], shapes=[[self.train_image_size, self.train_image_size, 3], []], name="to_allgather_queue")#[image.get_shape(), label.get_shape()]) queue_runner.add_queue_runner(queue_runner.QueueRunner(to_allg_queue, [to_allg_queue.enqueue([image, label])] * Pipeline.QR_THREADS)) tf.summary.scalar("to_allgather_fraction_of_%d_full" % capacity, math_ops.to_float(to_allg_queue.size()) * (1. / capacity)) # num_preprocessors = tf.placeholder(tf.int32, shape=[], name="num_preprocessors) # self.num_hvd_send_tensor = send_images, send_labels = to_allg_queue.dequeue_many(self.num_hvd_send) # if rank == #TODO all_images = hvd.allgather(send_images, name="hvd_allgather") all_labels = hvd.allgather(send_labels, name="hvd_allgather") #TODO: Remove extra queues with tf.name_scope("to-compute"): capacity = 30 * self.FLAGS.batch_size to_compute_queue = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[image.dtype, label.dtype], shapes=[[self.train_image_size, self.train_image_size, 3], []],#TODO name="to_compute_queue")#[image.get_shape(), label.get_shape()]) queue_runner.add_queue_runner(queue_runner.QueueRunner(to_compute_queue, [to_compute_queue.enqueue_many([all_images, all_labels])]))#1 thread! tf.summary.scalar("to_compute_fraction_of_%d_full" % capacity, math_ops.to_float(to_compute_queue.size()) * (1. / capacity)) image, label = to_compute_queue.dequeue() elif self.is_single_bcast: ### SINGLE BROADCAST ### img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, is_training=True) allg_images_name = "allgather-images-op" allg_labels_name = "allgather-labels-op" bcast_images_name = "bcast-images-op" bcast_labels_name = "bcast-labels-op" if 0 in self.member_of_group: #If we belong to group 0, initialize the reading and preprocessing pipeline with tf.device("/cpu:0"): with tf.name_scope("reading"): data_provider = slim.dataset_data_provider.DatasetDataProvider( self.dataset, num_readers=self.FLAGS.num_data_readers, common_queue_capacity=20*self.FLAGS.batch_size, common_queue_min=10*self.FLAGS.batch_size, seed=self.rank) [image, label] = data_provider.get(['image', 'label']) image, label = create_qr("to-pre", 10 * self.FLAGS.batch_size, [image, label], None, [image.dtype, label.dtype], Pipeline.QR_THREADS, False, False) with tf.name_scope("preprocessing"): image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode) send_images, send_labels = create_qr("to-allg", 10 * self.FLAGS.batch_size, [image, label], [[self.train_image_size, self.train_image_size, 3], []], [image.dtype, label.dtype], Pipeline.QR_THREADS, False, True, self.num_hvd_send) all_images = hvd.allgather(send_images, group=0, name=allg_images_name) all_labels = hvd.allgather(send_labels, group=0, name=allg_labels_name) all_images, all_labels = create_qr("to-bcast", 20 * self.FLAGS.batch_size, [all_images, all_labels], [[self.train_image_size, self.train_image_size, 3], []], [post_pre_image_dtype, post_pre_label_dtype], 1, True, True, self.images_per_bcast) if 1 in self.member_of_group: # For the middle man rank, reset all_images and all_labels # names to their broadcasted tensors so that the bcast is # performed. Note that the bcast root is rank 0 since the # group1 sent to init had this rank listed first, meaning that # the resulting mpi group comm has this rank has rank 0 if len(self.member_of_group) == 1: # Then not middle man, so construct holder variable WITH CORRECT NAME! # tf.Variable(self.num_hvd_send? all_images = tf.zeros([self.images_per_bcast, self.train_image_size, self.train_image_size, 3], dtype=post_pre_image_dtype) all_labels = tf.zeros([self.images_per_bcast] , dtype=post_pre_label_dtype) #shape of [] turns into 1D instead of 0D all_images = hvd.broadcast(all_images, 0, group=1, name=bcast_images_name) all_labels = hvd.broadcast(all_labels, 0, group=1, name=bcast_labels_name) image, label = create_qr("to-compute", 20 * self.FLAGS.batch_size, [all_images, all_labels], [[self.train_image_size, self.train_image_size, 3], []], [post_pre_image_dtype, post_pre_label_dtype], 1, True, False) elif self.is_multi_bcast: ### MULTIPLE BROADCAST # print("Rank:", rank, member_of_group, group_rank_list) img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, is_training=True) # allg_image_name = "allgathered-image" # need some naming commonalities # allg_label_name = "allgathered-label" allg_images_name = "allgather-images-op" allg_labels_name = "allgather-labels-op" bcast_images_name = "bcast-images-op" bcast_labels_name = "bcast-labels-op" # if 0 in member_of_group: #If we belong to group 0, initialize the reading and preprocessing pipeline if self.rank < self.FLAGS.num_pre: with tf.device("/cpu:0"): with tf.name_scope("reading"): data_provider = slim.dataset_data_provider.DatasetDataProvider( self.dataset, num_readers=self.FLAGS.num_data_readers, common_queue_capacity=20*self.FLAGS.batch_size, common_queue_min=10*self.FLAGS.batch_size, seed=self.rank) [image, label] = data_provider.get(['image', 'label']) image, label = create_qr("to-pre", 10 * self.FLAGS.batch_size, [image, label], None, [image.dtype, label.dtype], Pipeline.QR_THREADS, False, False) with tf.name_scope("preprocessing"): image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode) # image = tf.Print(image, ["using preprocessed image"]) send_images, send_labels = create_qr("to-bcast", 20 * self.FLAGS.batch_size, [image, label], [[self.train_image_size, self.train_image_size, 3], []], [image.dtype, label.dtype], 2 * Pipeline.QR_THREADS, False, True, self.images_per_bcast) else: send_images = tf.zeros([self.images_per_bcast, self.train_image_size, self.train_image_size, 3], dtype=post_pre_image_dtype) send_labels = tf.zeros([self.images_per_bcast] , dtype=post_pre_label_dtype) with tf.device("/cpu:0"): bcast_images_root = "broadcast-images-" bcast_labels_root = "broadcast-labels-" bcast_images_per_group = [hvd.broadcast(send_images, i, group=i, name=bcast_images_root + str(i)) for i in range(self.FLAGS.num_pre)] bcast_labels_per_group = [hvd.broadcast(send_labels, i, group=i, name=bcast_labels_root + str(i)) for i in range(self.FLAGS.num_pre)] with tf.name_scope("to-compute"): capacity = 30 * self.FLAGS.batch_size to_compute_q = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[post_pre_image_dtype, post_pre_label_dtype], shapes=[[self.train_image_size, self.train_image_size, 3], []], name="to-compute-queue") to_comp_ops = [to_compute_q.enqueue_many([bcast_images_per_group[i], bcast_labels_per_group[i]]) for i in range(self.FLAGS.num_pre)] queue_runner.add_queue_runner(queue_runner.QueueRunner(to_compute_q, to_comp_ops)) tf.summary.scalar("to_compute_fraction_of_%d_full" % capacity, math_ops.to_float(to_compute_q.size()) * (1. / capacity)) image, label = to_compute_q.dequeue() return image, label
def gather(self, tensor, axis): import horovod.tensorflow as hvd return hvd.allgather(tensor)
def train(*, flow_constructor, logdir, lr_schedule, dropout_p, seed, init_bs, total_bs, ema_decay, steps_per_log, epochs_per_val, max_grad_norm, dtype=tf.float32, scale_loss=None, restore_checkpoint=None, scale_grad=None, dataset='cifar10', steps_per_extra_samples=None): hvd, MPI, is_root, mpi_average = setup_horovod() # Seeding and logging setup seed_all(hvd.rank() + hvd.size() * seed) assert total_bs % hvd.size() == 0 local_bs = total_bs // hvd.size() logger = None logdir = '{}_mpi{}_{}'.format(os.path.expanduser(logdir), hvd.size(), time.time()) checkpointdir = os.path.join(logdir, 'checkpoints') if is_root: print('Floating point format:', dtype) pprint(locals()) os.makedirs(logdir) os.makedirs(checkpointdir) logger = TensorBoardOutput(logdir) # Load data if is_root: # Load once on root first to prevent downloading conflicts print('Loading data') load_data(dataset=dataset, dtype=dtype.as_numpy_dtype) MPI.COMM_WORLD.Barrier() data_train, data_val = load_data(dataset=dataset, dtype=dtype.as_numpy_dtype) img_shp = list(data_train.shape[1:]) if is_root: print('Training data: {}, Validation data: {}'.format( data_train.shape[0], data_val.shape[0])) print('Image shape:', img_shp) bpd_scale_factor = 1. / (np.log(2) * np.prod(img_shp)) # Build graph if is_root: print('Building graph') dequant_flow, flow = flow_constructor() # Data-dependent init if is_root: print('===== Init graph =====') x_init_sym = tf.placeholder(dtype, [init_bs] + img_shp) _, _, init_loss_sym, _ = build_forward(x=x_init_sym, dequant_flow=dequant_flow, flow=flow, flow_kwargs=dict( vcfg=VarConfig(init=True, ema=None, dtype=dtype), dropout_p=dropout_p, verbose=is_root)) # Training if is_root: print('===== Training graph =====') x_sym = tf.placeholder(dtype, [local_bs] + img_shp) _, y_sym, loss_sym, _ = build_forward(x=x_sym, dequant_flow=dequant_flow, flow=flow, flow_kwargs=dict(vcfg=VarConfig( init=False, ema=None, dtype=dtype), dropout_p=dropout_p, verbose=is_root)) # EMA params = tf.trainable_variables() if is_root: # for p in params: # print(p.name, p.shape) print('Parameters', sum(np.prod(p.get_shape().as_list()) for p in params)) ema = tf.train.ExponentialMovingAverage(decay=ema_decay) maintain_averages_op = tf.group(ema.apply(params)) # Op for setting the ema params to the current non-ema params (for use after data-dependent init) name2var = {v.name: v for v in tf.global_variables()} copy_params_to_ema = tf.group([ name2var[p.name.replace(':0', '') + '/ExponentialMovingAverage:0'].assign(p) for p in params ]) # Validation and sampling (with EMA) if is_root: print('===== Validation graph =====') val_flow_kwargs = dict(vcfg=VarConfig(init=False, ema=ema, dtype=dtype), dropout_p=0, verbose=is_root) val_dequant_x_sym, val_y_sym, val_loss_sym, _ = build_forward( x=x_sym, dequant_flow=dequant_flow, flow=flow, flow_kwargs=val_flow_kwargs) # for debugging invertibility val_inverr_sym = tf.reduce_max( tf.abs(val_dequant_x_sym - flow.inverse(val_y_sym, **val_flow_kwargs)[0])) if is_root: print('===== Sampling graph =====') samples_sym, _ = flow.inverse( tf.random_normal(y_sym.shape.as_list(), dtype=dtype), **val_flow_kwargs) allgathered_samples_sym = hvd.allgather(tf.to_float(samples_sym)) assert len(tf.trainable_variables()) == len(params) def run_validation(sess, i_step): data_val_shard = np.array_split(data_val, hvd.size(), axis=0)[hvd.rank()] shard_losses, shard_inverrs = zip(*[ sess.run([val_loss_sym, val_inverr_sym], {x_sym: val_batch}) for val_batch, in iterbatches([data_val_shard], batch_size=local_bs, include_final_partial_batch=False) ]) val_loss, total_count = mpi_average(shard_losses) inv_err, _ = mpi_average(shard_inverrs) samples = sess.run(allgathered_samples_sym) if is_root: logger.writekvs( [('val_bpd', bpd_scale_factor * val_loss), ('val_inverr', inv_err), ('num_val_examples', total_count * local_bs), ('samples', tile_imgs(np.clip(samples, 0, 255).astype(np.uint8)))], i_step) def run_sampling_only(sess, i_step): samples = sess.run(allgathered_samples_sym) if is_root: logger.writekvs( [('samples', tile_imgs(np.clip(samples, 0, 255).astype(np.uint8)))], i_step) # Optimization lr_sym = tf.placeholder(dtype, [], 'lr') optimizer = hvd.DistributedOptimizer(tf.train.AdamOptimizer(lr_sym)) if scale_loss is None: grads_and_vars = optimizer.compute_gradients(loss_sym, var_list=params) else: grads_and_vars = [(g / scale_loss, v) for (g, v) in optimizer.compute_gradients( loss_sym * scale_loss, var_list=params)] if scale_grad is not None: grads_and_vars = [(g / scale_grad, v) for (g, v) in grads_and_vars] if max_grad_norm is not None: clipped_grads, grad_norm_sym = tf.clip_by_global_norm( [g for (g, _) in grads_and_vars], max_grad_norm) grads_and_vars = [ (cg, v) for (cg, (_, v)) in zip(clipped_grads, grads_and_vars) ] else: grad_norm_sym = tf.constant(0.) opt_sym = tf.group(optimizer.apply_gradients(grads_and_vars), maintain_averages_op) def loop(sess: tf.Session): i_step = 0 if is_root: print('Initializing') sess.run(tf.global_variables_initializer()) if restore_checkpoint is not None: # Restore from checkpoint if is_root: saver = tf.train.Saver() print('Restoring checkpoint:', restore_checkpoint) restore_step = int(restore_checkpoint.split('-')[-1]) print('Restoring from step:', restore_step) saver.restore(sess, restore_checkpoint) i_step = restore_step else: saver = None else: # No checkpoint: perform data dependent initialization if is_root: print('Data dependent init') init_loss = sess.run( init_loss_sym, { x_init_sym: data_train[np.random.randint(0, data_train.shape[0], init_bs)] }) if is_root: print('Init loss:', init_loss * bpd_scale_factor) sess.run(copy_params_to_ema) saver = tf.train.Saver() if is_root else None if is_root: print('Broadcasting initial parameters') sess.run(hvd.broadcast_global_variables(0)) sess.graph.finalize() if is_root: print('Training') loss_hist = deque(maxlen=steps_per_log) gnorm_hist = deque(maxlen=steps_per_log) for i_epoch in range(99999999999): if i_epoch % epochs_per_val == 0: run_validation(sess, i_step=i_step) if saver is not None: saver.save(sess, os.path.join(checkpointdir, 'model'), global_step=i_step) epoch_start_t = time.time() for i_epoch_step, (batch, ) in enumerate( iterbatches( # non-sharded: each gpu goes through the whole dataset [data_train], batch_size=local_bs, include_final_partial_batch=False, )): if steps_per_extra_samples is not None and i_step % steps_per_extra_samples == 0: run_sampling_only(sess, i_step) lr = lr_schedule(i_step) loss, gnorm, _ = sess.run([loss_sym, grad_norm_sym, opt_sym], { x_sym: batch, lr_sym: lr }) loss_hist.append(loss) gnorm_hist.append(gnorm) # Skip timing the very first step, which will be unusually slow due to TF initialization if i_epoch == i_epoch_step == 0: epoch_start_t = time.time() if i_step % steps_per_log == 0: loss_hist_means = MPI.COMM_WORLD.gather(float( np.mean(loss_hist)), root=0) gnorm_hist_means = MPI.COMM_WORLD.gather(float( np.mean(gnorm_hist)), root=0) steps_per_sec = (i_epoch_step + 1) / (time.time() - epoch_start_t) if is_root: kvs = [ ('iter', i_step), ('epoch', i_epoch + i_epoch_step * local_bs / data_train.shape[0]), # epoch for this gpu ('bpd', float( np.mean(loss_hist_means) * bpd_scale_factor)), ('gnorm', float(np.mean(gnorm_hist_means))), ('lr', float(lr)), ('fps', steps_per_sec * total_bs ), # fps calculated over all gpus (this epoch) ('sps', steps_per_sec), ] logger.writekvs(kvs, i_step) i_step += 1 # End of epoch # Train config = tf.ConfigProto() # config.log_device_placement = True config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str( hvd.local_rank()) # Pin GPU to local rank (one GPU per process) with tf.Session(config=config) as sess: loop(sess)
def evaluate(*, flow_constructor, seed, restore_checkpoint, total_bs=1024, iw_samples=4096, dtype=tf.float32, dataset='cifar10', samples_filename=None): hvd, MPI, is_root, mpi_average = setup_horovod() restore_checkpoint = os.path.expanduser(restore_checkpoint) # Seeding and logging setup seed_all(hvd.rank() + hvd.size() * seed) assert total_bs % hvd.size() == 0 local_bs = total_bs // hvd.size() assert iw_samples % total_bs == 0 if is_root: print('===== EVALUATING {} ({} IW samples) ====='.format( restore_checkpoint, iw_samples)) # Load data if is_root: # Load once on root first to prevent downloading conflicts print('Loading data') load_data(dataset=dataset, dtype=dtype.as_numpy_dtype) MPI.COMM_WORLD.Barrier() data_train, data_val = load_data(dataset=dataset, dtype=dtype.as_numpy_dtype) img_shp = list(data_train.shape[1:]) if is_root: print('Training data: {}, Validation data: {}'.format( data_train.shape[0], data_val.shape[0])) print('Image shape:', img_shp) bpd_scale_factor = 1. / (np.log(2) * np.prod(img_shp)) # Build graph if is_root: print('Building graph') dequant_flow, flow = flow_constructor() x_sym = tf.placeholder(dtype, [local_bs] + img_shp) # This is a fake training graph. Just used to mimic flow_training, so we can load from the saver build_forward(x=x_sym, dequant_flow=dequant_flow, flow=flow, flow_kwargs=dict(vcfg=VarConfig(init=False, ema=None, dtype=dtype), dropout_p=0, verbose=is_root) # note dropout is 0: it doesn't matter ) # EMA params = tf.trainable_variables() if is_root: print('Parameters', sum(np.prod(p.get_shape().as_list()) for p in params)) ema = tf.train.ExponentialMovingAverage( decay=0.9999999999999) # ema turned off maintain_averages_op = tf.group(ema.apply(params)) # Validation and sampling (with EMA) if is_root: print('===== Validation graph =====') val_flow_kwargs = dict(vcfg=VarConfig(init=False, ema=ema, dtype=dtype), dropout_p=0, verbose=is_root) val_dequant_x_sym, val_y_sym, val_loss_sym, val_logratio_sym = build_forward( x=x_sym, dequant_flow=dequant_flow, flow=flow, flow_kwargs=val_flow_kwargs) allgathered_val_logratios_sym = hvd.allgather(val_logratio_sym) # for debugging invertibility val_inverr_sym = tf.reduce_max( tf.abs(val_dequant_x_sym - flow.inverse(val_y_sym, **val_flow_kwargs)[0])) if is_root: print('===== Sampling graph =====') samples_sym, _ = flow.inverse( tf.random_normal(val_y_sym.shape.as_list(), dtype=dtype), **val_flow_kwargs) allgathered_samples_sym = hvd.allgather(tf.to_float(samples_sym)) assert len(tf.trainable_variables()) == len(params) def run_iw_eval(sess): if is_root: print('Running IW eval with {} samples...'.format(iw_samples)) # Go through one example at a time all_val_losses = [] for i_example in (trange if is_root else range)(len(data_val)): # take this single example and tile it batch_x = np.tile(data_val[i_example, None, ...], (local_bs, 1, 1, 1)) # repeatedly evaluate logd for the IWAE bound batch_logratios = np.concatenate([ sess.run(allgathered_val_logratios_sym, {x_sym: batch_x}) for _ in range(iw_samples // total_bs) ]).astype(np.float64) assert batch_logratios.shape == (iw_samples, ) # log [1/n \sum_i exp(r_i)] = log [exp(-b) 1/n \sum_i exp(r_i + b)] = -b + log [1/n \sum_i exp(r_i + b)] shift = batch_logratios.max() all_val_losses.append( -bpd_scale_factor * (shift + np.log(np.mean(np.exp(batch_logratios - shift))))) if i_example % 100 == 0 and is_root: print(i_example, np.mean(all_val_losses)) if is_root: print(f'Final ({len(data_val)}):', np.mean(all_val_losses)) def run_standard_eval(sess): if is_root: print('Running standard eval...') # Standard validation (single sample) data_val_shard = np.array_split(data_val, hvd.size(), axis=0)[hvd.rank()] shard_losses, shard_inverrs = zip(*[ sess.run([val_loss_sym, val_inverr_sym], {x_sym: val_batch}) for val_batch, in iterbatches([data_val_shard], batch_size=local_bs, include_final_partial_batch=False) ]) val_loss, total_count = mpi_average(shard_losses) inv_err, _ = mpi_average(shard_inverrs) if is_root: for k, v in [ ('val_bpd', bpd_scale_factor * val_loss), ('val_inverr', inv_err), ('num_val_examples', total_count * local_bs), ]: print(k, v) def run_sampling_only(sess): samples = sess.run(allgathered_samples_sym) # # warmup a few times # for _ in range(10): # sess.run(allgathered_samples_sym) # # start timing # trials = 100 # tstart = time.time() # for _ in range(trials): # samples = sess.run(allgathered_samples_sym) # sample_time = (time.time() - tstart) / trials if is_root: from PIL import Image Image.fromarray( tile_imgs(np.clip(samples, 0, 255).astype( np.uint8))).save(samples_filename) print('Saved {} samples to {}'.format(len(samples), samples_filename)) # print('Sampled in {} seconds'.format(sample_time)) # Run config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str( hvd.local_rank()) # Pin GPU to local rank (one GPU per process) with tf.Session(config=config) as sess: if is_root: print('Initializing') sess.run(tf.global_variables_initializer()) # Restore from checkpoint if is_root: print('Restoring checkpoint:', restore_checkpoint) saver = tf.train.Saver() saver.restore(sess, restore_checkpoint) print('Broadcasting initial parameters') sess.run(hvd.broadcast_global_variables(0)) sess.graph.finalize() if samples_filename: run_sampling_only(sess) # Make sure data is the same on all MPI processes tmp_inds = [0, 183, 3, 6, 20, 88] check_batch = np.ascontiguousarray(data_val[tmp_inds]) gathered_batches = np.zeros( (hvd.size(), *check_batch.shape), check_batch.dtype) if is_root else None MPI.COMM_WORLD.Gather(check_batch, gathered_batches, root=0) if is_root: assert all( np.allclose(check_batch, b) for b in gathered_batches), 'data must be in the same order!' print('data ordering ok') # Run validation run_standard_eval(sess) run_iw_eval(sess)
def allgather(backend, value, name): return _eval(backend, hvd.allgather(tf.constant(value, name=name)))
def allgather(backend, value, name): allgather_op = hvd.allgather(tf.constant(value, name=name)) return backend.get_session().run(allgather_op)