def _do_all_reduce_pack_test(self, tt): """Test that all-reduce results are the same with or without packing.""" with ops.Graph().as_default(): tower_grads, consts, _, _ = self._init_tensors( tt.num_devices, tt.in_shapes) dev_prefixes = ['/job:localhost'] num_workers = 1 alg = 'xring' shards = 1 gpu_indices = range(0, tt.num_devices) assert len(gpu_indices) == len(tower_grads) no_pack_all_reduce = allreduce.sum_gradients_all_reduce( dev_prefixes, tower_grads, num_workers, alg, shards, gpu_indices, agg_small_grads_max_bytes=0, agg_small_grads_max_group=1) packed_tg, packing = allreduce.pack_small_tensors(tower_grads, 100, 100) packed_all_reduce = allreduce.sum_gradients_all_reduce( dev_prefixes, packed_tg, num_workers, alg, shards, gpu_indices, agg_small_grads_max_bytes=0, agg_small_grads_max_group=1) unpacked_tg = allreduce.unpack_small_tensors(packed_all_reduce, packing) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) no_pack_values = sess.run(no_pack_all_reduce) pack_unpack_values = sess.run(unpacked_tg) for d in range(1, tt.num_devices): for t in range(0, len(tt.in_shapes)): self.assertTrue(np.allclose(no_pack_values[d][t][0], tt.num_devices * consts[0][t])) self.assertTrue(np.array_equal(no_pack_values[d][t][0], pack_unpack_values[d][t][0]))
def preprocess_device_grads(self, device_grads): remaining_grads = device_grads aggregated_grads = [] for spec_tuple in self._all_reduce_spec: if spec_tuple.limit < 0: this_grads = remaining_grads remaining_grads = [] else: (this_grads, remaining_grads) = allreduce.split_grads_by_size( spec_tuple.limit, remaining_grads) if this_grads: range_agg_grads = allreduce.sum_gradients_all_reduce( self._all_reduce_device_prefixes, this_grads, self._num_workers, spec_tuple.alg, spec_tuple.shards, self.benchmark_cnn.gpu_indices, agg_small_grads_max_bytes=self._agg_small_grads_max_bytes, agg_small_grads_max_group=self._agg_small_grads_max_group) if not aggregated_grads: aggregated_grads = range_agg_grads else: assert len(aggregated_grads) == len(range_agg_grads) for i in range(len(aggregated_grads)): aggregated_grads[i] += range_agg_grads[i] assert not remaining_grads full_device_set = [] for grads in device_grads: g, v = grads[0] del v full_device_set.append(g.device) return (full_device_set, aggregated_grads)
def preprocess_device_grads(self, device_grads): if self._all_reduce_spec: aggregated_device_grads = allreduce.sum_gradients_all_reduce( ['/job:localhost'], device_grads, 1, self._all_reduce_spec.alg, self._all_reduce_spec.shards, self.benchmark_cnn.gpu_indices, agg_small_grads_max_bytes=self._agg_small_grads_max_bytes, agg_small_grads_max_group=self._agg_small_grads_max_group) else: agg_grads, self.grad_has_inf_nan = ( variable_mgr_util. aggregate_gradients_using_copy_with_device_selection( self.benchmark_cnn, device_grads, use_mean=False, check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale)) aggregated_device_grads = [] for arr in device_grads: aggregated_device_grads.append([ (g, v) for (_, v), (g, _) in zip(arr, agg_grads) ]) return self.benchmark_cnn.devices, aggregated_device_grads
def preprocess_device_grads(self, device_grads): reduced_grads = allreduce.sum_gradients_all_reduce( self._single_session, self._all_reduce_device_prefixes, device_grads, self._num_workers, 'collective', 1, # spec_tuple.shards, self.benchmark_cnn.gpu_indices, allreduce_merge_scope=self._allreduce_merge_scope) assert len(reduced_grads) == len(device_grads) full_device_set = [] for grads in device_grads: g, _ = grads[0] full_device_set.append(g.device) return (full_device_set, reduced_grads)
def _aggregate_grads(self, device_grads): """Aggregate gradients across GPUs. Args: device_grads: List of lists of (gradient, variable) tuples. device_grads[t][g] = (gradient, variable), where t is the index of the tower and g is the index of the gradient-variable pair. Returns: List of lists of (gradient, variable) tuples, in the same form as `device_grads`. Each gradient has been summed over the towers. """ if self._all_reduce_spec: # TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other # gradient aggregation code, since gradient aggregation is doing an all # reduce. Currently, we do gradient repacking in two different places. aggregated_device_grads = allreduce.sum_gradients_all_reduce( ['/job:localhost'], device_grads, 1, self._all_reduce_spec.alg, self._all_reduce_spec.shards, self.benchmark_cnn.gpu_indices, agg_small_grads_max_bytes=self._agg_small_grads_max_bytes, agg_small_grads_max_group=self._agg_small_grads_max_group) elif self.benchmark_cnn.params.hierarchical_copy: aggregated_device_grads, self.grad_has_inf_nan = ( variable_mgr_util.aggregate_gradients_using_hierarchical_copy( self.benchmark_cnn, device_grads, use_mean=False, check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale)) else: agg_grads, self.grad_has_inf_nan = ( variable_mgr_util. aggregate_gradients_using_copy_with_device_selection( self.benchmark_cnn, device_grads, use_mean=False, check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale)) aggregated_device_grads = [] for arr in device_grads: aggregated_device_grads.append([ (g, v) for (_, v), (g, _) in zip(arr, agg_grads) ]) return aggregated_device_grads
def _do_batch_all_reduce(self, all_device_tensors): # TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other # gradient aggregation code, since gradient aggregation is doing an all # reduce. Currently, we do gradient repacking in two different places. # TODO(reedwm): Change the allreduce code to reduce tensors instead of # tower_grads. tower_grads = [[(t, None) for t in device_tensors] for device_tensors in all_device_tensors] aggregated_device_grads = allreduce.sum_gradients_all_reduce( ['/job:localhost'], tower_grads, 1, self._all_reduce_spec.alg, self._all_reduce_spec.shards, self._gpu_indices, agg_small_grads_max_bytes=self._agg_small_grads_max_bytes, agg_small_grads_max_group=self._agg_small_grads_max_group) return [[t for t, _ in grad_vars] for grad_vars in aggregated_device_grads]
def preprocess_device_grads(self, device_grads): if self._all_reduce_spec: aggregated_device_grads = allreduce.sum_gradients_all_reduce( ['/job:localhost'], device_grads, 1, self._all_reduce_spec.alg, self._all_reduce_spec.shards, self.benchmark_cnn.gpu_indices, agg_small_grads_max_bytes=self._agg_small_grads_max_bytes, agg_small_grads_max_group=self._agg_small_grads_max_group) else: if not self.benchmark_cnn.params.hierarchical_copy: agg_grads, self.grad_has_inf_nan = ( variable_mgr_util. aggregate_gradients_using_copy_with_device_selection( self.benchmark_cnn, device_grads, use_mean=False, check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale)) aggregated_device_grads = [] for arr in device_grads: aggregated_device_grads.append( [(g, v) for (_, v), (g, _) in zip(arr, agg_grads)]) elif self.benchmark_cnn.params.gradient_repacking == 0: aggregated_device_grads, self.grad_has_inf_nan = ( variable_mgr_util.aggregate_gradients_using_hierarchical_copy( self.benchmark_cnn, device_grads, use_mean=False, check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale)) else: device_grad_packs = [] all_tower_shapes = [] all_tower_sizes = [] compact_gradient = self.benchmark_cnn.params.compact_gradient_transfer use_fp16 = self.benchmark_cnn.params.use_fp16 variable_consistency = self.benchmark_cnn.params.variable_consistency deferred_gradient = (variable_consistency == 'relaxed') gradient_put_ops = [] for tower_grads_and_vars in device_grads: with tf.colocate_with(tower_grads_and_vars[0][0]): # Flatten all the grads. flat_grads = [tf.reshape(g, [-1]) for g, _ in tower_grads_and_vars] # Remember the original shape of all the grads. tower_shapes = [tf.shape(g) for g, _ in tower_grads_and_vars] # Remember the original sizes of all the grads. tower_sizes = [tf.size(g) for g, _ in tower_grads_and_vars] # Concat all the flat grads into a big flat tensor. concat_grads = tf.concat(flat_grads, 0) grads_dtype = concat_grads.dtype if use_fp16 and compact_gradient: concat_grads = tf.cast(concat_grads, tf.float16) # With deferred-gradients, place the gradients in a staging area. if deferred_gradient: total_var_size = sum( [v.shape.num_elements() for _, v in tower_grads_and_vars]) gradient_stage = data_flow_ops.StagingArea([concat_grads.dtype]) # Push the concat-gradients into the staging area. gradient_put_op = gradient_stage.put([concat_grads]) gradient_put_ops.append(gradient_put_op) # Push an empty set of gradients into the staging area. warmup_op = gradient_stage.put( [tf.zeros([total_var_size], dtype=concat_grads.dtype)]) self._warmup_ops.append(warmup_op) # Fetch the next set of gradients to ues. concat_grads = gradient_stage.get() concat_grads = tf.reshape(concat_grads, [-1]) else: gradient_put_ops.append(None) # Split the big tensor into num_splits packs. In cases where the # total size is not divisible num_splits, the last pack gets # more elements. # TODO(zhengxq): it is possible to optimize away the additional # data movement by copying along the original variable boundary. # TODO(zhengxq): it is also possible to optimize away all the concat # as well. num_splits = self.benchmark_cnn.params.gradient_repacking total_grad_size = tf.size(concat_grads) split_size = total_grad_size // num_splits split_size_last = total_grad_size - split_size * (num_splits - 1) split_sizes = [split_size] * (num_splits - 1) + [split_size_last] grad_packs = tf.split(concat_grads, split_sizes) # Ready to aggregate the repacked gradients, with fake variables. # TODO(zhengxq): It is hacky to have to use fake variables. # We should remove the need for variables in # aggregate_gradients_using*. device_grad_packs.append(zip(grad_packs, [None] * num_splits)) all_tower_shapes.append(tower_shapes) all_tower_sizes.append(tower_sizes) # The actual aggregation of the repacked gradients. Note that they are # sharded among different aggregation trees. So it is important to # strike the balance on num_splits. summed_device_grad_packs, self.grad_has_inf_nan = ( variable_mgr_util.aggregate_gradients_using_hierarchical_copy( self.benchmark_cnn, device_grad_packs, use_mean=False, check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale)) aggregated_device_grads = [] # pylint: disable=line-too-long for (summed_tower_grad_packs, tower_grads_and_vars, tower_shapes, tower_sizes, gradient_put_op) in zip( summed_device_grad_packs, device_grads, all_tower_shapes, all_tower_sizes, gradient_put_ops): # pylint: enable=line-too-long # Reverse the packing operations in the previous steps. Form the # summed gradients back into their original shapes. with tf.colocate_with(summed_tower_grad_packs[0][0]): # Form a list of the summed grad packs. device_grad_packs = [g for g, _ in summed_tower_grad_packs] # Concat them back into a big flat tensor. device_grads_concat = tf.concat(device_grad_packs, 0) device_grads_concat = tf.cast(device_grads_concat, grads_dtype) # Split the tensors back into their original sizes. grads_with_sizes = tf.split(device_grads_concat, tower_sizes) # Reshape the tensors back into their original shapes. grads_with_shapes = [ tf.reshape(grad, shape) for shape, grad in zip(tower_shapes, grads_with_sizes) ] # With deferred gradient, add a dependency on the put_op so we # don't have a race condition with the update operation that # follows. Also it triggers the gradient_put_op. Otherwise, the # caller has to explicitly pump it. if deferred_gradient: assert gradient_put_op grads_with_shapes = [ control_flow_ops.with_dependencies([gradient_put_op], g) for g in grads_with_shapes ] # Form the list with the original list of variables. summed_tower_grads = [ (g, v) for g, (_, v) in zip(grads_with_shapes, tower_grads_and_vars) ] aggregated_device_grads.append(summed_tower_grads) return self.benchmark_cnn.devices, aggregated_device_grads
def preprocess_device_grads(self, device_grads): if self._all_reduce_spec: aggregated_device_grads = allreduce.sum_gradients_all_reduce( ['/job:localhost'], device_grads, 1, self._all_reduce_spec.alg, self._all_reduce_spec.shards, self.benchmark_cnn.gpu_indices, agg_small_grads_max_bytes=self._agg_small_grads_max_bytes, agg_small_grads_max_group=self._agg_small_grads_max_group) else: if not self.benchmark_cnn.params.hierarchical_copy: agg_grads, self.grad_has_inf_nan = ( variable_mgr_util. aggregate_gradients_using_copy_with_device_selection( self.benchmark_cnn, device_grads, use_mean=False, check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale )) aggregated_device_grads = [] for arr in device_grads: aggregated_device_grads.append([ (g, v) for (_, v), (g, _) in zip(arr, agg_grads) ]) elif self.benchmark_cnn.params.gradient_repacking == 0: aggregated_device_grads, self.grad_has_inf_nan = ( variable_mgr_util. aggregate_gradients_using_hierarchical_copy( self.benchmark_cnn, device_grads, use_mean=False, check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale )) else: device_grad_packs = [] all_tower_shapes = [] all_tower_sizes = [] compact_gradient = self.benchmark_cnn.params.compact_gradient_transfer use_fp16 = self.benchmark_cnn.params.use_fp16 for tower_grads_and_vars in device_grads: with tf.colocate_with(tower_grads_and_vars[0][0]): # Flatten all the grads. flat_grads = [ tf.reshape(g, [-1]) for g, _ in tower_grads_and_vars ] # Remember the original shape of all the grads. tower_shapes = [ tf.shape(g) for g, _ in tower_grads_and_vars ] # Remember the original sizes of all the grads. tower_sizes = [ tf.size(g) for g, _ in tower_grads_and_vars ] # Concat all the flat grads into a big flat tensor. concat_grads = tf.concat(flat_grads, 0) grads_dtype = concat_grads.dtype if use_fp16 and compact_gradient: concat_grads = tf.cast(concat_grads, tf.float16) # Split the big tensor into num_splits packs. In cases where the # total size is not divisible num_splits, the last pack gets # more elements. # TODO(zhengxq): it is possible to optimize away the additional # data movement by copying along the original variable boundary. # TODO(zhengxq): it is also possible to optimize away all the concat # as well. num_splits = self.benchmark_cnn.params.gradient_repacking total_grad_size = tf.size(concat_grads) split_size = total_grad_size // num_splits split_size_last = total_grad_size - split_size * ( num_splits - 1) split_sizes = [split_size] * (num_splits - 1) + [ split_size_last ] grad_packs = tf.split(concat_grads, split_sizes) # Ready to aggregate the repacked gradients, with fake variables. # TODO(zhengxq): It is hacky to have to use fake variables. # We should remove the need for variables in # aggregate_gradients_using*. device_grad_packs.append( zip(grad_packs, [None] * num_splits)) all_tower_shapes.append(tower_shapes) all_tower_sizes.append(tower_sizes) # The actual aggregation of the repacked gradients. Note that they are # sharded among different aggregation trees. So it is important to # strike the balance on num_splits. summed_device_grad_packs, self.grad_has_inf_nan = ( variable_mgr_util. aggregate_gradients_using_hierarchical_copy( self.benchmark_cnn, device_grad_packs, use_mean=False, check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale )) aggregated_device_grads = [] # pylint: disable=line-too-long for (summed_tower_grad_packs, tower_grads_and_vars, tower_shapes, tower_sizes) in zip(summed_device_grad_packs, device_grads, all_tower_shapes, all_tower_sizes): # pylint: enable=line-too-long # Reverse the packing operations in the previous steps. Form the # summed gradients back into their original shapes. with tf.colocate_with(summed_tower_grad_packs[0][0]): # Form a list of the summed grad packs. device_grad_packs = [ g for g, _ in summed_tower_grad_packs ] # Concat them back into a big flat tensor. device_grads_concat = tf.concat(device_grad_packs, 0) device_grads_concat = tf.cast(device_grads_concat, grads_dtype) # Split the tensors back into their original sizes. grads_with_sizes = tf.split(device_grads_concat, tower_sizes) # Reshape the tensors back into their original shapes. grads_with_shapes = [ tf.reshape(grad, shape) for shape, grad in zip( tower_shapes, grads_with_sizes) ] # Form the list with the original list of variables. summed_tower_grads = [(g, v) for g, ( _, v) in zip(grads_with_shapes, tower_grads_and_vars)] aggregated_device_grads.append(summed_tower_grads) return self.benchmark_cnn.devices, aggregated_device_grads