def test_potentially_deadlocked_send_recv_pairs(barrier_fence_fixture, comm_split_fixture, P_x_ranks, P_x_shape, P_w_ranks, P_w_shape): from distdl.backends.mpi.partition import MPIPartition from distdl.nn.broadcast import Broadcast device = torch.device('cuda' if use_cuda else 'cpu') # Isolate the minimum needed ranks base_comm, active = comm_split_fixture if not active: return P_world = MPIPartition(base_comm) # Create the partitions P_x_base = P_world.create_partition_inclusive(P_x_ranks) P_x = P_x_base.create_cartesian_topology_partition(P_x_shape) P_w_base = P_world.create_partition_inclusive(P_w_ranks) P_w = P_w_base.create_cartesian_topology_partition(P_w_shape) layer = Broadcast(P_x, P_w) # noqa F841 layer = layer.to(device) P_world.deactivate() P_x_base.deactivate() P_x.deactivate() P_w_base.deactivate() P_w.deactivate()
def __init__(self, P_x, P_y, P_w, in_features, out_features, bias=True): super(DistributedLinear, self).__init__() # P_x ~ 1 X P_fi self.P_x = P_x # P_y ~ 1 X P_fo self.P_y = P_y # P_w ~ P_fo X P_fi self.P_w = P_w self.bias = bias self.x_broadcast = Broadcast(self.P_x, self.P_w, preserve_batch=True) if self.P_w.active: local_in_features = compute_subshape(P_w.shape[1], P_w.index[1], in_features) local_out_features = compute_subshape(P_w.shape[0], P_w.index[0], out_features) # On column 0, use the specified bias, otherwise no bias to # prevent double counting bias = self.bias if (self.P_w.index[-1] == 0) else False self.sublinear = torch.nn.Linear(local_in_features[0], local_out_features[0], bias=bias) self.y_sum_reduce = SumReduce(self.P_w, self.P_y, transpose_src=True, preserve_batch=True)
def test_broadcast_adjoint(barrier_fence_fixture, comm_split_fixture, P_x_ranks, P_x_shape, P_y_ranks, P_y_shape, x_global_shape, transpose_src): import numpy as np import torch from distdl.backends.mpi.partition import MPIPartition from distdl.nn.broadcast import Broadcast from distdl.utilities.torch import zero_volume_tensor # Isolate the minimum needed ranks base_comm, active = comm_split_fixture if not active: return P_world = MPIPartition(base_comm) # Create the partitions P_x_base = P_world.create_partition_inclusive(P_x_ranks) P_x = P_x_base.create_cartesian_topology_partition(P_x_shape) P_y_base = P_world.create_partition_inclusive(P_y_ranks) P_y = P_y_base.create_cartesian_topology_partition(P_y_shape) # TODO #93: Change this to create a subtensor so we test when local tensors # have different shape. Then, the output size will also be different, which # we will have to get from `y` itself. x_local_shape = np.asarray(x_global_shape) layer = Broadcast(P_x, P_y, transpose_src=transpose_src, preserve_batch=False) x = zero_volume_tensor() if P_x.active: x = torch.Tensor(np.random.randn(*x_local_shape)) x.requires_grad = True dy = zero_volume_tensor() if P_y.active: # Adjoint Input dy = torch.Tensor(np.random.randn(*x_local_shape)) # y = F @ x y = layer(x) # dx = F* @ dy y.backward(dy) dx = x.grad x = x.detach() dx = dx.detach() dy = dy.detach() y = y.detach() check_adjoint_test_tight(P_world, x, dx, y, dy)
def __init__(self, P_x, P_y, P_w, in_features, out_features, bias=True): super(DistributedLinear, self).__init__() # P_x ~ 1 X P_fi self.P_x = P_x # P_y ~ 1 X P_fo self.P_y = P_y # P_w ~ P_fo X P_fi self.P_w = P_w # Bias flag self.bias = bias # Broadcast layer in the x-tensor self.x_broadcast = Broadcast(self.P_x, self.P_w, preserve_batch=True) # Each worker in P_W computes its own portion of the weight tensor and then # stores its own PyTorch Linear layer. Only the 0th column of the tensor # also stores a bias. if self.P_w.active: local_in_features = compute_subshape(P_w.shape[1], P_w.index[1], in_features) local_out_features = compute_subshape(P_w.shape[0], P_w.index[0], out_features) # On column 0, use the specified bias, otherwise no bias to # prevent double counting bias = self.bias if (self.P_w.index[-1] == 0) else False self.sublinear = torch.nn.Linear(local_in_features[0], local_out_features[0], bias=bias) # Sum-reduce layer to get the y-tensor self.y_sum_reduce = SumReduce(self.P_w, self.P_y, transpose_src=True, preserve_batch=True)
def __init__(self, P_x, P_y, P_w, in_channels=1, out_channels=1, bias=True, *args, **kwargs): super(DistributedGeneralConvBase, self).__init__() # P_x is 1 x P_ci x P_d-1 x ... x P_0 self.P_x = P_x # P_y is 1 x P_co x P_d-1 x ... x P_0 self.P_y = P_y # P_w is P_co x P_ci x P_d-1 x ... x P_0 self.P_w = P_w self.P_union = self._distdl_backend.Partition() if not (self.P_x.active or self.P_y.active or self.P_w.active): return # This guarantees that P_union rank 0 has the kernel size, stride, # padding, and dilation factors P_union = P_w.create_partition_union(P_x) P_union = P_union.create_partition_union(P_y) self.P_union = P_union P_w_shape = None if P_union.rank == 0: P_w_shape = np.array(P_w.shape, dtype=np.int) P_w_shape = P_union.broadcast_data(P_w_shape, root=0) P_co = P_w_shape[0] P_ci = P_w_shape[1] P_channels = [P_co, P_ci] P_x_new_shape = [] if self.P_x.active: if (np.any(P_x.shape[2:] != P_w_shape[2:])): raise ValueError( "Spatial components of P_x and P_w must match.") if P_w_shape[1] != P_x.shape[1]: raise ValueError( "Index 2 of P_w dimension must match input channel partition." ) P_x_new_shape = list(P_x.shape) P_x_new_shape.insert(1, 1) # Currently a hack, removing the batch dimension because P_w does # not have one. This is OK because we assume there are no partitions # in the batch dimension. P_x_new_shape = np.asarray(P_x_new_shape[1:], dtype=int) # For the purposes of this layer, we re-cast P_x to have the extra # dimension. This has no impact outside of the layer or on the results. self.P_x = self.P_x.create_cartesian_topology_partition(P_x_new_shape) P_y_new_shape = [] if self.P_y.active: if (np.any(P_y.shape[2:] != P_w_shape[2:])): raise ValueError( "Spatial components of P_y and P_w must match.") if P_w_shape[0] != P_y.shape[1]: raise ValueError( "Index 1 of P_w dimension must match output channel partition." ) P_y_new_shape = list(P_y.shape) P_y_new_shape.insert(2, 1) # Currently a hack, removing the batch dimension because P_w does # not have one. This is OK because we assume there are no partitions # in the batch dimension. P_y_new_shape = np.asarray(P_y_new_shape[1:], dtype=int) # For the purposes of this layer, we re-cast P_x to have the extra # dimension. This has no impact outside of the layer or on the results. self.P_y = self.P_y.create_cartesian_topology_partition(P_y_new_shape) P_spatial = P_w_shape[2:] self.serial = False if self.P_w.size == 1: self.serial = True self.conv_layer = self.TorchConvType(*args, **kwargs) return self.receives_weight = False self.stores_weight = False self.receives_bias = False self.stores_bias = False # Determine P_r, initialize weights there if self.P_w.active: # All of P_w always receives the weight self.receives_weight = True # This subset is taken to be the origin of the spartial component w_root_subset = [] for i, c in enumerate(range_index(P_w.shape)): c = np.asarray(c) # Find the P_co x P_ci x 1 x ... x 1 subset to store the weights if np.all(c[2:] == 0): w_root_subset.append(i) self.P_wr_base = self.P_w.create_partition_inclusive(w_root_subset) # ones are needed so the broadcast will work self.P_wr = self.P_wr_base.create_cartesian_topology_partition( [P_co, P_ci] + [1] * len(P_spatial)) self.stores_weight = self.P_wr.active b_subset = [] for i, c in enumerate(range_index(P_w.shape)): c = np.asarray(c) # Find the P_co x 1 x P_0 x ... x P_D-1 subset that needs biases in its calculation. # This is everywhere that the input channels is rank 0. if c[1] == 0: b_subset.append(i) self.P_b_base = self.P_w.create_partition_inclusive(b_subset) self.P_b = self.P_b_base.create_cartesian_topology_partition( [P_co] + [1] + list(P_spatial)) self.receives_bias = self.P_b.active and bias # Now find the subset of _that_ which actually stores the learnable parameter. b_root_subset = [] for i, c in enumerate(range_index(P_w.shape)): c = np.asarray(c) # Find the P_co x 1 x 1 x ... x 1 subset to store the biases if np.all(c[1:] == 0): b_root_subset.append(i) self.P_br_base = self.P_w.create_partition_inclusive(b_root_subset) # ones are needed so the broadcast will work self.P_br = self.P_br_base.create_cartesian_topology_partition( [P_co] + [1] + [1] * len(P_spatial)) self.stores_bias = self.P_br.active and bias # Correct the input arguments based on local properties local_kwargs = {} local_kwargs.update(kwargs) # Do this before checking serial so that the layer works properly # in the serial case local_channels = compute_subshape(P_channels, P_w.index[0:2], [out_channels, in_channels]) local_out_channels, local_in_channels = local_channels local_kwargs["in_channels"] = local_in_channels local_kwargs["out_channels"] = local_out_channels local_kwargs["bias"] = self.receives_bias self.conv_layer = self.TorchConvType(*args, **local_kwargs) # If we store the weight it is a learnable parameter iff it is # learnable by default in the layer, which it is. if self.stores_weight: self._weight = torch.nn.Parameter( self.conv_layer.weight.detach()) else: self._weight = zero_volume_tensor() # This always exists so we can copy the property self._weight.requires_grad = self.conv_layer.weight.requires_grad # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2 new_weight = self.conv_layer.weight.detach() * 0 new_weight.requires_grad = self.conv_layer.weight.requires_grad del self.conv_layer.weight self.conv_layer.weight = new_weight # If we store the bias, it is a learnable parameter iff it is # learnable by default in the layer, which is only true if it # exists. if self.stores_bias: self._bias = torch.nn.Parameter(self.conv_layer.bias.detach()) else: self._bias = zero_volume_tensor() # This does not always exist, but when it does we can copy the # property. if self.receives_bias: self._bias.requires_grad = self.conv_layer.bias.requires_grad # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2 new_bias = self.conv_layer.bias.detach() * 0 new_bias.requires_grad = self.conv_layer.bias.requires_grad del self.conv_layer.bias self.conv_layer.bias = new_bias # Now we need to share the kernel structure. The size of the kernel # is always the spatial dimensions. self.conv_kernel_size = None self.conv_stride = None self.conv_padding = None self.conv_dilation = None if P_union.rank == 0: self.conv_kernel_size = np.array(self.conv_layer.kernel_size, dtype=np.int) self.conv_stride = np.array(self.conv_layer.stride, dtype=np.int) self.conv_padding = np.array(self.conv_layer.padding, dtype=np.int) self.conv_dilation = np.array(self.conv_layer.dilation, dtype=np.int) self.conv_kernel_size = P_union.broadcast_data(self.conv_kernel_size, root=0) self.conv_stride = P_union.broadcast_data(self.conv_stride, root=0) self.conv_padding = P_union.broadcast_data(self.conv_padding, root=0) self.conv_dilation = P_union.broadcast_data(self.conv_dilation, root=0) # We need the halo shape, and other info, to fully populate the pad, # halo exchange, and unpad layers. For pad and unpad, we defer their # construction to the pre-forward hook. self.pad_layer = None self.unpad_layer = None # We need to be able to remove some data from the input to the conv # layer. self.needed_slices = None # For the halo layer we also defer construction, so that we can have # the halo shape for the input. The halo will allocate its own # buffers, but it needs this information at construction to be able # to do this in the pre-forward hook. self.halo_layer = None # Variables for tracking input changes and buffer construction self._distdl_is_setup = False self._input_shape = None self._input_requires_grad = None if P_w.active: self.w_broadcast = Broadcast(self.P_wr, self.P_w, preserve_batch=False) if self.receives_bias or self.stores_bias: self.b_broadcast = Broadcast(self.P_br, self.P_b, preserve_batch=False) self.x_broadcast = Broadcast(self.P_x, self.P_w, preserve_batch=True) self.y_sum_reduce = SumReduce(self.P_w, self.P_y, preserve_batch=True)
def __init__(self, P_x, P_y, P_w, in_channels, out_channels, kernel_size, stride=1, padding=0, padding_mode='zeros', dilation=1, groups=1, bias=True, *args, **kwargs): super(DistributedChannelConvBase, self).__init__() # P_x is 1 x P_ci x 1 x ... x 1 self.P_x = P_x # P_y is 1 x P_co x 1 x ... x 1 self.P_y = P_y # P_w is P_co x P_ci x 1 x ... x 1 self.P_w = P_w # Even inactive workers need some partition union P_union = self._distdl_backend.Partition() if not (self.P_x.active or self.P_y.active or self.P_w.active): return self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = self._expand_parameter(kernel_size) self.stride = self._expand_parameter(stride) self.padding = self._expand_parameter(padding) self.padding_mode = padding_mode self.dilation = self._expand_parameter(dilation) self.groups = groups self.use_bias = bias # This guarantees that P_union rank 0 has the kernel size, stride, # padding, and dilation factors P_union_temp = P_w.create_partition_union(P_x) P_union = P_union_temp.create_partition_union(P_y) # Ensure that all workers have the full size and structure of P_w P_w_shape = None if P_union.rank == 0: P_w_shape = np.array(P_w.shape, dtype=np.int) P_w_shape = P_union.broadcast_data(P_w_shape, root=0) # Release the temporary resources P_union_temp.deactivate() P_union.deactivate() P_co = P_w_shape[0] P_ci = P_w_shape[1] P_channels = [P_co, P_ci] # Ensure that P_x and P_w are correctly aligned. We also produce a # new P_x that is shaped like 1 x P_ci x 1 x ... x 1, to assist with # broadcasts. P_x_new_shape = [] if self.P_x.active: if (np.any(P_x.shape[2:] != P_w_shape[2:])): raise ValueError( "Spatial components of P_x and P_w must match.") if (np.any(P_x.shape[2:] != np.ones(len(P_x.shape[2:])))): raise ValueError( "Spatial components of P_x must be 1 x ... x 1.") if P_w_shape[1] != P_x.shape[1]: raise ValueError( "Index 2 of P_w dimension must match input channel partition." ) P_x_new_shape = list(P_x.shape) P_x_new_shape.insert(1, 1) # Currently a hack, removing the batch dimension because P_w does # not have one. This is OK because we assume there are no partitions # in the batch dimension. P_x_new_shape = np.asarray(P_x_new_shape[1:], dtype=int) # For the purposes of this layer, we re-cast P_x to have the extra # dimension. This has no impact outside of the layer or on the results. self.P_x = self.P_x.create_cartesian_topology_partition(P_x_new_shape) # Ensure that P_y and P_w are correctly aligned. We also produce a # new P_y that is shaped like P_co x 1 x 1 x ... x 1, to assist with # broadcasts. P_y_new_shape = [] if self.P_y.active: if (np.any(P_y.shape[2:] != P_w_shape[2:])): raise ValueError( "Spatial components of P_y and P_w must match.") if (np.any(P_y.shape[2:] != np.ones(len(P_y.shape[2:])))): raise ValueError( "Spatial components of P_y must be 1 x ... x 1.") if P_w_shape[0] != P_y.shape[1]: raise ValueError( "Index 1 of P_w dimension must match output channel partition." ) P_y_new_shape = list(P_y.shape) P_y_new_shape.insert(2, 1) # Currently a hack, removing the batch dimension because P_w does # not have one. This is OK because we assume there are no partitions # in the batch dimension. P_y_new_shape = np.asarray(P_y_new_shape[1:], dtype=int) # For the purposes of this layer, we re-cast P_x to have the extra # dimension. This has no impact outside of the layer or on the results. self.P_y = self.P_y.create_cartesian_topology_partition(P_y_new_shape) self.serial = self.P_w.size == 1 if self.serial: self.conv_layer = self.TorchConvType( in_channels=in_channels, out_channels=out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, padding_mode=self.padding_mode, dilation=self.dilation, groups=self.groups, bias=self.use_bias) self.weight = self.conv_layer.weight self.bias = self.conv_layer.bias return # Flag if the global bias is set self.global_bias = bias # Flags if current worker stores (part of) the bias locally. self.stores_bias = False if self.P_w.active: # Let the P_co column store the bias if it is to be used self.stores_bias = self.P_w.index[1] == 0 and self.use_bias # Correct the input arguments based on local properties # This ensures that the in and out channels are correctly shared. local_co, local_ci = compute_subshape(P_channels, P_w.index[0:2], [out_channels, in_channels]) self.conv_layer = self.TorchConvType( in_channels=local_ci, out_channels=local_co, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, padding_mode=self.padding_mode, dilation=self.dilation, groups=groups, bias=self.stores_bias) # Workers in P_w alias the conv layer to get their weight and perhaps # biases. Every other worker doesn't have a weight or bias. if self.P_w.active: self.weight = self.conv_layer.weight if self.stores_bias: self.bias = self.conv_layer.bias else: if self.use_bias: self.register_buffer('bias', zero_volume_tensor()) else: self.register_buffer('bias', None) else: self.register_buffer('weight', zero_volume_tensor()) if self.use_bias: self.register_buffer('bias', zero_volume_tensor()) else: self.register_buffer('bias', None) # Variables for tracking input changes and buffer construction self._distdl_is_setup = False self._input_tensor_structure = TensorStructure() self.x_broadcast = Broadcast(self.P_x, self.P_w, preserve_batch=True) self.y_sum_reduce = SumReduce(self.P_w, self.P_y, preserve_batch=True)
def __init__(self, P_x, in_channels, out_channels, kernel_size, stride=1, padding=0, padding_mode='zeros', dilation=1, groups=1, bias=True, buffer_manager=None): super(DistributedFeatureConvBase, self).__init__() # P_x is 1 x 1 x P_d-1 x ... x P_0 self.P_x = P_x # Back-end specific buffer manager for economic buffer allocation if buffer_manager is None: buffer_manager = self._distdl_backend.BufferManager() elif type(buffer_manager) is not self._distdl_backend.BufferManager: raise ValueError("Buffer manager type does not match backend.") self.buffer_manager = buffer_manager if not self.P_x.active: return self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = self._expand_parameter(kernel_size) self.stride = self._expand_parameter(stride) self.padding = self._expand_parameter(padding) self.padding_mode = padding_mode self.dilation = self._expand_parameter(dilation) self.groups = groups self.use_bias = bias self.serial = self.P_x.size == 1 if self.serial: self.conv_layer = self.TorchConvType( in_channels=in_channels, out_channels=out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, padding_mode=self.padding_mode, dilation=self.dilation, groups=self.groups, bias=self.use_bias) self.weight = self.conv_layer.weight self.bias = self.conv_layer.bias else: self.conv_layer = self.TorchConvType(in_channels=in_channels, out_channels=out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=0, padding_mode='zeros', dilation=self.dilation, groups=groups, bias=bias) if self.serial: return dims = len(self.P_x.shape) # We will be using global padding to compute local padding, # so expand it to a numpy array global_padding = np.pad(self.padding, pad_width=(dims - len(self.padding), 0), mode='constant', constant_values=0) self.global_padding = global_padding pad_left_right = self.global_padding.reshape((dims, 1)) + np.zeros( (dims, 2), dtype=np.int) self.local_padding = self._compute_local_padding(pad_left_right) # Weights and biases partition P_wb = self.P_x.create_partition_inclusive([0]) self.P_wb_cart = P_wb.create_cartesian_topology_partition([1]) # Release temporary resources P_wb.deactivate() # We want only the root rank of the broadcast to have a weight and a # bias parameter. Every other rank gets a zero-volume tensor. if self.P_wb_cart.active: self.weight = torch.nn.Parameter(self.conv_layer.weight.detach()) if self.conv_layer.bias is not None: self.bias = torch.nn.Parameter(self.conv_layer.bias.detach()) else: self.register_buffer('bias', None) else: self.register_buffer('weight', zero_volume_tensor()) if self.conv_layer.bias is not None: self.register_buffer('bias', zero_volume_tensor()) else: self.register_buffer('bias', None) self.weight.requires_grad = self.conv_layer.weight.requires_grad if self.conv_layer.bias is not None: self.bias.requires_grad = self.conv_layer.bias.requires_grad # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2 new_weight = self.conv_layer.weight.detach() * 0 new_weight.requires_grad = self.conv_layer.weight.requires_grad del self.conv_layer.weight self.conv_layer.weight = new_weight if self.conv_layer.bias is not None: new_bias = self.conv_layer.bias.detach() * 0 new_bias.requires_grad = self.conv_layer.bias.requires_grad del self.conv_layer.bias self.conv_layer.bias = new_bias self.w_broadcast = Broadcast(self.P_wb_cart, self.P_x, preserve_batch=False) if self.conv_layer.bias is not None: self.b_broadcast = Broadcast(self.P_wb_cart, self.P_x, preserve_batch=False) # We need to be able to remove some data from the input to the conv # layer. self.needed_slices = None # For the halo layer we also defer construction, so that we can have # the halo shape for the input. The halo will allocate its own # buffers, but it needs this information at construction to be able # to do this in the pre-forward hook. self.halo_layer = None # Variables for tracking input changes and buffer construction self._distdl_is_setup = False self._input_tensor_structure = TensorStructure()
def __init__(self, P_x, P_y, P_w, in_channels, out_channels, kernel_size, stride=1, padding=0, padding_mode='zeros', dilation=1, groups=1, bias=True, buffer_manager=None): super(DistributedGeneralConvBase, self).__init__() # P_x is 1 x P_ci x P_d-1 x ... x P_0 self.P_x = P_x # P_y is 1 x P_co x P_d-1 x ... x P_0 self.P_y = P_y # P_w is P_co x P_ci x P_d-1 x ... x P_0 self.P_w = P_w # Back-end specific buffer manager for economic buffer allocation if buffer_manager is None: buffer_manager = self._distdl_backend.BufferManager() elif type(buffer_manager) is not self._distdl_backend.BufferManager: raise ValueError("Buffer manager type does not match backend.") self.buffer_manager = buffer_manager # Even inactive workers need some partition union self.P_union = self._distdl_backend.Partition() if not (self.P_x.active or self.P_y.active or self.P_w.active): return self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = self._expand_parameter(kernel_size) self.stride = self._expand_parameter(stride) self.padding = self._expand_parameter(padding) self.padding_mode = padding_mode self.dilation = self._expand_parameter(dilation) self.groups = groups self.use_bias = bias # This guarantees that P_union rank 0 has the kernel size, stride, # padding, and dilation factors P_union_temp = P_w.create_partition_union(P_x) self.P_union = P_union_temp.create_partition_union(P_y) # Release the temporary resources P_union_temp.deactivate() # Ensure that all workers have the full size and structure of P_w P_w_shape = None if self.P_union.rank == 0: P_w_shape = np.array(P_w.shape, dtype=np.int) P_w_shape = self.P_union.broadcast_data(P_w_shape, root=0) P_co = P_w_shape[0] P_ci = P_w_shape[1] P_channels = [P_co, P_ci] # Ensure that P_x and P_w are correctly aligned. We also produce a # new P_x that is shaped like 1 x P_ci x P_d-1 x ... x P_0, to assist # with broadcasts. P_x_new_shape = [] if self.P_x.active: if(np.any(P_x.shape[2:] != P_w_shape[2:])): raise ValueError("Spatial components of P_x and P_w must match.") if P_w_shape[1] != P_x.shape[1]: raise ValueError("Index 2 of P_w dimension must match input channel partition.") P_x_new_shape = list(P_x.shape) P_x_new_shape.insert(1, 1) # Currently a hack, removing the batch dimension because P_w does # not have one. This is OK because we assume there are no partitions # in the batch dimension. P_x_new_shape = np.asarray(P_x_new_shape[1:], dtype=int) # For the purposes of this layer, we re-cast P_x to have the extra # dimension. This has no impact outside of the layer or on the results. self.P_x = self.P_x.create_cartesian_topology_partition(P_x_new_shape) # Ensure that P_y and P_w are correctly aligned. We also produce a # new P_y that is shaped like 1 x P_ci x P_d-1 x ... x P_0, to assist # with broadcasts. P_y_new_shape = [] if self.P_y.active: if(np.any(P_y.shape[2:] != P_w_shape[2:])): raise ValueError("Spatial components of P_y and P_w must match.") if P_w_shape[0] != P_y.shape[1]: raise ValueError("Index 1 of P_w dimension must match output channel partition.") P_y_new_shape = list(P_y.shape) P_y_new_shape.insert(2, 1) # Currently a hack, removing the batch dimension because P_w does # not have one. This is OK because we assume there are no partitions # in the batch dimension. P_y_new_shape = np.asarray(P_y_new_shape[1:], dtype=int) # For the purposes of this layer, we re-cast P_x to have the extra # dimension. This has no impact outside of the layer or on the results. self.P_y = self.P_y.create_cartesian_topology_partition(P_y_new_shape) P_spatial = P_w_shape[2:] self.serial = self.P_w.size == 1 if self.serial: self.conv_layer = self.TorchConvType(in_channels=in_channels, out_channels=out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, padding_mode=self.padding_mode, dilation=self.dilation, groups=self.groups, bias=self.use_bias) self.weight = self.conv_layer.weight self.bias = self.conv_layer.bias return # Need to figure out any padding necessary to handle global padding. # This is only on the input tensor. The convolution will not use # any implicit padding, so the work partition does not need it. if self.P_x.active: dims = len(self.P_x.shape) # We will be using global padding to compute local padding, # so expand it to a numpy array global_padding = np.pad(self.padding, pad_width=(dims-len(self.padding), 0), mode='constant', constant_values=0) self.global_padding = global_padding pad_left_right = self.global_padding.reshape((dims, 1)) + np.zeros((dims, 2), dtype=np.int) self.local_padding = self._compute_local_padding(pad_left_right) # Workers can either store the learnable weights and bias, or they # need copies of it. self.receives_weight = False self.stores_weight = False self.receives_bias = False self.stores_bias = False # Determine root partitions, initialize weights there if self.P_w.active: # All of P_w always receives the weight self.receives_weight = True # This subset is taken to be the origin of the spartial component w_root_subset = [] for i, c in enumerate(range_index(P_w.shape)): c = np.asarray(c) # Find the P_co x P_ci x 1 x ... x 1 subset to store the weights if np.all(c[2:] == 0): w_root_subset.append(i) P_wr_base = self.P_w.create_partition_inclusive(w_root_subset) # ones are needed so the broadcast will work self.P_wr = P_wr_base.create_cartesian_topology_partition([P_co, P_ci] + [1]*len(P_spatial)) self.stores_weight = self.P_wr.active # Release temporary resources P_wr_base.deactivate() b_subset = [] for i, c in enumerate(range_index(P_w.shape)): c = np.asarray(c) # Find the P_co x 1 x P_0 x ... x P_D-1 subset that needs # biases in its calculation. This is everywhere that the input # channels is rank 0. if c[1] == 0: b_subset.append(i) P_b_base = self.P_w.create_partition_inclusive(b_subset) self.P_b = P_b_base.create_cartesian_topology_partition([P_co] + [1] + list(P_spatial)) self.receives_bias = self.P_b.active and self.use_bias # Release temporary resources P_b_base.deactivate() # Now find the subset of _that_ which actually stores the # learnable parameter. b_root_subset = [] for i, c in enumerate(range_index(P_w.shape)): c = np.asarray(c) # Find the P_co x 1 x 1 x ... x 1 subset to store the biases if np.all(c[1:] == 0): b_root_subset.append(i) P_br_base = self.P_w.create_partition_inclusive(b_root_subset) # ones are needed so the broadcast will work self.P_br = P_br_base.create_cartesian_topology_partition([P_co] + [1] + [1]*len(P_spatial)) self.stores_bias = self.P_br.active and self.use_bias # Release temporary resources P_br_base.deactivate() # Correct the input arguments based on local properties # This ensures that the in and out channels are correctly shared. local_co, local_ci = compute_subshape(P_channels, P_w.index[0:2], [out_channels, in_channels]) self.conv_layer = self.TorchConvType(in_channels=local_ci, out_channels=local_co, kernel_size=self.kernel_size, stride=self.stride, padding=0, padding_mode='zeros', dilation=self.dilation, groups=groups, bias=self.receives_bias) # If we store the weight it is a learnable parameter iff it is # learnable by default in the layer, which it is. if self.stores_weight: self.weight = torch.nn.Parameter(self.conv_layer.weight.detach()) else: self.register_buffer('weight', zero_volume_tensor()) # This always exists so we can copy the property self.weight.requires_grad = self.conv_layer.weight.requires_grad # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2 new_weight = self.conv_layer.weight.detach() * 0 new_weight.requires_grad = self.conv_layer.weight.requires_grad del self.conv_layer.weight self.conv_layer.weight = new_weight # If we store the bias, it is a learnable parameter iff it is # learnable by default in the layer, which is only true if it # exists. if self.stores_bias: self.bias = torch.nn.Parameter(self.conv_layer.bias.detach()) else: if self.use_bias: self.register_buffer('bias', zero_volume_tensor()) else: self.register_buffer('bias', None) # This does not always exist, but when it does we can copy the # property. if self.receives_bias: self.bias.requires_grad = self.conv_layer.bias.requires_grad # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2 new_bias = self.conv_layer.bias.detach() * 0 new_bias.requires_grad = self.conv_layer.bias.requires_grad del self.conv_layer.bias self.conv_layer.bias = new_bias else: # Workers not in P_w don't have a weight or bias. self.register_buffer('weight', zero_volume_tensor()) if self.use_bias: self.register_buffer('bias', zero_volume_tensor()) else: self.register_buffer('bias', None) # Now we need to share the kernel structure. The size of the kernel # is always the spatial dimensions. self.conv_kernel_size = None self.conv_stride = None self.conv_padding = None self.conv_dilation = None # By construction, rank 0 of the union should always have all of this # information, because it will always construct a local conv layer. We # rely on the local conv layer to properly fill out this information # from the defaults. This info is required for all workers on the # input and output partitions because it is needed to construct the # halos. Rank 0 in the union shares it with everyone. if self.P_union.rank == 0: self.conv_kernel_size = np.array(self.conv_layer.kernel_size, dtype=np.int) self.conv_stride = np.array(self.conv_layer.stride, dtype=np.int) self.conv_padding = np.array(self.conv_layer.padding, dtype=np.int) self.conv_dilation = np.array(self.conv_layer.dilation, dtype=np.int) self.conv_kernel_size = self.P_union.broadcast_data(self.conv_kernel_size, root=0) self.conv_stride = self.P_union.broadcast_data(self.conv_stride, root=0) self.conv_padding = self.P_union.broadcast_data(self.conv_padding, root=0) self.conv_dilation = self.P_union.broadcast_data(self.conv_dilation, root=0) # We need to be able to remove some data from the input to the conv # layer but again need to defer. self.needed_slices = None # For the halo layer we also defer construction, so that we can have # the halo shape for the input. The halo will allocate its own # buffers, but it needs this information at construction to be able # to do this in the pre-forward hook. self.halo_layer = None # Variables for tracking input changes and buffer construction self._distdl_is_setup = False self._input_tensor_structure = TensorStructure() # Some layers, those that require no information about the input # tensor to setup, can be built now. if P_w.active: self.w_broadcast = Broadcast(self.P_wr, self.P_w, preserve_batch=False) if self.receives_bias or self.stores_bias: self.b_broadcast = Broadcast(self.P_br, self.P_b, preserve_batch=False) self.x_broadcast = Broadcast(self.P_x, self.P_w, preserve_batch=True) self.y_sum_reduce = SumReduce(self.P_w, self.P_y, preserve_batch=True)
def __init__(self, P_x, num_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True): super(DistributedBatchNorm, self).__init__() self.num_dimensions = len(P_x.shape) if self.num_dimensions < 2: raise ValueError( 'Number of dimensions of P_x should be at least 2.') self.num_features = num_features self.eps = eps self.momentum = momentum self.affine = affine self.track_running_stats = track_running_stats self.inputs_seen = 0 # Determine the size of the local trainable parameters (this is a bit of a hack) possible_input_shape = P_x.shape.tolist() possible_input_shape[1] = num_features start_index = compute_start_index(P_x.shape, P_x.index, possible_input_shape) stop_index = compute_stop_index(P_x.shape, P_x.index, possible_input_shape) self.local_num_features = stop_index[1] - start_index[1] internal_data_shape = [1] * self.num_dimensions internal_data_shape[1] = self.local_num_features internal_partition_shape = [1] * self.num_dimensions internal_partition_shape[1] = P_x.shape[1] # Decide which workers will be used to store sum and affine parameters index = [0] * self.num_dimensions index[1] = slice(0, P_x.shape[1]) index = tuple(index) storage_workers = worker_layout(P_x.shape)[index].tolist() self.P_x = P_x P_sum_base = P_x.create_partition_inclusive(storage_workers) self.P_sum = P_sum_base.create_cartesian_topology_partition( internal_partition_shape) # Release temporary resources P_sum_base.deactivate() if self.track_running_stats: self.register_buffer('running_mean', torch.zeros(internal_data_shape)) self.register_buffer('running_var', torch.ones(internal_data_shape)) else: self.running_mean = None self.running_var = None self.sr = SumReduce(P_x, self.P_sum) self.bc = Broadcast(self.P_sum, P_x) self.bc_affine = Broadcast(self.P_sum, P_x) if self.affine: if self.P_sum.active: self.gamma = torch.nn.Parameter( torch.ones(internal_data_shape)) self.beta = torch.nn.Parameter( torch.zeros(internal_data_shape)) else: self.register_buffer('gamma', zero_volume_tensor(requires_grad=True)) self.register_buffer('beta', zero_volume_tensor(requires_grad=True))
def __init__(self, P_x, *args, **kwargs): super(DistributedConvBase, self).__init__() self.P_x = P_x if not self.P_x.active: return # Do this before checking serial so that the layer works properly # in the serial case self.conv_layer = self.TorchConvType(*args, **kwargs) self.serial = False if self.P_x.size == 1: self.serial = True return # Weights and biases partition self.P_wb = self.P_x.create_partition_inclusive([0]) self.P_wb_cart = self.P_wb.create_cartesian_topology_partition([1]) # We want only the root rank of the broadcast to have a weight and a bias parameter. # Every other rank gets a NoneTensor. if self.P_wb_cart.active: self.weight = torch.nn.Parameter(self.conv_layer.weight.detach()) if self.conv_layer.bias is not None: self.bias = torch.nn.Parameter(self.conv_layer.bias.detach()) else: self.weight = zero_volume_tensor() if self.conv_layer.bias is not None: self.bias = zero_volume_tensor() self.weight.requires_grad = self.conv_layer.weight.requires_grad if self.conv_layer.bias is not None: self.bias.requires_grad = self.conv_layer.bias.requires_grad # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2 new_weight = self.conv_layer.weight.detach() * 0 new_weight.requires_grad = self.conv_layer.weight.requires_grad del self.conv_layer.weight self.conv_layer.weight = new_weight if self.conv_layer.bias is not None: new_bias = self.conv_layer.bias.detach() * 0 new_bias.requires_grad = self.conv_layer.bias.requires_grad del self.conv_layer.bias self.conv_layer.bias = new_bias self.w_broadcast = Broadcast(self.P_wb_cart, self.P_x, preserve_batch=False) if self.conv_layer.bias is not None: self.b_broadcast = Broadcast(self.P_wb_cart, self.P_x, preserve_batch=False) # We need the halo shape, and other info, to fully populate the pad, # halo exchange, and unpad layers. For pad and unpad, we defer their # construction to the pre-forward hook. self.pad_layer = None self.unpad_layer = None # We need to be able to remove some data from the input to the conv # layer. self.needed_slices = None # For the halo layer we also defer construction, so that we can have # the halo shape for the input. The halo will allocate its own # buffers, but it needs this information at construction to be able # to do this in the pre-forward hook. self.halo_layer = None # Variables for tracking input changes and buffer construction self._distdl_is_setup = False self._input_shape = None self._input_requires_grad = None
# [ 1 1 ] # ------- # [ 2 2 ] # [ 2 2 ] ] x = zero_volume_tensor() if P_x.active: x_local_shape = slicing.compute_subshape(P_x.shape, P_x.index, x_global_shape) x = np.zeros(x_local_shape) + P_x.rank + 1 x = torch.from_numpy(x) x.requires_grad = True print(f"rank {P_world.rank}; index {P_x.index}; value {x}") # Here we broadcast the columns (axis 1), along the rows. all_reduce_cols = Broadcast(P_x, P_y, preserve_batch=False) # # Output tensor will be (on a 2 x 3 partition): # [ [ 1 1 | 1 1 | 1 1 ] # [ 1 1 | 1 1 | 1 1 ] # [ 1 1 | 1 1 | 1 1 ] # ------------------------- # [ 2 2 | 2 2 | 2 2 ] # [ 2 2 | 2 2 | 2 2 ] # [ 2 2 | 2 2 | 2 2 ] ] y = all_reduce_cols(x) print(f"rank {P_world.rank}; index {P_x.index}; value {y}") # Setup the adjoint input tensor. Any worker in P_y will generate its part of # the adjoint input tensor. Any worker not in P_y will have a zero-volume
def test_broadcast_dtype(barrier_fence_fixture, comm_split_fixture, dtype, test_backward, P_x_ranks, P_x_shape, P_y_ranks, P_y_shape, x_global_shape, transpose_src): import numpy as np import torch from distdl.backends.mpi.partition import MPIPartition from distdl.nn.broadcast import Broadcast from distdl.utilities.torch import zero_volume_tensor device = torch.device('cuda' if use_cuda else 'cpu') # Isolate the minimum needed ranks base_comm, active = comm_split_fixture if not active: return P_world = MPIPartition(base_comm) # Create the partitions P_x_base = P_world.create_partition_inclusive(P_x_ranks) P_x = P_x_base.create_cartesian_topology_partition(P_x_shape) P_y_base = P_world.create_partition_inclusive(P_y_ranks) P_y = P_y_base.create_cartesian_topology_partition(P_y_shape) # TODO #93: Change this to create a subtensor so we test when local tensors # have different shape. Then, the output size will also be different, which # we will have to get from `y` itself. x_local_shape = np.asarray(x_global_shape) layer = Broadcast(P_x, P_y, transpose_src=transpose_src, preserve_batch=False) layer = layer.to(device) x = zero_volume_tensor(device=device) if P_x.active: x = 10 * torch.randn(*x_local_shape).to(dtype) x = x.to(device) x.requires_grad = test_backward # y = F @ x y = layer(x) # If we are not in the output partition, there is no data to test the type # against. if P_y.active: assert y.dtype == dtype if test_backward: dy = zero_volume_tensor(device=device) if P_y.active: # Adjoint Input dy = 10 * torch.randn(*x_local_shape).to(dtype) dy = dy.to(device) # dx = F* @ dy y.backward(dy) dx = x.grad if P_x.active: assert dx.dtype == dtype P_world.deactivate() P_x_base.deactivate() P_x.deactivate() P_y_base.deactivate() P_y.deactivate()