def assemble_global_tensor_structure(local_tensor_structure, P_in, P_out=None): global_tensor_structure = TensorStructure() global_tensor_shape = None intID_dtype = None requires_grad_int = None if P_in.active: # Assemble the global shape global_tensor_shape = np.zeros(P_in.dim, dtype=np.int) for i in range(P_in.dim): keep = [False] * P_in.dim keep[i] = True P_sub = P_in.create_cartesian_subtopology_partition(keep) v0 = np.atleast_1d(int(local_tensor_structure.shape[i])) v1 = np.zeros(1, dtype=np.int) P_sub._comm.Allreduce(v0, v1, op=MPI.SUM) global_tensor_shape[i] = v1[0] # Free the subtopology resources P_sub.deactivate() # Get a communicable integer representing the dtype intID_dtype = torch_to_intID_dtype_dict[local_tensor_structure.dtype] intID_dtype = np.array([intID_dtype], dtype=np.int) requires_grad_int = np.array([-1], dtype=np.int) requires_grad_int[0] = 1 if local_tensor_structure.requires_grad else 0 global_tensor_structure.shape = global_tensor_shape global_tensor_structure.dtype = local_tensor_structure.dtype global_tensor_structure.requires_grad = local_tensor_structure.requires_grad if P_out is not None and P_out.active: # Share the shape global_tensor_structure.shape = P_out.broadcast_data( global_tensor_shape, P_data=P_in) # Share the dtype intID_dtype = P_out.broadcast_data(intID_dtype, P_data=P_in) global_tensor_structure.dtype = intID_to_torch_dtype_dict[ intID_dtype[0]] # Share the requires_grad status requires_grad_int = P_out.broadcast_data(requires_grad_int, P_data=P_in) global_tensor_structure.requires_grad = bool(requires_grad_int[0]) return global_tensor_structure
def _distdl_module_setup(self, input): r"""Distributed (feature) pooling module setup function. This function is called every time something changes in the input tensor structure. It should not be called manually. Parameters ---------- input : Tuple of forward inputs. See `torch.nn.Module.register_forward_pre_hook` for more details. """ self._distdl_is_setup = True self._input_tensor_structure = TensorStructure(input[0]) if not self.P_x.active: return # To compute the halo regions and interpolation, we need the global # tensor shape. This is not available until when the input is # provided. global_input_tensor_structure = \ self._distdl_backend.assemble_global_tensor_structure(input[0], self.P_x) if self.size is None: global_output_tensor_shape = torch.as_tensor( global_input_tensor_structure.shape).to(torch.float64) global_output_tensor_shape[2:] *= self.scale_factor # I prefer ceil(), torch uses floor(), so we go with floor for consistency global_output_tensor_shape = torch.Size( torch.floor(global_output_tensor_shape).to(torch.int64)) else: if len(self.size) != len(global_input_tensor_structure.shape): raise ValueError( "Provided size does not match input tensor dimension.") global_output_tensor_shape = torch.Size(torch.as_tensor(self.size)) global_output_tensor_structure = TensorStructure() global_output_tensor_structure.shape = global_output_tensor_shape # Using that information, we can get there rest of the halo information exchange_info = self._compute_exchange_info( self.P_x, global_input_tensor_structure, global_output_tensor_structure, self.scale_factor, self.mode, self.align_corners) halo_shape = exchange_info[0] recv_buffer_shape = exchange_info[1] send_buffer_shape = exchange_info[2] needed_ranges = exchange_info[3] self.halo_shape = halo_shape # We can also set up part of the halo layer. self.halo_layer = HaloExchange(self.P_x, halo_shape, recv_buffer_shape, send_buffer_shape, buffer_manager=self.buffer_manager) # We have to select out the "unused" entries. Sometimes there can # be "negative" halos. self.needed_slices = assemble_slices(needed_ranges[:, 0], needed_ranges[:, 1]) # TODO #176: This block to compute the start and stop index of the # post-halo exchanged input can be cleaned up, as it is a duplicate of # calculation in the halo layer itself _slice = tuple([slice(i, i + 1) for i in self.P_x.index] + [slice(None)]) x_subtensor_shapes = compute_subtensor_shapes_balanced( global_input_tensor_structure, self.P_x.shape) x_subtensor_start_indices = compute_subtensor_start_indices( x_subtensor_shapes) x_subtensor_stop_indices = compute_subtensor_stop_indices( x_subtensor_shapes) x_start_index = torch.from_numpy( x_subtensor_start_indices[_slice].squeeze()) x_stop_index = torch.from_numpy( x_subtensor_stop_indices[_slice].squeeze()) y_subtensor_shapes = compute_subtensor_shapes_balanced( global_output_tensor_structure, self.P_x.shape) y_subtensor_start_indices = compute_subtensor_start_indices( y_subtensor_shapes) y_subtensor_stop_indices = compute_subtensor_stop_indices( y_subtensor_shapes) y_start_index = torch.from_numpy( y_subtensor_start_indices[_slice].squeeze()) y_stop_index = torch.from_numpy( y_subtensor_stop_indices[_slice].squeeze()) x_start_index = self._compute_needed_start( y_start_index, global_input_tensor_structure.shape, global_output_tensor_structure.shape, self.scale_factor, self.mode, self.align_corners) x_stop_index = self._compute_needed_stop( y_stop_index - 1, global_input_tensor_structure.shape, global_output_tensor_structure.shape, self.scale_factor, self.mode, self.align_corners) self.interp_layer = Interpolate(x_start_index, x_stop_index, global_input_tensor_structure.shape, y_start_index, y_stop_index, global_output_tensor_structure.shape, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners)
def broadcast_tensor_structure(input_tensor_structure, P_send, P_recv): output_tensor_structure = TensorStructure() if not P_send.active and not P_recv.active: return output_tensor_structure requests = [] if P_send.active: # Share the torch dtype code, converted to an int. intID_dtype = torch_to_intID_dtype_dict[input_tensor_structure.dtype] send_intID_dtype = np.array([intID_dtype], dtype=np.int) req = P_send._comm.Iallreduce(MPI.IN_PLACE, send_intID_dtype, op=MPI.MAX) requests.append(req) # Need to send non-Python types, so convert the boolean temporarily rg_int_send = np.array([-1], dtype=np.int) rg_int_send[0] = 1 if input_tensor_structure.requires_grad else 0 req = P_send._comm.Iallreduce(MPI.IN_PLACE, rg_int_send, op=MPI.MAX) requests.append(req) # Sending processes know the shape, so they can send a copy of the # data. We will ignore this variable later. send_tensor_dim = np.array([len(input_tensor_structure.shape)], dtype=np.int) req = P_send._comm.Iallreduce(MPI.IN_PLACE, send_tensor_dim, op=MPI.MAX) requests.append(req) # Similarly, sending processes know the tensor shape, so they can send # a copy of it, but we will not use that copy for our actual return # value. send_tensor_shape = np.array(input_tensor_structure.shape, dtype=np.int) req = P_send._comm.Iallreduce(MPI.IN_PLACE, send_tensor_shape, op=MPI.MAX) requests.append(req) # If the process is a receiving process, but doesn't already know the data # because it is the _same_ sending process, then we receive the results. # If it is a receiving process that sent data to a different set of # processes, we still have to complete the receive, even though later we # will not use that data. if (P_send != P_recv) and P_recv.active: # Everyone needs to receive these two values, but we don't need them # for future communication in this function so we can defer receiving # the data. recv_intID_dtype = np.array([-1], dtype=np.int) req = P_recv._comm.Iallreduce(MPI.IN_PLACE, recv_intID_dtype, op=MPI.MAX) requests.append(req) rg_int_recv = np.array([-1], dtype=np.int) req = P_recv._comm.Iallreduce(MPI.IN_PLACE, rg_int_recv, op=MPI.MAX) requests.append(req) # We need this value for the next communication, so we have to wait # for it to complete before moving on. recv_tensor_dim = np.array([-1], dtype=np.int) req = P_recv._comm.Iallreduce(MPI.IN_PLACE, recv_tensor_dim, op=MPI.MAX) req.Wait() recv_tensor_shape = np.zeros(recv_tensor_dim, dtype=np.int) recv_tensor_shape[:] = -1 req = P_recv._comm.Iallreduce(MPI.IN_PLACE, recv_tensor_shape, op=MPI.MAX) requests.append(req) # Make sure all requests, including the final recv all reduce complete # before receiving processes can actually copy the data out. MPI.Request.Waitall(requests) # Wait until the communication is complete to set these values. Only # receiving ranks that do not have the data originally should enter here. if P_recv.active and (P_send != P_recv): output_tensor_structure.shape = torch.Size(recv_tensor_shape) output_tensor_structure.dtype = intID_to_torch_dtype_dict[ recv_intID_dtype[0]] output_tensor_structure.requires_grad = bool(rg_int_recv[0]) elif P_send == P_recv: output_tensor_structure.shape = input_tensor_structure.shape output_tensor_structure.dtype = input_tensor_structure.dtype output_tensor_structure.requires_grad = input_tensor_structure.requires_grad # Finally, every active worker should have valid data. Any sending rank # created it from input data. Any receving _only_ rank used what it was # given. return output_tensor_structure
def _distdl_module_setup(self, input): r"""Distributed (feature) convolution module setup function. This function is called every time something changes in the input tensor structure. It should not be called manually. Parameters ---------- input : Tuple of forward inputs. See `torch.nn.Module.register_forward_pre_hook` for more details. """ self._distdl_is_setup = True self._input_tensor_structure = TensorStructure(input[0]) if not self.P_x.active: return if self.serial: return # Compute global and local shapes with padding x_global_structure = \ self._distdl_backend.assemble_global_tensor_structure(input[0], self.P_x) x_local_structure = TensorStructure(input[0]) x_global_shape = x_global_structure.shape x_local_shape = x_local_structure.shape x_global_shape_after_pad = x_global_shape + 2 * self.global_padding x_local_shape_after_pad = x_local_shape + np.sum( self.local_padding, axis=1, keepdims=False) x_local_structure_after_pad = TensorStructure(input[0]) x_local_structure_after_pad.shape = x_local_shape_after_pad # We need to compute the halos with respect to the explicit padding. # So, we assume the padding is already added, then compute the halo regions. compute_subtensor_shapes_unbalanced = \ self._distdl_backend.tensor_decomposition.compute_subtensor_shapes_unbalanced subtensor_shapes = \ compute_subtensor_shapes_unbalanced(x_local_structure_after_pad, self.P_x) # Using that information, we can get there rest of the halo information exchange_info = self._compute_exchange_info( x_global_shape_after_pad, self.kernel_size, self.stride, self._expand_parameter(0), self.dilation, self.P_x.active, self.P_x.shape, self.P_x.index, subtensor_shapes=subtensor_shapes) halo_shape = exchange_info[0] recv_buffer_shape = exchange_info[1] send_buffer_shape = exchange_info[2] needed_ranges = exchange_info[3] self.halo_shape = halo_shape # We can also set up part of the halo layer. self.halo_layer = HaloExchange(self.P_x, halo_shape, recv_buffer_shape, send_buffer_shape, buffer_manager=self.buffer_manager) # We have to select out the "unused" entries. Sometimes there can # be "negative" halos. self.needed_slices = assemble_slices(needed_ranges[:, 0], needed_ranges[:, 1])