Ejemplo n.º 1
0
def assemble_global_tensor_structure(local_tensor_structure, P_in, P_out=None):

    global_tensor_structure = TensorStructure()
    global_tensor_shape = None
    intID_dtype = None
    requires_grad_int = None

    if P_in.active:

        # Assemble the global shape
        global_tensor_shape = np.zeros(P_in.dim, dtype=np.int)
        for i in range(P_in.dim):

            keep = [False] * P_in.dim
            keep[i] = True

            P_sub = P_in.create_cartesian_subtopology_partition(keep)

            v0 = np.atleast_1d(int(local_tensor_structure.shape[i]))
            v1 = np.zeros(1, dtype=np.int)
            P_sub._comm.Allreduce(v0, v1, op=MPI.SUM)
            global_tensor_shape[i] = v1[0]

            # Free the subtopology resources
            P_sub.deactivate()

        # Get a communicable integer representing the dtype
        intID_dtype = torch_to_intID_dtype_dict[local_tensor_structure.dtype]
        intID_dtype = np.array([intID_dtype], dtype=np.int)

        requires_grad_int = np.array([-1], dtype=np.int)
        requires_grad_int[0] = 1 if local_tensor_structure.requires_grad else 0

        global_tensor_structure.shape = global_tensor_shape
        global_tensor_structure.dtype = local_tensor_structure.dtype
        global_tensor_structure.requires_grad = local_tensor_structure.requires_grad

    if P_out is not None and P_out.active:
        # Share the shape
        global_tensor_structure.shape = P_out.broadcast_data(
            global_tensor_shape, P_data=P_in)

        # Share the dtype
        intID_dtype = P_out.broadcast_data(intID_dtype, P_data=P_in)
        global_tensor_structure.dtype = intID_to_torch_dtype_dict[
            intID_dtype[0]]

        # Share the requires_grad status
        requires_grad_int = P_out.broadcast_data(requires_grad_int,
                                                 P_data=P_in)
        global_tensor_structure.requires_grad = bool(requires_grad_int[0])

    return global_tensor_structure
Ejemplo n.º 2
0
    def _distdl_module_setup(self, input):
        r"""Distributed (feature) pooling module setup function.

        This function is called every time something changes in the input
        tensor structure.  It should not be called manually.

        Parameters
        ----------
        input :
            Tuple of forward inputs.  See
            `torch.nn.Module.register_forward_pre_hook` for more details.

        """

        self._distdl_is_setup = True
        self._input_tensor_structure = TensorStructure(input[0])

        if not self.P_x.active:
            return

        # To compute the halo regions and interpolation, we need the global
        # tensor shape.  This is not available until when the input is
        # provided.
        global_input_tensor_structure = \
            self._distdl_backend.assemble_global_tensor_structure(input[0], self.P_x)

        if self.size is None:
            global_output_tensor_shape = torch.as_tensor(
                global_input_tensor_structure.shape).to(torch.float64)
            global_output_tensor_shape[2:] *= self.scale_factor

            # I prefer ceil(), torch uses floor(), so we go with floor for consistency
            global_output_tensor_shape = torch.Size(
                torch.floor(global_output_tensor_shape).to(torch.int64))
        else:
            if len(self.size) != len(global_input_tensor_structure.shape):
                raise ValueError(
                    "Provided size does not match input tensor dimension.")
            global_output_tensor_shape = torch.Size(torch.as_tensor(self.size))
        global_output_tensor_structure = TensorStructure()
        global_output_tensor_structure.shape = global_output_tensor_shape

        # Using that information, we can get there rest of the halo information
        exchange_info = self._compute_exchange_info(
            self.P_x, global_input_tensor_structure,
            global_output_tensor_structure, self.scale_factor, self.mode,
            self.align_corners)
        halo_shape = exchange_info[0]
        recv_buffer_shape = exchange_info[1]
        send_buffer_shape = exchange_info[2]
        needed_ranges = exchange_info[3]

        self.halo_shape = halo_shape

        # We can also set up part of the halo layer.
        self.halo_layer = HaloExchange(self.P_x,
                                       halo_shape,
                                       recv_buffer_shape,
                                       send_buffer_shape,
                                       buffer_manager=self.buffer_manager)

        # We have to select out the "unused" entries.  Sometimes there can
        # be "negative" halos.
        self.needed_slices = assemble_slices(needed_ranges[:, 0],
                                             needed_ranges[:, 1])

        # TODO #176: This block to compute the start and stop index of the
        # post-halo exchanged input can be cleaned up, as it is a duplicate of
        # calculation in the halo layer itself
        _slice = tuple([slice(i, i + 1)
                        for i in self.P_x.index] + [slice(None)])

        x_subtensor_shapes = compute_subtensor_shapes_balanced(
            global_input_tensor_structure, self.P_x.shape)
        x_subtensor_start_indices = compute_subtensor_start_indices(
            x_subtensor_shapes)
        x_subtensor_stop_indices = compute_subtensor_stop_indices(
            x_subtensor_shapes)

        x_start_index = torch.from_numpy(
            x_subtensor_start_indices[_slice].squeeze())
        x_stop_index = torch.from_numpy(
            x_subtensor_stop_indices[_slice].squeeze())

        y_subtensor_shapes = compute_subtensor_shapes_balanced(
            global_output_tensor_structure, self.P_x.shape)
        y_subtensor_start_indices = compute_subtensor_start_indices(
            y_subtensor_shapes)
        y_subtensor_stop_indices = compute_subtensor_stop_indices(
            y_subtensor_shapes)

        y_start_index = torch.from_numpy(
            y_subtensor_start_indices[_slice].squeeze())
        y_stop_index = torch.from_numpy(
            y_subtensor_stop_indices[_slice].squeeze())

        x_start_index = self._compute_needed_start(
            y_start_index, global_input_tensor_structure.shape,
            global_output_tensor_structure.shape, self.scale_factor, self.mode,
            self.align_corners)

        x_stop_index = self._compute_needed_stop(
            y_stop_index - 1, global_input_tensor_structure.shape,
            global_output_tensor_structure.shape, self.scale_factor, self.mode,
            self.align_corners)

        self.interp_layer = Interpolate(x_start_index,
                                        x_stop_index,
                                        global_input_tensor_structure.shape,
                                        y_start_index,
                                        y_stop_index,
                                        global_output_tensor_structure.shape,
                                        scale_factor=self.scale_factor,
                                        mode=self.mode,
                                        align_corners=self.align_corners)
Ejemplo n.º 3
0
def broadcast_tensor_structure(input_tensor_structure, P_send, P_recv):

    output_tensor_structure = TensorStructure()

    if not P_send.active and not P_recv.active:
        return output_tensor_structure

    requests = []

    if P_send.active:
        # Share the torch dtype code, converted to an int.
        intID_dtype = torch_to_intID_dtype_dict[input_tensor_structure.dtype]
        send_intID_dtype = np.array([intID_dtype], dtype=np.int)
        req = P_send._comm.Iallreduce(MPI.IN_PLACE,
                                      send_intID_dtype,
                                      op=MPI.MAX)
        requests.append(req)

        # Need to send non-Python types, so convert the boolean temporarily
        rg_int_send = np.array([-1], dtype=np.int)
        rg_int_send[0] = 1 if input_tensor_structure.requires_grad else 0
        req = P_send._comm.Iallreduce(MPI.IN_PLACE, rg_int_send, op=MPI.MAX)
        requests.append(req)

        # Sending processes know the shape, so they can send a copy of the
        # data.  We will ignore this variable later.
        send_tensor_dim = np.array([len(input_tensor_structure.shape)],
                                   dtype=np.int)
        req = P_send._comm.Iallreduce(MPI.IN_PLACE,
                                      send_tensor_dim,
                                      op=MPI.MAX)
        requests.append(req)

        # Similarly, sending processes know the tensor shape, so they can send
        # a copy of it, but we will not use that copy for our actual return
        # value.
        send_tensor_shape = np.array(input_tensor_structure.shape,
                                     dtype=np.int)
        req = P_send._comm.Iallreduce(MPI.IN_PLACE,
                                      send_tensor_shape,
                                      op=MPI.MAX)
        requests.append(req)

    # If the process is a receiving process, but doesn't already know the data
    # because it is the _same_ sending process, then we receive the results.
    # If it is a receiving process that sent data to a different set of
    # processes, we still have to complete the receive, even though later we
    # will not use that data.
    if (P_send != P_recv) and P_recv.active:

        # Everyone needs to receive these two values, but we don't need them
        # for future communication in this function so we can defer receiving
        # the data.
        recv_intID_dtype = np.array([-1], dtype=np.int)
        req = P_recv._comm.Iallreduce(MPI.IN_PLACE,
                                      recv_intID_dtype,
                                      op=MPI.MAX)
        requests.append(req)

        rg_int_recv = np.array([-1], dtype=np.int)
        req = P_recv._comm.Iallreduce(MPI.IN_PLACE, rg_int_recv, op=MPI.MAX)
        requests.append(req)

        # We need this value for the next communication, so we have to wait
        # for it to complete before moving on.
        recv_tensor_dim = np.array([-1], dtype=np.int)
        req = P_recv._comm.Iallreduce(MPI.IN_PLACE,
                                      recv_tensor_dim,
                                      op=MPI.MAX)
        req.Wait()

        recv_tensor_shape = np.zeros(recv_tensor_dim, dtype=np.int)
        recv_tensor_shape[:] = -1
        req = P_recv._comm.Iallreduce(MPI.IN_PLACE,
                                      recv_tensor_shape,
                                      op=MPI.MAX)
        requests.append(req)

    # Make sure all requests, including the final recv all reduce complete
    # before receiving processes can actually copy the data out.
    MPI.Request.Waitall(requests)

    # Wait until the communication is complete to set these values.  Only
    # receiving ranks that do not have the data originally should enter here.
    if P_recv.active and (P_send != P_recv):
        output_tensor_structure.shape = torch.Size(recv_tensor_shape)
        output_tensor_structure.dtype = intID_to_torch_dtype_dict[
            recv_intID_dtype[0]]
        output_tensor_structure.requires_grad = bool(rg_int_recv[0])

    elif P_send == P_recv:
        output_tensor_structure.shape = input_tensor_structure.shape
        output_tensor_structure.dtype = input_tensor_structure.dtype
        output_tensor_structure.requires_grad = input_tensor_structure.requires_grad

    # Finally, every active worker should have valid data.  Any sending rank
    # created it from input data.  Any receving _only_ rank used what it was
    # given.
    return output_tensor_structure
Ejemplo n.º 4
0
    def _distdl_module_setup(self, input):
        r"""Distributed (feature) convolution module setup function.

        This function is called every time something changes in the input
        tensor structure.  It should not be called manually.

        Parameters
        ----------
        input :
            Tuple of forward inputs.  See
            `torch.nn.Module.register_forward_pre_hook` for more details.

        """

        self._distdl_is_setup = True
        self._input_tensor_structure = TensorStructure(input[0])

        if not self.P_x.active:
            return

        if self.serial:
            return

        # Compute global and local shapes with padding
        x_global_structure = \
            self._distdl_backend.assemble_global_tensor_structure(input[0], self.P_x)
        x_local_structure = TensorStructure(input[0])
        x_global_shape = x_global_structure.shape
        x_local_shape = x_local_structure.shape
        x_global_shape_after_pad = x_global_shape + 2 * self.global_padding
        x_local_shape_after_pad = x_local_shape + np.sum(
            self.local_padding, axis=1, keepdims=False)
        x_local_structure_after_pad = TensorStructure(input[0])
        x_local_structure_after_pad.shape = x_local_shape_after_pad

        # We need to compute the halos with respect to the explicit padding.
        # So, we assume the padding is already added, then compute the halo regions.
        compute_subtensor_shapes_unbalanced = \
            self._distdl_backend.tensor_decomposition.compute_subtensor_shapes_unbalanced
        subtensor_shapes = \
            compute_subtensor_shapes_unbalanced(x_local_structure_after_pad, self.P_x)

        # Using that information, we can get there rest of the halo information
        exchange_info = self._compute_exchange_info(
            x_global_shape_after_pad,
            self.kernel_size,
            self.stride,
            self._expand_parameter(0),
            self.dilation,
            self.P_x.active,
            self.P_x.shape,
            self.P_x.index,
            subtensor_shapes=subtensor_shapes)
        halo_shape = exchange_info[0]
        recv_buffer_shape = exchange_info[1]
        send_buffer_shape = exchange_info[2]
        needed_ranges = exchange_info[3]

        self.halo_shape = halo_shape

        # We can also set up part of the halo layer.
        self.halo_layer = HaloExchange(self.P_x,
                                       halo_shape,
                                       recv_buffer_shape,
                                       send_buffer_shape,
                                       buffer_manager=self.buffer_manager)

        # We have to select out the "unused" entries.  Sometimes there can
        # be "negative" halos.
        self.needed_slices = assemble_slices(needed_ranges[:, 0],
                                             needed_ranges[:, 1])