Exemple #1
0
def test_potentially_deadlocked_send_recv_pairs(barrier_fence_fixture,
                                                comm_split_fixture, P_x_ranks,
                                                P_x_shape, P_w_ranks,
                                                P_w_shape):

    from distdl.backends.mpi.partition import MPIPartition
    from distdl.nn.broadcast import Broadcast

    device = torch.device('cuda' if use_cuda else 'cpu')

    # Isolate the minimum needed ranks
    base_comm, active = comm_split_fixture
    if not active:
        return
    P_world = MPIPartition(base_comm)

    # Create the partitions
    P_x_base = P_world.create_partition_inclusive(P_x_ranks)
    P_x = P_x_base.create_cartesian_topology_partition(P_x_shape)

    P_w_base = P_world.create_partition_inclusive(P_w_ranks)
    P_w = P_w_base.create_cartesian_topology_partition(P_w_shape)

    layer = Broadcast(P_x, P_w)  # noqa F841
    layer = layer.to(device)

    P_world.deactivate()
    P_x_base.deactivate()
    P_x.deactivate()
    P_w_base.deactivate()
    P_w.deactivate()
Exemple #2
0
    def __init__(self, P_x, P_y, P_w, in_features, out_features, bias=True):

        super(DistributedLinear, self).__init__()

        # P_x ~ 1 X P_fi
        self.P_x = P_x
        # P_y ~ 1 X P_fo
        self.P_y = P_y
        # P_w ~ P_fo X P_fi
        self.P_w = P_w

        self.bias = bias

        self.x_broadcast = Broadcast(self.P_x, self.P_w, preserve_batch=True)

        if self.P_w.active:
            local_in_features = compute_subshape(P_w.shape[1], P_w.index[1],
                                                 in_features)
            local_out_features = compute_subshape(P_w.shape[0], P_w.index[0],
                                                  out_features)
            # On column 0, use the specified bias, otherwise no bias to
            # prevent double counting
            bias = self.bias if (self.P_w.index[-1] == 0) else False
            self.sublinear = torch.nn.Linear(local_in_features[0],
                                             local_out_features[0],
                                             bias=bias)

        self.y_sum_reduce = SumReduce(self.P_w,
                                      self.P_y,
                                      transpose_src=True,
                                      preserve_batch=True)
Exemple #3
0
def test_broadcast_adjoint(barrier_fence_fixture, comm_split_fixture,
                           P_x_ranks, P_x_shape, P_y_ranks, P_y_shape,
                           x_global_shape, transpose_src):

    import numpy as np
    import torch

    from distdl.backends.mpi.partition import MPIPartition
    from distdl.nn.broadcast import Broadcast
    from distdl.utilities.torch import zero_volume_tensor

    # Isolate the minimum needed ranks
    base_comm, active = comm_split_fixture
    if not active:
        return
    P_world = MPIPartition(base_comm)

    # Create the partitions
    P_x_base = P_world.create_partition_inclusive(P_x_ranks)
    P_x = P_x_base.create_cartesian_topology_partition(P_x_shape)

    P_y_base = P_world.create_partition_inclusive(P_y_ranks)
    P_y = P_y_base.create_cartesian_topology_partition(P_y_shape)

    # TODO #93: Change this to create a subtensor so we test when local tensors
    # have different shape.  Then, the output size will also be different, which
    # we will have to get from `y` itself.
    x_local_shape = np.asarray(x_global_shape)

    layer = Broadcast(P_x,
                      P_y,
                      transpose_src=transpose_src,
                      preserve_batch=False)

    x = zero_volume_tensor()
    if P_x.active:
        x = torch.Tensor(np.random.randn(*x_local_shape))
    x.requires_grad = True

    dy = zero_volume_tensor()
    if P_y.active:
        # Adjoint Input
        dy = torch.Tensor(np.random.randn(*x_local_shape))

    # y = F @ x
    y = layer(x)

    # dx = F* @ dy
    y.backward(dy)
    dx = x.grad

    x = x.detach()
    dx = dx.detach()
    dy = dy.detach()
    y = y.detach()

    check_adjoint_test_tight(P_world, x, dx, y, dy)
Exemple #4
0
    def __init__(self, P_x, P_y, P_w, in_features, out_features, bias=True):

        super(DistributedLinear, self).__init__()

        # P_x ~ 1 X P_fi
        self.P_x = P_x
        # P_y ~ 1 X P_fo
        self.P_y = P_y
        # P_w ~ P_fo X P_fi
        self.P_w = P_w

        # Bias flag
        self.bias = bias

        # Broadcast layer in the x-tensor
        self.x_broadcast = Broadcast(self.P_x, self.P_w, preserve_batch=True)

        # Each worker in P_W computes its own portion of the weight tensor and then
        # stores its own PyTorch Linear layer.  Only the 0th column of the tensor
        # also stores a bias.
        if self.P_w.active:
            local_in_features = compute_subshape(P_w.shape[1], P_w.index[1],
                                                 in_features)
            local_out_features = compute_subshape(P_w.shape[0], P_w.index[0],
                                                  out_features)
            # On column 0, use the specified bias, otherwise no bias to
            # prevent double counting
            bias = self.bias if (self.P_w.index[-1] == 0) else False
            self.sublinear = torch.nn.Linear(local_in_features[0],
                                             local_out_features[0],
                                             bias=bias)

        # Sum-reduce layer to get the y-tensor
        self.y_sum_reduce = SumReduce(self.P_w,
                                      self.P_y,
                                      transpose_src=True,
                                      preserve_batch=True)
Exemple #5
0
    def __init__(self,
                 P_x,
                 P_y,
                 P_w,
                 in_channels=1,
                 out_channels=1,
                 bias=True,
                 *args,
                 **kwargs):

        super(DistributedGeneralConvBase, self).__init__()

        # P_x is 1    x P_ci x P_d-1 x ... x P_0
        self.P_x = P_x
        # P_y is 1    x P_co x P_d-1 x ... x P_0
        self.P_y = P_y
        # P_w is P_co x P_ci x P_d-1 x ... x P_0
        self.P_w = P_w

        self.P_union = self._distdl_backend.Partition()
        if not (self.P_x.active or self.P_y.active or self.P_w.active):
            return

        # This guarantees that P_union rank 0 has the kernel size, stride,
        # padding, and dilation factors
        P_union = P_w.create_partition_union(P_x)
        P_union = P_union.create_partition_union(P_y)
        self.P_union = P_union

        P_w_shape = None
        if P_union.rank == 0:
            P_w_shape = np.array(P_w.shape, dtype=np.int)
        P_w_shape = P_union.broadcast_data(P_w_shape, root=0)

        P_co = P_w_shape[0]
        P_ci = P_w_shape[1]
        P_channels = [P_co, P_ci]

        P_x_new_shape = []
        if self.P_x.active:
            if (np.any(P_x.shape[2:] != P_w_shape[2:])):
                raise ValueError(
                    "Spatial components of P_x and P_w must match.")
            if P_w_shape[1] != P_x.shape[1]:
                raise ValueError(
                    "Index 2 of P_w dimension must match input channel partition."
                )
            P_x_new_shape = list(P_x.shape)
            P_x_new_shape.insert(1, 1)
            # Currently a hack, removing the batch dimension because P_w does
            # not have one. This is OK because we assume there are no partitions
            # in the batch dimension.
            P_x_new_shape = np.asarray(P_x_new_shape[1:], dtype=int)

        # For the purposes of this layer, we re-cast P_x to have the extra
        # dimension.  This has no impact outside of the layer or on the results.
        self.P_x = self.P_x.create_cartesian_topology_partition(P_x_new_shape)

        P_y_new_shape = []
        if self.P_y.active:
            if (np.any(P_y.shape[2:] != P_w_shape[2:])):
                raise ValueError(
                    "Spatial components of P_y and P_w must match.")
            if P_w_shape[0] != P_y.shape[1]:
                raise ValueError(
                    "Index 1 of P_w dimension must match output channel partition."
                )
            P_y_new_shape = list(P_y.shape)
            P_y_new_shape.insert(2, 1)
            # Currently a hack, removing the batch dimension because P_w does
            # not have one. This is OK because we assume there are no partitions
            # in the batch dimension.
            P_y_new_shape = np.asarray(P_y_new_shape[1:], dtype=int)

        # For the purposes of this layer, we re-cast P_x to have the extra
        # dimension.  This has no impact outside of the layer or on the results.
        self.P_y = self.P_y.create_cartesian_topology_partition(P_y_new_shape)

        P_spatial = P_w_shape[2:]

        self.serial = False
        if self.P_w.size == 1:
            self.serial = True
            self.conv_layer = self.TorchConvType(*args, **kwargs)
            return

        self.receives_weight = False
        self.stores_weight = False
        self.receives_bias = False
        self.stores_bias = False

        # Determine P_r, initialize weights there
        if self.P_w.active:
            # All of P_w always receives the weight
            self.receives_weight = True

            # This subset is taken to be the origin of the spartial component
            w_root_subset = []
            for i, c in enumerate(range_index(P_w.shape)):
                c = np.asarray(c)
                # Find the P_co x P_ci x 1 x ... x 1 subset to store the weights
                if np.all(c[2:] == 0):
                    w_root_subset.append(i)

            self.P_wr_base = self.P_w.create_partition_inclusive(w_root_subset)
            # ones are needed so the broadcast will work
            self.P_wr = self.P_wr_base.create_cartesian_topology_partition(
                [P_co, P_ci] + [1] * len(P_spatial))
            self.stores_weight = self.P_wr.active

            b_subset = []
            for i, c in enumerate(range_index(P_w.shape)):
                c = np.asarray(c)
                # Find the P_co x 1 x P_0 x ... x P_D-1 subset that needs biases in its calculation.
                # This is everywhere that the input channels is rank 0.
                if c[1] == 0:
                    b_subset.append(i)

            self.P_b_base = self.P_w.create_partition_inclusive(b_subset)
            self.P_b = self.P_b_base.create_cartesian_topology_partition(
                [P_co] + [1] + list(P_spatial))
            self.receives_bias = self.P_b.active and bias

            # Now find the subset of _that_ which actually stores the learnable parameter.
            b_root_subset = []
            for i, c in enumerate(range_index(P_w.shape)):
                c = np.asarray(c)
                # Find the P_co x 1 x 1 x ... x 1 subset to store the biases
                if np.all(c[1:] == 0):
                    b_root_subset.append(i)

            self.P_br_base = self.P_w.create_partition_inclusive(b_root_subset)
            # ones are needed so the broadcast will work
            self.P_br = self.P_br_base.create_cartesian_topology_partition(
                [P_co] + [1] + [1] * len(P_spatial))
            self.stores_bias = self.P_br.active and bias

            # Correct the input arguments based on local properties
            local_kwargs = {}
            local_kwargs.update(kwargs)

            # Do this before checking serial so that the layer works properly
            # in the serial case
            local_channels = compute_subshape(P_channels, P_w.index[0:2],
                                              [out_channels, in_channels])
            local_out_channels, local_in_channels = local_channels
            local_kwargs["in_channels"] = local_in_channels
            local_kwargs["out_channels"] = local_out_channels

            local_kwargs["bias"] = self.receives_bias
            self.conv_layer = self.TorchConvType(*args, **local_kwargs)

            # If we store the weight it is a learnable parameter iff it is
            # learnable by default in the layer, which it is.
            if self.stores_weight:
                self._weight = torch.nn.Parameter(
                    self.conv_layer.weight.detach())
            else:
                self._weight = zero_volume_tensor()
            # This always exists so we can copy the property
            self._weight.requires_grad = self.conv_layer.weight.requires_grad

            # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2
            new_weight = self.conv_layer.weight.detach() * 0
            new_weight.requires_grad = self.conv_layer.weight.requires_grad
            del self.conv_layer.weight
            self.conv_layer.weight = new_weight

            # If we store the bias, it is a learnable parameter iff it is
            # learnable by default in the layer, which is only true if it
            # exists.
            if self.stores_bias:
                self._bias = torch.nn.Parameter(self.conv_layer.bias.detach())
            else:
                self._bias = zero_volume_tensor()
            # This does not always exist, but when it does we can copy the
            # property.
            if self.receives_bias:
                self._bias.requires_grad = self.conv_layer.bias.requires_grad

                # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2
                new_bias = self.conv_layer.bias.detach() * 0
                new_bias.requires_grad = self.conv_layer.bias.requires_grad
                del self.conv_layer.bias
                self.conv_layer.bias = new_bias

        # Now we need to share the kernel structure.  The size of the kernel
        # is always the spatial dimensions.
        self.conv_kernel_size = None
        self.conv_stride = None
        self.conv_padding = None
        self.conv_dilation = None
        if P_union.rank == 0:
            self.conv_kernel_size = np.array(self.conv_layer.kernel_size,
                                             dtype=np.int)
            self.conv_stride = np.array(self.conv_layer.stride, dtype=np.int)
            self.conv_padding = np.array(self.conv_layer.padding, dtype=np.int)
            self.conv_dilation = np.array(self.conv_layer.dilation,
                                          dtype=np.int)
        self.conv_kernel_size = P_union.broadcast_data(self.conv_kernel_size,
                                                       root=0)
        self.conv_stride = P_union.broadcast_data(self.conv_stride, root=0)
        self.conv_padding = P_union.broadcast_data(self.conv_padding, root=0)
        self.conv_dilation = P_union.broadcast_data(self.conv_dilation, root=0)

        # We need the halo shape, and other info, to fully populate the pad,
        # halo exchange, and unpad layers.  For pad and unpad, we defer their
        # construction to the pre-forward hook.

        self.pad_layer = None
        self.unpad_layer = None

        # We need to be able to remove some data from the input to the conv
        # layer.
        self.needed_slices = None

        # For the halo layer we also defer construction, so that we can have
        # the halo shape for the input.  The halo will allocate its own
        # buffers, but it needs this information at construction to be able
        # to do this in the pre-forward hook.

        self.halo_layer = None

        # Variables for tracking input changes and buffer construction
        self._distdl_is_setup = False
        self._input_shape = None
        self._input_requires_grad = None

        if P_w.active:
            self.w_broadcast = Broadcast(self.P_wr,
                                         self.P_w,
                                         preserve_batch=False)

        if self.receives_bias or self.stores_bias:
            self.b_broadcast = Broadcast(self.P_br,
                                         self.P_b,
                                         preserve_batch=False)

        self.x_broadcast = Broadcast(self.P_x, self.P_w, preserve_batch=True)
        self.y_sum_reduce = SumReduce(self.P_w, self.P_y, preserve_batch=True)
Exemple #6
0
    def __init__(self,
                 P_x,
                 P_y,
                 P_w,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 padding_mode='zeros',
                 dilation=1,
                 groups=1,
                 bias=True,
                 *args,
                 **kwargs):

        super(DistributedChannelConvBase, self).__init__()

        # P_x is 1    x P_ci x 1 x ... x 1
        self.P_x = P_x
        # P_y is 1    x P_co x 1 x ... x 1
        self.P_y = P_y
        # P_w is P_co x P_ci x 1 x ... x 1
        self.P_w = P_w

        # Even inactive workers need some partition union
        P_union = self._distdl_backend.Partition()

        if not (self.P_x.active or self.P_y.active or self.P_w.active):
            return

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = self._expand_parameter(kernel_size)
        self.stride = self._expand_parameter(stride)
        self.padding = self._expand_parameter(padding)
        self.padding_mode = padding_mode
        self.dilation = self._expand_parameter(dilation)
        self.groups = groups
        self.use_bias = bias

        # This guarantees that P_union rank 0 has the kernel size, stride,
        # padding, and dilation factors
        P_union_temp = P_w.create_partition_union(P_x)
        P_union = P_union_temp.create_partition_union(P_y)

        # Ensure that all workers have the full size and structure of P_w
        P_w_shape = None
        if P_union.rank == 0:
            P_w_shape = np.array(P_w.shape, dtype=np.int)
        P_w_shape = P_union.broadcast_data(P_w_shape, root=0)

        # Release the temporary resources
        P_union_temp.deactivate()
        P_union.deactivate()

        P_co = P_w_shape[0]
        P_ci = P_w_shape[1]
        P_channels = [P_co, P_ci]

        # Ensure that P_x and P_w are correctly aligned.  We also produce a
        # new P_x that is shaped like 1 x P_ci x 1 x ... x 1, to assist with
        # broadcasts.
        P_x_new_shape = []
        if self.P_x.active:
            if (np.any(P_x.shape[2:] != P_w_shape[2:])):
                raise ValueError(
                    "Spatial components of P_x and P_w must match.")
            if (np.any(P_x.shape[2:] != np.ones(len(P_x.shape[2:])))):
                raise ValueError(
                    "Spatial components of P_x must be 1 x ... x 1.")
            if P_w_shape[1] != P_x.shape[1]:
                raise ValueError(
                    "Index 2 of P_w dimension must match input channel partition."
                )
            P_x_new_shape = list(P_x.shape)
            P_x_new_shape.insert(1, 1)
            # Currently a hack, removing the batch dimension because P_w does
            # not have one. This is OK because we assume there are no partitions
            # in the batch dimension.
            P_x_new_shape = np.asarray(P_x_new_shape[1:], dtype=int)

        # For the purposes of this layer, we re-cast P_x to have the extra
        # dimension.  This has no impact outside of the layer or on the results.
        self.P_x = self.P_x.create_cartesian_topology_partition(P_x_new_shape)

        # Ensure that P_y and P_w are correctly aligned.  We also produce a
        # new P_y that is shaped like P_co x 1 x 1 x ... x 1, to assist with
        # broadcasts.
        P_y_new_shape = []
        if self.P_y.active:
            if (np.any(P_y.shape[2:] != P_w_shape[2:])):
                raise ValueError(
                    "Spatial components of P_y and P_w must match.")
            if (np.any(P_y.shape[2:] != np.ones(len(P_y.shape[2:])))):
                raise ValueError(
                    "Spatial components of P_y must be 1 x ... x 1.")
            if P_w_shape[0] != P_y.shape[1]:
                raise ValueError(
                    "Index 1 of P_w dimension must match output channel partition."
                )
            P_y_new_shape = list(P_y.shape)
            P_y_new_shape.insert(2, 1)
            # Currently a hack, removing the batch dimension because P_w does
            # not have one. This is OK because we assume there are no partitions
            # in the batch dimension.
            P_y_new_shape = np.asarray(P_y_new_shape[1:], dtype=int)

        # For the purposes of this layer, we re-cast P_x to have the extra
        # dimension.  This has no impact outside of the layer or on the results.
        self.P_y = self.P_y.create_cartesian_topology_partition(P_y_new_shape)

        self.serial = self.P_w.size == 1

        if self.serial:
            self.conv_layer = self.TorchConvType(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=self.kernel_size,
                stride=self.stride,
                padding=self.padding,
                padding_mode=self.padding_mode,
                dilation=self.dilation,
                groups=self.groups,
                bias=self.use_bias)
            self.weight = self.conv_layer.weight
            self.bias = self.conv_layer.bias
            return

        # Flag if the global bias is set
        self.global_bias = bias

        # Flags if current worker stores (part of) the bias locally.
        self.stores_bias = False

        if self.P_w.active:

            # Let the P_co column store the bias if it is to be used
            self.stores_bias = self.P_w.index[1] == 0 and self.use_bias

            # Correct the input arguments based on local properties
            # This ensures that the in and out channels are correctly shared.
            local_co, local_ci = compute_subshape(P_channels, P_w.index[0:2],
                                                  [out_channels, in_channels])
            self.conv_layer = self.TorchConvType(
                in_channels=local_ci,
                out_channels=local_co,
                kernel_size=self.kernel_size,
                stride=self.stride,
                padding=self.padding,
                padding_mode=self.padding_mode,
                dilation=self.dilation,
                groups=groups,
                bias=self.stores_bias)

        # Workers in P_w alias the conv layer to get their weight and perhaps
        # biases.  Every other worker doesn't have a weight or bias.
        if self.P_w.active:
            self.weight = self.conv_layer.weight
            if self.stores_bias:
                self.bias = self.conv_layer.bias
            else:
                if self.use_bias:
                    self.register_buffer('bias', zero_volume_tensor())
                else:
                    self.register_buffer('bias', None)
        else:
            self.register_buffer('weight', zero_volume_tensor())
            if self.use_bias:
                self.register_buffer('bias', zero_volume_tensor())
            else:
                self.register_buffer('bias', None)

        # Variables for tracking input changes and buffer construction
        self._distdl_is_setup = False
        self._input_tensor_structure = TensorStructure()

        self.x_broadcast = Broadcast(self.P_x, self.P_w, preserve_batch=True)
        self.y_sum_reduce = SumReduce(self.P_w, self.P_y, preserve_batch=True)
Exemple #7
0
    def __init__(self,
                 P_x,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 padding_mode='zeros',
                 dilation=1,
                 groups=1,
                 bias=True,
                 buffer_manager=None):

        super(DistributedFeatureConvBase, self).__init__()

        # P_x is 1 x 1 x P_d-1 x ... x P_0
        self.P_x = P_x

        # Back-end specific buffer manager for economic buffer allocation
        if buffer_manager is None:
            buffer_manager = self._distdl_backend.BufferManager()
        elif type(buffer_manager) is not self._distdl_backend.BufferManager:
            raise ValueError("Buffer manager type does not match backend.")
        self.buffer_manager = buffer_manager

        if not self.P_x.active:
            return

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = self._expand_parameter(kernel_size)
        self.stride = self._expand_parameter(stride)
        self.padding = self._expand_parameter(padding)
        self.padding_mode = padding_mode
        self.dilation = self._expand_parameter(dilation)
        self.groups = groups
        self.use_bias = bias

        self.serial = self.P_x.size == 1

        if self.serial:
            self.conv_layer = self.TorchConvType(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=self.kernel_size,
                stride=self.stride,
                padding=self.padding,
                padding_mode=self.padding_mode,
                dilation=self.dilation,
                groups=self.groups,
                bias=self.use_bias)
            self.weight = self.conv_layer.weight
            self.bias = self.conv_layer.bias
        else:
            self.conv_layer = self.TorchConvType(in_channels=in_channels,
                                                 out_channels=out_channels,
                                                 kernel_size=self.kernel_size,
                                                 stride=self.stride,
                                                 padding=0,
                                                 padding_mode='zeros',
                                                 dilation=self.dilation,
                                                 groups=groups,
                                                 bias=bias)

        if self.serial:
            return

        dims = len(self.P_x.shape)

        # We will be using global padding to compute local padding,
        # so expand it to a numpy array
        global_padding = np.pad(self.padding,
                                pad_width=(dims - len(self.padding), 0),
                                mode='constant',
                                constant_values=0)
        self.global_padding = global_padding

        pad_left_right = self.global_padding.reshape((dims, 1)) + np.zeros(
            (dims, 2), dtype=np.int)
        self.local_padding = self._compute_local_padding(pad_left_right)

        # Weights and biases partition
        P_wb = self.P_x.create_partition_inclusive([0])
        self.P_wb_cart = P_wb.create_cartesian_topology_partition([1])

        # Release temporary resources
        P_wb.deactivate()

        # We want only the root rank of the broadcast to have a weight and a
        # bias parameter. Every other rank gets a zero-volume tensor.
        if self.P_wb_cart.active:
            self.weight = torch.nn.Parameter(self.conv_layer.weight.detach())

            if self.conv_layer.bias is not None:
                self.bias = torch.nn.Parameter(self.conv_layer.bias.detach())
            else:
                self.register_buffer('bias', None)
        else:
            self.register_buffer('weight', zero_volume_tensor())

            if self.conv_layer.bias is not None:
                self.register_buffer('bias', zero_volume_tensor())
            else:
                self.register_buffer('bias', None)

        self.weight.requires_grad = self.conv_layer.weight.requires_grad

        if self.conv_layer.bias is not None:
            self.bias.requires_grad = self.conv_layer.bias.requires_grad

        # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2
        new_weight = self.conv_layer.weight.detach() * 0
        new_weight.requires_grad = self.conv_layer.weight.requires_grad
        del self.conv_layer.weight
        self.conv_layer.weight = new_weight

        if self.conv_layer.bias is not None:
            new_bias = self.conv_layer.bias.detach() * 0
            new_bias.requires_grad = self.conv_layer.bias.requires_grad
            del self.conv_layer.bias
            self.conv_layer.bias = new_bias

        self.w_broadcast = Broadcast(self.P_wb_cart,
                                     self.P_x,
                                     preserve_batch=False)

        if self.conv_layer.bias is not None:
            self.b_broadcast = Broadcast(self.P_wb_cart,
                                         self.P_x,
                                         preserve_batch=False)

        # We need to be able to remove some data from the input to the conv
        # layer.
        self.needed_slices = None

        # For the halo layer we also defer construction, so that we can have
        # the halo shape for the input.  The halo will allocate its own
        # buffers, but it needs this information at construction to be able
        # to do this in the pre-forward hook.
        self.halo_layer = None

        # Variables for tracking input changes and buffer construction
        self._distdl_is_setup = False
        self._input_tensor_structure = TensorStructure()
Exemple #8
0
    def __init__(self, P_x, P_y, P_w,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 padding_mode='zeros',
                 dilation=1,
                 groups=1,
                 bias=True,
                 buffer_manager=None):

        super(DistributedGeneralConvBase, self).__init__()

        # P_x is 1    x P_ci x P_d-1 x ... x P_0
        self.P_x = P_x
        # P_y is 1    x P_co x P_d-1 x ... x P_0
        self.P_y = P_y
        # P_w is P_co x P_ci x P_d-1 x ... x P_0
        self.P_w = P_w

        # Back-end specific buffer manager for economic buffer allocation
        if buffer_manager is None:
            buffer_manager = self._distdl_backend.BufferManager()
        elif type(buffer_manager) is not self._distdl_backend.BufferManager:
            raise ValueError("Buffer manager type does not match backend.")
        self.buffer_manager = buffer_manager

        # Even inactive workers need some partition union
        self.P_union = self._distdl_backend.Partition()
        if not (self.P_x.active or
                self.P_y.active or
                self.P_w.active):
            return

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = self._expand_parameter(kernel_size)
        self.stride = self._expand_parameter(stride)
        self.padding = self._expand_parameter(padding)
        self.padding_mode = padding_mode
        self.dilation = self._expand_parameter(dilation)
        self.groups = groups
        self.use_bias = bias

        # This guarantees that P_union rank 0 has the kernel size, stride,
        # padding, and dilation factors
        P_union_temp = P_w.create_partition_union(P_x)
        self.P_union = P_union_temp.create_partition_union(P_y)

        # Release the temporary resources
        P_union_temp.deactivate()

        # Ensure that all workers have the full size and structure of P_w
        P_w_shape = None
        if self.P_union.rank == 0:
            P_w_shape = np.array(P_w.shape, dtype=np.int)
        P_w_shape = self.P_union.broadcast_data(P_w_shape, root=0)

        P_co = P_w_shape[0]
        P_ci = P_w_shape[1]
        P_channels = [P_co, P_ci]

        # Ensure that P_x and P_w are correctly aligned.  We also produce a
        # new P_x that is shaped like 1 x P_ci x P_d-1 x ... x P_0, to assist
        # with broadcasts.
        P_x_new_shape = []
        if self.P_x.active:
            if(np.any(P_x.shape[2:] != P_w_shape[2:])):
                raise ValueError("Spatial components of P_x and P_w must match.")
            if P_w_shape[1] != P_x.shape[1]:
                raise ValueError("Index 2 of P_w dimension must match input channel partition.")
            P_x_new_shape = list(P_x.shape)
            P_x_new_shape.insert(1, 1)
            # Currently a hack, removing the batch dimension because P_w does
            # not have one. This is OK because we assume there are no partitions
            # in the batch dimension.
            P_x_new_shape = np.asarray(P_x_new_shape[1:], dtype=int)

        # For the purposes of this layer, we re-cast P_x to have the extra
        # dimension.  This has no impact outside of the layer or on the results.
        self.P_x = self.P_x.create_cartesian_topology_partition(P_x_new_shape)

        # Ensure that P_y and P_w are correctly aligned.  We also produce a
        # new P_y that is shaped like 1 x P_ci x P_d-1 x ... x P_0, to assist
        # with broadcasts.
        P_y_new_shape = []
        if self.P_y.active:
            if(np.any(P_y.shape[2:] != P_w_shape[2:])):
                raise ValueError("Spatial components of P_y and P_w must match.")
            if P_w_shape[0] != P_y.shape[1]:
                raise ValueError("Index 1 of P_w dimension must match output channel partition.")
            P_y_new_shape = list(P_y.shape)
            P_y_new_shape.insert(2, 1)
            # Currently a hack, removing the batch dimension because P_w does
            # not have one. This is OK because we assume there are no partitions
            # in the batch dimension.
            P_y_new_shape = np.asarray(P_y_new_shape[1:], dtype=int)

        # For the purposes of this layer, we re-cast P_x to have the extra
        # dimension.  This has no impact outside of the layer or on the results.
        self.P_y = self.P_y.create_cartesian_topology_partition(P_y_new_shape)

        P_spatial = P_w_shape[2:]

        self.serial = self.P_w.size == 1

        if self.serial:
            self.conv_layer = self.TorchConvType(in_channels=in_channels,
                                                 out_channels=out_channels,
                                                 kernel_size=self.kernel_size,
                                                 stride=self.stride,
                                                 padding=self.padding,
                                                 padding_mode=self.padding_mode,
                                                 dilation=self.dilation,
                                                 groups=self.groups,
                                                 bias=self.use_bias)
            self.weight = self.conv_layer.weight
            self.bias = self.conv_layer.bias
            return

        # Need to figure out any padding necessary to handle global padding.
        # This is only on the input tensor.  The convolution will not use
        # any implicit padding, so the work partition does not need it.
        if self.P_x.active:
            dims = len(self.P_x.shape)

            # We will be using global padding to compute local padding,
            # so expand it to a numpy array
            global_padding = np.pad(self.padding,
                                    pad_width=(dims-len(self.padding), 0),
                                    mode='constant',
                                    constant_values=0)
            self.global_padding = global_padding

            pad_left_right = self.global_padding.reshape((dims, 1)) + np.zeros((dims, 2), dtype=np.int)
            self.local_padding = self._compute_local_padding(pad_left_right)

        # Workers can either store the learnable weights and bias, or they
        # need copies of it.
        self.receives_weight = False
        self.stores_weight = False
        self.receives_bias = False
        self.stores_bias = False

        # Determine root partitions, initialize weights there
        if self.P_w.active:
            # All of P_w always receives the weight
            self.receives_weight = True

            # This subset is taken to be the origin of the spartial component
            w_root_subset = []
            for i, c in enumerate(range_index(P_w.shape)):
                c = np.asarray(c)
                # Find the P_co x P_ci x 1 x ... x 1 subset to store the weights
                if np.all(c[2:] == 0):
                    w_root_subset.append(i)

            P_wr_base = self.P_w.create_partition_inclusive(w_root_subset)
            # ones are needed so the broadcast will work
            self.P_wr = P_wr_base.create_cartesian_topology_partition([P_co, P_ci] + [1]*len(P_spatial))
            self.stores_weight = self.P_wr.active

            # Release temporary resources
            P_wr_base.deactivate()

            b_subset = []
            for i, c in enumerate(range_index(P_w.shape)):
                c = np.asarray(c)
                # Find the P_co x 1 x P_0 x ... x P_D-1 subset that needs
                # biases in its calculation. This is everywhere that the input
                # channels is rank 0.
                if c[1] == 0:
                    b_subset.append(i)

            P_b_base = self.P_w.create_partition_inclusive(b_subset)
            self.P_b = P_b_base.create_cartesian_topology_partition([P_co] + [1] + list(P_spatial))
            self.receives_bias = self.P_b.active and self.use_bias

            # Release temporary resources
            P_b_base.deactivate()

            # Now find the subset of _that_ which actually stores the
            # learnable parameter.
            b_root_subset = []
            for i, c in enumerate(range_index(P_w.shape)):
                c = np.asarray(c)
            # Find the P_co x 1 x 1 x ... x 1 subset to store the biases
                if np.all(c[1:] == 0):
                    b_root_subset.append(i)

            P_br_base = self.P_w.create_partition_inclusive(b_root_subset)
            # ones are needed so the broadcast will work
            self.P_br = P_br_base.create_cartesian_topology_partition([P_co] + [1] + [1]*len(P_spatial))
            self.stores_bias = self.P_br.active and self.use_bias

            # Release temporary resources
            P_br_base.deactivate()

            # Correct the input arguments based on local properties
            # This ensures that the in and out channels are correctly shared.
            local_co, local_ci = compute_subshape(P_channels,
                                                  P_w.index[0:2],
                                                  [out_channels, in_channels])
            self.conv_layer = self.TorchConvType(in_channels=local_ci,
                                                 out_channels=local_co,
                                                 kernel_size=self.kernel_size,
                                                 stride=self.stride,
                                                 padding=0,
                                                 padding_mode='zeros',
                                                 dilation=self.dilation,
                                                 groups=groups,
                                                 bias=self.receives_bias)

            # If we store the weight it is a learnable parameter iff it is
            # learnable by default in the layer, which it is.
            if self.stores_weight:
                self.weight = torch.nn.Parameter(self.conv_layer.weight.detach())
            else:
                self.register_buffer('weight', zero_volume_tensor())
            # This always exists so we can copy the property
            self.weight.requires_grad = self.conv_layer.weight.requires_grad

            # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2
            new_weight = self.conv_layer.weight.detach() * 0
            new_weight.requires_grad = self.conv_layer.weight.requires_grad
            del self.conv_layer.weight
            self.conv_layer.weight = new_weight

            # If we store the bias, it is a learnable parameter iff it is
            # learnable by default in the layer, which is only true if it
            # exists.
            if self.stores_bias:
                self.bias = torch.nn.Parameter(self.conv_layer.bias.detach())
            else:
                if self.use_bias:
                    self.register_buffer('bias', zero_volume_tensor())
                else:
                    self.register_buffer('bias', None)
            # This does not always exist, but when it does we can copy the
            # property.
            if self.receives_bias:
                self.bias.requires_grad = self.conv_layer.bias.requires_grad

                # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2
                new_bias = self.conv_layer.bias.detach() * 0
                new_bias.requires_grad = self.conv_layer.bias.requires_grad
                del self.conv_layer.bias
                self.conv_layer.bias = new_bias

        else:
            # Workers not in P_w don't have a weight or bias.
            self.register_buffer('weight', zero_volume_tensor())
            if self.use_bias:
                self.register_buffer('bias', zero_volume_tensor())
            else:
                self.register_buffer('bias', None)

        # Now we need to share the kernel structure.  The size of the kernel
        # is always the spatial dimensions.
        self.conv_kernel_size = None
        self.conv_stride = None
        self.conv_padding = None
        self.conv_dilation = None

        # By construction, rank 0 of the union should always have all of this
        # information, because it will always construct a local conv layer. We
        # rely on the local conv layer to properly fill out this information
        # from the defaults.  This info is required for all workers on the
        # input and output partitions because it is needed to construct the
        # halos.  Rank 0 in the union shares it with everyone.
        if self.P_union.rank == 0:
            self.conv_kernel_size = np.array(self.conv_layer.kernel_size, dtype=np.int)
            self.conv_stride = np.array(self.conv_layer.stride, dtype=np.int)
            self.conv_padding = np.array(self.conv_layer.padding, dtype=np.int)
            self.conv_dilation = np.array(self.conv_layer.dilation, dtype=np.int)
        self.conv_kernel_size = self.P_union.broadcast_data(self.conv_kernel_size, root=0)
        self.conv_stride = self.P_union.broadcast_data(self.conv_stride, root=0)
        self.conv_padding = self.P_union.broadcast_data(self.conv_padding, root=0)
        self.conv_dilation = self.P_union.broadcast_data(self.conv_dilation, root=0)

        # We need to be able to remove some data from the input to the conv
        # layer but again need to defer.
        self.needed_slices = None

        # For the halo layer we also defer construction, so that we can have
        # the halo shape for the input.  The halo will allocate its own
        # buffers, but it needs this information at construction to be able
        # to do this in the pre-forward hook.
        self.halo_layer = None

        # Variables for tracking input changes and buffer construction
        self._distdl_is_setup = False
        self._input_tensor_structure = TensorStructure()

        # Some layers, those that require no information about the input
        # tensor to setup, can be built now.
        if P_w.active:
            self.w_broadcast = Broadcast(self.P_wr, self.P_w, preserve_batch=False)

        if self.receives_bias or self.stores_bias:
            self.b_broadcast = Broadcast(self.P_br, self.P_b, preserve_batch=False)

        self.x_broadcast = Broadcast(self.P_x, self.P_w, preserve_batch=True)
        self.y_sum_reduce = SumReduce(self.P_w, self.P_y, preserve_batch=True)
Exemple #9
0
    def __init__(self,
                 P_x,
                 num_features,
                 eps=1e-05,
                 momentum=0.1,
                 affine=True,
                 track_running_stats=True):
        super(DistributedBatchNorm, self).__init__()
        self.num_dimensions = len(P_x.shape)
        if self.num_dimensions < 2:
            raise ValueError(
                'Number of dimensions of P_x should be at least 2.')
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        self.affine = affine
        self.track_running_stats = track_running_stats
        self.inputs_seen = 0

        # Determine the size of the local trainable parameters (this is a bit of a hack)
        possible_input_shape = P_x.shape.tolist()
        possible_input_shape[1] = num_features
        start_index = compute_start_index(P_x.shape, P_x.index,
                                          possible_input_shape)
        stop_index = compute_stop_index(P_x.shape, P_x.index,
                                        possible_input_shape)
        self.local_num_features = stop_index[1] - start_index[1]

        internal_data_shape = [1] * self.num_dimensions
        internal_data_shape[1] = self.local_num_features
        internal_partition_shape = [1] * self.num_dimensions
        internal_partition_shape[1] = P_x.shape[1]

        # Decide which workers will be used to store sum and affine parameters
        index = [0] * self.num_dimensions
        index[1] = slice(0, P_x.shape[1])
        index = tuple(index)
        storage_workers = worker_layout(P_x.shape)[index].tolist()

        self.P_x = P_x
        P_sum_base = P_x.create_partition_inclusive(storage_workers)
        self.P_sum = P_sum_base.create_cartesian_topology_partition(
            internal_partition_shape)

        # Release temporary resources
        P_sum_base.deactivate()

        if self.track_running_stats:
            self.register_buffer('running_mean',
                                 torch.zeros(internal_data_shape))
            self.register_buffer('running_var',
                                 torch.ones(internal_data_shape))
        else:
            self.running_mean = None
            self.running_var = None

        self.sr = SumReduce(P_x, self.P_sum)
        self.bc = Broadcast(self.P_sum, P_x)
        self.bc_affine = Broadcast(self.P_sum, P_x)

        if self.affine:
            if self.P_sum.active:
                self.gamma = torch.nn.Parameter(
                    torch.ones(internal_data_shape))
                self.beta = torch.nn.Parameter(
                    torch.zeros(internal_data_shape))
            else:
                self.register_buffer('gamma',
                                     zero_volume_tensor(requires_grad=True))
                self.register_buffer('beta',
                                     zero_volume_tensor(requires_grad=True))
Exemple #10
0
    def __init__(self, P_x, *args, **kwargs):

        super(DistributedConvBase, self).__init__()

        self.P_x = P_x

        if not self.P_x.active:
            return

        # Do this before checking serial so that the layer works properly
        # in the serial case
        self.conv_layer = self.TorchConvType(*args, **kwargs)

        self.serial = False
        if self.P_x.size == 1:
            self.serial = True
            return

        # Weights and biases partition
        self.P_wb = self.P_x.create_partition_inclusive([0])
        self.P_wb_cart = self.P_wb.create_cartesian_topology_partition([1])

        # We want only the root rank of the broadcast to have a weight and a bias parameter.
        # Every other rank gets a NoneTensor.
        if self.P_wb_cart.active:
            self.weight = torch.nn.Parameter(self.conv_layer.weight.detach())

            if self.conv_layer.bias is not None:
                self.bias = torch.nn.Parameter(self.conv_layer.bias.detach())

        else:
            self.weight = zero_volume_tensor()

            if self.conv_layer.bias is not None:
                self.bias = zero_volume_tensor()

        self.weight.requires_grad = self.conv_layer.weight.requires_grad

        if self.conv_layer.bias is not None:
            self.bias.requires_grad = self.conv_layer.bias.requires_grad

        # https://discuss.pytorch.org/t/assign-parameters-to-nn-module-and-have-grad-fn-track-it/62677/2
        new_weight = self.conv_layer.weight.detach() * 0
        new_weight.requires_grad = self.conv_layer.weight.requires_grad
        del self.conv_layer.weight
        self.conv_layer.weight = new_weight

        if self.conv_layer.bias is not None:
            new_bias = self.conv_layer.bias.detach() * 0
            new_bias.requires_grad = self.conv_layer.bias.requires_grad
            del self.conv_layer.bias
            self.conv_layer.bias = new_bias

        self.w_broadcast = Broadcast(self.P_wb_cart,
                                     self.P_x,
                                     preserve_batch=False)

        if self.conv_layer.bias is not None:
            self.b_broadcast = Broadcast(self.P_wb_cart,
                                         self.P_x,
                                         preserve_batch=False)

        # We need the halo shape, and other info, to fully populate the pad,
        # halo exchange, and unpad layers.  For pad and unpad, we defer their
        # construction to the pre-forward hook.

        self.pad_layer = None
        self.unpad_layer = None

        # We need to be able to remove some data from the input to the conv
        # layer.
        self.needed_slices = None

        # For the halo layer we also defer construction, so that we can have
        # the halo shape for the input.  The halo will allocate its own
        # buffers, but it needs this information at construction to be able
        # to do this in the pre-forward hook.

        self.halo_layer = None

        # Variables for tracking input changes and buffer construction
        self._distdl_is_setup = False
        self._input_shape = None
        self._input_requires_grad = None
Exemple #11
0
#   [ 1 1 ]
#   -------
#   [ 2 2 ]
#   [ 2 2 ] ]
x = zero_volume_tensor()
if P_x.active:
    x_local_shape = slicing.compute_subshape(P_x.shape,
                                             P_x.index,
                                             x_global_shape)
    x = np.zeros(x_local_shape) + P_x.rank + 1
    x = torch.from_numpy(x)
x.requires_grad = True
print(f"rank {P_world.rank}; index {P_x.index}; value {x}")

# Here we broadcast the columns (axis 1), along the rows.
all_reduce_cols = Broadcast(P_x, P_y, preserve_batch=False)
#
# Output tensor will be (on a 2 x 3 partition):
# [ [ 1 1 | 1 1 | 1 1 ]
#   [ 1 1 | 1 1 | 1 1 ]
#   [ 1 1 | 1 1 | 1 1 ]
#   -------------------------
#   [ 2 2 | 2 2 | 2 2 ]
#   [ 2 2 | 2 2 | 2 2 ]
#   [ 2 2 | 2 2 | 2 2 ] ]
y = all_reduce_cols(x)

print(f"rank {P_world.rank}; index {P_x.index}; value {y}")

# Setup the adjoint input tensor.  Any worker in P_y will generate its part of
# the adjoint input tensor.  Any worker not in P_y will have a zero-volume
Exemple #12
0
def test_broadcast_dtype(barrier_fence_fixture, comm_split_fixture, dtype,
                         test_backward, P_x_ranks, P_x_shape, P_y_ranks,
                         P_y_shape, x_global_shape, transpose_src):

    import numpy as np
    import torch

    from distdl.backends.mpi.partition import MPIPartition
    from distdl.nn.broadcast import Broadcast
    from distdl.utilities.torch import zero_volume_tensor

    device = torch.device('cuda' if use_cuda else 'cpu')

    # Isolate the minimum needed ranks
    base_comm, active = comm_split_fixture
    if not active:
        return
    P_world = MPIPartition(base_comm)

    # Create the partitions
    P_x_base = P_world.create_partition_inclusive(P_x_ranks)
    P_x = P_x_base.create_cartesian_topology_partition(P_x_shape)

    P_y_base = P_world.create_partition_inclusive(P_y_ranks)
    P_y = P_y_base.create_cartesian_topology_partition(P_y_shape)

    # TODO #93: Change this to create a subtensor so we test when local tensors
    # have different shape.  Then, the output size will also be different, which
    # we will have to get from `y` itself.
    x_local_shape = np.asarray(x_global_shape)

    layer = Broadcast(P_x,
                      P_y,
                      transpose_src=transpose_src,
                      preserve_batch=False)
    layer = layer.to(device)

    x = zero_volume_tensor(device=device)
    if P_x.active:
        x = 10 * torch.randn(*x_local_shape).to(dtype)
        x = x.to(device)

    x.requires_grad = test_backward

    # y = F @ x
    y = layer(x)

    # If we are not in the output partition, there is no data to test the type
    # against.
    if P_y.active:
        assert y.dtype == dtype

    if test_backward:
        dy = zero_volume_tensor(device=device)
        if P_y.active:
            # Adjoint Input
            dy = 10 * torch.randn(*x_local_shape).to(dtype)
            dy = dy.to(device)

        # dx = F* @ dy
        y.backward(dy)
        dx = x.grad

        if P_x.active:
            assert dx.dtype == dtype

    P_world.deactivate()
    P_x_base.deactivate()
    P_x.deactivate()
    P_y_base.deactivate()
    P_y.deactivate()