Exemple #1
0
def unpack2bits(array, size):
    decode = cupy2torch(cupy.unpackbits(torch2cupy(array)))
    first = decode[:size]
    second = decode[size:2 * size]
    second[first > 0] = 2

    return second
Exemple #2
0
def memory_leak_check():
    memory_percentage = torch.cuda.memory_allocated(
    ) / torch.cuda.max_memory_allocated()
    print(f"Memory used: {memory_percentage:.3f}")

    dict2 = {}

    # Obtaining the first cupy in the dict
    first_cupy = dict2[list(dict2.keys())[0]]

    # For each item in the dict1, convert it to cupy
    for key in dict1.keys():
        dict2[key] = torch.as_tensor(cp.unpackbits(dict1[key]),
                                     device=f'cuda:{first_cupy.device.id}')

    return dict2
Exemple #3
0
def unpackbits(a):
    if a.device == torch.device('cpu'):
        return torch.from_numpy(np.unpackbits(a))
    with cp.cuda.Device(a.device.index):
        return to_pt(cp.unpackbits(to_cp(a)))
Exemple #4
0
 def time_unpackbits_axis1(self):
     np.unpackbits(self.d2, axis=1)
Exemple #5
0
 def time_unpackbits(self):
     np.unpackbits(self.d)
Exemple #6
0
    def Compressed_Allreduce(self, buffer_m: torch.tensor, worker_error,
                             server_error, rank, world_size, comm, local_rank):

        all_start_time = time.time()
        original_size = buffer_m.numel()
        cupy.cuda.Device(local_rank).use()

        if torch.numel(buffer_m) != torch.numel(worker_error):
            empty_tensor = torch.zeros(torch.numel(worker_error) -
                                       torch.numel(buffer_m),
                                       device=buffer_m.device)
            buffer_m = torch.cat([buffer_m, empty_tensor])

        buffer_m.add_(worker_error)
        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
        sign_buffer_m = buffer_m.sign().add_(1).bool()
        sign_buffer_m = sign_buffer_m.float()
        sign_buffer_m.add_(-0.5).mul_(2.0)
        worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
        sign_buffer_m = None

        compensated_buffer_m = buffer_m
        compensated_buffer_m.sign_()
        compensated_buffer_m = compensated_buffer_m.add_(1).bool()
        cupy_worker_scale = self.torch2cupy(worker_scale)
        cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m)
        compensated_buffer_m = None

        cupy_sign_list_packed = self.compress_by_chunk(
            cupy_compensated_buffer_m, world_size)
        cupy_compensated_buffer_m = None

        cupy_recvbuf_sign = cupy.zeros(
            [world_size, cupy_sign_list_packed[rank].size],
            dtype=cupy_sign_list_packed[0].dtype)
        cupy_recvbuf_scale = cupy.zeros([world_size, 1],
                                        dtype=cupy_worker_scale.dtype)

        # Communication Phase 1
        gather_start = time.time()
        if self.cuda_aware:
            gather_cuda(rank, world_size, comm, cupy_sign_list_packed,
                        cupy_recvbuf_sign, cupy_worker_scale,
                        cupy_recvbuf_scale)
        else:
            cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(
                rank, world_size, comm, cupy_sign_list_packed,
                cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale)
        gather_end = time.time()

        cupy_unpacked_sign = (cupy.unpackbits(
            cupy_recvbuf_sign.flatten())).reshape(world_size, -1)
        cupy_recvbuf_sign = None
        unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float()
        cupy_unpacked_sign = None
        unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
        worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size)
        compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
        unpacked_sign = None

        compensated_server_m.add_(server_error)
        server_scale = torch.norm(compensated_server_m) / np.sqrt(
            compensated_server_m.numel())
        sign_server_m = compensated_server_m.sign().add_(1).bool()
        sign_server_m = sign_server_m.float()
        sign_server_m.add_(-0.5).mul_(2.0)
        server_error.set_(compensated_server_m - server_scale * sign_server_m)
        sign_server_m = None

        compensated_server_m.sign_()
        compensated_server_m = compensated_server_m.add_(1).bool()
        cupy_server_scale = self.torch2cupy(server_scale)
        cupy_compensated_server_m = self.torch2cupy(compensated_server_m)
        compensated_server_m = None

        cupy_server_sign_packed = self.compress_by_chunk(
            cupy_compensated_server_m, 1)

        cupy_recvbuf_sign_server = cupy.zeros(
            [world_size, cupy_server_sign_packed[0].size],
            dtype=cupy_sign_list_packed[0].dtype)
        cupy_recvbuf_scale_server = cupy.zeros([world_size, 1],
                                               dtype=cupy_worker_scale.dtype)

        # Communication Phase 2
        if self.cuda_aware:
            allgather_cuda(comm, cupy_server_sign_packed[0],
                           cupy_recvbuf_sign_server, cupy_server_scale,
                           cupy_recvbuf_scale_server)
        else:
            cupy_server_sign_packed[
                0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(
                    comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server,
                    cupy_server_scale, cupy_recvbuf_scale_server)

        cupy_server_unpacked_sign = (cupy.unpackbits(
            cupy_recvbuf_sign_server.flatten())).reshape(world_size, -1)
        cupy_recvbuf_sign_server = None

        server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign)
        cupy_server_unpacked_sign = None

        server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(
            2.0)
        server_scale = self.cupy2torch(cupy_recvbuf_scale_server)
        buffer_m = server_unpacked_sign.mul_(
            server_scale).flatten()[0:original_size]

        return buffer_m
Exemple #7
0
    def compressed_allreduce(self,
                             buffer_m: torch.tensor,
                             worker_error,
                             server_error,
                             local_rank):

        # all_start_time = time.time()
        original_shape = buffer_m.size()
        if len(original_shape) > 1:
            buffer_m = torch.flatten(buffer_m)
        original_size = buffer_m.numel()
        worker_error_size = worker_error.numel()
        cupy.cuda.Device(local_rank).use()

        if original_size != worker_error_size:
            empty_tensor = torch.zeros(worker_error_size - original_size,
                                       device=buffer_m.device)
            buffer_m = torch.cat([buffer_m, empty_tensor])

        buffer_m.add_(worker_error)
        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
        worker_error.set_(buffer_m - worker_scale *
                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))

        if self.bool_not_supported:
            cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(
                    buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
                self.size)
        else:
            cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
                self.size)
        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)

        cupy_recvbuf_sign = cupy.zeros(
            [self.size,
             cupy_sign_list_packed[self.rank].size],
            dtype=cupy_sign_list_packed[0].dtype)
        # cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)

        sign_list_packed = [
            self.compression_backend.cupy2torch(cupy_sign_list_packed[idx])
            for idx in range(self.size)
        ]

        # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
        recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
        #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
        recvbuf_scale = [
            torch.zeros(1,
                        dtype=worker_scale.dtype,
                        device=torch.device(local_rank)) for i in range(self.size)
        ]

        # communication phase 1
        # gather_start = time.time()
        # Alltoall for sign
        dist.all_to_all_single(recvbuf_sign,
                               torch.stack(sign_list_packed),
                               group=self.world_group)
        # Allgather for scale
        dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group)

        # gather_end = time.time()

        # cupy_sign_list_packed, sign_list_packed, cupy_worker_scale, worker_scale = None, None, None, None
        cupy_sign_list_packed = None

        cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign)
        #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale))

        compensated_server_m = self.compression_backend.cupy2torch(
            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
                self.size,
                -1)).float().add_(-0.5).mul_(2.0).mul_(
                    torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
        compensated_server_m.add_(server_error)
        server_scale = torch.norm(compensated_server_m) / np.sqrt(
            compensated_server_m.numel())
        server_error.set_(
            compensated_server_m - server_scale *
            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))

        # cupy_server_scale = self.compression_backend.torch2cupy(server_scale)

        if self.bool_not_supported:
            cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(
                    compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
                1)
        else:
            cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(
                    compensated_server_m.sign_().add_(1).bool()),
                1)
        compensated_server_m = None

        cupy_recvbuf_sign_server = cupy.zeros(
            [self.size,
             cupy_server_sign_packed[0].size],
            dtype=cupy_recvbuf_sign.dtype)
        # cupy_recvbuf_sign, recvbuf_sign = None, None
        cupy_recvbuf_sign = None

        server_sign_packed = [
            self.compression_backend.cupy2torch(cupy_server_sign_packed[0])
        ]
        recvbuf_sign_server = [
            self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx])
            for idx in range(self.size)
        ]

        # server_scale = self.compression_backend.cupy2torch(cupy_server_scale)
        cupy_recvbuf_scale_server = cupy.zeros([self.size,
                                                1],
                                               dtype=cupy_worker_scale.dtype)
        # cupy_recvbuf_scale, recvbuf_scale = None, None

        recvbuf_scale_server = [
            self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx])
            for idx in range(self.size)
        ]

        # Communication Phase 2
        dist.all_gather(recvbuf_sign_server,
                        server_sign_packed[0],
                        group=self.world_group)
        dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group)

        cupy_server_sign_packed = None

        # need to convert from a tensor list to a single tensor
        # dist.all_gather only provides a tensor list as the recv/output buffer
        recvbuf_sign_server = torch.stack(recvbuf_sign_server)

        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(
            recvbuf_sign_server)

        buffer_m.data.copy_(
            self.compression_backend.cupy2torch(
                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
                    self.size,
                    -1)).float().add_(-0.5).mul_(2.0).mul_(
                        self.compression_backend.cupy2torch(
                            cupy_recvbuf_scale_server)).flatten().data)
        if original_size != worker_error_size:
            buffer_m = buffer_m[0:original_size]
        if len(original_shape) > 1:
            buffer_m = buffer_m.reshape(original_shape)

        return buffer_m
Exemple #8
0
    def compressed_allreduce(self, buffer_m: torch.tensor, worker_error,
                             server_error, local_rank):

        all_start_time = time.time()
        original_shape = buffer_m.size()
        if len(original_shape) > 1:
            buffer_m = torch.flatten(buffer_m)
        original_size = buffer_m.numel()
        worker_error_size = worker_error.numel()
        cupy.cuda.Device(local_rank).use()

        if original_size != worker_error_size:
            empty_tensor = torch.zeros(worker_error_size - original_size,
                                       device=buffer_m.device)
            buffer_m = torch.cat([buffer_m, empty_tensor])

        buffer_m.add_(worker_error)
        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
        worker_error.set_(
            buffer_m - worker_scale *
            buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))

        cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
            self.compression_backend.torch2cupy(
                buffer_m.sign_().add_(1).bool()), self.size)
        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)

        cupy_recvbuf_sign = cupy.zeros(
            [self.size, cupy_sign_list_packed[self.rank].size],
            dtype=cupy_sign_list_packed[0].dtype)
        cupy_recvbuf_scale = cupy.zeros([self.size, 1],
                                        dtype=cupy_worker_scale.dtype)

        # Communication Phase 1
        gather_start = time.time()
        if self.cuda_aware:
            self.gather_cuda(self.rank, self.size, self.comm,
                             cupy_sign_list_packed, cupy_recvbuf_sign,
                             cupy_worker_scale, cupy_recvbuf_scale)
        else:
            _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host(
                self.rank, self.size, self.comm, cupy_sign_list_packed,
                cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale)
        gather_end = time.time()

        # cupy_sign_list_packed, cupy_worker_scale, worker_scale = None, None, None
        cupy_sign_list_packed = None

        compensated_server_m = self.compression_backend.cupy2torch(
            (cupy.unpackbits(cupy_recvbuf_sign.flatten())
             ).reshape(self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
                 self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(
                     1 / self.size)).sum(0)
        compensated_server_m.add_(server_error)
        server_scale = torch.norm(compensated_server_m) / np.sqrt(
            compensated_server_m.numel())
        server_error.set_(compensated_server_m -
                          server_scale * compensated_server_m.sign().add_(
                              1).bool().float().add_(-0.5).mul_(2.0))

        cupy_server_scale = self.compression_backend.torch2cupy(server_scale)

        cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
            self.compression_backend.torch2cupy(
                compensated_server_m.sign_().add_(1).bool()), 1)
        compensated_server_m = None

        cupy_recvbuf_sign_server = cupy.zeros(
            [self.size, cupy_server_sign_packed[0].size],
            dtype=cupy_recvbuf_sign.dtype)
        cupy_recvbuf_scale_server = cupy.zeros([self.size, 1],
                                               dtype=cupy_recvbuf_scale.dtype)
        # cupy_recvbuf_sign, cupy_recvbuf_scale = None, None
        cupy_recvbuf_sign = None

        # Communication Phase 2
        if self.cuda_aware:
            self.allgather_cuda(self.comm, cupy_server_sign_packed[0],
                                cupy_recvbuf_sign_server, cupy_server_scale,
                                cupy_recvbuf_scale_server)
        else:
            _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host(
                self.comm, cupy_server_sign_packed[0],
                cupy_recvbuf_sign_server, cupy_server_scale,
                cupy_recvbuf_scale_server)

        # cupy_server_sign_packed, cupy_server_scale, server_scale = None, None, None
        cupy_server_sign_packed = None

        buffer_m.data.copy_(
            self.compression_backend.cupy2torch(
                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
                    self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
                        self.compression_backend.cupy2torch(
                            cupy_recvbuf_scale_server)).flatten().data)
        if original_size != worker_error_size:
            buffer_m = buffer_m[0:original_size]
        if len(original_shape) > 1:
            buffer_m = buffer_m.reshape(original_shape)

        # cupy_recvbuf_sign_server, cupy_recvbuf_scale_server = None, None

        return buffer_m
Exemple #9
0
def unpackbits(array, size):
    return cupy2torch(cupy.unpackbits(torch2cupy(array))[:size])