Ejemplo n.º 1
0
 def get_ops(self, L):
     import sys
     if L not in MultiheadAttention.ops:
         sparse_dot_sdd_nt = torch_blocksparse.MatMul(self.layout, self.block, 'sdd', trans_a=False, trans_b=True)
         sparse_dot_dsd_nn = torch_blocksparse.MatMul(self.layout, self.block, 'dsd', trans_a=False, trans_b=False)
         sparse_softmax = torch_blocksparse.Softmax(self.layout, self.block)
         MultiheadAttention.ops[L] = (sparse_dot_sdd_nt, sparse_dot_dsd_nn, sparse_softmax)
     return MultiheadAttention.ops[L]
Ejemplo n.º 2
0
def run_bench_mm(Z,
                 H,
                 M,
                 N,
                 K,
                 rho,
                 mode,
                 trans_a,
                 trans_b,
                 block,
                 dtype,
                 layout=None,
                 repeat=10):
    x, w, dy, shape, layout = init_inputs(Z, H, M, N, K, rho, mode, trans_a,
                                          trans_b, block, dtype, layout)
    op = torch_blocksparse.MatMul(layout,
                                  block,
                                  mode,
                                  trans_a=trans_a,
                                  trans_b=trans_b)
    time = bench(lambda: op(x, w), repeat)
    gflops = {
        'sdd': 2 * Z * K * float(layout.sum()) * block * block * 1e-9,
        'dsd': 2 * Z * N * float(layout.sum()) * block * block * 1e-9,
        'dds': 2 * Z * M * float(layout.sum()) * block * block * 1e-9
    }[mode]
    return gflops / time
Ejemplo n.º 3
0
 def get_ops(self, L):
     if L not in self.__class__.ops:
         sparsity = self.sparsity
         layout = self.__class__._make_layout(
             self.num_attention_heads_per_partition, L // sparsity.block,
             sparsity.mode, sparsity.stride // sparsity.block,
             sparsity.unidirectional, sparsity.numverts, sparsity.vertsize)
         sparse_dot_sdd_nt = torch_blocksparse.MatMul(layout,
                                                      sparsity.block,
                                                      'sdd',
                                                      trans_a=False,
                                                      trans_b=True)
         sparse_dot_dsd_nn = torch_blocksparse.MatMul(layout,
                                                      sparsity.block,
                                                      'dsd',
                                                      trans_a=False,
                                                      trans_b=False)
         sparse_softmax = torch_blocksparse.Softmax(layout, sparsity.block)
         self.__class__.ops[L] = (sparse_dot_sdd_nt, sparse_dot_dsd_nn,
                                  sparse_softmax)
     return self.__class__.ops[L]
Ejemplo n.º 4
0
    def get_ops(self, H, L):
        import sys
        if L not in DeepSpeedSparseSelfAttention.ops:
            spConfig = self.sparsity_config

            num_blocks = L // spConfig.block
            if num_blocks != L / spConfig.block:
                raise ValueError(
                    f'Sequence length {L} must be dividable by block size {spConfig.block}'
                )

            block_stride = spConfig.stride // spConfig.block
            if block_stride != spConfig.stride // spConfig.block:
                raise ValueError(
                    f'Stride {spConfig.stride} must be dividable by block size {spConfig.block}'
                )

            layout = DeepSpeedSparseSelfAttention._make_layout(
                H, num_blocks, spConfig.mode, block_stride, spConfig.attention,
                spConfig.numverts, spConfig.vertsize)

            sparse_dot_sdd_nt = torch_blocksparse.MatMul(layout,
                                                         spConfig.block,
                                                         'sdd',
                                                         trans_a=False,
                                                         trans_b=True)

            sparse_dot_dsd_nn = torch_blocksparse.MatMul(layout,
                                                         spConfig.block,
                                                         'dsd',
                                                         trans_a=False,
                                                         trans_b=False)

            sparse_softmax = torch_blocksparse.Softmax(layout, spConfig.block)

            DeepSpeedSparseSelfAttention.ops[L] = (sparse_dot_sdd_nt,
                                                   sparse_dot_dsd_nn,
                                                   sparse_softmax)
        return DeepSpeedSparseSelfAttention.ops[L]
Ejemplo n.º 5
0
 def get_ops(self, L):
     import sys
     if L not in MultiheadAttention.ops:
         sparsity = self.sparsity
         layout = MultiheadAttention._make_layout(
             self.num_heads, L // sparsity.block, sparsity.mode,
             sparsity.stride // sparsity.block, sparsity.unidirectional,
             sparsity.numverts, sparsity.vertsize)
         sparse_dot_sdd_nt = torch_blocksparse.MatMul(layout,
                                                      sparsity.block,
                                                      'sdd',
                                                      trans_a=False,
                                                      trans_b=True)
         sparse_dot_dsd_nn = torch_blocksparse.MatMul(layout,
                                                      sparsity.block,
                                                      'dsd',
                                                      trans_a=False,
                                                      trans_b=False)
         sparse_softmax = torch_blocksparse.Softmax(layout, sparsity.block)
         MultiheadAttention.ops[L] = (sparse_dot_sdd_nt, sparse_dot_dsd_nn,
                                      sparse_softmax)
     return MultiheadAttention.ops[L]
Ejemplo n.º 6
0
def run_mm_triton(x, w, mode, trans_a, trans_b, layout, block, dy):
    x = dense_to_sparse(x, layout, block) if mode == 'dsd' else x
    w = dense_to_sparse(w, layout, block) if mode == 'dds' else w
    dy = dense_to_sparse(dy, layout, block) if mode == 'sdd' else dy
    op = torch_blocksparse.MatMul(layout,
                                  block,
                                  mode,
                                  trans_a=trans_a,
                                  trans_b=trans_b)
    x.retain_grad()
    w.retain_grad()
    y = op(x, w)
    y.backward(dy)
    dx = x.grad.clone()
    dw = w.grad.clone()
    x.grad.zero_()
    return y, dx, dw
Ejemplo n.º 7
0
import torch
import torch_blocksparse

# Z: non-sparse batch dimension
# H: sparse batch dimension
# M: row dimension
# N: column dimension
Z, H, M, N, K = 4, 2, 256, 512, 384
a = torch.rand((Z, H, M, K), dtype=torch.float32).cuda()
b = torch.rand((Z, H, K, N), dtype=torch.float32).cuda()
# create sparsity layout
block = 16
layout = torch.randint(0, 2, (H, M // block, N // block))
# create object for Sparse = trans(Dense) x Dense (sdd)
# some overhead there as it pre-computes look-up tables
# internally needed by GPU kernels
dot = torch_blocksparse.MatMul(layout,
                               block,
                               'sdd',
                               trans_a=True,
                               trans_b=False)
c = dot(a, b)
# create object for Sparse = softmax(Sparse)
softmax = torch_blocksparse.Softmax(layout, block)
d = softmax(c)