def make_sdd_lut(layout, block, dtype, device): #_sparse_matmul._load_utils() #start_width = 64 // block #segmented = _sparse_matmul.sdd_segment(layout.type(torch.int32), start_width) start_width = (128 if block > 16 else 32) // block layout = layout.type(torch.int32) segmented = libtriton.superblock(layout.data_ptr(), layout.shape[0], layout.shape[1], layout.shape[2], start_width) luts, widths, packs = [], [], [] for size, nnz in segmented: """ width = nnz.shape[0] // (size * size) h = nnz[:, 0] i = nnz[:, 1] j = nnz[:, 2] b = nnz[:, 3] lut = torch.stack((h, i, j, b), dim=1).view(-1).contiguous() luts.append(lut.type(torch.int32).to(device)) widths.append(width) packs.append(size) """ nnz = nnz.reshape(-1, 4) width = nnz.shape[0] // (size * size) luts.append(torch.from_numpy(nnz).type(torch.int32).to(device)) widths.append(width) packs.append(size) # create locks return luts, None, widths, packs
def make_sdd_lut(layout, block, dtype, device): start_width = 128 // block layout = layout.type(torch.int32) superblocks = libtriton.superblock(layout.data_ptr(), layout.shape[0], layout.shape[1], layout.shape[2], start_width) luts, widths, packs = [], [], [] for size, nnz in superblocks: nnz = nnz.reshape(-1, 4) width = nnz.shape[0] // (size * size) luts.append(torch.from_numpy(nnz).type(torch.int32).to(device)) widths.append(width) packs.append(size) # create locks return luts, None, widths, packs
def make_sdd_lut(layout, block, dtype, device): start_width = 128 // block superblocks = libtriton.superblock(layout.type(torch.int32), start_width) luts, widths, packs = [], [], [] for size, nnz in superblocks: width = nnz.shape[0] // (size*size) h = nnz[:, 0] i = nnz[:, 1] j = nnz[:, 2] b = nnz[:, 3] lut = torch.stack((h, i, j, b), dim=1).view(-1).contiguous() luts.append(lut.type(torch.int32).to(device)) widths.append(width) packs.append(size) # create locks return luts, None, widths, packs