def ctor(self): code = pccm.FunctionCode() code.arg("is_cpu", "bool") code.arg("key_itemsize, value_itemsize", "int") code.arg("keys_data", "tv::Tensor") code.arg("values_data", "tv::Tensor") code.arg("stream", "std::uintptr_t", "0") code.ctor_init("is_cpu", "is_cpu") code.ctor_init("keys_data", "keys_data") code.ctor_init("values_data", "values_data") code.ctor_init("key_itemsize_", "key_itemsize") code.ctor_init("value_itemsize_", "value_itemsize") code.ctor_init("insert_count_", "0") code.raw(f""" TV_ASSERT_RT_ERR(key_itemsize == 4 || key_itemsize == 8, "key_itemsize must be 4 or 8"); TV_ASSERT_RT_ERR(value_itemsize == 4 || value_itemsize == 8, "value_itemsize must be 4 or 8"); if (!is_cpu){{ TV_ASSERT_RT_ERR(!keys_data.empty() && !values_data.empty(), "key and value must not empty"); TV_ASSERT_RT_ERR(keys_data.dim(0) == values_data.dim(0), "key and value must have same size"); TV_ASSERT_RT_ERR(key_itemsize == keys_data.itemsize(), "key_itemsize must equal to key_data"); TV_ASSERT_RT_ERR(value_itemsize == values_data.itemsize(), "value_itemsize must equal to values_data"); // clear cuda table here. clear(stream); }} """) if CUMM_CPU_ONLY_BUILD: code.raw( f"TV_ASSERT_RT_ERR(is_cpu, \"spconv not built with CUDA\");") return code
def voxel_empty_fill_mean(self): code = pccm.FunctionCode() code.arg("voxels", f"{self.dtype} *") code.arg("num_per_voxel", f"int *") code.arg("num_voxels", f"int") code.arg("num_points_per_voxel", f"int") code.arg("num_voxel_features", f"int") code.raw(f""" int voxel_stride = num_points_per_voxel * num_voxel_features; for (int i : tv::KernelLoopX<int>(num_voxels)){{ int count = min(num_points_per_voxel, num_per_voxel[i]); num_per_voxel[i] = count; for (int j = 0; j < num_voxel_features; ++j){{ auto voxel_ptr = voxels + i * voxel_stride + j; {self.dtype} sum_val = 0; for (int k = 0; k < count; ++k){{ sum_val += voxel_ptr[0]; voxel_ptr += num_voxel_features; }} sum_val = count == 0 ? 0 : sum_val / count; for (int k = count; k < num_points_per_voxel; ++k){{ voxel_ptr[0] = sum_val; voxel_ptr += num_voxel_features; }} }} }} """) return code
def generate_subm_conv_inds_cpu(self): code = pccm.FunctionCode() code.arg("indices", "tv::Tensor") code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor") code.arg("batch_size", "int") code.arg("input_dims", f"std::vector<int>") code.arg("ksize, dilation", f"std::vector<int>") code.raw(f""" int ndim = indices.dim(1) - 1; TV_ASSERT_RT_ERR(input_dims.size() == ndim && ksize.size() == ndim && dilation.size() == ndim, "your params size not equal to ndim", ndim); """) for ndim in self.ndims: code.raw(f""" if (ndim == {ndim}){{ tv::array<int, {ndim}> input_dims_; tv::array<int, {ndim}> ksize_, dilation_; for (int i = 0; i < {ndim}; ++i){{ input_dims_[i] = input_dims[i]; ksize_[i] = ksize[i]; dilation_[i] = dilation[i]; }} return SpconvIndicesCPU{ndim}D::generate_subm_conv_inds(indices, indice_pairs, out_inds, indice_num_per_loc, batch_size, input_dims_, ksize_, dilation_); }} """) code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") return code.ret("int")
def scatter_add(self): code = pccm.FunctionCode() code.arg("out", "tv::Tensor") code.arg("in", "tv::Tensor") code.arg("inds", "tv::Tensor") code.raw(f""" // tv::check_shape(inds, {{in.dim(0)}}); auto nhot = inds.dim(0); int channel = in.dim(1); tv::dispatch<float, double>(out.dtype(), [&](auto I){{ using T = TV_DECLTYPE(I); auto indices_data = inds.data_ptr<const int>(); const T *buffer_data = in.data_ptr<const T>(); T *features_data = out.data_ptr<T>(); const T *buf = in.data_ptr<const T>(); T *out_ptr = out.data_ptr<T>(); tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{ for (int i = begin; i < end; i += step) {{ buf = buffer_data + i * channel; out_ptr = features_data + indices_data[i] * channel; for (int j = 0; j < channel; ++j) {{ out_ptr[j] = out_ptr[j] + buf[j]; }} }} }}); }}); """) return code
def assign_arange_(self): """ this function assign "arange(NumItem)" to table values. useful in "unique-like" operations. unlike insert/query, this method only support i32/i64/u32/u64 for value. count must be u32/u64. """ code = pccm.FunctionCode() if not CUMM_CPU_ONLY_BUILD: code.add_dependency(TensorViewHashKernel) code.arg("count", "tv::Tensor") code.arg("stream", "std::uintptr_t", "0") with code.if_("is_cpu"): map_name = "cpu_map" for k_type, v_type in self.cpu_map_storage_select( "key_itemsize_", "value_itemsize_", map_name, code): code.raw(f""" {v_type} index = 0; for (auto it = {map_name}.begin(); it != {map_name}.end(); ++it){{ it.value() = index; ++index; }} """) if not CUMM_CPU_ONLY_BUILD: with code.else_(): code.raw(f""" TV_ASSERT_RT_ERR(count.device() == 0, "count must be cuda"); auto custream = reinterpret_cast<cudaStream_t>(stream); """) for k_items in _dispatch_ints(code, [4, 8], "keys_data.itemsize()"): code.raw(f""" using K = tv::hash::itemsize_to_unsigned_t<{k_items}>; constexpr K kEmptyKey = std::numeric_limits<K>::max(); auto count_ptr = count.data_ptr<K>(); K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data()); """) val_dtypes = [ dtypes.int32, dtypes.int64, dtypes.uint32, dtypes.uint64 ] for v_dtype in _dispatch(code, val_dtypes, "values_data.dtype()"): code.raw(f""" using V = {v_dtype}; V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data()); using table_t = tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>, kEmptyKey, false>; table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0)); tv::cuda::Launch launcher(table.size(), custream); launcher(tv::hash::assign_arange_split<table_t, K>, table, count_ptr); """) else: code.raw(f""" TV_THROW_RT_ERR("spconv not compiled with cuda, don't support cuda"); """) return code
def gather(self): code = pccm.FunctionCode() code.arg("out", "tv::Tensor") code.arg("in", "tv::Tensor") code.arg("inds", "tv::Tensor") code.raw(f""" // tv::check_shape(inds, {{out.dim(0)}}); auto nhot = inds.dim(0); int channel = in.dim(1); tv::dispatch<float, double>(out.dtype(), [&](auto I){{ auto indices_data = inds.data_ptr<const int>(); using T = TV_DECLTYPE(I); T *buffer_data = out.data_ptr<T>(); const T *features_data = in.data_ptr<const T>(); tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{ for (int i = begin; i < end; i += step) {{ std::memcpy(buffer_data + i * channel, features_data + indices_data[i] * channel, sizeof(T) * channel); }} }}); }}); """) return code
def forward_kernel(self): code = pccm.FunctionCode() code.targ("T") code.arg("out_features", f"T*") code.arg("in_features", f"const T*") code.arg("out_indices", "const int*") code.arg("in_indices", "const int*") code.arg("size", "int") code.arg("num_features", "int") code.raw(f""" for (int i : tv::KernelLoopY<int>(size)) {{ int in_idx = in_indices[i]; int out_idx = out_indices[i]; auto in_ptr = in_features + in_idx * num_features; auto out_ptr = out_features + out_idx * num_features; for (int j : tv::KernelLoopX<int>(num_features)) {{ auto in = in_ptr[j]; auto out = out_ptr[j]; if (in > out){{ out_ptr[j] = in; }} }} }} """) return code
def backward_kernel(self): code = pccm.FunctionCode() code.targ("T") code.arg("out_features", f"const T*") code.arg("in_features", f"const T*") code.arg("dout_features", f"const T*") code.arg("din_features", f"T*") code.arg("out_indices", "const int*") code.arg("in_indices", "const int*") code.arg("size", "int") code.arg("num_features", "int") code.raw(f""" for (int i : tv::KernelLoopY<int>(size)) {{ int in_idx_offset = in_indices[i] * num_features; int out_idx_offset = out_indices[i] * num_features; auto in_ptr = in_features + in_idx_offset; auto out_ptr = out_features + out_idx_offset; auto din_ptr = din_features + in_idx_offset; auto dout_ptr = dout_features + out_idx_offset; for (int j : tv::KernelLoopX<int>(num_features)) {{ auto in = in_ptr[j]; auto out = out_ptr[j]; if (in == out){{ din_ptr[j] = din_ptr[j] + dout_ptr[j]; }} }} }} """) return code
def calc_meta_data(self): code = pccm.FunctionCode() code.arg("vsize_xyz", f"std::array<float, {self.ndim}>") code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>") code.raw(f""" return Point2VoxelCommon::calc_meta_data(vsize_xyz, coors_range_xyz); """) return code.ret(self.p2v_c.calc_meta_ret)
def sort_1d_by_key_split_allocator(self): code = pccm.FunctionCode() if CUMM_CPU_ONLY_BUILD: return code.make_invalid() code.arg("data", "tv::Tensor") code.arg("alloc_func", "std::function<std::uintptr_t(std::size_t)>") code.arg("mask", "tv::Tensor") code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()") code.arg("stream", "std::uintptr_t", "0", pyanno="int") code.arg("mask_output", "bool", "false") code.code_after_include = f""" template <typename T> struct MaskedElementComp {{ T mask_; TV_HOST_DEVICE_INLINE T operator()(const T &x, const T &y) const {{ return (x & mask_) < (y & mask_); }} }}; template <typename T> __global__ void mask_input(T* inp, T mask, int size){{ for (int i : tv::KernelLoopX<int>(size)){{ inp[i] &= mask; }} }} """ code.add_dependency(CustomThrustLib, TensorViewKernel) code.add_param_class("cudakers", CudaCommonKernel()) code.raw(f""" ThrustCustomAllocatorV2 allocator{{alloc_func}}; cudaStream_t stream_cu = reinterpret_cast<cudaStream_t>(stream); // auto timer = tv::CudaContextTimer<>(); if (indices.empty()){{ indices = tv::empty({{data.dim(0)}}, tv::int32, 0); }} tv::cuda::Launch launcher(data.dim(0), stream_cu); launcher(cudakers::arange_kernel<int32_t>, indices.data_ptr<int32_t>(), indices.dim(0)); tv::dispatch<int32_t, uint32_t, int64_t, uint64_t>(data.dtype(), [&](auto I){{ using T = TV_DECLTYPE(I); auto masks_ptr = mask.data_ptr<T>(); MaskedElementComp<T> op_comp{{masks_ptr[0]}}; thrust::device_ptr<T> ptr_tr(data.data_ptr<T>()); thrust::device_ptr<int32_t> ptr_k(indices.data_ptr<int32_t>()); // auto thrust_ctx = thrust::cuda::par.on(stream_cu); auto ctx2 = thrust::cuda::par(allocator).on(stream_cu); thrust::sort_by_key(ctx2, ptr_tr, ptr_tr + data.dim(0), ptr_k, op_comp); if (mask_output){{ launcher(mask_input<T>, data.data_ptr<T>(), masks_ptr[0], data.dim(0)); }} }}); // tv::ssprint("SORT_BY_KEY_MASKED", timer.report() / 1000.0); return indices; """) return code.ret("tv::Tensor")
def count_bits(self): code = pccm.FunctionCode() if CUMM_CPU_ONLY_BUILD: return code.make_invalid() code.add_dependency(TensorViewKernel) code.arg("a", "tv::Tensor") code.code_after_include = f""" __global__ void count_bits_kernel_64(const uint64_t* data, int32_t* out, int size){{ for (int i : tv::KernelLoopX<int>(size)){{ out[i] = __popcll(reinterpret_cast<const unsigned long long*>(data)[i]); }} }} __global__ void count_bits_kernel(const uint32_t* data, int32_t* out, int size){{ for (int i : tv::KernelLoopX<int>(size)){{ out[i] = __popc(data[i]); }} }} int numberOfSetBits(uint32_t i) {{ // https://stackoverflow.com/questions/109023/how-to-count-the-number-of-set-bits-in-a-32-bit-integer // Java: use int, and use >>> instead of >>. Or use Integer.bitCount() // C or C++: use uint32_t i = i - ((i >> 1) & 0x55555555); // add pairs of bits i = (i & 0x33333333) + ((i >> 2) & 0x33333333); // quads i = (i + (i >> 4)) & 0x0F0F0F0F; // groups of 8 return (i * 0x01010101) >> 24; // horizontal sum of bytes }} int numberOfSetBits(uint64_t i) {{ return numberOfSetBits(uint32_t(i)) + numberOfSetBits(uint32_t(i >> 32)); }} """ code.raw(f""" tv::Tensor res(a.shape(), tv::int32, a.device()); tv::dispatch<uint32_t, uint64_t>(a.dtype(), [&](auto I){{ auto res_ptr = res.data_ptr<int>(); using T = TV_DECLTYPE(I); auto a_ptr = a.data_ptr<const T>(); if (a.device() == -1){{ for (int i = 0; i < a.size(); ++i){{ res_ptr[i] = numberOfSetBits(a_ptr[i]); }} }}else{{ tv::cuda::Launch launcher(a.size()); tv::if_constexpr<std::is_same<T, uint64_t>::value>([=](auto _)mutable{{ launcher(_(count_bits_kernel_64), a_ptr, res_ptr, int(a.size())); }}, [=](auto _)mutable{{ launcher(_(count_bits_kernel), a_ptr, res_ptr, int(a.size())); }}); }} }}); return res; """) return code.ret("tv::Tensor")
def non_max_suppression_cpu(self): code = pccm.FunctionCode() code.arg("boxes, order", "tv::Tensor") code.arg("thresh", "float") code.arg("eps", "float", "0") code.raw(f""" auto ndets = boxes.dim(0); std::vector<int> keep(ndets); tv::dispatch<float, double>(boxes.dtype(), [&](auto I1){{ using DType = TV_DECLTYPE(I1); auto boxes_r = boxes.tview<const DType, 2>(); tv::dispatch<int, int64_t, uint32_t, uint64_t>(order.dtype(), [&](auto I2){{ using T2 = TV_DECLTYPE(I2); auto order_r = order.tview<const T2, 1>(); std::vector<DType> areas; for (int i = 0; i < ndets; ++i){{ areas[i] = (boxes_r(i, 2) - boxes_r(i, 0) + eps) * (boxes_r(i, 3) - boxes_r(i, 1) + eps); }} std::vector<int> suppressed(ndets, 0); int i, j; DType xx1, xx2, w, h, inter, ovr; for (int _i = 0; _i < ndets; ++_i) {{ i = order_r(_i); if (suppressed[i] == 1) continue; keep.push_back(i); for (int _j = _i + 1; _j < ndets; ++_j) {{ j = order_r(_j); if (suppressed[j] == 1) continue; xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2)); xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0)); w = xx2 - xx1 + eps; if (w > 0) {{ xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3)); xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1)); h = xx2 - xx1 + eps; if (h > 0) {{ inter = w * h; ovr = inter / (areas[i] + areas[j] - inter); if (ovr >= thresh) suppressed[j] = 1; }} }} }} }} }}); }}); return keep; """) return code.ret("std::vector<int>")
def get_grid_size(self): code = pccm.FunctionCode() code.raw(f""" std::array<int, {self.ndim}> res; for (int i = 0; i < {self.ndim}; ++i){{ res[i] = grid_size[i]; }} return res; """) return code.ret(f"std::array<int, {self.ndim}>")
def scatter_add_cpu(self): code = pccm.FunctionCode() code.arg("out", "tv::Tensor") code.arg("inp", "tv::Tensor") code.arg("inds", "tv::Tensor") code.add_dependency(GatherCPU) code.raw(f""" return GatherCPU::scatter_add(out, inp, inds); """) return code
def limit_num_per_voxel_value(self): code = pccm.FunctionCode() code.arg("num_per_voxel", f"int *") code.arg("num_voxels, num_points_per_voxel", f"int") code.raw(f""" for (int i : tv::KernelLoopX<int>(num_voxels)){{ int count = min(num_points_per_voxel, num_per_voxel[i]); num_per_voxel[i] = count; }} """) return code
def maxpool_forward_cpu(self): code = pccm.FunctionCode() code.arg("out", "tv::Tensor") code.arg("inp", "tv::Tensor") code.arg("out_inds", "tv::Tensor") code.arg("in_inds", "tv::Tensor") code.add_dependency(IndiceMaxPoolCPU) code.raw(f""" return IndiceMaxPoolCPU::forward(out, inp, out_inds, in_inds); """) return code
def maxpool_implicit_gemm_forward(self): code = pccm.FunctionCode() if CUMM_CPU_ONLY_BUILD: return code.make_invalid() code.arg("out", "tv::Tensor") code.arg("inp", "tv::Tensor") code.arg("inds", "tv::Tensor") code.arg("stream", "std::uintptr_t", "0", pyanno="int") code.add_dependency(IndiceMaxPool) code.raw(f""" return IndiceMaxPool::forward_implicit_gemm(out, inp, inds, stream); """) return code
def array2tvarray(self): code = pccm.FunctionCode() code.targ("T") code.nontype_targ("N", "size_t") code.arg("arr", "std::array<T, N>") code.raw(f""" tv::array<T, N> tarr; for (int i = 0; i < N; ++i){{ tarr[i] = arr[i]; }} return tarr; """) return code.ret("tv::array<T, N>")
def point_to_voxel_empty_mean(self): code = pccm.FunctionCode() code.arg("points", "tv::Tensor") code.arg("clear_voxels", "bool", "true") code.raw(f""" tv::Tensor points_voxel_id = tv::empty({{points.dim(0)}}, tv::int64, -1); return point_to_voxel_empty_mean_static(points, voxels, indices, num_per_voxel, densehashdata, points_voxel_id, tvarray2array(vsize), tvarray2array(grid_size), tvarray2array(grid_stride), tvarray2array(coors_range), clear_voxels); """) return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
def allocate(self): code = pccm.FunctionCode() code.arg("num_bytes", "std::ptrdiff_t") code.ret("char*") code.raw(f""" if (alloc_func){{ char* result = reinterpret_cast<char*>(alloc_func(num_bytes)); return result; }} else{{ TV_THROW_RT_ERR("set alloc function first."); }} """) return code
def generate_conv_inds_stage1_5(self): code = pccm.FunctionCode() code.arg("indice_pairs_uniq", "tv::Tensor") code.arg("ndim", "int") code.arg("uniq_size", "int64_t") code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int") if CUMM_CPU_ONLY_BUILD: return code.make_invalid() for ndim in self.ndims: code.raw(f""" if (ndim == {ndim}){{ return SpconvIndices{ndim}D::generate_conv_inds_stage1_5(indice_pairs_uniq, uniq_size, stream_int); }} """) code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") return code.ret("int")
def calc_meta_data(self): code = pccm.FunctionCode() code.arg("vsize_xyz", f"std::array<float, {self.ndim}>") code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>") code.raw(f""" std::array<float, {self.ndim}> vsize; std::array<int, {self.ndim}> grid_size, grid_stride; std::array<float, {self.ndim * 2}> coors_range; """) if self.zyx: code.raw(f""" for (int i = 0; i < {self.ndim}; ++i){{ vsize[{self.ndim - 1} - i] = vsize_xyz[i]; coors_range[{self.ndim - 1} - i] = coors_range_xyz[i]; coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}]; }} """) else: code.raw(f""" for (int i = 0; i < {self.ndim}; ++i){{ vsize[i] = vsize_xyz[i]; coors_range[i] = coors_range_xyz[i]; coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}]; }} """) code.raw(f""" int64_t prod = 1; for (size_t i = 0; i < {self.ndim}; ++i) {{ grid_size[i] = std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]); }} for (int i = {self.ndim} - 1; i >= 0; --i) {{ grid_stride[i] = prod; prod *= grid_size[i]; }} return std::make_tuple(vsize, grid_size, grid_stride, coors_range); """) ret_str = f"std::array<int, {self.ndim}>" retf_str = f"std::array<float, {self.ndim}>" retf2_str = f"std::array<float, {self.ndim * 2}>" return code.ret( f"std::tuple<{retf_str}, {ret_str}, {ret_str}, {retf2_str}>")
def point2voxel_cpu(self): code = pccm.FunctionCode() code.arg("points", "tv::Tensor") code.arg("voxels, indices, num_per_voxel, densehashdata, pc_voxel_id", "tv::Tensor") code.arg("vsize", f"std::vector<float>") code.arg("grid_size, grid_stride", f"std::vector<int>") code.arg("coors_range", f"std::vector<float>") code.arg("empty_mean", "bool", "false") code.arg("clear_voxels", "bool", "true") code.raw(f""" int ndim = vsize.size(); TV_ASSERT_RT_ERR(vsize.size() == ndim && grid_stride.size() == ndim && coors_range.size() == ndim * 2 && grid_size.size() == ndim, "your params size not equal to ndim", ndim); // voxels: [] """) for ndim in self.ndims: code.raw(f""" if (ndim == {ndim}){{ std::array<float, {ndim}> vsize_; std::array<int, {ndim}> grid_size_, grid_stride_; std::array<float, {ndim * 2}> coors_range_; for (int i = 0; i < {ndim}; ++i){{ vsize_[i] = vsize[i]; grid_size_[i] = grid_size[i]; grid_stride_[i] = grid_stride[i]; coors_range_[i] = coors_range[i]; coors_range_[i + {ndim}] = coors_range[i + {ndim}]; }} if (empty_mean){{ return Point2Voxel{ndim}DCPU::point_to_voxel_empty_mean_static(points, voxels, indices, num_per_voxel, densehashdata, pc_voxel_id, vsize_, grid_size_, grid_stride_, coors_range_, clear_voxels); }} else{{ return Point2Voxel{ndim}DCPU::point_to_voxel_static(points, voxels, indices, num_per_voxel, densehashdata, pc_voxel_id, vsize_, grid_size_, grid_stride_, coors_range_, clear_voxels); }} }} """) code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
def size_cpu(self): """ this function can only be used to get cpu hash table size. """ code = pccm.FunctionCode() code.raw(f""" int64_t res = -1; TV_ASSERT_RT_ERR(is_cpu, "size_cpu can only be used in cpu hash table"); """) with code.if_("is_cpu"): map_name = "cpu_map" for _ in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code): code.raw(f""" res = {map_name}.size(); """) code.raw(f"return res;") return code.ret("int64_t")
def generate_voxel(self): code = pccm.FunctionCode() code.targ("TTable") code.arg("table", "TTable") code.arg("points", f"{self.dtype} const*") code.arg("points_indice_data", f"const int64_t*") code.arg("voxels", f"{self.dtype} *") code.arg("num_per_voxel", f"int *") code.arg("points_voxel_id", f"int64_t*") code.arg("point_stride", f"int") code.arg("max_points_per_voxel", f"int") code.arg("max_voxels", f"int") code.arg("vsize", f"tv::array<float, {self.ndim}>") code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>") code.arg("grid_bound", f"tv::array<int, {self.ndim}>") code.arg("grid_stride", f"tv::array<int, {self.ndim}>") code.arg("num_points", f"int") # TODO add backward? code.raw(f""" int voxel_stride0 = point_stride * max_points_per_voxel; for (int i : tv::KernelLoopX<int>(num_points)){{ int64_t prod = points_indice_data[i]; int voxel_id = -1; if (prod != -1){{ auto voxel_index_pair = table.lookup(prod); if (!voxel_index_pair.empty() && voxel_index_pair.second < max_voxels) {{ voxel_id = voxel_index_pair.second; int old = atomicAdd(num_per_voxel + voxel_index_pair.second, 1); if (old < max_points_per_voxel) {{ for (int j = 0; j < point_stride; ++j) {{ voxels[voxel_index_pair.second * voxel_stride0 + old * point_stride + j] = points[i * point_stride + j]; }} }} }} }} points_voxel_id[i] = voxel_id; }} """) return code
def backward_implicit_gemm_kernel(self): code = pccm.FunctionCode() code.targ("T") code.arg("out_features", f"const T*") code.arg("in_features", f"const T*") code.arg("dout_features", f"const T*") code.arg("din_features", f"T*") code.arg("indices_bwd", "const int*") code.arg("num_features", "int") code.arg("RS", "int") code.arg("num_indices", "int") code.raw(f""" for (int i : tv::KernelLoopY<int>(num_indices)) {{ auto in_ptr = in_features + i * num_features; auto din_ptr = din_features + i * num_features; for (int j : tv::KernelLoopX<int>(num_features)) {{ auto indices_ptr = indices_bwd + i; int out_idx = indices_ptr[0]; T in = in_ptr[j]; T sum_val = T(0); // if idx invalid, we only need to ensure in not equal to out. T out = out_idx != -1 ? out_features[out_idx * num_features + j] : T(0); T dout = out_idx != -1 ? dout_features[out_idx * num_features + j] : T(0); bool valid = in == out && out_idx != -1; sum_val = valid ? sum_val + dout : sum_val; indices_ptr += num_indices; for (int k = 1; k < RS; ++k){{ out_idx = indices_ptr[0]; out = out_idx != -1 ? out_features[out_idx * num_features + j] : T(0); dout = out_idx != -1 ? dout_features[out_idx * num_features + j] : T(0); valid = in == out && out_idx != -1; sum_val = valid ? sum_val + dout : sum_val; indices_ptr += num_indices; }} din_ptr[j] = sum_val; }} }} """) return code
def sort_1d_by_key(self): code = pccm.FunctionCode() if CUMM_CPU_ONLY_BUILD: return code.make_invalid() code.arg("data", "tv::Tensor") code.arg("indices", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()") code.arg("stream", "std::uintptr_t", "0", pyanno="int") code.code_after_include = f""" template <typename T> struct SmallOrEqualTo {{ TV_HOST_DEVICE_INLINE T operator()(const T &x, const T &y) const {{ return x < y; }} }}; template <typename T> __global__ void mask_input(T* inp, T mask, int size){{ for (int i : tv::KernelLoopX<int>(size)){{ inp[i] &= mask; }} }} """ code.add_dependency(ThrustLib, TensorViewKernel) code.add_param_class("cudakers", CudaCommonKernel()) code.raw(f""" cudaStream_t stream_cu = reinterpret_cast<cudaStream_t>(stream); if (indices.empty()){{ indices = tv::empty({{data.dim(0)}}, tv::int32, 0); }} tv::cuda::Launch launcher(data.dim(0), stream_cu); launcher(cudakers::arange_kernel<int32_t>, indices.data_ptr<int32_t>(), indices.dim(0)); auto timer = tv::CUDATimer(); tv::dispatch<int32_t, uint32_t, int64_t, uint64_t>(data.dtype(), [&](auto I){{ using T = TV_DECLTYPE(I); thrust::device_ptr<T> ptr_tr(data.data_ptr<T>()); thrust::device_ptr<int32_t> ptr_k(indices.data_ptr<int32_t>()); auto thrust_ctx = thrust::cuda::par.on(stream_cu); thrust::stable_sort_by_key(thrust_ctx, ptr_tr, ptr_tr + data.dim(0), ptr_k, SmallOrEqualTo<uint32_t>()); }}); tv::ssprint("SORT BY KEY TIME", data.dim(0), timer.report() / 1000.0); return indices; """) return code.ret("tv::Tensor")
def generate_conv_inds_mask_stage2(self): code = pccm.FunctionCode() if CUMM_CPU_ONLY_BUILD: return code.make_invalid() code.arg("indices, hashdata", "tv::Tensor") code.arg( "indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds", "tv::Tensor") code.arg("mask_fwd, mask_bwd", "tv::Tensor") code.arg("num_out_act", "int") code.arg("batch_size", "int") code.arg("output_dims, input_dims", f"std::vector<int>") code.arg("ksize, stride, padding, dilation", f"std::vector<int>") code.arg("transposed", f"bool", "false") code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int") code.raw(f""" int ndim = indices.dim(1) - 1; TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim && ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim && padding.size() == ndim, "your params size not equal to ndim", ndim); """) for ndim in self.ndims: code.raw(f""" if (ndim == {ndim}){{ tv::array<int, {ndim}> output_dims_, input_dims_; tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_; for (int i = 0; i < {ndim}; ++i){{ output_dims_[i] = output_dims[i]; input_dims_[i] = input_dims[i]; ksize_[i] = ksize[i]; stride_[i] = stride[i]; padding_[i] = padding[i]; dilation_[i] = dilation[i]; }} return SpconvIndices{ndim}D::generate_conv_inds_stage2_mask(indices, hashdata, indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds, mask_fwd, mask_bwd, num_out_act, batch_size, output_dims_, input_dims_, ksize_, stride_, padding_, dilation_, transposed, stream_int); }} """) code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") return code.ret("int")
def clear(self): """ in this function, if values is empty, it will be assigned to zero. """ code = pccm.FunctionCode() if not CUMM_CPU_ONLY_BUILD: code.add_dependency(TensorViewHashKernel) code.arg("stream", "std::uintptr_t", "0") with code.if_("is_cpu"): code.raw(f""" if (is_cpu){{ map_4_4.clear(); map_4_8.clear(); map_8_4.clear(); map_8_8.clear(); return; }} """) if not CUMM_CPU_ONLY_BUILD: with code.else_(): code.raw(f""" auto custream = reinterpret_cast<cudaStream_t>(stream); """) for k_items in _dispatch_ints(code, [4, 8], "keys_data.itemsize()"): code.raw(f""" using K = tv::hash::itemsize_to_unsigned_t<{k_items}>; constexpr K kEmptyKey = std::numeric_limits<K>::max(); K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data()); """) for v_items in _dispatch_ints(code, [4, 8], "values_data.itemsize()"): code.raw(f""" using V = tv::hash::itemsize_to_unsigned_t<{v_items}>; V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data()); using table_t = tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>, kEmptyKey, false>; table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0)); tv::cuda::Launch launcher(table.size(), custream); launcher(tv::hash::clear_table_split<table_t>, table); """) return code
def ctor(self): code = pccm.FunctionCode() code.arg("vsize_xyz", f"std::array<float, {self.ndim}>") code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>") code.arg("num_point_features", f"int") code.arg("max_num_voxels, max_num_points_per_voxel", f"int") if self.zyx: code.raw(f""" for (int i = 0; i < {self.ndim}; ++i){{ vsize[{self.ndim - 1} - i] = vsize_xyz[i]; coors_range[{self.ndim - 1} - i] = coors_range_xyz[i]; coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}]; }} """) else: code.raw(f""" for (int i = 0; i < {self.ndim}; ++i){{ vsize[i] = vsize_xyz[i]; coors_range[i] = coors_range_xyz[i]; coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}]; }} """) code.raw(f""" int64_t prod = 1; for (size_t i = 0; i < {self.ndim}; ++i) {{ grid_size[i] = std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]); }} for (int i = {self.ndim} - 1; i >= 0; --i) {{ grid_stride[i] = prod; prod *= grid_size[i]; }} voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, -1); indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, -1); num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, -1); tv::TensorShape grid_shape(grid_size.data(), grid_size.data() + {self.ndim}); densehashdata = tv::zeros(grid_shape, tv::int32, -1); auto densehashdata_ptr = densehashdata.data_ptr<int>(); for (int i= 0; i < densehashdata.size(); ++i){{ densehashdata_ptr[i] = -1; }} """) return code