def _load_torch_gpu_allocator_cpp_extension(verbosity, is_rocm_pytorch): gpu_identifier = "hip" if is_rocm_pytorch else "cuda" gpu_allocator_header = "HIPCachingAllocator" if is_rocm_pytorch else "CUDACachingAllocator" torch_gpu_allocator_addresses_cpp_source = f''' #include <torch/extension.h> #include <c10/{gpu_identifier}/{gpu_allocator_header}.h> size_t gpu_caching_allocator_raw_alloc_address() {{ return reinterpret_cast<size_t>(&c10::{gpu_identifier}::{gpu_allocator_header}::raw_alloc); }} size_t gpu_caching_allocator_raw_delete_address() {{ return reinterpret_cast<size_t>(&c10::{gpu_identifier}::{gpu_allocator_header}::raw_delete); }} ''' return load_inline( name='inline_extension', cpp_sources=[torch_gpu_allocator_addresses_cpp_source], extra_cflags=['-D__HIP_PLATFORM_HCC__=1' if is_rocm_pytorch else ''], functions=[ 'gpu_caching_allocator_raw_alloc_address', 'gpu_caching_allocator_raw_delete_address' ], verbose=verbosity, with_cuda=True)
def re_build_func_torch(): mod = cpp_extension.load_inline( name=self.module_name, cpp_sources=self.source, functions=self.functions, verbose=True, build_directory=self.path, is_python_module=True, ) return mod
def _load_torch_allocator_cpp_extension(verbosity): torch_cuda_allocator_addresses_cpp_source = """ #include <torch/extension.h> #include <c10/cuda/CUDACachingAllocator.h> size_t cuda_caching_allocator_raw_alloc_address() { return reinterpret_cast<size_t>(&c10::cuda::CUDACachingAllocator::raw_alloc); } size_t cuda_caching_allocator_raw_delete_address() { return reinterpret_cast<size_t>(&c10::cuda::CUDACachingAllocator::raw_delete); } """ return load_inline(name='inline_extension', cpp_sources=[torch_cuda_allocator_addresses_cpp_source], functions=['cuda_caching_allocator_raw_alloc_address', 'cuda_caching_allocator_raw_delete_address'], verbose=verbosity < Verbosity.WARNING, with_cuda=True)
scratch[current][0] = (int)h; scratch[current][1] = (int)mm; scratch[current][2] = (int)nn; current++; } m = firstm; n = firstn; } } ret.push_back({max_width, scratch.slice(0, 0, current).clone()}); } ret_t load_balance(at::Tensor layout) { ret_t ret; at::Tensor scratch = at::empty({layout.sum().item<int>(), 3}, layout.dtype()); for(int max_width = 4; max_width > 0; max_width /= 2) segment_blocks(layout, scratch, max_width, ret); return ret; } ''' block = 16 L = 256 stride = 128 layout = torch_blocksparse.MultiheadAttention._make_layout(1, L // block, 'fixed', stride // block, True, 1, 1) module = load_inline(name='load_balance', cpp_sources=[source], functions=['load_balance']) balanced = module.load_balance(layout) print(layout) print(balanced)
torch::Tensor batched_dot_mul_sum_v1( const torch::Tensor& a, const torch::Tensor& b) { return a.mul(b).sum(-1); } """ # PyTorch makes it easy to test our C++ implementations by providing a utility # to JIT compile C++ source into Python extensions: import os from torch.utils import cpp_extension cpp_lib = cpp_extension.load_inline( name='cpp_lib', cpp_sources=batched_dot_src, extra_cflags=['-O3'], extra_include_paths=[ # `load_inline` needs to know where to find Pybind11 headers. os.path.join(os.getenv('CONDA_PREFIX'), 'include') ], functions=['batched_dot_mul_sum_v0', 'batched_dot_mul_sum_v1']) # `load_inline` will create a shared object that is loaded into Python. When we collect # instruction counts Timer will create a subprocess, so we need to re-import it. The # import process is slightly more complicated for C extensions, but that's all we're # doing here. module_import_str = f"""\ # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path import importlib.util spec = importlib.util.spec_from_file_location("cpp_lib", {repr(cpp_lib.__file__)}) cpp_lib = importlib.util.module_from_spec(spec) spec.loader.exec_module(cpp_lib)"""
import torch from torch.utils.cpp_extension import load_inline from .pair_wise_distance_cuda_source import source import pair_wise_distance print("compile cuda source of 'pair_wise_distance' function...") print( "NOTE: if you avoid this process, you make .cu file and compile it following https://pytorch.org/tutorials/advanced/cpp_extension.html" ) pair_wise_distance_cuda = load_inline("pair_wise_distance", cpp_sources="", cuda_sources=source) print("done") class PairwiseDistFunction(torch.autograd.Function): @staticmethod def forward(self, pixel_features, spixel_features, init_spixel_indices, num_spixels_width, num_spixels_height): self.num_spixels_width = num_spixels_width self.num_spixels_height = num_spixels_height output = pixel_features.new(pixel_features.shape[0], 9, pixel_features.shape[-1]).zero_() self.save_for_backward(pixel_features, spixel_features, init_spixel_indices) return pair_wise_distance_cuda.forward( pixel_features.contiguous(), spixel_features.contiguous(), init_spixel_indices.contiguous(), output, self.num_spixels_width, self.num_spixels_height)
def _load_aten_op_executor_cpp_extension(verbosity, is_rocm_pytorch): aten_op_executor_cpp_source = """ #include <torch/torch.h> #include <ATen/DLConvertor.h> #include <unordered_map> #include <tuple> #include <vector> class ATenOperatorCache { public: static ATenOperatorCache& Instance() { static ATenOperatorCache instance; return instance; } std::shared_ptr<torch::jit::Operator> GetOperator(const std::string& op_name) { if (ops_.find(op_name) == ops_.end()) { auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString(op_name)); TORCH_INTERNAL_ASSERT(ops.size() == 1); ops_[op_name] = ops.front(); } return ops_.at(op_name); } private: ATenOperatorCache() = default; std::unordered_map<std::string, std::shared_ptr<torch::jit::Operator>> ops_; }; // Some arguments of backward operator are not from forward operator's input or output, // but need some processing. Since we cannot build such processing to ONNX graph for now, // we are putting such processing code here if needed. // Take embedding_backward as example: // weight: embedding_backward(grad, indices, weight.size(0), padding_idx, scale_grad_by_freq, sparse) // the 3rd argument (index 2) is weight.size(0), we add this processing here. using TensorTransformFunc = std::function<c10::IValue(const at::Tensor&)>; static const TensorTransformFunc embedding_num_weights = [](const at::Tensor& tensor) { return c10::IValue(tensor.size(0)); }; static const std::unordered_map<std::string, std::unordered_map<size_t, TensorTransformFunc>> TENSOR_TRANSFORM_FUNCS = { {"aten::embedding_backward", {{2, embedding_num_weights}}}, }; template <typename T> void SetIValueArguments(const std::vector<std::tuple<size_t, T>>& raw_arguments, std::vector<c10::IValue>& ivalue_arguments) { for (size_t i = 0; i < raw_arguments.size(); i++) { size_t index = std::get<0>(raw_arguments[i]); TORCH_INTERNAL_ASSERT(index < ivalue_arguments.size()); ivalue_arguments[index] = c10::IValue(std::get<1>(raw_arguments[i])); } } // TODO: Add more argument types, such as list type. std::vector<DLManagedTensor*> ExecuteATenOperator( const char* op_name, const std::vector<std::tuple<size_t, DLManagedTensor*>>& tensor_arguments, const std::vector<std::tuple<size_t, int64_t>>& int_arguments, const std::vector<std::tuple<size_t, float>>& float_arguments, const std::vector<std::tuple<size_t, bool>>& bool_arguments) { std::string op_name_str(op_name); std::shared_ptr<torch::jit::Operator> op = ATenOperatorCache::Instance().GetOperator(op_name_str); // TODO: need to handle optional argument and arguments with default values. std::vector<c10::IValue> arguments; arguments.resize(op->schema().arguments().size()); for (size_t i = 0; i < tensor_arguments.size(); i++) { size_t index = std::get<0>(tensor_arguments[i]); at::Tensor tensor = at::fromDLPack(std::get<1>(tensor_arguments[i])); bool has_transform_func = false; if (TENSOR_TRANSFORM_FUNCS.find(op_name_str) != TENSOR_TRANSFORM_FUNCS.end()) { const auto& transform_funcs = TENSOR_TRANSFORM_FUNCS.at(op_name_str); if (transform_funcs.find(index) != transform_funcs.end()) { arguments[index] = transform_funcs.at(index)(tensor); has_transform_func = true; } } if (!has_transform_func) { arguments[index] = c10::IValue(tensor); } } SetIValueArguments<int64_t>(int_arguments, arguments); SetIValueArguments<float>(float_arguments, arguments); SetIValueArguments<bool>(bool_arguments, arguments); torch::jit::Stack stack; for (size_t i = 0; i < arguments.size(); i++) { torch::jit::push(stack, arguments[i]); } op->getOperation()(&stack); // TODO: need to handle multiple-tensor outputs. at::Tensor output; torch::jit::pop(stack, output); std::vector<DLManagedTensor*> result; result.emplace_back(at::toDLPack(output)); return result; } size_t execute_aten_operator_address() { return reinterpret_cast<size_t>(&ExecuteATenOperator); } """ aten_op_executor_cpp_extension = load_inline( name='inline_extension_aten_op_executor', cpp_sources=[aten_op_executor_cpp_source], extra_cflags=['-D__HIP_PLATFORM_HCC__=1' if is_rocm_pytorch else ''], functions=['execute_aten_operator_address'], verbose=verbosity, with_cuda=True) C.register_aten_op_executor( str(aten_op_executor_cpp_extension.execute_aten_operator_address()))
# NOTE: inspect the current parallel backend with # print(th.__config__.parallel_info()) # ref: https://github.com/suphoff/pytorch_parallel_extension_cpp/blob/master/setup.py flags = ['-DAT_PARALLEL_OPENMP', '-fopenmp'] # flags = ['-DAT_PARALLEL_NATIVE_TBB'] # flags = ['-DAT_PARALLEL_NATIVE'] __srcpath = os.path.join(os.path.dirname(__file__), '_srckernel.cc') with open(__srcpath, 'rt') as f: srckernel = f.read() SRC = load_inline('SRC', srckernel, functions=[ 'ShortRangeRankingCorrelation', 'BatchShortRangeRankingCorrelation', ], extra_cflags=flags + ['-O2'], extra_ldflags=flags, verbose=True) def BatchNearsightRankCorr(X, y, r): X = X.cpu() y = y.cpu() r = r.cpu() scores = SRC.BatchShortRangeRankingCorrelation(X, y, r) return scores.cpu().numpy().astype(np.float) def NearsightRankCorr(x, y, r): x = x.cpu() y = y.cpu() r = r.cpu()
import cupy except BaseException as e: logging.info(f'spikingjelly.activation_based.spike_op: {e}') cupy = None try: logging.warning( 'spikingjelly.activation_based.spike_op: try to use `torch.utils.cpp_extension.load_inline` to load cudnn functions.' ) logging.warning( f'If it is hanging, pleast try to delete torch_extensions cache directory. (In most cases, the directory is {torch.utils.cpp_extension._get_build_directory("", False)}.)' ) cpp_wrapper = load_inline(name='cpp_wrapper', cpp_sources='using namespace at;', functions=[ 'cudnn_convolution_backward', 'cudnn_convolution_backward_input', 'cudnn_convolution_backward_weight' ], with_cuda=True) except BaseException as e: logging.info(f'spikingjelly.activation_based.spike_op: {e}') cpp_wrapper = None ''' aten/src/ATen/native/cudnn/ConvPlaceholders.cpp at::Tensor cudnn_convolution( const at::Tensor& input, const at::Tensor& weight, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) There are two overloaded C++ methods `cudnn_convolution`. So, we need to use an alternative syntax to cast the overloaded function.
#!/usr/bin/env python # Verify that PyTorch can JIT compile C++ extensions # This requires at least Ninja and a working C++ compiler, preferably GCC # # Heavily based on the PyTorch tutorial for C++ extensions # Author: Alexander Grund (TU Dresden) from torch.utils.cpp_extension import load_inline cpp_source = "torch::Tensor test_func(torch::Tensor x) { return x; }" module = load_inline(name='inline_extension', cpp_sources=cpp_source, functions=['test_func']) assert module
#torch::Tensor myfunc(torch::Tensor z) { # auto s = torch::ones_like(z); # return s + z; #} #''' # #mod = load_inline('mymod', source, functions=['myfunc'], verbose=True) #x = th.rand(10) #print(x) #print(mod.myfunc(x)) with open("_srckernel.cc", 'rt') as f: srckernel = f.read() SRC = load_inline('SRC', srckernel, functions=['ShortRangeRankingCorrelation'], verbose=True) def BatchShortRangeRankingCorrelation(X, y, r): scores = np.zeros(X.shape[0]) for (i, srt) in enumerate(X): scores[i] = SRC.ShortRangeRankingCorrelation(srt, y, r) return scores import time import rich c = rich.get_console() for i in range(100): for cansee in (5, 50, 1000): for k in (5, 10, 25):
# load_inline will automatically search /usr/include, but not conda include. extra_include_paths: List[str] = [] conda_prefix = os.getenv("CONDA_PREFIX") if conda_prefix is not None: extra_include_paths = [os.path.join(conda_prefix, "include")] bindings = load_inline( name="callgrind_bindings", cpp_sources=textwrap.dedent(""" #include <valgrind/callgrind.h> bool _valgrind_supported_platform() { #if defined(NVALGRIND) return false; #else return true; #endif } void _valgrind_toggle() { #if defined(NVALGRIND) TORCH_CHECK(false, "Valgrind is not supported."); #else CALLGRIND_TOGGLE_COLLECT; #endif } """), extra_include_paths=extra_include_paths, functions=["_valgrind_supported_platform", "_valgrind_toggle"], )