def cuda_helper(): helper_code = """ #include <dace/dace.h> extern "C" { int host_to_gpu(void* gpu, void* host, size_t size) { auto result = cudaMemcpy(gpu, host, size, cudaMemcpyHostToDevice); DACE_CUDA_CHECK(cudaGetLastError()); DACE_CUDA_CHECK(cudaDeviceSynchronize()); return result; } } """ program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper") dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.cuda.CUDACodeGen, "CudaDummy") build_folder = dace.Config.get('default_build_folder') BUILD_PATH = os.path.join(build_folder, "cuda_helper") compiler.generate_program_folder(None, [program, dummy_cuda_target], BUILD_PATH) compiler.configure_and_compile(BUILD_PATH) checker_dll = compiled_sdfg.ReloadableDLL( compiler.get_binary_name(BUILD_PATH, "cuda_helper"), "cuda_helper") class CudaHelper: def __init__(self): self.dll = checker_dll checker_dll.load() self._host_to_gpu = checker_dll.get_symbol("host_to_gpu") self._host_to_gpu.restype = ctypes.c_int def __del__(self): self.dll.unload() def host_to_gpu(self, gpu_ptr: int, numpy_array: np.ndarray): size = ctypes.sizeof( dtypes._FFI_CTYPES[numpy_array.dtype.type]) * numpy_array.size result = ctypes.c_int( self._host_to_gpu( ctypes.c_void_p(gpu_ptr), ctypes.c_void_p( numpy_array.__array_interface__["data"][0]), ctypes.c_size_t(size))) if result.value != 0: raise ValueError("host_to_gpu returned nonzero result!") return CudaHelper()
def build_checker(): if hasattr(build_checker, "dll"): return build_checker.dll checker_code_path = os.path.join( os.path.dirname(inspect.getfile(daceml.onnx)), "include", "op_checker.h") with open(checker_code_path, "r") as f: checker_code = f.read() program = codeobject.CodeObject("onnx_op_checker", checker_code, "cpp", targets.cpu.CPUCodeGen, "ONNXOpChecker", environments={"ONNXRuntime"}) BUILD_PATH = os.path.join('.dacecache', "onnx_op_checker") compiler.generate_program_folder(None, [program], BUILD_PATH) compiler.configure_and_compile(BUILD_PATH) checker_dll = ctypes.CDLL( compiler.get_binary_name(BUILD_PATH, "onnx_op_checker")) build_checker.dll = checker_dll return checker_dll
def unparse_tasklet(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, node: nodes.Node, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): # extract data state = sdfg.nodes()[state_id] tasklet = node # construct variables paths unique_name: str = "{}_{}_{}_{}".format(tasklet.name, sdfg.sdfg_id, sdfg.node_id(state), state.node_id(tasklet)) # Collect all of the input and output connectors into buses and scalars buses = {} scalars = {} for edge in state.in_edges(tasklet): arr = sdfg.arrays[edge.src.data] # catch symbolic (compile time variables) check_issymbolic([ tasklet.in_connectors[edge.dst_conn].veclen, tasklet.in_connectors[edge.dst_conn].bytes ], sdfg) # extract parameters vec_len = int( symbolic.evaluate(tasklet.in_connectors[edge.dst_conn].veclen, sdfg.constants)) total_size = int( symbolic.evaluate(tasklet.in_connectors[edge.dst_conn].bytes, sdfg.constants)) if isinstance(arr, data.Array): if self.hardware_target: raise NotImplementedError( 'Array input for hardware* not implemented') else: buses[edge.dst_conn] = (False, total_size, vec_len) elif isinstance(arr, data.Stream): buses[edge.dst_conn] = (False, total_size, vec_len) elif isinstance(arr, data.Scalar): scalars[edge.dst_conn] = (False, total_size * 8) for edge in state.out_edges(tasklet): arr = sdfg.arrays[edge.dst.data] # catch symbolic (compile time variables) check_issymbolic([ tasklet.out_connectors[edge.src_conn].veclen, tasklet.out_connectors[edge.src_conn].bytes ], sdfg) # extract parameters vec_len = int( symbolic.evaluate(tasklet.out_connectors[edge.src_conn].veclen, sdfg.constants)) total_size = int( symbolic.evaluate(tasklet.out_connectors[edge.src_conn].bytes, sdfg.constants)) if isinstance(arr, data.Array): if self.hardware_target: raise NotImplementedError( 'Array input for hardware* not implemented') else: buses[edge.src_conn] = (True, total_size, vec_len) elif isinstance(arr, data.Stream): buses[edge.src_conn] = (True, total_size, vec_len) elif isinstance(arr, data.Scalar): print('Scalar output not implemented') # generate system verilog module components parameter_string: str = self.generate_rtl_parameters(sdfg.constants) inputs, outputs = self.generate_rtl_inputs_outputs(buses, scalars) # create rtl code object (that is later written to file) self.code_objects.append( codeobject.CodeObject( name="{}".format(unique_name), code=RTLCodeGen.RTL_HEADER.format(name=unique_name, parameters=parameter_string, inputs="\n".join(inputs), outputs="\n".join(outputs)) + tasklet.code.code + RTLCodeGen.RTL_FOOTER, language="sv", target=RTLCodeGen, title="rtl", target_type="{}".format(unique_name), additional_compiler_kwargs="", linkable=True, environments=None)) if self.hardware_target: if self.vendor == 'xilinx': rtllib_config = { "name": unique_name, "buses": { name: ('m_axis' if is_output else 's_axis', vec_len) for name, (is_output, _, vec_len) in buses.items() }, "params": { "scalars": { name: total_size for name, (_, total_size) in scalars.items() }, "memory": {} }, "ip_cores": tasklet.ip_cores if isinstance( tasklet, nodes.RTLTasklet) else {}, } self.code_objects.append( codeobject.CodeObject(name=f"{unique_name}_control", code=rtllib_control(rtllib_config), language="v", target=RTLCodeGen, title="rtl", target_type="{}".format(unique_name), additional_compiler_kwargs="", linkable=True, environments=None)) self.code_objects.append( codeobject.CodeObject(name=f"{unique_name}_top", code=rtllib_top(rtllib_config), language="v", target=RTLCodeGen, title="rtl", target_type="{}".format(unique_name), additional_compiler_kwargs="", linkable=True, environments=None)) self.code_objects.append( codeobject.CodeObject(name=f"{unique_name}_package", code=rtllib_package(rtllib_config), language="tcl", target=RTLCodeGen, title="rtl", target_type="scripts", additional_compiler_kwargs="", linkable=True, environments=None)) self.code_objects.append( codeobject.CodeObject(name=f"{unique_name}_synth", code=rtllib_synth(rtllib_config), language="tcl", target=RTLCodeGen, title="rtl", target_type="scripts", additional_compiler_kwargs="", linkable=True, environments=None)) else: # self.vendor != "xilinx" raise NotImplementedError( 'Only RTL codegen for Xilinx is implemented') else: # not hardware_target # generate verilator simulation cpp code components inputs, outputs = self.generate_cpp_inputs_outputs(tasklet) valid_zeros, ready_zeros = self.generate_cpp_zero_inits(tasklet) vector_init = self.generate_cpp_vector_init(tasklet) num_elements = self.generate_cpp_num_elements(tasklet) internal_state_str, internal_state_var = self.generate_cpp_internal_state( tasklet) read_input_hs = self.generate_input_hs(tasklet) feed_elements = self.generate_feeding(tasklet, inputs) in_ptrs, out_ptrs = self.generate_ptrs(tasklet) export_elements = self.generate_exporting(tasklet, outputs) write_output_hs = self.generate_write_output_hs(tasklet) hs_flags = self.generate_hs_flags(tasklet) input_hs_toggle = self.generate_input_hs_toggle(tasklet) output_hs_toggle = self.generate_output_hs_toggle(tasklet) running_condition = self.generate_running_condition(tasklet) # add header code to stream if not self.cpp_general_header_added: sdfg.append_global_code( cpp_code=RTLCodeGen.CPP_GENERAL_HEADER_TEMPLATE.format( debug_include="// generic includes\n#include <iostream>" if self.verilator_debug else "")) self.cpp_general_header_added = True sdfg.append_global_code( cpp_code=RTLCodeGen.CPP_MODEL_HEADER_TEMPLATE.format( name=unique_name)) # add main cpp code to stream callsite_stream.write(contents=RTLCodeGen.CPP_MAIN_TEMPLATE.format( name=unique_name, inputs=inputs, outputs=outputs, num_elements=str.join('\n', num_elements), vector_init=vector_init, valid_zeros=str.join('\n', valid_zeros), ready_zeros=str.join('\n', ready_zeros), read_input_hs=str.join('\n', read_input_hs), feed_elements=str.join('\n', feed_elements), in_ptrs=str.join('\n', in_ptrs), out_ptrs=str.join('\n', out_ptrs), export_elements=str.join('\n', export_elements), write_output_hs=str.join('\n', write_output_hs), hs_flags=str.join('\n', hs_flags), input_hs_toggle=str.join('\n', input_hs_toggle), output_hs_toggle=str.join('\n', output_hs_toggle), running_condition=str.join(' && ', running_condition), internal_state_str=internal_state_str, internal_state_var=internal_state_var, debug_sim_start="std::cout << \"SIM {name} START\" << std::endl;" if self.verilator_debug else "", debug_internal_state=""" // report internal state VL_PRINTF("[t=%lu] ap_aclk=%u ap_areset=%u valid_i=%u ready_i=%u valid_o=%u ready_o=%u \\n", main_time, model->ap_aclk, model->ap_areset, model->valid_i, model->ready_i, model->valid_o, model->ready_o); VL_PRINTF("{internal_state_str}\\n", {internal_state_var}); std::cout << std::flush; """.format(internal_state_str=internal_state_str, internal_state_var=internal_state_var) if self.verilator_debug else "", debug_sim_end="std::cout << \"SIM {name} END\" << std::endl;" if self.verilator_debug else ""), sdfg=sdfg, state_id=state_id, node_id=node)
def unparse_tasklet(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, node: nodes.Node, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): # extract data state = sdfg.nodes()[state_id] tasklet = node # construct variables paths unique_name: str = "top_{}_{}_{}".format(sdfg.sdfg_id, sdfg.node_id(state), state.node_id(tasklet)) # generate system verilog module components parameter_string: str = self.generate_rtl_parameters(sdfg.constants) inputs, outputs = self.generate_rtl_inputs_outputs(sdfg, tasklet) # create rtl code object (that is later written to file) self.code_objects.append( codeobject.CodeObject( name="{}".format(unique_name), code=RTLCodeGen.RTL_HEADER.format(name=unique_name, parameters=parameter_string, inputs="\n".join(inputs), outputs="\n".join(outputs)) + tasklet.code.code + RTLCodeGen.RTL_FOOTER, language="sv", target=RTLCodeGen, title="rtl", target_type="", additional_compiler_kwargs="", linkable=True, environments=None)) # generate verilator simulation cpp code components inputs, outputs = self.generate_cpp_inputs_outputs(tasklet) vector_init = self.generate_cpp_vector_init(tasklet) num_elements = self.generate_cpp_num_elements() internal_state_str, internal_state_var = self.generate_cpp_internal_state( tasklet) # add header code to stream if not self.cpp_general_header_added: sdfg.append_global_code( cpp_code=RTLCodeGen.CPP_GENERAL_HEADER_TEMPLATE.format( debug_include="// generic includes\n#include <iostream>" if self.verilator_debug else "")) self.cpp_general_header_added = True sdfg.append_global_code( cpp_code=RTLCodeGen.CPP_MODEL_HEADER_TEMPLATE.format( name=unique_name)) # add main cpp code to stream callsite_stream.write(contents=RTLCodeGen.CPP_MAIN_TEMPLATE.format( name=unique_name, inputs=inputs, outputs=outputs, num_elements=num_elements, vector_init=vector_init, internal_state_str=internal_state_str, internal_state_var=internal_state_var, debug_sim_start="std::cout << \"SIM {name} START\" << std::endl;" if self.verilator_debug else "", debug_feed_element="std::cout << \"feed new element\" << std::endl;" if self.verilator_debug else "", debug_export_element="std::cout << \"export element\" << std::endl;" if self.verilator_debug else "", debug_internal_state=""" // report internal state VL_PRINTF("[t=%lu] clk_i=%u rst_i=%u valid_i=%u ready_i=%u valid_o=%u ready_o=%u \\n", main_time, model->clk_i, model->rst_i, model->valid_i, model->ready_i, model->valid_o, model->ready_o); VL_PRINTF("{internal_state_str}\\n", {internal_state_var}); std::cout << std::flush; """.format(internal_state_str=internal_state_str, internal_state_var=internal_state_var) if self.verilator_debug else "", debug_read_input_hs= "std::cout << \"remove read_input_hs flag\" << std::endl;" if self.verilator_debug else "", debug_output_hs= "std::cout << \"remove write_output_hs flag\" << std::endl;" if self.verilator_debug else "", debug_sim_end="std::cout << \"SIM {name} END\" << std::endl;" if self.verilator_debug else ""), sdfg=sdfg, state_id=state_id, node_id=node)