def test_scalar_output_ptr_access(): sdfg = dace.SDFG("scalptrtest") state = sdfg.add_state() sdfg.add_scalar("scal", dace.float64, transient=True, storage=dace.dtypes.StorageType.GPU_Global) sdfg.add_array("__return", [1], dace.float64) tasklet = state.add_tasklet( "write", {}, {"outp": dace.pointer(dace.float64)}, """ double a = 5; cudaMemcpyAsync(outp, &a, 1 * sizeof(double), cudaMemcpyHostToDevice, __state->gpu_context->streams[0]); """, language=dace.dtypes.Language.CPP, ) access_scal = state.add_access("scal") write_unsqueezed = state.add_write("__return") state.add_edge(tasklet, "outp", access_scal, None, sdfg.make_array_memlet("scal")) state.add_edge(access_scal, None, write_unsqueezed, None, sdfg.make_array_memlet("scal")) ret = sdfg() assert np.allclose(ret, 5)
def _JoinedStr(self, t, infer_type=False): # JoinedStr(expr* values) self.write("f'''", infer_type) for value in t.values: if isinstance(value, ast.Str): self.write(value.s, infer_type) else: self.dispatch(value, infer_type) self.write("'''", infer_type) return dace.pointer(dace.int8) if infer_type else None
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() localcode = CodeIOStream() # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) # Obtain some SDFG-related information input_memlet = input_edge.data output_memlet = output_edge.data if node.out_connectors: dtype = next(node.out_connectors.values()) else: dtype = sdfg.arrays[output_memlet.data].dtype output_type = dtype.ctype if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Try to obtain the number of threads in the block, or use the default # configuration block_threads = devicelevel_block_size(sdfg, state, node) if block_threads is not None: block_threads = functools.reduce(lambda a, b: a * b, block_threads, 1) # Checks if block_threads is None: raise ValueError('Block-wide GPU reduction must occur within' ' a GPU kernel') if issymbolic(block_threads, sdfg.constants): raise ValueError('Block size has to be constant for block-wide ' 'reduction (got %s)' % str(block_threads)) if (node.axes is not None and len(node.axes) < input_dims): raise ValueError( 'Only full reduction is supported for block-wide reduce,' ' please use the pure expansion') if (input_data.storage != dtypes.StorageType.Register or output_data.storage != dtypes.StorageType.Register): raise ValueError( 'Block-wise reduction only supports GPU register inputs ' 'and outputs') if redtype in ExpandReduceCUDABlock._SPECIAL_RTYPES: raise ValueError('%s block reduction not supported' % redtype) credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] if redtype == dtypes.ReductionType.Custom: redop = '__reduce_%s()' % idstr else: redop = 'dace::_wcr_fixed<%s, %s>()' % (credtype, output_type) # Allocate shared memory for block reduce localcode.write(""" typedef cub::BlockReduce<{type}, {numthreads}> BlockReduce_{id}; __shared__ typename BlockReduce_{id}::TempStorage temp_storage_{id}; """.format(id=idstr, type=output_data.dtype.ctype, numthreads=block_threads)) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = cpp_array_expr(sdfg, output_memlet) localcode.write(""" {output} = BlockReduce_{id}(temp_storage_{id}).Reduce({input}, {redop}); """.format(id=idstr, redop=redop, input=input_memlet.data, output=output)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) output_dims = len(output_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() cuda_initcode = CodeIOStream() cuda_exitcode = CodeIOStream() host_globalcode = CodeIOStream() host_localcode = CodeIOStream() output_memlet = output_edge.data # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) if node.out_connectors: dtype = next(node.out_connectors.values()) else: dtype = sdfg.arrays[output_memlet.data].dtype output_type = dtype.ctype if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Obtain some SDFG-related information input_memlet = input_edge.data reduce_shape = input_memlet.subset.bounding_box_size() num_items = ' * '.join(symstr(s) for s in reduce_shape) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = (output_memlet.data + ' + ' + cpp_array_expr(sdfg, output_memlet, with_brackets=False)) input_dims = input_memlet.subset.dims() output_dims = output_memlet.subset.data_dims() reduce_all_axes = (node.axes is None or len(node.axes) == input_dims) if reduce_all_axes: reduce_last_axes = False else: reduce_last_axes = sorted(node.axes) == list( range(input_dims - len(node.axes), input_dims)) if (not reduce_all_axes) and (not reduce_last_axes): raise NotImplementedError( 'Multiple axis reductions not supported on GPUs. Please use ' 'the pure expansion or make reduce axes the last in the array.' ) # Verify that data is on the GPU if input_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Input of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') if output_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Output of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') # Determine reduction type kname = (ExpandReduceCUDADevice._SPECIAL_RTYPES[redtype] if redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES else 'Reduce') # Create temp memory for this GPU cuda_globalcode.write( """ void *__cub_storage_{sdfg}_{state}_{node} = NULL; size_t __cub_ssize_{sdfg}_{state}_{node} = 0; """.format(sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) if reduce_all_axes: reduce_type = 'DeviceReduce' reduce_range = num_items reduce_range_def = 'size_t num_items' reduce_range_use = 'num_items' reduce_range_call = num_items elif reduce_last_axes: num_reduce_axes = len(node.axes) not_reduce_axes = reduce_shape[:-num_reduce_axes] reduce_axes = reduce_shape[-num_reduce_axes:] num_segments = ' * '.join([symstr(s) for s in not_reduce_axes]) segment_size = ' * '.join([symstr(s) for s in reduce_axes]) reduce_type = 'DeviceSegmentedReduce' iterator = 'dace::stridedIterator({size})'.format( size=segment_size) reduce_range = '{num}, {it}, {it} + 1'.format(num=num_segments, it=iterator) reduce_range_def = 'size_t num_segments, size_t segment_size' iterator_use = 'dace::stridedIterator(segment_size)' reduce_range_use = 'num_segments, {it}, {it} + 1'.format( it=iterator_use) reduce_range_call = '%s, %s' % (num_segments, segment_size) # Call CUB to get the storage size, allocate and free it cuda_initcode.write( """ cub::{reduce_type}::{kname}(nullptr, __cub_ssize_{sdfg}_{state}_{node}, ({intype}*)nullptr, ({outtype}*)nullptr, {reduce_range}{redop}); cudaMalloc(&__cub_storage_{sdfg}_{state}_{node}, __cub_ssize_{sdfg}_{state}_{node}); """.format(sdfg=sdfg.name, state=state_id, node=node_id, reduce_type=reduce_type, reduce_range=reduce_range, redop=reduce_op, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, kname=kname), sdfg, state_id, node) cuda_exitcode.write( 'cudaFree(__cub_storage_{sdfg}_{state}_{node});'.format( sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) # Write reduction function definition cuda_globalcode.write(""" DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream) {{ cub::{reduce_type}::{kname}(__cub_storage_{id}, __cub_ssize_{id}, input, output, {reduce_range_use}{redop}, stream); }} """.format(id=idstr, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, reduce_type=reduce_type, reduce_range_def=reduce_range_def, reduce_range_use=reduce_range_use, kname=kname, redop=reduce_op)) # Write reduction function definition in caller file host_globalcode.write( """ DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); """.format(id=idstr, reduce_range_def=reduce_range_def, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype), sdfg, state_id, node) # Call reduction function where necessary host_localcode.write( '__dace_reduce_{id}({input}, {output}, {reduce_range_call}, __dace_current_stream);' .format(id=idstr, input=input, output=output, reduce_range_call=reduce_range_call)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, host_localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(host_globalcode.getvalue()) sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') sdfg.append_init_code(cuda_initcode.getvalue(), 'cuda') sdfg.append_exit_code(cuda_exitcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Get reduction type for OpenMP redtype = detect_reduction_type(node.wcr, openmp=True) if redtype not in ExpandReduceOpenMP._REDUCTION_TYPE_TO_OPENMP: raise ValueError('Reduction type not supported for "%s"' % node.wcr) omptype, expr = ExpandReduceOpenMP._REDUCTION_TYPE_TO_OPENMP[redtype] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] outer_loops = len(axes) != input_dims # Create OpenMP clause if outer_loops: code = '#pragma omp parallel for collapse({cdim})\n'.format( cdim=output_dims) else: code = '' from dace.codegen.targets.cpp import sym2cpp # Output loops out_offset = [] if outer_loops: for i, sz in enumerate(outedge.data.subset.size()): code += 'for (int _o{i} = 0; _o{i} < {sz}; ++_o{i}) {{\n'.format( i=i, sz=sym2cpp(sz)) out_offset.append('_o%d * %s' % (i, sym2cpp(output_data.strides[i]))) else: out_offset.append('0') outexpr = '_out[%s]' % ' + '.join(out_offset) # Write identity value first if node.identity is not None: code += '%s = %s;\n' % (outexpr, node.identity) # Reduction OpenMP clause code += '#pragma omp parallel for collapse({cdim}) ' \ 'reduction({rtype}: {oexpr})\n'.format(cdim=len(axes), rtype=omptype, oexpr=outexpr) # Reduction loops for i, axis in enumerate(sorted(axes)): sz = sym2cpp(inedge.data.subset.size()[axis]) code += 'for (int _i{i} = 0; _i{i} < {sz}; ++_i{i}) {{\n'.format( i=i, sz=sz) # Prepare input offset expression in_offset = [] ictr, octr = 0, 0 for i in range(input_dims): if i in axes: result = '_i%d' % ictr ictr += 1 else: result = '_o%d' % octr octr += 1 in_offset.append('%s * %s' % (result, sym2cpp(input_data.strides[i]))) in_offset = ' + '.join(in_offset) # Reduction expression code += expr.format(i='_in[%s]' % in_offset, o=outexpr) code += '\n' # Closing braces code += '}\n' * len(axes) if outer_loops: code += '}\n' * output_dims # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, code, language=dace.Language.CPP) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def _Str(self, tree, infer_type=False): result = tree.s self._write_constant(result, infer_type) return dace.pointer(dace.int8) if infer_type else None
def keyword_none(A: dace.float32[N], B: dace.float32[N], C: dace.pointer(dace.int32)): if C is None: B[:] = A[:]
import ctypes import dace import numpy as np csrmatrix = dace.struct( 'csr', # CSR Matrix definition type rows=dace.int32, cols=dace.int32, nnz=dace.int32, data=(dace.pointer(dace.float32), 'nnz'), rowsp1=dace.int32, rowptr=(dace.pointer(dace.int32), 'rowsp1'), colind=(dace.pointer(dace.int32), 'nnz')) sdfg = dace.SDFG('addone') state = sdfg.add_state() sdfg.add_array('sparsemats_in', [5], dtype=csrmatrix) sdfg.add_array('sparsemats_out', [5], dtype=csrmatrix) ome, omx = state.add_map('matrices', dict(i='0:5')) tasklet = state.add_tasklet('addone', {'mat_in'}, {'mat_out'}, ''' for (int j = 0; j < mat_in.nnz; ++j) { mat_out.data[j] = mat_in.data[j] + 1.0f; } ''', language=dace.Language.CPP) matr = state.add_read('sparsemats_in') matw = state.add_write('sparsemats_out') state.add_memlet_path(matr, ome,
def emit_setup_code_for_ortvalue(node: nd.CodeNode, parameter_name: str, edge_connector_name: str, desc: dt.Data, required_copy: Optional[dtypes.StorageType], is_input: bool, ort_value_name: str, connector_dict: dict) -> str: """ Emit the code that creates the OrtValue for a parameter. Also set the connector types on the parent node. :param node: the parent node that we are expanding :param parameter_name: the onnx name of the parameter. :param edge_connector_name: the name of the edge connector to the tasklet. :param desc: the dace input descriptor connected to this parameter. :param required_copy: the ``StorageType`` to copy to for this parameter, if a copy is required. :param is_input: whether the parameter is an input. :param ort_value_name: the name for the ort_value. :param connector_dict: either the input connector or output connector dict for the expanded node, depending on whether this is an input or an output. :return: the code that creates the OrtValue for ``parameter_name``. """ parent_connector_dict = node.in_connectors if is_input else node.out_connectors input_output_string = "input" if is_input else "output" code = "" if required_copy is not None: storage = required_copy else: storage = desc.storage if storage in [dtypes.StorageType.Default, dtypes.StorageType.CPU_Heap]: mem_info = "__state->ort_cpu_mem_info" elif storage is dtypes.StorageType.GPU_Global: mem_info = "__state->ort_cuda_mem_info" elif storage is dtypes.StorageType.CPU_Pinned: mem_info = "__state->ort_cuda_pinned_mem_info" else: raise ValueError( "Unsupported storage type {} for input to ONNX node".format( desc.storage)) if isinstance(desc, dt.Scalar): on_gpu = storage is dtypes.StorageType.GPU_Global code += """ OrtValue* {ort_value_name}; __ort_check_status(__state->ort_api, __state->ort_api->CreateTensorWithDataAsOrtValue( {mem_info}, {maybe_ref}{edge_connector_name}, {data_size} * sizeof({ctype}), nullptr, 0, ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_str}, &{ort_value_name} )); """.format(mem_info=mem_info, edge_connector_name=edge_connector_name, data_size=reduce(lambda x, y: x * y, desc.shape), ctype=desc.dtype.ctype, type_str=typeclass_to_onnx_str(desc.dtype).upper(), ort_value_name=ort_value_name, maybe_ref="" if on_gpu else "&") if on_gpu: connector_dict[edge_connector_name] = dace.pointer(desc.dtype) parent_connector_dict[parameter_name] = dace.pointer(desc.dtype) else: connector_dict[edge_connector_name] = desc.dtype parent_connector_dict[parameter_name] = desc.dtype elif isinstance(desc, dt.Array): # setup dims array code += """ int64_t {input_output_string}_{parameter_name}_dims[{dims_size}] = {{{dims}}}; """.format(input_output_string=input_output_string, parameter_name=parameter_name, dims_size=len(desc.shape), dims=", ".join(str(s) for s in desc.shape)) data = "const_cast < void * > (reinterpret_cast < const void * > ({}))".format( edge_connector_name) code += """ OrtValue* {ort_value_name}; __ort_check_status(__state->ort_api, __state->ort_api->CreateTensorWithDataAsOrtValue( {mem_info}, {data}, {data_size} * sizeof({ctype}), {input_output_string}_{parameter_name}_dims, {dims_size}, ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_str}, &{ort_value_name} )); """.format(input_output_string=input_output_string, data=data, mem_info=mem_info, parameter_name=parameter_name, data_size=reduce(lambda x, y: x * y, desc.shape), ctype=desc.dtype.ctype, dims_size=len(desc.shape), type_str=typeclass_to_onnx_str(desc.dtype).upper(), ort_value_name=ort_value_name) connector_dict[edge_connector_name] = dace.pointer(desc.dtype) parent_connector_dict[parameter_name] = dace.pointer(desc.dtype) else: raise NotImplementedError( "Data-descriptor type {} not supported for ONNX nodes".format( type(desc))) return code
import ctypes import dace import numpy as np csrmatrix = dace.struct( 'csr', # CSR Matrix definition type rows=dace.int32, cols=dace.int32, nnz=dace.int32, data=(dace.pointer(dace.float32), 'nnz'), rowsp1=dace.int32, rowptr=(dace.pointer(dace.int32), 'rowsp1'), colind=(dace.pointer(dace.int32), 'nnz')) sdfg = dace.SDFG('addone') state = sdfg.add_state() sdfg.add_array('sparsemats_in', [5], dtype=csrmatrix) sdfg.add_array('sparsemats_out', [5], dtype=csrmatrix) ome, omx = state.add_map('matrices', dict(i='0:5')) tasklet = state.add_tasklet('addone', {'mat_in'}, {'mat_out': dace.pointer(csrmatrix)}, ''' for (int j = 0; j < mat_in.nnz; ++j) { mat_out->data[j] = mat_in.data[j] + 1.0f; } ''', language=dace.Language.CPP) matr = state.add_read('sparsemats_in') matw = state.add_write('sparsemats_out') state.add_memlet_path(matr,
def expansion(node, state: SDFGState, sdfg: SDFG): # Extract input and output array views (as generated by memlets) inputs, outputs = _get_inputs_and_outputs(sdfg, state, node) unique_id = "{}_{}_{}_{}".format(clean_onnx_name(node.name), sdfg.sdfg_id, sdfg.node_id(state), state.node_id(node)) _add_ort_init_code(sdfg) sdfg.append_global_code( "OrtExecutableKernel *__ort_kernel_{};\n".format(unique_id)) sdfg.append_global_code( "OrtExecutableKernelContext *__ort_context_{};\n".format( unique_id)) sdfg.append_init_code(""" {{ // Setup for {name} __ort_check_status(__ort_api->CreateExecutableKernelContext("{name}", "{op_type}", &__ort_context_{name})); """.format(name=unique_id, op_type=node.schema.name)) # check if ORT supports CUDA for this node ########################################## # Default: all parameters are on CPU if we execute using cpu outputs_on_host = [True for _ in range(len(outputs))] inputs_on_host = [True for _ in range(len(inputs))] actual_node_schedule = node.schedule if node.schedule == ScheduleType.CPU_Multicore or node.schedule == ScheduleType.Default: provider_index = 0 elif node.schedule == ScheduleType.GPU_Device: provider_index = 1 try: # the ith position indicates whether the ith output is in host memory inputs_on_host, outputs_on_host = check_op(sdfg, state, node, cuda=True) except ONNXOpValidationError as e: # fallback to CPU print("Falling back to CPU for node {}. Reason:\n{}".format( node.name, str(e))) provider_index = 0 actual_node_schedule = ScheduleType.Default else: raise NotImplementedError( "ORT expansion for schedule '{}' is not implemented".format( node.schedule)) # check if we need to insert device copies ########################################## # maps the connectors for which a copy will be required to the storage type required to be connected to the tasklet input_copy_required = defaultdict(dict) output_copy_required = defaultdict(dict) assert len( node.iter_outputs_in_onnx_order(state)) == len(outputs_on_host) assert len( node.iter_inputs_in_onnx_order(state)) == len(inputs_on_host) # check outputs for edge, output_on_host in zip(node.iter_outputs_in_onnx_order(state), outputs_on_host): # get the memlet for this output array = sdfg.arrays[edge.data.data] if output_on_host: is_device_mismatch = not can_access(ScheduleType.Default, array.storage) else: is_device_mismatch = not can_access(ScheduleType.GPU_Device, array.storage) if isinstance( array, dt.Scalar ) and actual_node_schedule == ScheduleType.GPU_Device: # ORT kernels expect scalars to be cudaMalloced. We will copy during expansion to enforce this is_device_mismatch = True output_copy_required[edge.src_conn]['copy_to_array'] = True if is_device_mismatch: # we need to insert a copy output_copy_required[edge.src_conn][ 'storage'] = StorageType.Default if output_on_host else StorageType.GPU_Global # check inputs (same thing again) for edge, input_on_host in zip(node.iter_inputs_in_onnx_order(state), inputs_on_host): array = sdfg.arrays[edge.data.data] if input_on_host: is_device_mismatch = not can_access(ScheduleType.Default, array.storage) else: is_device_mismatch = not can_access(ScheduleType.GPU_Device, array.storage) if isinstance( array, dt.Scalar ) and actual_node_schedule == ScheduleType.GPU_Device: # ORT kernels expect scalars to be cudaMalloced. We will copy during expansion to enforce this is_device_mismatch = True input_copy_required[edge.dst_conn]['copy_to_array'] = True if is_device_mismatch: # we need to insert a copy input_copy_required[edge.dst_conn][ 'storage'] = StorageType.Default if input_on_host else StorageType.GPU_Global # begin codegen ########################################## tasklet_setup_code = "" tasklet_code = "" tasklet_cleanup_code = "" reversed_onnx_dtype_map = { v: k for k, v in ONNX_DTYPES_TO_DACE_TYPE_CLASS.items() } # emit code for inputs and outputs ########################################## in_connectors = {} out_connectors = {} for edge, is_input in node.iter_edges(state): parameter_name = edge.dst_conn if is_input else edge.src_conn if len(output_copy_required) != 0 or len(input_copy_required) != 0: edge_connector_name = "_conn_" + parameter_name else: edge_connector_name = parameter_name input_output_string = "input" if is_input else "output" connector_dict = in_connectors if is_input else out_connectors memlet = edge.data desc = sdfg.arrays[memlet.data] sdfg.append_init_code(""" // Add parameter {parameter_name} __ort_check_status(__ort_api->ExecutableKernelContext_Add{input_output_string}(__ort_context_{id}, ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_string})); """.format(id=unique_id, type_string=reversed_onnx_dtype_map[desc.dtype].upper(), parameter_name=parameter_name, input_output_string=input_output_string.capitalize())) ort_value_name = "ort_value_{input_output_string}_{parameter_name}".format( input_output_string=input_output_string, parameter_name=parameter_name) copy_to_array = ( (parameter_name in output_copy_required and 'copy_to_array' in output_copy_required[parameter_name]) or (parameter_name in input_copy_required and 'copy_to_array' in input_copy_required[parameter_name])) if desc.storage == StorageType.Default: mem_info = "__ort_cpu_mem_info" elif desc.storage == StorageType.GPU_Global: mem_info = "__ort_cuda_mem_info" elif desc.storage == StorageType.CPU_Pinned: mem_info = "__ort_cuda_pinned_mem_info" else: raise ValueError( "Unsupported storage type {} for input to ONNX node". format(desc.storage)) if (isinstance(desc, dt.Scalar) and # when copying to array, the ort value is not a scalar but an array not copy_to_array): tasklet_setup_code += """ OrtValue* {ort_value_name}; __ort_check_status(__ort_api->CreateTensorWithDataAsOrtValue( {mem_info}, &{edge_connector_name}, {data_size} * sizeof({ctype}), nullptr, 0, ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_str}, &{ort_value_name} )); """.format( input_output_string=input_output_string, mem_info=mem_info, edge_connector_name=edge_connector_name, data_size=reduce(lambda x, y: x * y, desc.shape), ctype=desc.dtype.ctype, type_str=reversed_onnx_dtype_map[desc.dtype].upper(), ort_value_name=ort_value_name) connector_dict[parameter_name] = None elif isinstance(desc, dt.Array) or copy_to_array: # when we copy a scalar to an array, that scalar ofc has shape [] dims = [] if copy_to_array else desc.shape # setup dims array tasklet_setup_code += """ int64_t {input_output_string}_{parameter_name}_dims[{dims_size}] = {{{dims}}}; """.format(input_output_string=input_output_string, parameter_name=parameter_name, dims_size=len(dims), dims=", ".join(str(s) for s in dims)) connector_dict[parameter_name] = dace.pointer(desc.dtype) data = "const_cast < void * > (reinterpret_cast < const void * > ({}))".format( edge_connector_name) tasklet_setup_code += """ OrtValue* {ort_value_name}; __ort_check_status(__ort_api->CreateTensorWithDataAsOrtValue( {mem_info}, {data}, {data_size} * sizeof({ctype}), {input_output_string}_{parameter_name}_dims, {dims_size}, ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_str}, &{ort_value_name} )); """.format( input_output_string=input_output_string, data=data, mem_info=mem_info, parameter_name=parameter_name, data_size=reduce(lambda x, y: x * y, desc.shape), ctype=desc.dtype.ctype, dims_size=len(dims), type_str=reversed_onnx_dtype_map[desc.dtype].upper(), ort_value_name=ort_value_name) else: raise NotImplementedError( "Data-descriptor type {} not supported for ONNX nodes". format(type(desc))) tasklet_code += "__ort_check_status(__ort_api->ExecutableKernel_Set{input_output_string_capital}(" \ "__ort_kernel_{unique_id}, {position}, {ort_value_name}));\n".format( input_output_string_capital=input_output_string. capitalize(), ort_value_name=ort_value_name, unique_id=unique_id, position=get_position(node.schema, is_input, parameter_name)) tasklet_cleanup_code += "__ort_api->ReleaseValue(ort_value_{input_output_string}_{parameter_name});\n".format( input_output_string=input_output_string, parameter_name=parameter_name) sdfg.append_init_code("// Setup attributes\n") for name, attr in node.schema.attributes.items(): if hasattr(node, name): sdfg.append_init_code( _gen_attr_init_code("__ort_context_{}".format(unique_id), node.schema.attributes[name], getattr(node, name))) sdfg.prepend_exit_code( "__ort_api->ReleaseExecutableKernelContext(__ort_context_{});\n". format(unique_id)) sdfg.prepend_exit_code( "__ort_api->ReleaseExecutableKernel(__ort_kernel_{});\n".format( unique_id)) tasklet_code += 'fprintf(stderr, "Launching {}\\n");\n'.format( unique_id) tasklet_code += "__ort_check_status(__ort_api->ExecutableKernel_Compute(__ort_kernel_{}));\n".format( unique_id) sdfg.append_init_code( "__ort_check_status(__ort_api->CreateExecutableKernel(" "__ort_session, __ort_context_{id}, /*provider_index=*/{provider_index}, &__ort_kernel_{id}));\n" .format(provider_index=provider_index, id=unique_id)) sdfg.append_init_code( "}} // end setup for context_{}".format(unique_id)) tasklet_code = tasklet_setup_code + tasklet_code + tasklet_cleanup_code tasklet = nd.Tasklet('onnx_code', in_connectors, out_connectors, tasklet_code, language=dace.dtypes.Language.CPP) tasklet.environments = {"ONNXRuntime"} if len(output_copy_required) != 0 or len(input_copy_required) != 0: nsdfg = dace.SDFG("nested_{}".format(unique_id)) nstate = nsdfg.add_state() ntasklet = deepcopy(tasklet) # add a prefix to connectors to prevent shadowing of array names ntasklet.in_connectors = { "_conn_" + k: v for k, v in tasklet.in_connectors.items() } ntasklet.out_connectors = { "_conn_" + k: v for k, v in tasklet.out_connectors.items() } nstate.add_node(ntasklet) for edge, is_input in node.iter_edges(state): parameter_name = edge.dst_conn if is_input else edge.src_conn memlet = edge.data desc = sdfg.arrays[memlet.data] # add the original array original_desc = deepcopy(desc) original_desc.transient = False nsdfg.add_datadesc(parameter_name, original_desc) if not (isinstance(desc, dt.Array) or isinstance(desc, dt.Scalar)): raise ValueError( "Unsupported data type {} connected to an ONNX tasklet" .format(type(desc))) if parameter_name not in (input_copy_required if is_input else output_copy_required): if is_input: access = nstate.add_read(parameter_name) nstate.add_edge(access, None, ntasklet, "_conn_" + parameter_name, nsdfg.get_array_memlet(parameter_name)) else: access = nstate.add_write(parameter_name) nstate.add_edge(ntasklet, "_conn_" + parameter_name, access, None, nsdfg.get_array_memlet(parameter_name)) continue copy_options = input_copy_required[ parameter_name] if is_input else output_copy_required[ parameter_name] # add the copy of the descriptor if 'copy_to_array' in copy_options: copy_desc = dt.Array(shape=[1], dtype=desc.dtype) else: copy_desc = deepcopy(desc) copy_desc.transient = True copy_desc.storage = copy_options['storage'] nsdfg.add_datadesc("copy_" + memlet.data, copy_desc) nmemlet = deepcopy(memlet) nmemlet.data = "copy_" + nmemlet.data if is_input: access = nstate.add_read(parameter_name) access_copy = nstate.add_access("copy_" + memlet.data) nstate.add_edge( access, None, access_copy, None, nsdfg.get_array_memlet("copy_" + memlet.data)) nstate.add_edge(access_copy, None, ntasklet, "_conn_" + parameter_name, nmemlet) else: access = nstate.add_write(parameter_name) access_copy = nstate.add_access("copy_" + memlet.data) nstate.add_edge(ntasklet, "_conn_" + parameter_name, access_copy, None, nmemlet) nstate.add_edge( access_copy, None, access, None, nsdfg.get_array_memlet("copy_" + memlet.data)) return nsdfg else: return tasklet