def check_required_copies( node: nd.Node, state: SDFGState, sdfg: SDFG, outputs_on_host: List[bool], inputs_on_host: List[bool] ) -> Tuple[Dict[str, dtypes.StorageType], Dict[str, dtypes.StorageType]]: """ Check whether copies are required for all parameters. :param node: the node. :param state: the state. :param sdfg: the sdfg. :param outputs_on_host: boolean list, where the ith bool indicates if the ith output should be on host. :param inputs_on_host: boolean list, where the ith bool indicates if the ith input should be on host. :return: two dicts containing storage types for each of the connectors that require copies. The first dict is for the inputs, the second is for the outputs. """ # maps the connectors for which a copy will be required to the storage type required to be connected to the tasklet input_copy_required: Dict[str, dtypes.StorageType] = {} output_copy_required: Dict[str, dtypes.StorageType] = {} assert len(node.iter_outputs_in_onnx_order(state)) == len(outputs_on_host) assert len(node.iter_inputs_in_onnx_order(state)) == len(inputs_on_host) # check outputs for edge, output_on_host in zip(node.iter_outputs_in_onnx_order(state), outputs_on_host): # get the memlet for this output array = sdfg.arrays[edge.data.data] if output_on_host: is_device_mismatch = not dtypes.can_access( dtypes.ScheduleType.Default, array.storage) else: is_device_mismatch = not dtypes.can_access( dtypes.ScheduleType.GPU_Device, array.storage) if is_device_mismatch: # we need to insert a copy storage = dtypes.StorageType.CPU_Heap if output_on_host else dtypes.StorageType.GPU_Global output_copy_required[edge.src_conn] = storage # check inputs (same thing again) for edge, input_on_host in zip(node.iter_inputs_in_onnx_order(state), inputs_on_host): array = sdfg.arrays[edge.data.data] if input_on_host: is_device_mismatch = not dtypes.can_access( dtypes.ScheduleType.Default, array.storage) else: is_device_mismatch = not dtypes.can_access( dtypes.ScheduleType.GPU_Device, array.storage) if is_device_mismatch: # we need to insert a copy storage = dtypes.StorageType.CPU_Heap if input_on_host else dtypes.StorageType.GPU_Global input_copy_required[edge.dst_conn] = storage return input_copy_required, output_copy_required
def check_access(schedule: dtypes.ScheduleType, *descs: data.Data): """ If schedule cannot access all passed descriptors, through an error. :param schedule: the schedule. :param descs: the descriptors to check. """ for desc in descs: if not dtypes.can_access(schedule, desc.storage): raise ValueError( f"Schedule mismatch: {schedule} cannot access {desc.storage}")
def create_output_array( inferred_symbols: Dict[str, int], desc: dt.Data, use_torch=False, zeros: bool = False) -> Union[np.ndarray, torch.tensor]: """ Create the array for an output. This is either a numpy array or a torch tensor depending on `use_torch` When `self.force_torch_outputs` is True, the outputs will be tensors. Otherwise, the outputs will be tensors :param inferred_symbols: the symbols inferred from `infer_symbols_from_shapes`. :param desc: the data descriptor for the array :param use_torch: whether to return a numpy array or a torch tensor. :param zeros: if true init with zeros else empty. """ def eval_dim(dim): for sym in dim.free_symbols: dim = dim.subs(sym, inferred_symbols[sym.name]) return dim if dtypes.can_access(dtypes.ScheduleType.CPU_Multicore, desc.storage): cuda = False elif dtypes.can_access(dtypes.ScheduleType.GPU_Default, desc.storage): cuda = True else: raise ValueError(f"Unsupported storage {desc.storage}") if cuda and not use_torch: raise ValueError("Got use_torch=False, but received a GPU descriptor") shape = [eval_dim(d) if type(d) is dace.symbol else d for d in desc.shape] if use_torch: # as_numpy_dtype doesn't seem to work for indexing into the dict tens = (torch.zeros if zeros else torch.empty)( *shape, dtype=numpy_to_torch_dtype_dict[getattr(np, desc.dtype.to_string())]) return tens.cuda() if cuda else tens else: return (np.zeros if zeros else np.empty)(shape, dtype=getattr( np, desc.dtype.to_string()))
def apply(self, sdfg: dace.SDFG): # Extract the subgraph, execute it and insert an AccessNode to the result # this method of execution is slow but simple. A better option would be to call the ORT # C API from a python object (like the OpChecker). parent: ONNXModel = sdfg._parent_onnx_model state = sdfg.nodes()[self.state_id] node = state.nodes()[self.subgraph[ConstantFolding._onnx_node]] log.debug(f"Applying constant folding: {node} in {state}") if isinstance(node, donnx.ONNXShape): # if we have a shape node, replace it with a constant assert len(state.in_edges(node)) == 1 shape_in_edge = state.in_edges(node)[0] assert shape_in_edge.dst_conn == "data" shape_desc = sdfg.arrays[shape_in_edge.src.data] constant_name = sdfg.temp_data_name() clean_constant_name = clean_onnx_name(constant_name) sdfg.add_array(clean_constant_name, (len(shape_desc.shape), ), dace.int64) assert constant_name not in parent.clean_weights parent.weights[constant_name] = torch.from_numpy( np.array(shape_desc.shape, np.int64)) assert len(state.out_edges(node)) == 1 output_edge = state.out_edges(node)[0] access_shape = state.add_access(clean_constant_name) state.add_edge(access_shape, None, output_edge.dst, output_edge.dst_conn, sdfg.make_array_memlet(clean_constant_name)) else: # otherwise compute the result of the op global UNIQUE_ID UNIQUE_ID += 1 sub_sdfg = dace.SDFG("sub_sdfg_" + str(UNIQUE_ID)) sub_state = sub_sdfg.add_state() node_copy = copy.deepcopy(node) sub_state.add_node(node_copy) inputs = {} for edge in state.in_edges(node): # we know from can_be_applied that all in edges are from AccessNodes assert (isinstance(edge.src, nd.AccessNode) and hasattr(sdfg, "_parent_onnx_model") and edge.src.data in sdfg._parent_onnx_model.clean_weights) desc = copy.deepcopy(sdfg.arrays[edge.data.data]) desc.transient = False sub_sdfg.add_datadesc('array_' + edge.dst_conn, desc) input_value = sdfg._parent_onnx_model.clean_weights[ edge.src.data] if len(input_value.shape) == 0: inputs['array_' + edge.dst_conn] = input_value.cpu().numpy()[()] else: inputs['array_' + edge.dst_conn] = input_value.clone() access = sub_state.add_access('array_' + edge.dst_conn) sub_state.add_edge( access, None, node_copy, edge.dst_conn, sub_sdfg.make_array_memlet('array_' + edge.dst_conn)) outputs = {} for edge in state.out_edges(node): desc = copy.deepcopy(sdfg.arrays[edge.data.data]) if isinstance(desc, dt.Scalar): # we need to copy to an array of size [1] so that we can "return" the output from the sdfg desc.transient = True sub_sdfg.add_datadesc('scalar_array_' + edge.src_conn, desc) sub_sdfg.add_array('array_' + edge.src_conn, [1], desc.dtype, transient=False) access_scalar = sub_state.add_access('scalar_array_' + edge.src_conn) access = sub_state.add_access('array_' + edge.src_conn) sub_state.add_edge( node_copy, edge.src_conn, access_scalar, None, sub_sdfg.make_array_memlet('scalar_array_' + edge.src_conn)) sub_state.add_edge( access_scalar, None, access, None, sub_sdfg.make_array_memlet('array_' + edge.src_conn)) else: desc.transient = False sub_sdfg.add_datadesc('array_' + edge.src_conn, desc) access = sub_state.add_access('array_' + edge.src_conn) sub_state.add_edge( node_copy, edge.src_conn, access, None, sub_sdfg.make_array_memlet('array_' + edge.src_conn)) if len(desc.shape) == 0: empty_array = np.empty((1, ), desc.dtype.as_numpy_dtype()) else: empty_array = np.empty(tuple(desc.shape), desc.dtype.as_numpy_dtype()) empty_array = torch.from_numpy(empty_array) if desc.storage is dtypes.StorageType.GPU_Global: empty_array = empty_array.cuda() outputs['array_' + edge.src_conn] = empty_array sub_sdfg(**outputs, **inputs) for edge in state.out_edges(node): desc = copy.deepcopy(sdfg.arrays[edge.data.data]) desc.transient = False output_value = outputs['array_' + edge.src_conn] constant_name = sdfg.temp_data_name() clean_constant_name = clean_onnx_name(constant_name) sdfg.add_datadesc(clean_constant_name, desc) assert constant_name not in parent.weights assert type(output_value) is torch.Tensor if not dtypes.can_access(dtypes.ScheduleType.CPU_Multicore, desc.storage): cpu_desc = copy.deepcopy(desc) cpu_desc.storage = dtypes.StorageType.CPU_Heap cpu_desc.transient = False desc.transient = True copy_in_name = sdfg.temp_data_name() clean_copy_in_name = clean_onnx_name(copy_in_name) sdfg.add_datadesc(clean_copy_in_name, cpu_desc) access_constant = state.add_access(clean_constant_name) state.add_edge(state.add_read(clean_copy_in_name), None, access_constant, None, sdfg.make_array_memlet(clean_copy_in_name)) name_to_add = copy_in_name else: access_constant = state.add_read(clean_constant_name) name_to_add = constant_name if isinstance(desc, dt.Scalar): parent.weights[name_to_add] = output_value.reshape(()) else: parent.weights[name_to_add] = output_value state.add_edge(access_constant, None, edge.dst, edge.dst_conn, sdfg.make_array_memlet(clean_constant_name)) # remove all now useless nodes with a reverse BFS remove_node_and_computation(sdfg, state, node)
def expansion(node, state: SDFGState, sdfg: SDFG): # Extract input and output array views (as generated by memlets) inputs, outputs = _get_inputs_and_outputs(sdfg, state, node) unique_id = "{}_{}_{}_{}".format(clean_onnx_name(node.name), sdfg.sdfg_id, sdfg.node_id(state), state.node_id(node)) _add_ort_init_code(sdfg) sdfg.append_global_code( "OrtExecutableKernel *__ort_kernel_{};\n".format(unique_id)) sdfg.append_global_code( "OrtExecutableKernelContext *__ort_context_{};\n".format( unique_id)) sdfg.append_init_code(""" {{ // Setup for {name} __ort_check_status(__ort_api->CreateExecutableKernelContext("{name}", "{op_type}", &__ort_context_{name})); """.format(name=unique_id, op_type=node.schema.name)) # check if ORT supports CUDA for this node ########################################## # Default: all parameters are on CPU if we execute using cpu outputs_on_host = [True for _ in range(len(outputs))] inputs_on_host = [True for _ in range(len(inputs))] actual_node_schedule = node.schedule if node.schedule == ScheduleType.CPU_Multicore or node.schedule == ScheduleType.Default: provider_index = 0 elif node.schedule == ScheduleType.GPU_Device: provider_index = 1 try: # the ith position indicates whether the ith output is in host memory inputs_on_host, outputs_on_host = check_op(sdfg, state, node, cuda=True) except ONNXOpValidationError as e: # fallback to CPU print("Falling back to CPU for node {}. Reason:\n{}".format( node.name, str(e))) provider_index = 0 actual_node_schedule = ScheduleType.Default else: raise NotImplementedError( "ORT expansion for schedule '{}' is not implemented".format( node.schedule)) # check if we need to insert device copies ########################################## # maps the connectors for which a copy will be required to the storage type required to be connected to the tasklet input_copy_required = defaultdict(dict) output_copy_required = defaultdict(dict) assert len( node.iter_outputs_in_onnx_order(state)) == len(outputs_on_host) assert len( node.iter_inputs_in_onnx_order(state)) == len(inputs_on_host) # check outputs for edge, output_on_host in zip(node.iter_outputs_in_onnx_order(state), outputs_on_host): # get the memlet for this output array = sdfg.arrays[edge.data.data] if output_on_host: is_device_mismatch = not can_access(ScheduleType.Default, array.storage) else: is_device_mismatch = not can_access(ScheduleType.GPU_Device, array.storage) if isinstance( array, dt.Scalar ) and actual_node_schedule == ScheduleType.GPU_Device: # ORT kernels expect scalars to be cudaMalloced. We will copy during expansion to enforce this is_device_mismatch = True output_copy_required[edge.src_conn]['copy_to_array'] = True if is_device_mismatch: # we need to insert a copy output_copy_required[edge.src_conn][ 'storage'] = StorageType.Default if output_on_host else StorageType.GPU_Global # check inputs (same thing again) for edge, input_on_host in zip(node.iter_inputs_in_onnx_order(state), inputs_on_host): array = sdfg.arrays[edge.data.data] if input_on_host: is_device_mismatch = not can_access(ScheduleType.Default, array.storage) else: is_device_mismatch = not can_access(ScheduleType.GPU_Device, array.storage) if isinstance( array, dt.Scalar ) and actual_node_schedule == ScheduleType.GPU_Device: # ORT kernels expect scalars to be cudaMalloced. We will copy during expansion to enforce this is_device_mismatch = True input_copy_required[edge.dst_conn]['copy_to_array'] = True if is_device_mismatch: # we need to insert a copy input_copy_required[edge.dst_conn][ 'storage'] = StorageType.Default if input_on_host else StorageType.GPU_Global # begin codegen ########################################## tasklet_setup_code = "" tasklet_code = "" tasklet_cleanup_code = "" reversed_onnx_dtype_map = { v: k for k, v in ONNX_DTYPES_TO_DACE_TYPE_CLASS.items() } # emit code for inputs and outputs ########################################## in_connectors = {} out_connectors = {} for edge, is_input in node.iter_edges(state): parameter_name = edge.dst_conn if is_input else edge.src_conn if len(output_copy_required) != 0 or len(input_copy_required) != 0: edge_connector_name = "_conn_" + parameter_name else: edge_connector_name = parameter_name input_output_string = "input" if is_input else "output" connector_dict = in_connectors if is_input else out_connectors memlet = edge.data desc = sdfg.arrays[memlet.data] sdfg.append_init_code(""" // Add parameter {parameter_name} __ort_check_status(__ort_api->ExecutableKernelContext_Add{input_output_string}(__ort_context_{id}, ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_string})); """.format(id=unique_id, type_string=reversed_onnx_dtype_map[desc.dtype].upper(), parameter_name=parameter_name, input_output_string=input_output_string.capitalize())) ort_value_name = "ort_value_{input_output_string}_{parameter_name}".format( input_output_string=input_output_string, parameter_name=parameter_name) copy_to_array = ( (parameter_name in output_copy_required and 'copy_to_array' in output_copy_required[parameter_name]) or (parameter_name in input_copy_required and 'copy_to_array' in input_copy_required[parameter_name])) if desc.storage == StorageType.Default: mem_info = "__ort_cpu_mem_info" elif desc.storage == StorageType.GPU_Global: mem_info = "__ort_cuda_mem_info" elif desc.storage == StorageType.CPU_Pinned: mem_info = "__ort_cuda_pinned_mem_info" else: raise ValueError( "Unsupported storage type {} for input to ONNX node". format(desc.storage)) if (isinstance(desc, dt.Scalar) and # when copying to array, the ort value is not a scalar but an array not copy_to_array): tasklet_setup_code += """ OrtValue* {ort_value_name}; __ort_check_status(__ort_api->CreateTensorWithDataAsOrtValue( {mem_info}, &{edge_connector_name}, {data_size} * sizeof({ctype}), nullptr, 0, ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_str}, &{ort_value_name} )); """.format( input_output_string=input_output_string, mem_info=mem_info, edge_connector_name=edge_connector_name, data_size=reduce(lambda x, y: x * y, desc.shape), ctype=desc.dtype.ctype, type_str=reversed_onnx_dtype_map[desc.dtype].upper(), ort_value_name=ort_value_name) connector_dict[parameter_name] = None elif isinstance(desc, dt.Array) or copy_to_array: # when we copy a scalar to an array, that scalar ofc has shape [] dims = [] if copy_to_array else desc.shape # setup dims array tasklet_setup_code += """ int64_t {input_output_string}_{parameter_name}_dims[{dims_size}] = {{{dims}}}; """.format(input_output_string=input_output_string, parameter_name=parameter_name, dims_size=len(dims), dims=", ".join(str(s) for s in dims)) connector_dict[parameter_name] = dace.pointer(desc.dtype) data = "const_cast < void * > (reinterpret_cast < const void * > ({}))".format( edge_connector_name) tasklet_setup_code += """ OrtValue* {ort_value_name}; __ort_check_status(__ort_api->CreateTensorWithDataAsOrtValue( {mem_info}, {data}, {data_size} * sizeof({ctype}), {input_output_string}_{parameter_name}_dims, {dims_size}, ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_str}, &{ort_value_name} )); """.format( input_output_string=input_output_string, data=data, mem_info=mem_info, parameter_name=parameter_name, data_size=reduce(lambda x, y: x * y, desc.shape), ctype=desc.dtype.ctype, dims_size=len(dims), type_str=reversed_onnx_dtype_map[desc.dtype].upper(), ort_value_name=ort_value_name) else: raise NotImplementedError( "Data-descriptor type {} not supported for ONNX nodes". format(type(desc))) tasklet_code += "__ort_check_status(__ort_api->ExecutableKernel_Set{input_output_string_capital}(" \ "__ort_kernel_{unique_id}, {position}, {ort_value_name}));\n".format( input_output_string_capital=input_output_string. capitalize(), ort_value_name=ort_value_name, unique_id=unique_id, position=get_position(node.schema, is_input, parameter_name)) tasklet_cleanup_code += "__ort_api->ReleaseValue(ort_value_{input_output_string}_{parameter_name});\n".format( input_output_string=input_output_string, parameter_name=parameter_name) sdfg.append_init_code("// Setup attributes\n") for name, attr in node.schema.attributes.items(): if hasattr(node, name): sdfg.append_init_code( _gen_attr_init_code("__ort_context_{}".format(unique_id), node.schema.attributes[name], getattr(node, name))) sdfg.prepend_exit_code( "__ort_api->ReleaseExecutableKernelContext(__ort_context_{});\n". format(unique_id)) sdfg.prepend_exit_code( "__ort_api->ReleaseExecutableKernel(__ort_kernel_{});\n".format( unique_id)) tasklet_code += 'fprintf(stderr, "Launching {}\\n");\n'.format( unique_id) tasklet_code += "__ort_check_status(__ort_api->ExecutableKernel_Compute(__ort_kernel_{}));\n".format( unique_id) sdfg.append_init_code( "__ort_check_status(__ort_api->CreateExecutableKernel(" "__ort_session, __ort_context_{id}, /*provider_index=*/{provider_index}, &__ort_kernel_{id}));\n" .format(provider_index=provider_index, id=unique_id)) sdfg.append_init_code( "}} // end setup for context_{}".format(unique_id)) tasklet_code = tasklet_setup_code + tasklet_code + tasklet_cleanup_code tasklet = nd.Tasklet('onnx_code', in_connectors, out_connectors, tasklet_code, language=dace.dtypes.Language.CPP) tasklet.environments = {"ONNXRuntime"} if len(output_copy_required) != 0 or len(input_copy_required) != 0: nsdfg = dace.SDFG("nested_{}".format(unique_id)) nstate = nsdfg.add_state() ntasklet = deepcopy(tasklet) # add a prefix to connectors to prevent shadowing of array names ntasklet.in_connectors = { "_conn_" + k: v for k, v in tasklet.in_connectors.items() } ntasklet.out_connectors = { "_conn_" + k: v for k, v in tasklet.out_connectors.items() } nstate.add_node(ntasklet) for edge, is_input in node.iter_edges(state): parameter_name = edge.dst_conn if is_input else edge.src_conn memlet = edge.data desc = sdfg.arrays[memlet.data] # add the original array original_desc = deepcopy(desc) original_desc.transient = False nsdfg.add_datadesc(parameter_name, original_desc) if not (isinstance(desc, dt.Array) or isinstance(desc, dt.Scalar)): raise ValueError( "Unsupported data type {} connected to an ONNX tasklet" .format(type(desc))) if parameter_name not in (input_copy_required if is_input else output_copy_required): if is_input: access = nstate.add_read(parameter_name) nstate.add_edge(access, None, ntasklet, "_conn_" + parameter_name, nsdfg.get_array_memlet(parameter_name)) else: access = nstate.add_write(parameter_name) nstate.add_edge(ntasklet, "_conn_" + parameter_name, access, None, nsdfg.get_array_memlet(parameter_name)) continue copy_options = input_copy_required[ parameter_name] if is_input else output_copy_required[ parameter_name] # add the copy of the descriptor if 'copy_to_array' in copy_options: copy_desc = dt.Array(shape=[1], dtype=desc.dtype) else: copy_desc = deepcopy(desc) copy_desc.transient = True copy_desc.storage = copy_options['storage'] nsdfg.add_datadesc("copy_" + memlet.data, copy_desc) nmemlet = deepcopy(memlet) nmemlet.data = "copy_" + nmemlet.data if is_input: access = nstate.add_read(parameter_name) access_copy = nstate.add_access("copy_" + memlet.data) nstate.add_edge( access, None, access_copy, None, nsdfg.get_array_memlet("copy_" + memlet.data)) nstate.add_edge(access_copy, None, ntasklet, "_conn_" + parameter_name, nmemlet) else: access = nstate.add_write(parameter_name) access_copy = nstate.add_access("copy_" + memlet.data) nstate.add_edge(ntasklet, "_conn_" + parameter_name, access_copy, None, nmemlet) nstate.add_edge( access_copy, None, access, None, nsdfg.get_array_memlet("copy_" + memlet.data)) return nsdfg else: return tasklet