Example #1
0
def check_required_copies(
    node: nd.Node, state: SDFGState, sdfg: SDFG, outputs_on_host: List[bool],
    inputs_on_host: List[bool]
) -> Tuple[Dict[str, dtypes.StorageType], Dict[str, dtypes.StorageType]]:
    """ Check whether copies are required for all parameters.
        :param node: the node.
        :param state: the state.
        :param sdfg: the sdfg.
        :param outputs_on_host: boolean list, where the ith bool indicates if the ith output should be on host.
        :param inputs_on_host: boolean list, where the ith bool indicates if the ith input should be on host.
        :return: two dicts containing storage types for each of the connectors that require copies. The first
                 dict is for the inputs, the second is for the outputs.
    """

    # maps the connectors for which a copy will be required to the storage type required to be connected to the tasklet
    input_copy_required: Dict[str, dtypes.StorageType] = {}
    output_copy_required: Dict[str, dtypes.StorageType] = {}

    assert len(node.iter_outputs_in_onnx_order(state)) == len(outputs_on_host)
    assert len(node.iter_inputs_in_onnx_order(state)) == len(inputs_on_host)

    # check outputs
    for edge, output_on_host in zip(node.iter_outputs_in_onnx_order(state),
                                    outputs_on_host):
        # get the memlet for this output
        array = sdfg.arrays[edge.data.data]

        if output_on_host:
            is_device_mismatch = not dtypes.can_access(
                dtypes.ScheduleType.Default, array.storage)
        else:
            is_device_mismatch = not dtypes.can_access(
                dtypes.ScheduleType.GPU_Device, array.storage)

        if is_device_mismatch:
            # we need to insert a copy
            storage = dtypes.StorageType.CPU_Heap if output_on_host else dtypes.StorageType.GPU_Global
            output_copy_required[edge.src_conn] = storage

    # check inputs (same thing again)
    for edge, input_on_host in zip(node.iter_inputs_in_onnx_order(state),
                                   inputs_on_host):
        array = sdfg.arrays[edge.data.data]

        if input_on_host:
            is_device_mismatch = not dtypes.can_access(
                dtypes.ScheduleType.Default, array.storage)
        else:
            is_device_mismatch = not dtypes.can_access(
                dtypes.ScheduleType.GPU_Device, array.storage)

        if is_device_mismatch:
            # we need to insert a copy
            storage = dtypes.StorageType.CPU_Heap if input_on_host else dtypes.StorageType.GPU_Global
            input_copy_required[edge.dst_conn] = storage

    return input_copy_required, output_copy_required
Example #2
0
def check_access(schedule: dtypes.ScheduleType, *descs: data.Data):
    """ If schedule cannot access all passed descriptors, through an error.

        :param schedule: the schedule.
        :param descs: the descriptors to check.
    """
    for desc in descs:
        if not dtypes.can_access(schedule, desc.storage):
            raise ValueError(
                f"Schedule mismatch: {schedule} cannot access {desc.storage}")
Example #3
0
def create_output_array(
        inferred_symbols: Dict[str, int],
        desc: dt.Data,
        use_torch=False,
        zeros: bool = False) -> Union[np.ndarray, torch.tensor]:
    """ Create the array for an output. This is either a numpy array or a torch tensor depending on `use_torch`

        When `self.force_torch_outputs` is True, the outputs will be tensors. Otherwise, the outputs will be tensors
        :param inferred_symbols: the symbols inferred from `infer_symbols_from_shapes`.
        :param desc: the data descriptor for the array
        :param use_torch: whether to return a numpy array or a torch tensor.
        :param zeros: if true init with zeros else empty.
    """
    def eval_dim(dim):
        for sym in dim.free_symbols:
            dim = dim.subs(sym, inferred_symbols[sym.name])
        return dim

    if dtypes.can_access(dtypes.ScheduleType.CPU_Multicore, desc.storage):
        cuda = False
    elif dtypes.can_access(dtypes.ScheduleType.GPU_Default, desc.storage):
        cuda = True
    else:
        raise ValueError(f"Unsupported storage {desc.storage}")

    if cuda and not use_torch:
        raise ValueError("Got use_torch=False, but received a GPU descriptor")

    shape = [eval_dim(d) if type(d) is dace.symbol else d for d in desc.shape]
    if use_torch:
        # as_numpy_dtype doesn't seem to work for indexing into the dict
        tens = (torch.zeros if zeros else torch.empty)(
            *shape,
            dtype=numpy_to_torch_dtype_dict[getattr(np,
                                                    desc.dtype.to_string())])
        return tens.cuda() if cuda else tens
    else:
        return (np.zeros if zeros else np.empty)(shape,
                                                 dtype=getattr(
                                                     np,
                                                     desc.dtype.to_string()))
Example #4
0
    def apply(self, sdfg: dace.SDFG):
        # Extract the subgraph, execute it and insert an AccessNode to the result
        # this method of execution is slow but simple. A better option would be to call the ORT
        # C API from a python object (like the OpChecker).

        parent: ONNXModel = sdfg._parent_onnx_model
        state = sdfg.nodes()[self.state_id]
        node = state.nodes()[self.subgraph[ConstantFolding._onnx_node]]
        log.debug(f"Applying constant folding: {node} in {state}")

        if isinstance(node, donnx.ONNXShape):
            # if we have a shape node, replace it with a constant
            assert len(state.in_edges(node)) == 1
            shape_in_edge = state.in_edges(node)[0]
            assert shape_in_edge.dst_conn == "data"
            shape_desc = sdfg.arrays[shape_in_edge.src.data]

            constant_name = sdfg.temp_data_name()
            clean_constant_name = clean_onnx_name(constant_name)
            sdfg.add_array(clean_constant_name, (len(shape_desc.shape), ),
                           dace.int64)

            assert constant_name not in parent.clean_weights
            parent.weights[constant_name] = torch.from_numpy(
                np.array(shape_desc.shape, np.int64))

            assert len(state.out_edges(node)) == 1
            output_edge = state.out_edges(node)[0]
            access_shape = state.add_access(clean_constant_name)
            state.add_edge(access_shape, None, output_edge.dst,
                           output_edge.dst_conn,
                           sdfg.make_array_memlet(clean_constant_name))
        else:
            # otherwise compute the result of the op
            global UNIQUE_ID
            UNIQUE_ID += 1
            sub_sdfg = dace.SDFG("sub_sdfg_" + str(UNIQUE_ID))
            sub_state = sub_sdfg.add_state()

            node_copy = copy.deepcopy(node)
            sub_state.add_node(node_copy)

            inputs = {}
            for edge in state.in_edges(node):
                # we know from can_be_applied that all in edges are from AccessNodes
                assert (isinstance(edge.src, nd.AccessNode)
                        and hasattr(sdfg, "_parent_onnx_model") and
                        edge.src.data in sdfg._parent_onnx_model.clean_weights)

                desc = copy.deepcopy(sdfg.arrays[edge.data.data])
                desc.transient = False
                sub_sdfg.add_datadesc('array_' + edge.dst_conn, desc)

                input_value = sdfg._parent_onnx_model.clean_weights[
                    edge.src.data]

                if len(input_value.shape) == 0:
                    inputs['array_' +
                           edge.dst_conn] = input_value.cpu().numpy()[()]
                else:
                    inputs['array_' + edge.dst_conn] = input_value.clone()

                access = sub_state.add_access('array_' + edge.dst_conn)
                sub_state.add_edge(
                    access, None, node_copy, edge.dst_conn,
                    sub_sdfg.make_array_memlet('array_' + edge.dst_conn))

            outputs = {}
            for edge in state.out_edges(node):
                desc = copy.deepcopy(sdfg.arrays[edge.data.data])
                if isinstance(desc, dt.Scalar):
                    # we need to copy to an array of size [1] so that we can "return" the output from the sdfg
                    desc.transient = True
                    sub_sdfg.add_datadesc('scalar_array_' + edge.src_conn,
                                          desc)
                    sub_sdfg.add_array('array_' + edge.src_conn, [1],
                                       desc.dtype,
                                       transient=False)

                    access_scalar = sub_state.add_access('scalar_array_' +
                                                         edge.src_conn)
                    access = sub_state.add_access('array_' + edge.src_conn)
                    sub_state.add_edge(
                        node_copy, edge.src_conn, access_scalar, None,
                        sub_sdfg.make_array_memlet('scalar_array_' +
                                                   edge.src_conn))

                    sub_state.add_edge(
                        access_scalar, None, access, None,
                        sub_sdfg.make_array_memlet('array_' + edge.src_conn))
                else:
                    desc.transient = False
                    sub_sdfg.add_datadesc('array_' + edge.src_conn, desc)
                    access = sub_state.add_access('array_' + edge.src_conn)
                    sub_state.add_edge(
                        node_copy, edge.src_conn, access, None,
                        sub_sdfg.make_array_memlet('array_' + edge.src_conn))

                if len(desc.shape) == 0:
                    empty_array = np.empty((1, ), desc.dtype.as_numpy_dtype())
                else:
                    empty_array = np.empty(tuple(desc.shape),
                                           desc.dtype.as_numpy_dtype())

                empty_array = torch.from_numpy(empty_array)

                if desc.storage is dtypes.StorageType.GPU_Global:
                    empty_array = empty_array.cuda()

                outputs['array_' + edge.src_conn] = empty_array

            sub_sdfg(**outputs, **inputs)

            for edge in state.out_edges(node):
                desc = copy.deepcopy(sdfg.arrays[edge.data.data])
                desc.transient = False
                output_value = outputs['array_' + edge.src_conn]

                constant_name = sdfg.temp_data_name()
                clean_constant_name = clean_onnx_name(constant_name)
                sdfg.add_datadesc(clean_constant_name, desc)

                assert constant_name not in parent.weights
                assert type(output_value) is torch.Tensor

                if not dtypes.can_access(dtypes.ScheduleType.CPU_Multicore,
                                         desc.storage):
                    cpu_desc = copy.deepcopy(desc)
                    cpu_desc.storage = dtypes.StorageType.CPU_Heap
                    cpu_desc.transient = False
                    desc.transient = True
                    copy_in_name = sdfg.temp_data_name()
                    clean_copy_in_name = clean_onnx_name(copy_in_name)
                    sdfg.add_datadesc(clean_copy_in_name, cpu_desc)

                    access_constant = state.add_access(clean_constant_name)
                    state.add_edge(state.add_read(clean_copy_in_name), None,
                                   access_constant, None,
                                   sdfg.make_array_memlet(clean_copy_in_name))

                    name_to_add = copy_in_name
                else:
                    access_constant = state.add_read(clean_constant_name)
                    name_to_add = constant_name

                if isinstance(desc, dt.Scalar):
                    parent.weights[name_to_add] = output_value.reshape(())
                else:
                    parent.weights[name_to_add] = output_value

                state.add_edge(access_constant, None, edge.dst, edge.dst_conn,
                               sdfg.make_array_memlet(clean_constant_name))

        # remove all now useless nodes with a reverse BFS
        remove_node_and_computation(sdfg, state, node)
Example #5
0
    def expansion(node, state: SDFGState, sdfg: SDFG):
        # Extract input and output array views (as generated by memlets)
        inputs, outputs = _get_inputs_and_outputs(sdfg, state, node)

        unique_id = "{}_{}_{}_{}".format(clean_onnx_name(node.name),
                                         sdfg.sdfg_id, sdfg.node_id(state),
                                         state.node_id(node))
        _add_ort_init_code(sdfg)

        sdfg.append_global_code(
            "OrtExecutableKernel *__ort_kernel_{};\n".format(unique_id))
        sdfg.append_global_code(
            "OrtExecutableKernelContext *__ort_context_{};\n".format(
                unique_id))

        sdfg.append_init_code("""
        {{
        // Setup for {name}
        __ort_check_status(__ort_api->CreateExecutableKernelContext("{name}", "{op_type}", &__ort_context_{name}));
        """.format(name=unique_id, op_type=node.schema.name))

        # check if ORT supports CUDA for this node
        ##########################################

        # Default: all parameters are on CPU if we execute using cpu
        outputs_on_host = [True for _ in range(len(outputs))]
        inputs_on_host = [True for _ in range(len(inputs))]

        actual_node_schedule = node.schedule
        if node.schedule == ScheduleType.CPU_Multicore or node.schedule == ScheduleType.Default:
            provider_index = 0
        elif node.schedule == ScheduleType.GPU_Device:
            provider_index = 1
            try:
                # the ith position indicates whether the ith output is in host memory
                inputs_on_host, outputs_on_host = check_op(sdfg,
                                                           state,
                                                           node,
                                                           cuda=True)

            except ONNXOpValidationError as e:
                # fallback to CPU
                print("Falling back to CPU for node {}. Reason:\n{}".format(
                    node.name, str(e)))
                provider_index = 0
                actual_node_schedule = ScheduleType.Default
        else:
            raise NotImplementedError(
                "ORT expansion for schedule '{}' is not implemented".format(
                    node.schedule))

        # check if we need to insert device copies
        ##########################################

        # maps the connectors for which a copy will be required to the storage type required to be connected to the tasklet
        input_copy_required = defaultdict(dict)
        output_copy_required = defaultdict(dict)

        assert len(
            node.iter_outputs_in_onnx_order(state)) == len(outputs_on_host)
        assert len(
            node.iter_inputs_in_onnx_order(state)) == len(inputs_on_host)

        # check outputs
        for edge, output_on_host in zip(node.iter_outputs_in_onnx_order(state),
                                        outputs_on_host):
            # get the memlet for this output
            array = sdfg.arrays[edge.data.data]

            if output_on_host:
                is_device_mismatch = not can_access(ScheduleType.Default,
                                                    array.storage)
            else:
                is_device_mismatch = not can_access(ScheduleType.GPU_Device,
                                                    array.storage)

            if isinstance(
                    array, dt.Scalar
            ) and actual_node_schedule == ScheduleType.GPU_Device:
                # ORT kernels expect scalars to be cudaMalloced. We will copy during expansion to enforce this
                is_device_mismatch = True
                output_copy_required[edge.src_conn]['copy_to_array'] = True

            if is_device_mismatch:
                # we need to insert a copy
                output_copy_required[edge.src_conn][
                    'storage'] = StorageType.Default if output_on_host else StorageType.GPU_Global

        # check inputs (same thing again)
        for edge, input_on_host in zip(node.iter_inputs_in_onnx_order(state),
                                       inputs_on_host):
            array = sdfg.arrays[edge.data.data]

            if input_on_host:
                is_device_mismatch = not can_access(ScheduleType.Default,
                                                    array.storage)
            else:
                is_device_mismatch = not can_access(ScheduleType.GPU_Device,
                                                    array.storage)

            if isinstance(
                    array, dt.Scalar
            ) and actual_node_schedule == ScheduleType.GPU_Device:
                # ORT kernels expect scalars to be cudaMalloced. We will copy during expansion to enforce this
                is_device_mismatch = True
                input_copy_required[edge.dst_conn]['copy_to_array'] = True

            if is_device_mismatch:
                # we need to insert a copy
                input_copy_required[edge.dst_conn][
                    'storage'] = StorageType.Default if input_on_host else StorageType.GPU_Global

        # begin codegen
        ##########################################
        tasklet_setup_code = ""
        tasklet_code = ""
        tasklet_cleanup_code = ""

        reversed_onnx_dtype_map = {
            v: k
            for k, v in ONNX_DTYPES_TO_DACE_TYPE_CLASS.items()
        }

        # emit code for inputs and outputs
        ##########################################
        in_connectors = {}
        out_connectors = {}

        for edge, is_input in node.iter_edges(state):

            parameter_name = edge.dst_conn if is_input else edge.src_conn

            if len(output_copy_required) != 0 or len(input_copy_required) != 0:
                edge_connector_name = "_conn_" + parameter_name
            else:
                edge_connector_name = parameter_name

            input_output_string = "input" if is_input else "output"
            connector_dict = in_connectors if is_input else out_connectors
            memlet = edge.data
            desc = sdfg.arrays[memlet.data]
            sdfg.append_init_code("""
            // Add parameter {parameter_name}
            __ort_check_status(__ort_api->ExecutableKernelContext_Add{input_output_string}(__ort_context_{id}, ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_string}));
            """.format(id=unique_id,
                       type_string=reversed_onnx_dtype_map[desc.dtype].upper(),
                       parameter_name=parameter_name,
                       input_output_string=input_output_string.capitalize()))

            ort_value_name = "ort_value_{input_output_string}_{parameter_name}".format(
                input_output_string=input_output_string,
                parameter_name=parameter_name)

            copy_to_array = (
                (parameter_name in output_copy_required
                 and 'copy_to_array' in output_copy_required[parameter_name])
                or
                (parameter_name in input_copy_required
                 and 'copy_to_array' in input_copy_required[parameter_name]))
            if desc.storage == StorageType.Default:
                mem_info = "__ort_cpu_mem_info"
            elif desc.storage == StorageType.GPU_Global:
                mem_info = "__ort_cuda_mem_info"
            elif desc.storage == StorageType.CPU_Pinned:
                mem_info = "__ort_cuda_pinned_mem_info"
            else:
                raise ValueError(
                    "Unsupported storage type {} for input to ONNX node".
                    format(desc.storage))
            if (isinstance(desc, dt.Scalar) and
                    # when copying to array, the ort value is not a scalar but an array
                    not copy_to_array):

                tasklet_setup_code += """
                OrtValue* {ort_value_name};
                __ort_check_status(__ort_api->CreateTensorWithDataAsOrtValue(
                    {mem_info},
                    &{edge_connector_name},
                    {data_size} * sizeof({ctype}),
                    nullptr,
                    0,
                    ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_str},
                    &{ort_value_name}
                ));
                """.format(
                    input_output_string=input_output_string,
                    mem_info=mem_info,
                    edge_connector_name=edge_connector_name,
                    data_size=reduce(lambda x, y: x * y, desc.shape),
                    ctype=desc.dtype.ctype,
                    type_str=reversed_onnx_dtype_map[desc.dtype].upper(),
                    ort_value_name=ort_value_name)
                connector_dict[parameter_name] = None

            elif isinstance(desc, dt.Array) or copy_to_array:

                # when we copy a scalar to an array, that scalar ofc has shape []
                dims = [] if copy_to_array else desc.shape

                # setup dims array
                tasklet_setup_code += """
                int64_t {input_output_string}_{parameter_name}_dims[{dims_size}] = {{{dims}}};
                """.format(input_output_string=input_output_string,
                           parameter_name=parameter_name,
                           dims_size=len(dims),
                           dims=", ".join(str(s) for s in dims))

                connector_dict[parameter_name] = dace.pointer(desc.dtype)
                data = "const_cast < void * > (reinterpret_cast < const void * > ({}))".format(
                    edge_connector_name)

                tasklet_setup_code += """
                OrtValue* {ort_value_name};
                __ort_check_status(__ort_api->CreateTensorWithDataAsOrtValue(
                    {mem_info},
                    {data},
                    {data_size} * sizeof({ctype}),
                    {input_output_string}_{parameter_name}_dims,
                    {dims_size},
                    ONNX_TENSOR_ELEMENT_DATA_TYPE_{type_str},
                    &{ort_value_name}
                ));
                """.format(
                    input_output_string=input_output_string,
                    data=data,
                    mem_info=mem_info,
                    parameter_name=parameter_name,
                    data_size=reduce(lambda x, y: x * y, desc.shape),
                    ctype=desc.dtype.ctype,
                    dims_size=len(dims),
                    type_str=reversed_onnx_dtype_map[desc.dtype].upper(),
                    ort_value_name=ort_value_name)
            else:
                raise NotImplementedError(
                    "Data-descriptor type {} not supported for ONNX nodes".
                    format(type(desc)))


            tasklet_code += "__ort_check_status(__ort_api->ExecutableKernel_Set{input_output_string_capital}(" \
                            "__ort_kernel_{unique_id}, {position}, {ort_value_name}));\n".format(
                input_output_string_capital=input_output_string.
                    capitalize(),
                ort_value_name=ort_value_name,
                unique_id=unique_id,
                position=get_position(node.schema, is_input,
                                      parameter_name))

            tasklet_cleanup_code += "__ort_api->ReleaseValue(ort_value_{input_output_string}_{parameter_name});\n".format(
                input_output_string=input_output_string,
                parameter_name=parameter_name)

        sdfg.append_init_code("// Setup attributes\n")

        for name, attr in node.schema.attributes.items():
            if hasattr(node, name):
                sdfg.append_init_code(
                    _gen_attr_init_code("__ort_context_{}".format(unique_id),
                                        node.schema.attributes[name],
                                        getattr(node, name)))

        sdfg.prepend_exit_code(
            "__ort_api->ReleaseExecutableKernelContext(__ort_context_{});\n".
            format(unique_id))
        sdfg.prepend_exit_code(
            "__ort_api->ReleaseExecutableKernel(__ort_kernel_{});\n".format(
                unique_id))

        tasklet_code += 'fprintf(stderr, "Launching {}\\n");\n'.format(
            unique_id)
        tasklet_code += "__ort_check_status(__ort_api->ExecutableKernel_Compute(__ort_kernel_{}));\n".format(
            unique_id)

        sdfg.append_init_code(
            "__ort_check_status(__ort_api->CreateExecutableKernel("
            "__ort_session, __ort_context_{id}, /*provider_index=*/{provider_index}, &__ort_kernel_{id}));\n"
            .format(provider_index=provider_index, id=unique_id))
        sdfg.append_init_code(
            "}} // end setup for context_{}".format(unique_id))

        tasklet_code = tasklet_setup_code + tasklet_code + tasklet_cleanup_code
        tasklet = nd.Tasklet('onnx_code',
                             in_connectors,
                             out_connectors,
                             tasklet_code,
                             language=dace.dtypes.Language.CPP)
        tasklet.environments = {"ONNXRuntime"}

        if len(output_copy_required) != 0 or len(input_copy_required) != 0:
            nsdfg = dace.SDFG("nested_{}".format(unique_id))
            nstate = nsdfg.add_state()
            ntasklet = deepcopy(tasklet)

            # add a prefix to connectors to prevent shadowing of array names
            ntasklet.in_connectors = {
                "_conn_" + k: v
                for k, v in tasklet.in_connectors.items()
            }
            ntasklet.out_connectors = {
                "_conn_" + k: v
                for k, v in tasklet.out_connectors.items()
            }

            nstate.add_node(ntasklet)

            for edge, is_input in node.iter_edges(state):
                parameter_name = edge.dst_conn if is_input else edge.src_conn

                memlet = edge.data
                desc = sdfg.arrays[memlet.data]

                # add the original array
                original_desc = deepcopy(desc)
                original_desc.transient = False
                nsdfg.add_datadesc(parameter_name, original_desc)
                if not (isinstance(desc, dt.Array)
                        or isinstance(desc, dt.Scalar)):
                    raise ValueError(
                        "Unsupported data type {} connected to an ONNX tasklet"
                        .format(type(desc)))

                if parameter_name not in (input_copy_required if is_input else
                                          output_copy_required):
                    if is_input:
                        access = nstate.add_read(parameter_name)
                        nstate.add_edge(access, None, ntasklet,
                                        "_conn_" + parameter_name,
                                        nsdfg.get_array_memlet(parameter_name))
                    else:
                        access = nstate.add_write(parameter_name)
                        nstate.add_edge(ntasklet, "_conn_" + parameter_name,
                                        access, None,
                                        nsdfg.get_array_memlet(parameter_name))
                    continue

                copy_options = input_copy_required[
                    parameter_name] if is_input else output_copy_required[
                        parameter_name]

                # add the copy of the descriptor
                if 'copy_to_array' in copy_options:
                    copy_desc = dt.Array(shape=[1], dtype=desc.dtype)
                else:
                    copy_desc = deepcopy(desc)

                copy_desc.transient = True
                copy_desc.storage = copy_options['storage']
                nsdfg.add_datadesc("copy_" + memlet.data, copy_desc)

                nmemlet = deepcopy(memlet)
                nmemlet.data = "copy_" + nmemlet.data
                if is_input:
                    access = nstate.add_read(parameter_name)
                    access_copy = nstate.add_access("copy_" + memlet.data)
                    nstate.add_edge(
                        access, None, access_copy, None,
                        nsdfg.get_array_memlet("copy_" + memlet.data))
                    nstate.add_edge(access_copy, None, ntasklet,
                                    "_conn_" + parameter_name, nmemlet)
                else:
                    access = nstate.add_write(parameter_name)
                    access_copy = nstate.add_access("copy_" + memlet.data)
                    nstate.add_edge(ntasklet, "_conn_" + parameter_name,
                                    access_copy, None, nmemlet)
                    nstate.add_edge(
                        access_copy, None, access, None,
                        nsdfg.get_array_memlet("copy_" + memlet.data))

            return nsdfg

        else:
            return tasklet