Ejemplo n.º 1
0
def test_dde_scope_reconnect():
    '''
    Corner case:
    map {
        tasklet(callback()) -> tasklet(do nothing)
    }
    expected map to stay connected
    '''
    sdfg = dace.SDFG('dde_scope_tester')
    sdfg.add_symbol('cb', dace.callback(dace.float64))
    sdfg.add_scalar('s', dace.float64, transient=True)

    state = sdfg.add_state()
    me, mx = state.add_map('doit', dict(i='0:2'))
    # Tasklet has a callback and cannot be removed
    t1 = state.add_tasklet('callback', {}, {'o'}, 'o = cb()', side_effects=True)
    # Tasklet has no output and thus can be removed
    t2 = state.add_tasklet('nothing', {'inp'}, {}, '')
    state.add_nedge(me, t1, dace.Memlet())
    state.add_edge(t1, 'o', t2, 'inp', dace.Memlet('s'))
    state.add_nedge(t2, mx, dace.Memlet())

    Pipeline([DeadDataflowElimination()]).apply_pass(sdfg, {})
    assert set(state.nodes()) == {me, t1, mx}
    sdfg.validate()
Ejemplo n.º 2
0
def test_invalid_callback():
    cb = dace.symbol('cb', dace.callback(dace.uint32[5]))

    @dace.program
    def shouldfail(out):
        with dace.tasklet:
            arr = cb()
            o = arr[1]
            o >> out

    with pytest.raises(DaceSyntaxError):
        oo = numpy.random.rand(10)
        shouldfail(oo)
Ejemplo n.º 3
0
def test_dce_callback_manual():
    sdfg = dace.SDFG('dce_cbman')
    sdfg.add_array('a', [20], dace.float64)
    sdfg.add_symbol('cb', dace.callback(None, dace.float64))
    state = sdfg.add_state()

    r = state.add_read('a')
    t = state.add_tasklet('callback', {'inp'}, {}, 'cb(inp)')
    state.add_edge(r, None, t, 'inp', dace.Memlet('a[0:20]'))

    sdfg.validate()
    Pipeline([DeadDataflowElimination()]).apply_pass(sdfg, {})
    assert set(state.nodes()) == {r, t}
    sdfg.validate()
Ejemplo n.º 4
0
def winograd_convolution(dace_session, tf_node):
    debugNodes = []
    state = dace_session.state
    add_cublas_cusolver(dace_session.graph)
    #############Add constants for transformation matrices###############
    dace_session.graph.add_constant('Btrans', bt)
    dace_session.graph.add_constant('B', b)
    bNode = 'B'
    bTransposeNode = 'Btrans'

    dace_session.graph.add_constant('G', g)
    dace_session.graph.add_constant('Gtrans', gt)
    gNode = 'G'
    gTransposeNode = 'Gtrans'

    dace_session.graph.add_constant('Atrans', at)
    dace_session.graph.add_constant('A', a)
    aNode = 'A'
    aTransposeNode = 'Atrans'

    inputNodes = []
    inputParams = []
    inputDims = []
    for _inp in tf_node.inputs:
        _node, _params, _dims = dace_session.create_and_add_input_node(_inp)
        inputNodes.append(_node)
        inputParams.append(_params)
        inputDims.append(_dims)
    # Manually add copy for kernel from CPU to GPU
    kernel_desc = inputNodes[1].desc(dace_session.graph)
    kernelGPU = state.add_transient(
        inputNodes[1].data + "GPU",
        shape=kernel_desc.shape,
        dtype=kernel_desc.dtype,
        lifetime=dtypes.AllocationLifetime.SDFG,
        storage=dace.StorageType.GPU_Global,
    )
    state.add_edge(
        inputNodes[1],
        None,
        kernelGPU,
        None,
        Memlet.from_array(inputNodes[1],
                          inputNodes[1].desc(dace_session.graph)),
    )
    inputNodes[1] = kernelGPU
    outputList = dace_session.create_and_add_output_node(tf_node)
    outputDims = dace_session.get_default_dims(tf_node.outputs[0])
    if str(tf_node.get_attr("padding"))[2:-1] == "SAME":
        paddedInput, paddedDims = dace_session.inputPadding(
            tf_node,
            inputNodes[0],
            inputNodes[0].desc(dace_session.graph),
            outputList[0].desc(dace_session.graph).shape[1],
            inputNodes[1].desc(dace_session.graph).shape[0],
            tf_node.get_attr("strides")[1],
            inputDims[0],
        )
        inputDims[0] = paddedDims
        inputNodes[0] = paddedInput
    outputShape = [int(_s) for _s in tf_node.outputs[0].shape]
    inputViewShape = [
        IMAGE_TILE_SIZE,
        IMAGE_TILE_SIZE,
        tf_node.inputs[0].shape[-1],
        outputShape[0] * ceil(outputShape[1] / OUTPUT_TILE_SIZE) *
        ceil(outputShape[2] / OUTPUT_TILE_SIZE),
    ]
    inputViewDims = ["0:" + str(_x) for _x in inputViewShape]
    ########Tiling the image#################################
    inputViewParams = [
        "i3%" + str(outputShape[0]),
        "(i3/" + str(outputShape[0]) + ")%"
        # + str(output_shape[0] * ceil(output_shape[1] / OUTPUT_TILE_SIZE))
        + str(ceil(outputShape[2] / OUTPUT_TILE_SIZE)) + "*" +
        str(OUTPUT_TILE_SIZE) + "+i0",
        # + str(
        #    ceil(output_shape[1] / OUTPUT_TILE_SIZE)
        #    * ceil(output_shape[2] / OUTPUT_TILE_SIZE)
        # ),
        "int_floor(i3,"
        # + str(ceil(output_shape[1] / OUTPUT_TILE_SIZE))
        + str(outputShape[0] * ceil(outputShape[2] / OUTPUT_TILE_SIZE)) +
        ")*" + str(OUTPUT_TILE_SIZE) + "+i1",
        "i2",
    ]
    inputView = state.add_transient(
        "V" + "_".join([str(_s) for _s in inputViewShape]),
        inputViewShape,
        dace.float32,
        dace.StorageType.GPU_Global,
    )
    mapEntry, mapExit = state.add_map(
        string_builder(tf_node.name) + "_input_tile",
        dict(zip(inputParams[0], inputViewDims)),
    )
    tasklet = state.add_tasklet(
        string_builder(tf_node.name) + "_input_tile", {"j0"}, {"out"},
        "out = j0")
    dace_session.add_in_memlets([inputNodes[0]], mapEntry, tasklet,
                                [inputDims[0]], [inputViewParams])
    dace_session.add_out_memlets([inputView], mapExit, tasklet,
                                 [inputViewDims], [inputParams[0]])
    ##################Transforming all input tiles#########################
    #[TODO] try to re-use memory
    vNode = state.add_transient(
        "V_output" + "_".join([str(_s) for _s in inputViewShape]),
        inputViewShape,
        dace.float32,
        dace.StorageType.GPU_Global,
    )
    vNode.setzero = True
    mapEntry, mapExit = state.add_map(
        string_builder(tf_node.name) + "_input_txform",
        dict(zip(inputParams[0][0:2], inputViewDims[2:4])),
        dace.ScheduleType.GPU_Device,
    )
    intermediateResultNode = state.add_transient("BtI", bt.shape, dace.float32,
                                                 dace.StorageType.Register)
    intermediateResultNode.setzero = True
    state.add_edge(
        inputView,
        None,
        mapEntry,
        None,
        Memlet.simple(inputView, ",".join(inputViewDims)),
    )
    mm_small(
        state,
        bTransposeNode,
        inputView,
        intermediateResultNode,
        B_subset=[IMAGE_TILE_SIZE, IMAGE_TILE_SIZE],
        B_memlet=Memlet.simple(
            inputView, ",".join(inputViewDims[0:2] + inputParams[0][0:2])),
        map_entry=mapEntry,
        B_direct=False,
    )
    mm_small(
        state,
        intermediateResultNode,
        bNode,
        vNode,
        map_exit=mapExit,
        C_subset=[IMAGE_TILE_SIZE, IMAGE_TILE_SIZE],
        C_memlet=Memlet.simple(
            vNode,
            ",".join(inputViewDims[0:2] + inputParams[0][0:2]),
            wcr_str="lambda a,b: a+b",
            wcr_conflict=False,
        ),
        map_entry=mapEntry,
        A_direct=True,
    )
    state.add_edge(
        mapExit,
        None,
        vNode,
        None,
        Memlet.simple(
            vNode,
            ",".join(inputViewDims),
            wcr_str="lambda a,b: a+b",
            wcr_conflict=False,
        ),
    )
    #############Transforming the kernel###############################
    mapEntry, mapExit = state.add_map(
        string_builder(tf_node.name) + "_kernel_txform",
        dict(zip(inputParams[1][0:2], inputDims[1][2:4])),
        dace.ScheduleType.GPU_Device,
    )
    intermediateResultNode = state.add_transient("GF", g.shape, dace.float32,
                                                 dace.StorageType.Register)
    intermediateResultNode.setzero = True
    processedKernelNode = state.add_transient(
        "U" + "_".join([
            str(_s) for _s in inputViewShape[0:2] +
            list(tf_node.inputs[1].shape[-1:-3:-1])
        ]),
        inputViewShape[0:2] + list(tf_node.inputs[1].shape[-1:-3:-1]),
        dace.float32,
        dace.StorageType.GPU_Global,
    )
    processedKernelNode.setzero = True
    state.add_edge(
        inputNodes[1],
        None,
        mapEntry,
        None,
        dace.Memlet.from_array(inputNodes[1].data,
                               inputNodes[1].desc(dace_session.graph)),
    )

    mm_small(
        state,
        gNode,
        inputNodes[1],
        intermediateResultNode,
        map_entry=mapEntry,
        B_subset=tf_node.inputs[1].shape[0:2],
        B_memlet=Memlet.simple(
            inputNodes[1], ",".join(inputDims[1][0:2] + inputParams[1][0:2])),
        B_direct=False,
    )
    mm_small(
        state,
        intermediateResultNode,
        gTransposeNode,
        processedKernelNode,
        C_subset=[IMAGE_TILE_SIZE, IMAGE_TILE_SIZE],
        C_memlet=Memlet.simple(
            processedKernelNode,
            ",".join(inputViewDims[0:2] + [inputParams[0][1]] +
                     [inputParams[0][0]]),
            wcr_str="lambda a,b: a+b",
            wcr_conflict=False,
        ),
        map_entry=mapEntry,
        map_exit=mapExit,
        A_direct=True,
    )
    state.add_edge(
        mapExit,
        None,
        processedKernelNode,
        None,
        Memlet.simple(
            processedKernelNode.data,
            ",".join([
                "0:" + str(_s)
                for _s in processedKernelNode.desc(dace_session.graph).shape
            ]),
            wcr_str="lambda a,b: a+b",
            wcr_conflict=False,
        ),
    )
    mNode = state.add_transient(
        "m" + "_".join([
            str(_s) for _s in inputViewShape[0:2] +
            [tf_node.inputs[1].shape[-1], inputViewShape[-1]]
        ]),
        inputViewShape[0:2] +
        [tf_node.inputs[1].shape[-1], inputViewShape[-1]],
        dace.float32,
        dace.StorageType.GPU_Global,
    )
    mNodeDims = ["0:" + str(_d) for _d in mNode.desc(dace_session.graph).shape]
    mapEntry, mapExit = state.add_map(
        string_builder(tf_node.name) + "_eltwise_product",
        dict(zip(inputParams[0][0:2], inputViewDims[0:2])),
        dace.ScheduleType.Sequential,
    )
    state.add_edge(
        vNode,
        None,
        mapEntry,
        None,
        Memlet.from_array(vNode.data, vNode.desc(dace_session.graph)),
    )
    state.add_edge(
        processedKernelNode,
        None,
        mapEntry,
        None,
        Memlet.from_array(processedKernelNode.data,
                          processedKernelNode.desc(dace_session.graph)),
    )
    mm(
        state,
        vNode,
        processedKernelNode,
        mNode,
        A_subset=inputViewShape[2:4],
        A_memlet=Memlet.simple(
            vNode, ",".join(inputParams[0][0:2] + inputViewDims[-2:])),
        B_subset=tf_node.inputs[1].shape[-1:-3:-1],
        B_memlet=Memlet.simple(
            processedKernelNode,
            ",".join(
                inputParams[0][0:2] +
                ["0:" + str(_s) for _s in tf_node.inputs[1].shape[-1:-3:-1]]),
        ),
        C_subset=[tf_node.inputs[1].shape[-1], inputViewShape[-1]],
        C_memlet=Memlet.simple(mNode,
                               ",".join(inputParams[0][0:2] + mNodeDims[-2:])),
        map_entry=mapEntry,
        map_exit=mapExit,
        shadow_a=True,
        shadow_b=True,
    )
    state.add_edge(mapExit, None, mNode, None,
                   Memlet.simple(mNode, ",".join(mNodeDims)))
    #################OUTPUT TRANSFORMATION################################
    mapRange = [inputDims[1][-1]] + [inputViewDims[-1]]
    mapEntry, mapExit = state.add_map(
        string_builder(tf_node.name) + "_output_txform",
        dict(zip(inputParams[0][0:2], mapRange)),
        dace.ScheduleType.GPU_Device,
    )
    intermediateResultNode = state.add_transient("AtM", at.shape, dace.float32,
                                                 dace.StorageType.Register)
    intermediateResultNode.setzero = True
    transformedOutputNode = state.add_transient(
        "inv_txformed_output" + "_".join([str(tf_node.inputs[1].shape[-1])] +
                                         [str(inputViewShape[-1])]),
        [OUTPUT_TILE_SIZE, OUTPUT_TILE_SIZE] + [tf_node.inputs[1].shape[-1]] +
        [inputViewShape[-1]],
        dace.float32,
        dace.StorageType.GPU_Global,
    )
    transformedOutputNode.setzero = True
    state.add_edge(mNode, None, mapEntry, None,
                   Memlet.simple(mNode, ",".join(mNodeDims)))
    mm_small(
        state,
        aTransposeNode,
        mNode,
        intermediateResultNode,
        B_subset=inputViewShape[0:2],
        B_memlet=Memlet.simple(
            mNode, ",".join(inputViewDims[0:2] + inputParams[0][0:2])),
        map_entry=mapEntry,
        B_direct=False,
    )
    mm_small(
        state,
        intermediateResultNode,
        aNode,
        transformedOutputNode,
        C_subset=[OUTPUT_TILE_SIZE, OUTPUT_TILE_SIZE],
        C_memlet=Memlet.simple(
            transformedOutputNode,
            ",".join(
                ["0:" + str(OUTPUT_TILE_SIZE), "0:" + str(OUTPUT_TILE_SIZE)] +
                inputParams[0][0:2]),
            wcr_str="lambda a,b:a+b",
            wcr_conflict=False,
        ),
        map_entry=mapEntry,
        map_exit=mapExit,
        A_direct=True,
    )
    state.add_edge(
        mapExit,
        None,
        transformedOutputNode,
        None,
        Memlet.simple(
            transformedOutputNode.data,
            ",".join([
                "0:" + str(_s)
                for _s in transformedOutputNode.desc(dace_session.graph).shape
            ]),
            wcr_str="lambda a,b: a+b",
            wcr_conflict=False,
        ),
    )
    ###################Un-Tile the output to NHWC format###################
    outputParams = [
        "i3%" + str(outputShape[0]),
        "(i3/" + str(outputShape[0]) + ")%" +
        str(ceil(outputShape[2] / OUTPUT_TILE_SIZE)) + "*" +
        str(OUTPUT_TILE_SIZE) + "+i0",
        "int_floor(i3," +
        str(outputShape[0] * ceil(outputShape[2] / OUTPUT_TILE_SIZE)) + ")*" +
        str(OUTPUT_TILE_SIZE) + "+i1",
        "i2",
    ]
    mapRange = [
        "0:" + str(_s)
        for _s in transformedOutputNode.desc(dace_session.graph).shape
    ]
    mapEntry, mapExit = state.add_map(
        string_builder(tf_node.name) + "_output_untile",
        dict(zip(inputParams[0], mapRange)),
    )
    tasklet = state.add_tasklet(
        string_builder(tf_node.name) + "_output_untile", {"j0"}, {"out"},
        "out = j0")
    dace_session.add_in_memlets([transformedOutputNode], mapEntry, tasklet,
                                [mapRange], [inputParams[0]])
    dace_session.add_out_memlets(outputList, mapExit, tasklet, [outputDims],
                                 [outputParams])
    ################# Debugging with callbacks #############
    taskletInputs = ["i" + str(index) for index in range(len(debugNodes))]
    callback_tasklet = state.add_tasklet(
        string_builder(tf_node.name) + "_printer",
        {*taskletInputs},
        {},
        string_builder(tf_node.name) + "_printer" + "(" +
        ",".join(taskletInputs) + ");",
        language=dace.dtypes.Language.CPP,
    )
    for _n, _conn in zip(debugNodes, taskletInputs):
        _n_cpu = state.add_transient(_n.data + "_cpucopy",
                                     _n.desc(dace_session.graph).shape,
                                     _n.desc(dace_session.graph).dtype,
                                     storage=dace.StorageType.CPU_Heap,
                                     lifetime=dtypes.AllocationLifetime.SDFG)
        state.add_edge(_n, None, _n_cpu, None,
                       Memlet.from_array(_n, _n.desc(dace_session.graph)))
        state.add_edge(
            _n_cpu,
            None,
            callback_tasklet,
            _conn,
            Memlet.from_array(_n_cpu, _n_cpu.desc(dace_session.graph)),
        )
    callback_input_types = []
    for somenode in debugNodes:
        callback_input_types.append(somenode.desc(dace_session.graph))
    dace_session.callbackFunctionDict[string_builder(tf_node.name) +
                                      "_printer"] = printer
    dace_session.callbackTypeDict[string_builder(tf_node.name) +
                                  "_printer"] = dace.data.Scalar(
                                      dace.callback(None,
                                                    *callback_input_types))
Ejemplo n.º 5
0
    return 42


def consumer(inp):
    assert inp == 6


def arraysquarer(outp_array, inp_array):
    import numpy as np
    np.copyto(outp_array, np.square(inp_array))


M = dace.symbolic.symbol('M')
N = dace.symbolic.symbol('N')
O = dace.symbolic.symbol('O')
giveandtake = dace.symbol('giveandtake', dace.callback(dace.uint32,
                                                       dace.uint32))
take = dace.symbol('take', dace.callback(None, dace.uint32))
give = dace.symbol('give', dace.callback(dace.uint32))
donothing = dace.symbol('donothing', dace.callback(None))


@dace.program
def callback_test(A: dace.uint32[2], B: dace.uint32[2]):
    @dace.map(_[0:2])
    def index(i):
        a << A[i]
        b >> B[i]
        b = giveandtake(a)
        take(a + 1)
        if give() != 42:
            donothing()
Ejemplo n.º 6
0

def arraysquarer(outp_array, inp_array):
    import numpy as np
    np.copyto(outp_array, np.square(inp_array))


M = dace.symbolic.symbol()
N = dace.symbolic.symbol()
O = dace.symbolic.symbol()


@dace.program(
    dace.uint32[2],
    dace.uint32[2],
    dace.callback(dace.uint32, dace.uint32),
    dace.callback(None, dace.uint32),
    dace.callback(dace.uint32),
    dace.callback(None),
)
def callback_test(A, B, giveandtake, take, give, donothing):
    @dace.map(_[0:2])
    def index(i):
        a << A[i]
        b >> B[i]
        b = giveandtake(a)
        take(a + 1)
        if give() != 42:
            donothing()