def test_dde_scope_reconnect(): ''' Corner case: map { tasklet(callback()) -> tasklet(do nothing) } expected map to stay connected ''' sdfg = dace.SDFG('dde_scope_tester') sdfg.add_symbol('cb', dace.callback(dace.float64)) sdfg.add_scalar('s', dace.float64, transient=True) state = sdfg.add_state() me, mx = state.add_map('doit', dict(i='0:2')) # Tasklet has a callback and cannot be removed t1 = state.add_tasklet('callback', {}, {'o'}, 'o = cb()', side_effects=True) # Tasklet has no output and thus can be removed t2 = state.add_tasklet('nothing', {'inp'}, {}, '') state.add_nedge(me, t1, dace.Memlet()) state.add_edge(t1, 'o', t2, 'inp', dace.Memlet('s')) state.add_nedge(t2, mx, dace.Memlet()) Pipeline([DeadDataflowElimination()]).apply_pass(sdfg, {}) assert set(state.nodes()) == {me, t1, mx} sdfg.validate()
def test_invalid_callback(): cb = dace.symbol('cb', dace.callback(dace.uint32[5])) @dace.program def shouldfail(out): with dace.tasklet: arr = cb() o = arr[1] o >> out with pytest.raises(DaceSyntaxError): oo = numpy.random.rand(10) shouldfail(oo)
def test_dce_callback_manual(): sdfg = dace.SDFG('dce_cbman') sdfg.add_array('a', [20], dace.float64) sdfg.add_symbol('cb', dace.callback(None, dace.float64)) state = sdfg.add_state() r = state.add_read('a') t = state.add_tasklet('callback', {'inp'}, {}, 'cb(inp)') state.add_edge(r, None, t, 'inp', dace.Memlet('a[0:20]')) sdfg.validate() Pipeline([DeadDataflowElimination()]).apply_pass(sdfg, {}) assert set(state.nodes()) == {r, t} sdfg.validate()
def winograd_convolution(dace_session, tf_node): debugNodes = [] state = dace_session.state add_cublas_cusolver(dace_session.graph) #############Add constants for transformation matrices############### dace_session.graph.add_constant('Btrans', bt) dace_session.graph.add_constant('B', b) bNode = 'B' bTransposeNode = 'Btrans' dace_session.graph.add_constant('G', g) dace_session.graph.add_constant('Gtrans', gt) gNode = 'G' gTransposeNode = 'Gtrans' dace_session.graph.add_constant('Atrans', at) dace_session.graph.add_constant('A', a) aNode = 'A' aTransposeNode = 'Atrans' inputNodes = [] inputParams = [] inputDims = [] for _inp in tf_node.inputs: _node, _params, _dims = dace_session.create_and_add_input_node(_inp) inputNodes.append(_node) inputParams.append(_params) inputDims.append(_dims) # Manually add copy for kernel from CPU to GPU kernel_desc = inputNodes[1].desc(dace_session.graph) kernelGPU = state.add_transient( inputNodes[1].data + "GPU", shape=kernel_desc.shape, dtype=kernel_desc.dtype, lifetime=dtypes.AllocationLifetime.SDFG, storage=dace.StorageType.GPU_Global, ) state.add_edge( inputNodes[1], None, kernelGPU, None, Memlet.from_array(inputNodes[1], inputNodes[1].desc(dace_session.graph)), ) inputNodes[1] = kernelGPU outputList = dace_session.create_and_add_output_node(tf_node) outputDims = dace_session.get_default_dims(tf_node.outputs[0]) if str(tf_node.get_attr("padding"))[2:-1] == "SAME": paddedInput, paddedDims = dace_session.inputPadding( tf_node, inputNodes[0], inputNodes[0].desc(dace_session.graph), outputList[0].desc(dace_session.graph).shape[1], inputNodes[1].desc(dace_session.graph).shape[0], tf_node.get_attr("strides")[1], inputDims[0], ) inputDims[0] = paddedDims inputNodes[0] = paddedInput outputShape = [int(_s) for _s in tf_node.outputs[0].shape] inputViewShape = [ IMAGE_TILE_SIZE, IMAGE_TILE_SIZE, tf_node.inputs[0].shape[-1], outputShape[0] * ceil(outputShape[1] / OUTPUT_TILE_SIZE) * ceil(outputShape[2] / OUTPUT_TILE_SIZE), ] inputViewDims = ["0:" + str(_x) for _x in inputViewShape] ########Tiling the image################################# inputViewParams = [ "i3%" + str(outputShape[0]), "(i3/" + str(outputShape[0]) + ")%" # + str(output_shape[0] * ceil(output_shape[1] / OUTPUT_TILE_SIZE)) + str(ceil(outputShape[2] / OUTPUT_TILE_SIZE)) + "*" + str(OUTPUT_TILE_SIZE) + "+i0", # + str( # ceil(output_shape[1] / OUTPUT_TILE_SIZE) # * ceil(output_shape[2] / OUTPUT_TILE_SIZE) # ), "int_floor(i3," # + str(ceil(output_shape[1] / OUTPUT_TILE_SIZE)) + str(outputShape[0] * ceil(outputShape[2] / OUTPUT_TILE_SIZE)) + ")*" + str(OUTPUT_TILE_SIZE) + "+i1", "i2", ] inputView = state.add_transient( "V" + "_".join([str(_s) for _s in inputViewShape]), inputViewShape, dace.float32, dace.StorageType.GPU_Global, ) mapEntry, mapExit = state.add_map( string_builder(tf_node.name) + "_input_tile", dict(zip(inputParams[0], inputViewDims)), ) tasklet = state.add_tasklet( string_builder(tf_node.name) + "_input_tile", {"j0"}, {"out"}, "out = j0") dace_session.add_in_memlets([inputNodes[0]], mapEntry, tasklet, [inputDims[0]], [inputViewParams]) dace_session.add_out_memlets([inputView], mapExit, tasklet, [inputViewDims], [inputParams[0]]) ##################Transforming all input tiles######################### #[TODO] try to re-use memory vNode = state.add_transient( "V_output" + "_".join([str(_s) for _s in inputViewShape]), inputViewShape, dace.float32, dace.StorageType.GPU_Global, ) vNode.setzero = True mapEntry, mapExit = state.add_map( string_builder(tf_node.name) + "_input_txform", dict(zip(inputParams[0][0:2], inputViewDims[2:4])), dace.ScheduleType.GPU_Device, ) intermediateResultNode = state.add_transient("BtI", bt.shape, dace.float32, dace.StorageType.Register) intermediateResultNode.setzero = True state.add_edge( inputView, None, mapEntry, None, Memlet.simple(inputView, ",".join(inputViewDims)), ) mm_small( state, bTransposeNode, inputView, intermediateResultNode, B_subset=[IMAGE_TILE_SIZE, IMAGE_TILE_SIZE], B_memlet=Memlet.simple( inputView, ",".join(inputViewDims[0:2] + inputParams[0][0:2])), map_entry=mapEntry, B_direct=False, ) mm_small( state, intermediateResultNode, bNode, vNode, map_exit=mapExit, C_subset=[IMAGE_TILE_SIZE, IMAGE_TILE_SIZE], C_memlet=Memlet.simple( vNode, ",".join(inputViewDims[0:2] + inputParams[0][0:2]), wcr_str="lambda a,b: a+b", wcr_conflict=False, ), map_entry=mapEntry, A_direct=True, ) state.add_edge( mapExit, None, vNode, None, Memlet.simple( vNode, ",".join(inputViewDims), wcr_str="lambda a,b: a+b", wcr_conflict=False, ), ) #############Transforming the kernel############################### mapEntry, mapExit = state.add_map( string_builder(tf_node.name) + "_kernel_txform", dict(zip(inputParams[1][0:2], inputDims[1][2:4])), dace.ScheduleType.GPU_Device, ) intermediateResultNode = state.add_transient("GF", g.shape, dace.float32, dace.StorageType.Register) intermediateResultNode.setzero = True processedKernelNode = state.add_transient( "U" + "_".join([ str(_s) for _s in inputViewShape[0:2] + list(tf_node.inputs[1].shape[-1:-3:-1]) ]), inputViewShape[0:2] + list(tf_node.inputs[1].shape[-1:-3:-1]), dace.float32, dace.StorageType.GPU_Global, ) processedKernelNode.setzero = True state.add_edge( inputNodes[1], None, mapEntry, None, dace.Memlet.from_array(inputNodes[1].data, inputNodes[1].desc(dace_session.graph)), ) mm_small( state, gNode, inputNodes[1], intermediateResultNode, map_entry=mapEntry, B_subset=tf_node.inputs[1].shape[0:2], B_memlet=Memlet.simple( inputNodes[1], ",".join(inputDims[1][0:2] + inputParams[1][0:2])), B_direct=False, ) mm_small( state, intermediateResultNode, gTransposeNode, processedKernelNode, C_subset=[IMAGE_TILE_SIZE, IMAGE_TILE_SIZE], C_memlet=Memlet.simple( processedKernelNode, ",".join(inputViewDims[0:2] + [inputParams[0][1]] + [inputParams[0][0]]), wcr_str="lambda a,b: a+b", wcr_conflict=False, ), map_entry=mapEntry, map_exit=mapExit, A_direct=True, ) state.add_edge( mapExit, None, processedKernelNode, None, Memlet.simple( processedKernelNode.data, ",".join([ "0:" + str(_s) for _s in processedKernelNode.desc(dace_session.graph).shape ]), wcr_str="lambda a,b: a+b", wcr_conflict=False, ), ) mNode = state.add_transient( "m" + "_".join([ str(_s) for _s in inputViewShape[0:2] + [tf_node.inputs[1].shape[-1], inputViewShape[-1]] ]), inputViewShape[0:2] + [tf_node.inputs[1].shape[-1], inputViewShape[-1]], dace.float32, dace.StorageType.GPU_Global, ) mNodeDims = ["0:" + str(_d) for _d in mNode.desc(dace_session.graph).shape] mapEntry, mapExit = state.add_map( string_builder(tf_node.name) + "_eltwise_product", dict(zip(inputParams[0][0:2], inputViewDims[0:2])), dace.ScheduleType.Sequential, ) state.add_edge( vNode, None, mapEntry, None, Memlet.from_array(vNode.data, vNode.desc(dace_session.graph)), ) state.add_edge( processedKernelNode, None, mapEntry, None, Memlet.from_array(processedKernelNode.data, processedKernelNode.desc(dace_session.graph)), ) mm( state, vNode, processedKernelNode, mNode, A_subset=inputViewShape[2:4], A_memlet=Memlet.simple( vNode, ",".join(inputParams[0][0:2] + inputViewDims[-2:])), B_subset=tf_node.inputs[1].shape[-1:-3:-1], B_memlet=Memlet.simple( processedKernelNode, ",".join( inputParams[0][0:2] + ["0:" + str(_s) for _s in tf_node.inputs[1].shape[-1:-3:-1]]), ), C_subset=[tf_node.inputs[1].shape[-1], inputViewShape[-1]], C_memlet=Memlet.simple(mNode, ",".join(inputParams[0][0:2] + mNodeDims[-2:])), map_entry=mapEntry, map_exit=mapExit, shadow_a=True, shadow_b=True, ) state.add_edge(mapExit, None, mNode, None, Memlet.simple(mNode, ",".join(mNodeDims))) #################OUTPUT TRANSFORMATION################################ mapRange = [inputDims[1][-1]] + [inputViewDims[-1]] mapEntry, mapExit = state.add_map( string_builder(tf_node.name) + "_output_txform", dict(zip(inputParams[0][0:2], mapRange)), dace.ScheduleType.GPU_Device, ) intermediateResultNode = state.add_transient("AtM", at.shape, dace.float32, dace.StorageType.Register) intermediateResultNode.setzero = True transformedOutputNode = state.add_transient( "inv_txformed_output" + "_".join([str(tf_node.inputs[1].shape[-1])] + [str(inputViewShape[-1])]), [OUTPUT_TILE_SIZE, OUTPUT_TILE_SIZE] + [tf_node.inputs[1].shape[-1]] + [inputViewShape[-1]], dace.float32, dace.StorageType.GPU_Global, ) transformedOutputNode.setzero = True state.add_edge(mNode, None, mapEntry, None, Memlet.simple(mNode, ",".join(mNodeDims))) mm_small( state, aTransposeNode, mNode, intermediateResultNode, B_subset=inputViewShape[0:2], B_memlet=Memlet.simple( mNode, ",".join(inputViewDims[0:2] + inputParams[0][0:2])), map_entry=mapEntry, B_direct=False, ) mm_small( state, intermediateResultNode, aNode, transformedOutputNode, C_subset=[OUTPUT_TILE_SIZE, OUTPUT_TILE_SIZE], C_memlet=Memlet.simple( transformedOutputNode, ",".join( ["0:" + str(OUTPUT_TILE_SIZE), "0:" + str(OUTPUT_TILE_SIZE)] + inputParams[0][0:2]), wcr_str="lambda a,b:a+b", wcr_conflict=False, ), map_entry=mapEntry, map_exit=mapExit, A_direct=True, ) state.add_edge( mapExit, None, transformedOutputNode, None, Memlet.simple( transformedOutputNode.data, ",".join([ "0:" + str(_s) for _s in transformedOutputNode.desc(dace_session.graph).shape ]), wcr_str="lambda a,b: a+b", wcr_conflict=False, ), ) ###################Un-Tile the output to NHWC format################### outputParams = [ "i3%" + str(outputShape[0]), "(i3/" + str(outputShape[0]) + ")%" + str(ceil(outputShape[2] / OUTPUT_TILE_SIZE)) + "*" + str(OUTPUT_TILE_SIZE) + "+i0", "int_floor(i3," + str(outputShape[0] * ceil(outputShape[2] / OUTPUT_TILE_SIZE)) + ")*" + str(OUTPUT_TILE_SIZE) + "+i1", "i2", ] mapRange = [ "0:" + str(_s) for _s in transformedOutputNode.desc(dace_session.graph).shape ] mapEntry, mapExit = state.add_map( string_builder(tf_node.name) + "_output_untile", dict(zip(inputParams[0], mapRange)), ) tasklet = state.add_tasklet( string_builder(tf_node.name) + "_output_untile", {"j0"}, {"out"}, "out = j0") dace_session.add_in_memlets([transformedOutputNode], mapEntry, tasklet, [mapRange], [inputParams[0]]) dace_session.add_out_memlets(outputList, mapExit, tasklet, [outputDims], [outputParams]) ################# Debugging with callbacks ############# taskletInputs = ["i" + str(index) for index in range(len(debugNodes))] callback_tasklet = state.add_tasklet( string_builder(tf_node.name) + "_printer", {*taskletInputs}, {}, string_builder(tf_node.name) + "_printer" + "(" + ",".join(taskletInputs) + ");", language=dace.dtypes.Language.CPP, ) for _n, _conn in zip(debugNodes, taskletInputs): _n_cpu = state.add_transient(_n.data + "_cpucopy", _n.desc(dace_session.graph).shape, _n.desc(dace_session.graph).dtype, storage=dace.StorageType.CPU_Heap, lifetime=dtypes.AllocationLifetime.SDFG) state.add_edge(_n, None, _n_cpu, None, Memlet.from_array(_n, _n.desc(dace_session.graph))) state.add_edge( _n_cpu, None, callback_tasklet, _conn, Memlet.from_array(_n_cpu, _n_cpu.desc(dace_session.graph)), ) callback_input_types = [] for somenode in debugNodes: callback_input_types.append(somenode.desc(dace_session.graph)) dace_session.callbackFunctionDict[string_builder(tf_node.name) + "_printer"] = printer dace_session.callbackTypeDict[string_builder(tf_node.name) + "_printer"] = dace.data.Scalar( dace.callback(None, *callback_input_types))
return 42 def consumer(inp): assert inp == 6 def arraysquarer(outp_array, inp_array): import numpy as np np.copyto(outp_array, np.square(inp_array)) M = dace.symbolic.symbol('M') N = dace.symbolic.symbol('N') O = dace.symbolic.symbol('O') giveandtake = dace.symbol('giveandtake', dace.callback(dace.uint32, dace.uint32)) take = dace.symbol('take', dace.callback(None, dace.uint32)) give = dace.symbol('give', dace.callback(dace.uint32)) donothing = dace.symbol('donothing', dace.callback(None)) @dace.program def callback_test(A: dace.uint32[2], B: dace.uint32[2]): @dace.map(_[0:2]) def index(i): a << A[i] b >> B[i] b = giveandtake(a) take(a + 1) if give() != 42: donothing()
def arraysquarer(outp_array, inp_array): import numpy as np np.copyto(outp_array, np.square(inp_array)) M = dace.symbolic.symbol() N = dace.symbolic.symbol() O = dace.symbolic.symbol() @dace.program( dace.uint32[2], dace.uint32[2], dace.callback(dace.uint32, dace.uint32), dace.callback(None, dace.uint32), dace.callback(dace.uint32), dace.callback(None), ) def callback_test(A, B, giveandtake, take, give, donothing): @dace.map(_[0:2]) def index(i): a << A[i] b >> B[i] b = giveandtake(a) take(a + 1) if give() != 42: donothing()