def dace_sum(X_in: dace.float32[N], X_out: dace.float32[1]): dace.reduce(lambda a, b: a + b, X_in, X_out, identity=0)
def program(A: dace.float32[M, N]): return dace.reduce(lambda a, b: max(a, b), A, axis=1, identity=0)
def subgraph_fusion_complex(A: dace.float64[N], B: dace.float64[M], C: dace.float64[O], out1: dace.float64[N, M], out2: dace.float64[1], out3: dace.float64[N, M, O]): tmp1 = np.ndarray([N, M, O], dtype=dace.float64) tmp2 = np.ndarray([N, M, O], dtype=dace.float64) tmp3 = np.ndarray([N, M, O], dtype=dace.float64) tmp4 = np.ndarray([N, M, O], dtype=dace.float64) tmp5 = np.ndarray([N, M, O], dtype=dace.float64) t1 = np.ndarray([N, M], dtype=dace.float64) t2 = np.ndarray([N, M], dtype=dace.float64) t3 = np.ndarray([N, M], dtype=dace.float64) for i, j, k in dace.map[0:N, 0:M, 0:O]: tp = np.ndarray([1], dtype=dace.float64) with dace.tasklet: in1 << A[i] in2 << B[j] in3 << C[k] out >> tp out = in1 + in2 + in3 with dace.tasklet: in1 << tp out >> tmp1[i, j, k] out = in1 + 42 dace.reduce(lambda a, b: a + b, tmp1, t1, axis=2, identity=0) for i, j in dace.map[0:N, 0:M]: with dace.tasklet: in1 << A[i] in2 << B[j] out >> t2[i, j] out = in1 + in2 + 42 for i, j in dace.map[0:N, 0:M]: with dace.tasklet: in1 << t2[i, j] in2 << A[i] out >> out1[i, j] out = in1 * in1 * in2 + in2 for i, j, k in dace.map[0:N, 0:M, 0:O]: with dace.tasklet: in1 << t1[i, j] in2 << t2[i, j] in3 << C[k] out >> tmp3[i, j, k] out = in1 + in2 + in3 for i, j, k in dace.map[0:N, 0:M, 0:O]: with dace.tasklet: in1 << tmp3[i, j, k] in2 << tmp1[i, j, k] out >> out3[i, j, k] out = in1 + in2 @dace.tasklet def fun(): in1 << tmp3[0, 0, 0] out >> out2 out = in1 * 42
def dace_softmax(X_in: dace.float32[N], X_out: dace.float32[N]): tmp_max = dace.reduce(lambda a, b: max(a, b), X_in) X_out[:] = exp(X_in - tmp_max) tmp_sum = dace.reduce(lambda a, b: a + b, X_out, identity=0) X_out[:] /= tmp_sum
def sum(A: dace.float32[N], out: dace.float32[1]): dace.reduce(lambda a, b: a + b, A, out, identity=0)
def reduction_test_1(A: dace.float64[M, N], B: dace.float64[M, N], C: dace.float64[N]): tmp = np.ndarray(shape=[M, N], dtype=np.float64) tmp[:] = 2 * A[:] + B[:] C[:] = dace.reduce(lambda a, b: a + b, tmp, axis=0)
def own_fft(x, y): dtype = dace.complex64 # Generate radix dft matrix dft_mat = dace.define_local([R, R], dtype=dtype) @dace.map(_[0:R, 0:R]) def dft_mat_gen(i, j): omega >> dft_mat[i, j] omega = exp(-dace.complex64(0, 2 * 3.14159265359 * i * j / R)) tmp = dace.define_local([N], dtype=dtype) @dace.map(_[0:N]) def move_x_to_y(i): x_in << x[i] y_out >> y[i] y_out = x_in # Calculate indices r_i = dace.define_local([K], dtype=dace.int64) r_i_1 = dace.define_local([K], dtype=dace.int64) r_k_1 = dace.define_local([K], dtype=dace.int64) r_k_i_1 = dace.define_local([K], dtype=dace.int64) @dace.map(_[0:K]) def calc_index(i): # Permutations r_i_o >> r_i[i] r_i_1_o >> r_i_1[i] r_k_1_o >> r_k_1[i] r_k_i_1_o >> r_k_i_1[i] r_i_o = R ** i r_i_1_o = R ** (i + 1) r_k_i_1_o = R ** (K - i - 1) r_k_1_o = R ** (K - 1) # Main Stockham loop for i in range(K): # STRIDE PERMUTATION tmp_perm = dace.define_local([N], dtype=dtype) @dace.map(_[0:R, 0:r_i[i], 0:r_k_i_1[i]]) def permute(ii, jj, kk): r_k_i_1_in << r_k_i_1[i] r_i_in << r_i[i] y_in << y[r_k_i_1_in * (jj * R + ii) + kk] tmp_out >> tmp_perm[r_k_i_1_in * (ii * r_i_in + jj) + kk] tmp_out = y_in # --------------------------------------------------------------------- # TWIDDLE FACTOR MULTIPLICATION D = dace.define_local([N], dtype=dace.complex64) @dace.map(_[0:R, 0:r_i[i], 0:r_k_i_1[i]]) def generate_twiddles(ii, jj, kk): r_i_1_in << r_i_1[i] r_i_in << r_i[i] r_k_i_1_in << r_k_i_1[i] twiddle_o >> D[r_k_i_1_in * (ii * r_i_in + jj) + kk] twiddle_o = exp(dace.complex64(0, -2 * 3.14159265359 * ii * jj / r_i_1_in)) tmp_twid = dace.define_local([N], dtype=dtype) @dace.map(_[0:N]) def twiddle_multiplication(ii): tmp_in << tmp_perm[ii] D_in << D[ii] tmp_out >> tmp_twid[ii] tmp_out = tmp_in * D_in # --------------------------------------------------------------------- # Vector DFT multiplication tmp_y = dace.define_local([N, N], dtype=dace.complex64) @dace.map(_[0:r_k_1[i], 0:R, 0:R]) def tensormult(ii, jj, kk): r_k_1_in << r_k_1[i] dft_in << dft_mat[jj, kk] tmp_in << tmp_twid[ii + r_k_1_in * kk] tmp_y_out >> tmp_y[ii + r_k_1_in * jj, ii + r_k_1_in * kk] tmp_y_out = dft_in * tmp_in tmp_red = dace.define_local([N], dtype=dtype) dace.reduce(lambda a, b: a + b, tmp_y, tmp_red, axis=1, identity=0) @dace.map(_[0:N]) def move_to_y(i): tmp_in << tmp_red[i] y_out >> y[i] y_out = tmp_in
def dace_max(X_in: dace.float32[N], X_out: dace.float32[1]): dace.reduce(lambda a, b: max(a, b), X_in, X_out)
def customreduction(A: dace.float32[20], out: dace.float32[1]): dace.reduce(lambda a, b: a if a < b else b, A, out, identity=9999999)
def sGPU_reduction_library(A: dtype[N], sumA: dtype[1]): dace.reduce(lambda a, b: a + b, A, sumA, identity=0)