Beispiel #1
0
def dace_sum(X_in: dace.float32[N], X_out: dace.float32[1]):
    dace.reduce(lambda a, b: a + b, X_in, X_out, identity=0)
def program(A: dace.float32[M, N]):
    return dace.reduce(lambda a, b: max(a, b), A, axis=1, identity=0)
Beispiel #3
0
def subgraph_fusion_complex(A: dace.float64[N], B: dace.float64[M],
                            C: dace.float64[O], out1: dace.float64[N, M],
                            out2: dace.float64[1], out3: dace.float64[N, M,
                                                                      O]):

    tmp1 = np.ndarray([N, M, O], dtype=dace.float64)
    tmp2 = np.ndarray([N, M, O], dtype=dace.float64)
    tmp3 = np.ndarray([N, M, O], dtype=dace.float64)
    tmp4 = np.ndarray([N, M, O], dtype=dace.float64)
    tmp5 = np.ndarray([N, M, O], dtype=dace.float64)

    t1 = np.ndarray([N, M], dtype=dace.float64)
    t2 = np.ndarray([N, M], dtype=dace.float64)
    t3 = np.ndarray([N, M], dtype=dace.float64)

    for i, j, k in dace.map[0:N, 0:M, 0:O]:
        tp = np.ndarray([1], dtype=dace.float64)
        with dace.tasklet:
            in1 << A[i]
            in2 << B[j]
            in3 << C[k]
            out >> tp

            out = in1 + in2 + in3

        with dace.tasklet:
            in1 << tp
            out >> tmp1[i, j, k]

            out = in1 + 42

    dace.reduce(lambda a, b: a + b, tmp1, t1, axis=2, identity=0)

    for i, j in dace.map[0:N, 0:M]:
        with dace.tasklet:
            in1 << A[i]
            in2 << B[j]
            out >> t2[i, j]
            out = in1 + in2 + 42

    for i, j in dace.map[0:N, 0:M]:
        with dace.tasklet:
            in1 << t2[i, j]
            in2 << A[i]
            out >> out1[i, j]

            out = in1 * in1 * in2 + in2

    for i, j, k in dace.map[0:N, 0:M, 0:O]:
        with dace.tasklet:
            in1 << t1[i, j]
            in2 << t2[i, j]
            in3 << C[k]
            out >> tmp3[i, j, k]

            out = in1 + in2 + in3

    for i, j, k in dace.map[0:N, 0:M, 0:O]:
        with dace.tasklet:
            in1 << tmp3[i, j, k]
            in2 << tmp1[i, j, k]
            out >> out3[i, j, k]

            out = in1 + in2

    @dace.tasklet
    def fun():
        in1 << tmp3[0, 0, 0]
        out >> out2

        out = in1 * 42
Beispiel #4
0
def dace_softmax(X_in: dace.float32[N], X_out: dace.float32[N]):

    tmp_max = dace.reduce(lambda a, b: max(a, b), X_in)
    X_out[:] = exp(X_in - tmp_max)
    tmp_sum = dace.reduce(lambda a, b: a + b, X_out, identity=0)
    X_out[:] /= tmp_sum
Beispiel #5
0
def sum(A: dace.float32[N], out: dace.float32[1]):
    dace.reduce(lambda a, b: a + b, A, out, identity=0)
Beispiel #6
0
def reduction_test_1(A: dace.float64[M, N], B: dace.float64[M, N],
                     C: dace.float64[N]):

    tmp = np.ndarray(shape=[M, N], dtype=np.float64)
    tmp[:] = 2 * A[:] + B[:]
    C[:] = dace.reduce(lambda a, b: a + b, tmp, axis=0)
Beispiel #7
0
def own_fft(x, y):
    dtype = dace.complex64
    
    # Generate radix dft matrix
    dft_mat = dace.define_local([R, R], dtype=dtype)
    @dace.map(_[0:R, 0:R])
    def dft_mat_gen(i, j):
        omega >> dft_mat[i, j]
        omega = exp(-dace.complex64(0, 2 * 3.14159265359 * i * j / R))
        
    tmp = dace.define_local([N], dtype=dtype)
    @dace.map(_[0:N])
    def move_x_to_y(i):
        x_in << x[i]
        y_out >> y[i]
        
        y_out = x_in
        
    # Calculate indices        
    r_i = dace.define_local([K], dtype=dace.int64)
    r_i_1 = dace.define_local([K], dtype=dace.int64)
    r_k_1 = dace.define_local([K], dtype=dace.int64) 
    r_k_i_1 = dace.define_local([K], dtype=dace.int64)

    
    @dace.map(_[0:K])
    def calc_index(i):
        # Permutations
        r_i_o >> r_i[i]
        r_i_1_o >> r_i_1[i]
        r_k_1_o >> r_k_1[i]
        r_k_i_1_o >> r_k_i_1[i]
        
        r_i_o = R ** i
        r_i_1_o = R ** (i + 1)
        r_k_i_1_o = R ** (K - i - 1)
        r_k_1_o = R ** (K - 1)        

    # Main Stockham loop
    for i in range(K):
        # STRIDE PERMUTATION
        tmp_perm = dace.define_local([N], dtype=dtype)
        @dace.map(_[0:R, 0:r_i[i], 0:r_k_i_1[i]])
        def permute(ii, jj, kk):
            r_k_i_1_in << r_k_i_1[i]
            r_i_in << r_i[i]
            y_in << y[r_k_i_1_in * (jj * R + ii) + kk]
            tmp_out >> tmp_perm[r_k_i_1_in * (ii * r_i_in + jj) + kk]
    
            tmp_out = y_in
            
        # ---------------------------------------------------------------------
        # TWIDDLE FACTOR MULTIPLICATION
        D = dace.define_local([N], dtype=dace.complex64)
        @dace.map(_[0:R, 0:r_i[i], 0:r_k_i_1[i]])
        def generate_twiddles(ii, jj, kk):
            r_i_1_in << r_i_1[i]
            r_i_in << r_i[i]
            r_k_i_1_in << r_k_i_1[i]
            twiddle_o >> D[r_k_i_1_in * (ii * r_i_in + jj) + kk]
            twiddle_o = exp(dace.complex64(0, -2 * 3.14159265359 * ii * jj / r_i_1_in))
            
        tmp_twid = dace.define_local([N], dtype=dtype)
        @dace.map(_[0:N])
        def twiddle_multiplication(ii):
            tmp_in << tmp_perm[ii]
            D_in << D[ii]
            tmp_out >> tmp_twid[ii]
            
            tmp_out = tmp_in * D_in

        # ---------------------------------------------------------------------
        # Vector DFT multiplication
        tmp_y = dace.define_local([N, N], dtype=dace.complex64)
        @dace.map(_[0:r_k_1[i], 0:R, 0:R])
        def tensormult(ii, jj, kk):
            r_k_1_in << r_k_1[i]
            dft_in << dft_mat[jj, kk]
            tmp_in << tmp_twid[ii + r_k_1_in * kk]
            tmp_y_out >> tmp_y[ii + r_k_1_in * jj, ii + r_k_1_in * kk]

            tmp_y_out = dft_in * tmp_in
            
        tmp_red = dace.define_local([N], dtype=dtype)
        dace.reduce(lambda a, b: a + b, tmp_y, tmp_red, axis=1, identity=0)
        
        @dace.map(_[0:N])
        def move_to_y(i):
            tmp_in << tmp_red[i]
            y_out >> y[i]
            
            y_out = tmp_in
Beispiel #8
0
def dace_max(X_in: dace.float32[N], X_out: dace.float32[1]):
    dace.reduce(lambda a, b: max(a, b), X_in, X_out)
Beispiel #9
0
def customreduction(A: dace.float32[20], out: dace.float32[1]):
    dace.reduce(lambda a, b: a if a < b else b, A, out, identity=9999999)
def sGPU_reduction_library(A: dtype[N], sumA: dtype[1]):
    dace.reduce(lambda a, b: a + b, A, sumA, identity=0)