Ejemplo n.º 1
0
def compute_softdtw_backward_cuda(D, R, inv_gamma, bandwidth, max_i, max_j, n_passes, E):
    k = cuda.blockIdx.x
    tid = cuda.threadIdx.x

    # Indexing logic is the same as above, however, the anti-diagonal needs to
    # progress backwards
    I = tid

    for p in range(n_passes):
        # Reverse the order to make the loop go backward
        rev_p = n_passes - p - 1

        # convert tid to I, J, then i, j
        J = max(0, min(rev_p - tid, max_j - 1))

        i = I + 1
        j = J + 1

        # Only compute if element[i, j] is on the current anti-diagonal, and also is within bounds
        if I + J == rev_p and (I < max_i and J < max_j):

            if math.isinf(R[k, i, j]):
                R[k, i, j] = -math.inf

            # Don't compute if outside bandwidth
            if not (abs(i - j) > bandwidth > 0):
                a = math.exp((R[k, i + 1, j] - R[k, i, j] - D[k, i + 1, j]) * inv_gamma)
                b = math.exp((R[k, i, j + 1] - R[k, i, j] - D[k, i, j + 1]) * inv_gamma)
                c = math.exp((R[k, i + 1, j + 1] - R[k, i, j] - D[k, i + 1, j + 1]) * inv_gamma)
                E[k, i, j] = E[k, i + 1, j] * a + E[k, i, j + 1] * b + E[k, i + 1, j + 1] * c

        # Wait for other threads in this block
        cuda.syncthreads()
def compute_softdtw_cuda(D, gamma, warp, bandwidth, max_i, max_j, n_passes, R):

    b = cuda.blockIdx.x
    tid = cuda.threadIdx.x

    I = tid

    inv_gamma = 1.0 / gamma

    for p in range(n_passes):

        J = max(0, min(p - tid, max_j - 1))

        i = I + 1
        j = J + 1

        if I + J == p and (I < max_i and J < max_j):
            if not (abs(i - j) > bandwidth > 0):
                r0 = -(R[b, i - 1, j - 1] + D[b, i - 1, j - 1]) * inv_gamma
                r1 = -(R[b, i - 1, j] + D[b, i - 1, j] + warp) * inv_gamma
                r2 = -(R[b, i, j - 1] + D[b, i, j - 1] + warp) * inv_gamma
                rmax = max(max(r0, r1), r2)
                rsum = math.exp(r0 - rmax) + math.exp(r1 -
                                                      rmax) + math.exp(r2 -
                                                                       rmax)
                softmin = -gamma * (math.log(rsum) + rmax)
                R[b, i, j] = softmin

        # Wait for other threads in this block
        cuda.syncthreads()
Ejemplo n.º 3
0
def compute_sig_kernel_batch_varpar_from_increments_cuda(
        M_inc, len_x, len_y, n_anti_diagonals, M_sol, solver=0):
    """
    We start from a list of pairs of paths [(x^1,y^1), ..., (x^n, y^n)]
    M_inc: a 3-tensor D[i,j,k] = <x^i_j, y^i_k>.
    n_anti_diagonals = 2 * max(len_x, len_y) - 1
    M_sol: a 3-tensor storing the solutions of the PDEs.
    """

    # Each block corresponds to a pair (x_i,y_i).
    block_id = cuda.blockIdx.x
    # Each thread works on a node of a diagonal.
    thread_id = cuda.threadIdx.x

    I = thread_id

    # Go over each anti-diagonal. Only process threads that fall on the current on the anti-diagonal
    for p in range(n_anti_diagonals):

        # The index is actually 'p - thread_id' but need to force it in-bounds
        J = max(0, min(p - thread_id, len_y - 1))

        # For simplicity, we define i, j which start from 1 (offset from I, J)
        i = I + 1
        j = J + 1

        # Only compute if element[i, j] is on the current anti-diagonal
        if I + J == p and (I < len_x and J < len_y):

            inc = M_inc[block_id, i - 1, j - 1]

            k_01 = M_sol[block_id, i - 1, j]
            k_10 = M_sol[block_id, i, j - 1]
            k_00 = M_sol[block_id, i - 1, j - 1]

            # vanilla scheme
            if solver == 0:
                M_sol[block_id, i, j] = k_01 + k_10 + k_00 * (inc - 1.)

            # explicit scheme
            elif solver == 1:
                M_sol[block_id, i, j] = (k_01 + k_10) * (
                    1. + 0.5 * inc +
                    (1. / 12) * inc**2) - k_00 * (1. - (1. / 12) * inc**2)

            # implicit scheme
            else:
                #M_sol[block_id, i, j] = k_01+k_10-k_00 + ((0.5*inc)/(1.-0.25*inc))*(k_01+k_10)
                M_sol[block_id, i,
                      j] = k_01 + k_10 - k_00 + (math.exp(0.5 * inc) -
                                                 1.) * (k_01 + k_10)

        # Wait for other threads in this block
        cuda.syncthreads()
Ejemplo n.º 4
0
def compute_sig_kernel_Gram_mat_varpar_from_increments_cuda(
        M_inc, len_x, len_y, n_anti_diagonals, M_sol, solver=0):

    block_id_x = cuda.blockIdx.x
    block_id_y = cuda.blockIdx.y

    # Each thread works on a node of a diagonal.
    thread_id = cuda.threadIdx.x

    I = thread_id

    # Go over each anti-diagonal. Only process threads that fall on the current on the anti-diagonal
    for p in range(n_anti_diagonals):

        # The index is actually 'p - thread_id' but need to force it in-bounds
        J = max(0, min(p - thread_id, len_y - 1))

        # For simplicity, we define i, j which start from 1 (offset from I, J)
        i = I + 1
        j = J + 1

        # Only compute if element[i, j] is on the current anti-diagonal
        if I + J == p and (I < len_x and J < len_y):

            inc = M_inc[block_id_x, block_id_y, i - 1, j - 1]

            k_01 = M_sol[block_id_x, block_id_y, i - 1, j]
            k_10 = M_sol[block_id_x, block_id_y, i, j - 1]
            k_00 = M_sol[block_id_x, block_id_y, i - 1, j - 1]

            # vanilla scheme
            if solver == 0:
                M_sol[block_id_x, block_id_y, i,
                      j] = k_01 + k_10 + k_00 * (inc - 1.)

            # explicit scheme
            elif solver == 1:
                M_sol[block_id_x, block_id_y, i, j] = (k_01 + k_10) * (
                    1. + 0.5 * inc +
                    (1. / 12) * inc**2) - k_00 * (1. - (1. / 12) * inc**2)

            # implicit scheme
            else:
                #M_sol[block_id_x, block_id_y, i, j] = k_01+k_10-k_00 + ((0.5*inc)/(1.-0.25*inc))*(k_01+k_10)
                M_sol[block_id_x, block_id_y, i,
                      j] = k_01 + k_10 - k_00 + (math.exp(0.5 * inc) -
                                                 1.) * (k_01 + k_10)

        # Wait for other threads in this block
        cuda.syncthreads()


# ===========================================================================================================
Ejemplo n.º 5
0
def compute_softdtw_cuda(D, gamma, bandwidth, max_i, max_j, n_passes, R):
    """
    :param seq_len: The length of the sequence (both inputs are assumed to be of the same size)
    :param n_passes: 2 * seq_len - 1 (The number of anti-diagonals)
    """
    # Each block processes one pair of examples
    b = cuda.blockIdx.x
    # We have as many threads as seq_len, because the most number of threads we need
    # is equal to the number of elements on the largest anti-diagonal
    tid = cuda.threadIdx.x

    # Compute I, J, the indices from [0, seq_len)

    # The row index is always the same as tid
    I = tid

    inv_gamma = 1.0 / gamma

    # Go over each anti-diagonal. Only process threads that fall on the current on the anti-diagonal
    for p in range(n_passes):

        # The index is actually 'p - tid' but need to force it in-bounds
        J = max(0, min(p - tid, max_j - 1))

        # For simplicity, we define i, j which start from 1 (offset from I, J)
        i = I + 1
        j = J + 1

        # Only compute if element[i, j] is on the current anti-diagonal, and also is within bounds
        if I + J == p and (I < max_i and J < max_j):
            # Don't compute if outside bandwidth
            if not (abs(i - j) > bandwidth > 0):
                r0 = -R[b, i - 1, j - 1] * inv_gamma
                r1 = -R[b, i - 1, j] * inv_gamma
                r2 = -R[b, i, j - 1] * inv_gamma
                rmax = max(max(r0, r1), r2)
                rsum = math.exp(r0 - rmax) + math.exp(r1 -
                                                      rmax) + math.exp(r2 -
                                                                       rmax)
                softmin = -gamma * (math.log(rsum) + rmax)
                R[b, i, j] = D[b, i - 1, j - 1] + softmin

        # Wait for other threads in this block
        cuda.syncthreads()
def compute_softdtw_backward_cuda(D, R, inv_gamma, warp, bandwidth, max_i,
                                  max_j, n_passes, E, G):
    k = cuda.blockIdx.x
    tid = cuda.threadIdx.x

    I = tid

    for p in range(n_passes):
        rev_p = n_passes - p - 1

        J = max(0, min(rev_p - tid, max_j - 1))

        i = I + 1
        j = J + 1

        if I + J == rev_p and (I < max_i and J < max_j):

            if math.isinf(R[k, i, j]):
                R[k, i, j] = -math.inf

            if not (abs(i - j) > bandwidth > 0):
                a = math.exp(
                    (R[k, i + 1, j] - R[k, i, j] - D[k, i, j] - warp) *
                    inv_gamma)
                b = math.exp(
                    (R[k, i, j + 1] - R[k, i, j] - D[k, i, j] - warp) *
                    inv_gamma)
                c = math.exp(
                    (R[k, i + 1, j + 1] - R[k, i, j] - D[k, i, j]) * inv_gamma)
                E[k, i,
                  j] = E[k, i + 1, j] * a + E[k, i, j + 1] * b + E[k, i + 1,
                                                                   j + 1] * c
                G[k, i,
                  j] = E[k, i + 1, j] + E[k, i, j + 1] + E[k, i + 1, j + 1]

        cuda.syncthreads()