def compute_softdtw_backward_cuda(D, R, inv_gamma, bandwidth, max_i, max_j, n_passes, E): k = cuda.blockIdx.x tid = cuda.threadIdx.x # Indexing logic is the same as above, however, the anti-diagonal needs to # progress backwards I = tid for p in range(n_passes): # Reverse the order to make the loop go backward rev_p = n_passes - p - 1 # convert tid to I, J, then i, j J = max(0, min(rev_p - tid, max_j - 1)) i = I + 1 j = J + 1 # Only compute if element[i, j] is on the current anti-diagonal, and also is within bounds if I + J == rev_p and (I < max_i and J < max_j): if math.isinf(R[k, i, j]): R[k, i, j] = -math.inf # Don't compute if outside bandwidth if not (abs(i - j) > bandwidth > 0): a = math.exp((R[k, i + 1, j] - R[k, i, j] - D[k, i + 1, j]) * inv_gamma) b = math.exp((R[k, i, j + 1] - R[k, i, j] - D[k, i, j + 1]) * inv_gamma) c = math.exp((R[k, i + 1, j + 1] - R[k, i, j] - D[k, i + 1, j + 1]) * inv_gamma) E[k, i, j] = E[k, i + 1, j] * a + E[k, i, j + 1] * b + E[k, i + 1, j + 1] * c # Wait for other threads in this block cuda.syncthreads()
def compute_softdtw_cuda(D, gamma, warp, bandwidth, max_i, max_j, n_passes, R): b = cuda.blockIdx.x tid = cuda.threadIdx.x I = tid inv_gamma = 1.0 / gamma for p in range(n_passes): J = max(0, min(p - tid, max_j - 1)) i = I + 1 j = J + 1 if I + J == p and (I < max_i and J < max_j): if not (abs(i - j) > bandwidth > 0): r0 = -(R[b, i - 1, j - 1] + D[b, i - 1, j - 1]) * inv_gamma r1 = -(R[b, i - 1, j] + D[b, i - 1, j] + warp) * inv_gamma r2 = -(R[b, i, j - 1] + D[b, i, j - 1] + warp) * inv_gamma rmax = max(max(r0, r1), r2) rsum = math.exp(r0 - rmax) + math.exp(r1 - rmax) + math.exp(r2 - rmax) softmin = -gamma * (math.log(rsum) + rmax) R[b, i, j] = softmin # Wait for other threads in this block cuda.syncthreads()
def compute_sig_kernel_batch_varpar_from_increments_cuda( M_inc, len_x, len_y, n_anti_diagonals, M_sol, solver=0): """ We start from a list of pairs of paths [(x^1,y^1), ..., (x^n, y^n)] M_inc: a 3-tensor D[i,j,k] = <x^i_j, y^i_k>. n_anti_diagonals = 2 * max(len_x, len_y) - 1 M_sol: a 3-tensor storing the solutions of the PDEs. """ # Each block corresponds to a pair (x_i,y_i). block_id = cuda.blockIdx.x # Each thread works on a node of a diagonal. thread_id = cuda.threadIdx.x I = thread_id # Go over each anti-diagonal. Only process threads that fall on the current on the anti-diagonal for p in range(n_anti_diagonals): # The index is actually 'p - thread_id' but need to force it in-bounds J = max(0, min(p - thread_id, len_y - 1)) # For simplicity, we define i, j which start from 1 (offset from I, J) i = I + 1 j = J + 1 # Only compute if element[i, j] is on the current anti-diagonal if I + J == p and (I < len_x and J < len_y): inc = M_inc[block_id, i - 1, j - 1] k_01 = M_sol[block_id, i - 1, j] k_10 = M_sol[block_id, i, j - 1] k_00 = M_sol[block_id, i - 1, j - 1] # vanilla scheme if solver == 0: M_sol[block_id, i, j] = k_01 + k_10 + k_00 * (inc - 1.) # explicit scheme elif solver == 1: M_sol[block_id, i, j] = (k_01 + k_10) * ( 1. + 0.5 * inc + (1. / 12) * inc**2) - k_00 * (1. - (1. / 12) * inc**2) # implicit scheme else: #M_sol[block_id, i, j] = k_01+k_10-k_00 + ((0.5*inc)/(1.-0.25*inc))*(k_01+k_10) M_sol[block_id, i, j] = k_01 + k_10 - k_00 + (math.exp(0.5 * inc) - 1.) * (k_01 + k_10) # Wait for other threads in this block cuda.syncthreads()
def compute_sig_kernel_Gram_mat_varpar_from_increments_cuda( M_inc, len_x, len_y, n_anti_diagonals, M_sol, solver=0): block_id_x = cuda.blockIdx.x block_id_y = cuda.blockIdx.y # Each thread works on a node of a diagonal. thread_id = cuda.threadIdx.x I = thread_id # Go over each anti-diagonal. Only process threads that fall on the current on the anti-diagonal for p in range(n_anti_diagonals): # The index is actually 'p - thread_id' but need to force it in-bounds J = max(0, min(p - thread_id, len_y - 1)) # For simplicity, we define i, j which start from 1 (offset from I, J) i = I + 1 j = J + 1 # Only compute if element[i, j] is on the current anti-diagonal if I + J == p and (I < len_x and J < len_y): inc = M_inc[block_id_x, block_id_y, i - 1, j - 1] k_01 = M_sol[block_id_x, block_id_y, i - 1, j] k_10 = M_sol[block_id_x, block_id_y, i, j - 1] k_00 = M_sol[block_id_x, block_id_y, i - 1, j - 1] # vanilla scheme if solver == 0: M_sol[block_id_x, block_id_y, i, j] = k_01 + k_10 + k_00 * (inc - 1.) # explicit scheme elif solver == 1: M_sol[block_id_x, block_id_y, i, j] = (k_01 + k_10) * ( 1. + 0.5 * inc + (1. / 12) * inc**2) - k_00 * (1. - (1. / 12) * inc**2) # implicit scheme else: #M_sol[block_id_x, block_id_y, i, j] = k_01+k_10-k_00 + ((0.5*inc)/(1.-0.25*inc))*(k_01+k_10) M_sol[block_id_x, block_id_y, i, j] = k_01 + k_10 - k_00 + (math.exp(0.5 * inc) - 1.) * (k_01 + k_10) # Wait for other threads in this block cuda.syncthreads() # ===========================================================================================================
def compute_softdtw_cuda(D, gamma, bandwidth, max_i, max_j, n_passes, R): """ :param seq_len: The length of the sequence (both inputs are assumed to be of the same size) :param n_passes: 2 * seq_len - 1 (The number of anti-diagonals) """ # Each block processes one pair of examples b = cuda.blockIdx.x # We have as many threads as seq_len, because the most number of threads we need # is equal to the number of elements on the largest anti-diagonal tid = cuda.threadIdx.x # Compute I, J, the indices from [0, seq_len) # The row index is always the same as tid I = tid inv_gamma = 1.0 / gamma # Go over each anti-diagonal. Only process threads that fall on the current on the anti-diagonal for p in range(n_passes): # The index is actually 'p - tid' but need to force it in-bounds J = max(0, min(p - tid, max_j - 1)) # For simplicity, we define i, j which start from 1 (offset from I, J) i = I + 1 j = J + 1 # Only compute if element[i, j] is on the current anti-diagonal, and also is within bounds if I + J == p and (I < max_i and J < max_j): # Don't compute if outside bandwidth if not (abs(i - j) > bandwidth > 0): r0 = -R[b, i - 1, j - 1] * inv_gamma r1 = -R[b, i - 1, j] * inv_gamma r2 = -R[b, i, j - 1] * inv_gamma rmax = max(max(r0, r1), r2) rsum = math.exp(r0 - rmax) + math.exp(r1 - rmax) + math.exp(r2 - rmax) softmin = -gamma * (math.log(rsum) + rmax) R[b, i, j] = D[b, i - 1, j - 1] + softmin # Wait for other threads in this block cuda.syncthreads()
def compute_softdtw_backward_cuda(D, R, inv_gamma, warp, bandwidth, max_i, max_j, n_passes, E, G): k = cuda.blockIdx.x tid = cuda.threadIdx.x I = tid for p in range(n_passes): rev_p = n_passes - p - 1 J = max(0, min(rev_p - tid, max_j - 1)) i = I + 1 j = J + 1 if I + J == rev_p and (I < max_i and J < max_j): if math.isinf(R[k, i, j]): R[k, i, j] = -math.inf if not (abs(i - j) > bandwidth > 0): a = math.exp( (R[k, i + 1, j] - R[k, i, j] - D[k, i, j] - warp) * inv_gamma) b = math.exp( (R[k, i, j + 1] - R[k, i, j] - D[k, i, j] - warp) * inv_gamma) c = math.exp( (R[k, i + 1, j + 1] - R[k, i, j] - D[k, i, j]) * inv_gamma) E[k, i, j] = E[k, i + 1, j] * a + E[k, i, j + 1] * b + E[k, i + 1, j + 1] * c G[k, i, j] = E[k, i + 1, j] + E[k, i, j + 1] + E[k, i + 1, j + 1] cuda.syncthreads()