def plan_bcm2_threshold_diagonal1(queue, delta, weights, max_weight, tag=None): N = len(delta) for arr in (delta,): # matrices assert (arr.stride1s == 1).all() text = """ __kernel void bcm2_threshold_diagonal1( __global const int *shape0s, __global const int *shape1s, __global const int *delta_stride0s, __global const int *delta_starts, __global ${type} *delta_data, __global const int *weights_stride0s, __global const int *weights_starts, __global const ${type} *weights_data, __global const ${type} *max_weights ) { const int ij = get_global_id(0); const int k = get_global_id(1); const int shape0 = shape0s[k]; const int shape1 = shape1s[k]; const int i = ij / shape1; const int j = ij % shape1; __global ${type} *delta = delta_data + delta_starts[k]; __global const ${type} *weights = weights_data + weights_starts[k]; const ${type} max_weight = max_weights[k]; if (i < shape0) { if (fabs(weights[i*weights_stride0s[k] + j] + delta[i*delta_stride0s[k] + j]) > max_weight) { delta[i*delta_stride0s[k] + j] = 0; } } } """ textconf = dict(type=delta.ctype) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( delta.cl_shape0s, delta.cl_shape1s, delta.cl_stride0s, delta.cl_starts, delta.cl_buf, weights.cl_stride0s, weights.cl_starts, weights.cl_buf, max_weight, ) _fn = cl.Program(queue.context, text).build().bcm2_threshold_diagonal1 _fn.set_args(*[arr.data for arr in full_args]) lsize = None gsize = (delta.sizes.max(), N) plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_bcm2_threshold_diagonal1", tag=tag) plan.full_args = full_args # prevent garbage-collection plan.flops_per_call = 4 * delta.sizes.sum() plan.bw_per_call = (delta.nbytes + weights.nbytes + max_weight.nbytes) return plan
def plan_stp(queue, calcium, resources, weights, delta, alpha, init_weights, tag=None): assert (len(calcium) == len(resources) == len(weights) == len(delta) == alpha.size == len(init_weights)) N = len(calcium) for arr in (calcium, resources): # vectors assert (arr.shape1s == 1).all() for arr in (delta, weights, init_weights): # matrices assert (arr.stride1s == 1).all() #assert (resources.shape0s == weights.shape0s).all() #assert (calcium.shape0s == weights.shape1s).all() assert (weights.shape0s == delta.shape0s).all() assert (weights.shape1s == delta.shape1s).all() assert (weights.shape0s == init_weights.shape0s).all() assert (weights.shape1s == init_weights.shape1s).all() assert (calcium.ctype == resources.ctype == weights.ctype == delta.ctype == alpha.ctype == init_weights.ctype) text = """ __kernel void stp( __global const int *shape0s, __global const int *shape1s, __global const int *calcium_stride0s, __global const int *calcium_starts, __global const ${type} *calcium_data, __global const int *resources_stride0s, __global const int *resources_starts, __global const ${type} *resources_data, __global const int *weights_stride0s, __global const int *weights_starts, __global const ${type} *weights_data, __global const int *delta_stride0s, __global const int *delta_starts, __global ${type} *delta_data, __global const ${type} *alphas, __global const int *init_weights_stride0s, __global const int *init_weights_starts, __global const ${type} *init_weights_data ) { const int ij = get_global_id(0); const int k = get_global_id(1); const int shape0 = shape0s[k]; const int shape1 = shape1s[k]; const int i = ij / shape1; const int j = ij % shape1; __global ${type} *delta = delta_data + delta_starts[k]; const ${type} calcium = calcium_data[calcium_starts[k] + i*calcium_stride0s[k]]; const ${type} resources = resources_data[resources_starts[k] + i*resources_stride0s[k]]; const ${type} weight = weights_data[ weights_starts[k] + i*weights_stride0s[k]+j]; const ${type} alpha = alphas[k]; const ${type} init_weights = init_weights_data[init_weights_starts[k] + i*init_weights_stride0s[k]+j]; if (i < shape0) { delta[i*delta_stride0s[k] + j] = ((calcium*resources/0.2)*init_weights)-weight; } } """ textconf = dict(type=calcium.ctype) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( delta.cl_shape0s, delta.cl_shape1s, calcium.cl_stride0s, calcium.cl_starts, calcium.cl_buf, resources.cl_stride0s, resources.cl_starts, resources.cl_buf, weights.cl_stride0s, weights.cl_starts, weights.cl_buf, delta.cl_stride0s, delta.cl_starts, delta.cl_buf, alpha, init_weights.cl_stride0s, init_weights.cl_starts, init_weights.cl_buf, ) _fn = cl.Program(queue.context, text).build().stp _fn.set_args(*[arr.data for arr in full_args]) lsize = None gsize = (delta.sizes.max(), N) plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_stp", tag=tag) plan.full_args = full_args # prevent garbage-collection plan.flops_per_call = 6 * delta.sizes.sum() plan.bw_per_call = (calcium.nbytes + resources.nbytes + weights.nbytes + delta.nbytes + alpha.nbytes + init_weights.nbytes) return plan
def plan_bcm2(queue, pre, post, theta, delta, alpha, tag=None): #weights, max_weight, assert len(pre) == len(post) == len(theta) == len(delta) == alpha.size N = len(pre) for arr in (pre, post, theta): # vectors assert (arr.shape1s == 1).all() for arr in (delta,): # matrices assert (arr.stride1s == 1).all() assert (post.shape0s == delta.shape0s).all() assert (pre.shape0s == delta.shape1s).all() assert (post.shape0s == theta.shape0s).all() assert (pre.ctype == post.ctype == theta.ctype == delta.ctype == alpha.ctype) text = """ __kernel void bcm2( __global const int *shape0s, __global const int *shape1s, __global const int *pre_stride0s, __global const int *pre_starts, __global const ${type} *pre_data, __global const int *post_stride0s, __global const int *post_starts, __global const ${type} *post_data, __global const int *theta_stride0s, __global const int *theta_starts, __global const ${type} *theta_data, __global const int *delta_stride0s, __global const int *delta_starts, __global ${type} *delta_data, __global const ${type} *alphas //__global const int *weights_stride0s, //__global const int *weights_starts, //__global const ${type} *weights_data, //__global const ${type} *max_weights ) { const int ij = get_global_id(0); const int k = get_global_id(1); const int shape0 = shape0s[k]; const int shape1 = shape1s[k]; const int i = ij / shape1; const int j = ij % shape1; __global ${type} *delta = delta_data + delta_starts[k]; const ${type} pre = pre_data[pre_starts[k] + j*pre_stride0s[k]]; const ${type} post = post_data[post_starts[k] + i*post_stride0s[k]]; const ${type} theta = theta_data[ theta_starts[k] + i*theta_stride0s[k]]; const ${type} alpha = alphas[k]; //__global const ${type} *weights = weights_data + weights_starts[k]; //const ${type} max_weight = max_weights[k]; if (i < shape0) { delta[i*delta_stride0s[k] + j] = alpha * post * (post - theta) * pre; //if (i==j) { // delta[i*delta_stride0s[k] + j] = 0; //} else { // // delta[i*delta_stride0s[k] + j] = alpha * post * (post - theta) * pre; // // if (fabs(weights[i*weights_stride0s[k] + j] + delta[i*delta_stride0s[k] + j]) > max_weight) { // delta[i*delta_stride0s[k] + j] = 0; // } //} } } """ textconf = dict(type=pre.ctype) text = as_ascii(Template(text, output_encoding='ascii').render(**textconf)) full_args = ( delta.cl_shape0s, delta.cl_shape1s, pre.cl_stride0s, pre.cl_starts, pre.cl_buf, post.cl_stride0s, post.cl_starts, post.cl_buf, theta.cl_stride0s, theta.cl_starts, theta.cl_buf, delta.cl_stride0s, delta.cl_starts, delta.cl_buf, alpha, ) #weights.cl_stride0s, weights.cl_starts, weights.cl_buf, #max_weight, _fn = cl.Program(queue.context, text).build().bcm2 _fn.set_args(*[arr.data for arr in full_args]) lsize = None gsize = (delta.sizes.max(), N) plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_bcm2", tag=tag) plan.full_args = full_args # prevent garbage-collection plan.flops_per_call = 4 * delta.sizes.sum() plan.bw_per_call = (pre.nbytes + post.nbytes + theta.nbytes + delta.nbytes + alpha.nbytes) # + weights.nbytes + max_weight.nbytes) return plan
def plan_sparse_dot_inc(queue, A_indices, A_indptr, A_data, X, Y, inc=False, tag=None): """Implements a sparse matrix-vector multiply: Y += A * X or Y = A * X Parameters ---------- A_indices, A_indptr : PyOpenCL array Column sparse row index specifications A_data : PyOpenCL array Matrix values at those indices X, Y : CLRaggedArrays of length 1 Input/output data. inc : bool Whether to increment ``Y`` (True), or set it (False). Notes ----- This function crashes when there are >10M nonzero weights. A potential solution would be some way to tell each work item to do multiple rows. """ assert len(X) == len(Y) == 1 for arr in [X, Y]: assert (arr.stride1s == 1).all() if not ((arr.shape1s == 1).all() and (arr.stride0s == 1).all()): raise NotImplementedError( "OCL SparseDot only supports matrix-vector currently, not matrix-matrix" ) for arr in [A_indices, A_indptr, A_data]: assert len(arr.shape) == 1 assert arr.strides[0] == arr.dtype.itemsize # contiguous assert A_indices.size == A_data.size assert A_data.ctype == X.ctype == Y.ctype assert A_indices.ctype == A_indptr.ctype == "int" kern = """ __kernel void sparsedot_inc( __global const int *A_indices, __global const int *A_indptr, __global const ${dtype} *A_data, __global const int *Xstarts, __global const ${dtype} *Xdata, __global const int *Ystarts, __global ${dtype} *Ydata ) { // n can later be used to keep track of multiple arrays const int n = 0; const int irow = get_global_id(0); __global const ${dtype} *x = Xdata + Xstarts[n]; __global ${dtype} *y = Ydata + Ystarts[n]; %if not inc: y[irow] = 0; %endif const int end = A_indptr[irow + 1]; for (int k = A_indptr[irow]; k < end; k++) { y[irow] += A_data[k] * x[A_indices[k]]; } } """ textconf = dict(dtype=A_data.ctype, IndType=A_indices.ctype, inc=inc) text = as_ascii(Template(kern, output_encoding="ascii").render(**textconf)) full_args = ( A_indices.base_data, A_indptr.base_data, A_data.base_data, X.cl_starts.data, X.cl_buf.data, Y.cl_starts.data, Y.cl_buf.data, ) _fn = cl.Program(queue.context, text).build().sparsedot_inc _fn.set_args(*full_args) gsize = (Y.sizes[0], 1) # this only works for a single operation lsize = None plan = Plan(queue, _fn, gsize, lsize=lsize, name="cl_sparsedot", tag=tag) plan.full_args = full_args # prevent garbage-collection plan.flops_per_call = 2 * A_data.size plan.bw_per_call = A_data.nbytes * 3 + A_indices.nbytes + A_indptr.nbytes plan.description = "groups: %d; shape: (%d, %d); nonzeros: %d" % ( 1, Y.sizes[0], X.sizes[0], A_data.size, ) return plan