def c_support_code_apply(self, node, nodename): function_name = "TimCropper_%s" % nodename ndim_spatial = len(self.patch_shape) ndim_total = 2 + ndim_spatial import math sqrt2pi = math.sqrt(2*math.pi) arguments = common.defn_arguments("x y W".split()) threadindex = common.threadindex(ndim_spatial, "y_dims") weightpass_defn = common.weightpass_defn(nodename, self.patch_shape, grad=False) strings = [] strings.append(""" #include <stdio.h> #include <sys/time.h> $weightpass_defn __global__ void $function_name($arguments) { $threadindex """) for i in range(ndim_spatial): strings.append(""" int a%(i2)s = a[i0 * $ndim_spatial + %(i)s], b%(i2)s = b[i0 * $ndim_spatial + %(i)s]; assert(0 <= a%(i2)s); assert(a%(i2)s <= b%(i2)s); assert(b%(i2)s <= x_dims[%(i2)s]); """ % dict(i=i, i2=2 + i)) # loop over input pixel indices i{2, 3, ...}V # compute start of input memory # NOTE: assumes x, W contiguous # FIXME: the a, b windows are not contiguous, so this is all wrong. must adjust for a, b in both x and W indices strings.append("const float* x_pointer = x + i0 * x_strides[0] + i1 * x_strides[1];") strings.append("const float* W_pointer = W + i0 * W_strides[0] + i1 * W_strides[1];") strings.append("float result = 0.0f;") for i, patch_dim in enumerate(self.patch_shape): strings.append(""" x_pointer += a%(i)s; for (int i%(i)sV = a%(i)s; i%(i)sV < b%(i)s; ++i%(i)sV) { """ % dict(i=2 + i, patch_dim=patch_dim)) weight = " * ".join("*(W + %i)" % j for j in range(ndim_spatial)) strings.append("result += $weight * (*x_pointer);") strings.append("++x_pointer;") strings.append("W_pointer += $ndim_spatial;") strings.extend("}" * ndim_spatial) # store result at output memory location strings.append("size_t y_index = i0 * y_strides[0] + i1 * y_strides[1] + %s;" % " + ".join("i%(i)sv * y_strides[%(i)s]" % dict(i=2 + i) for i in range(ndim_spatial))) strings.append("assert(y_index < y_size);") strings.append("y[y_index] = result;") strings.append("}") from string import Template return Template("\n".join(strings)).substitute(locals())
def c_support_code_apply(self, node, nodename): function_name = "TimCropper_%s" % nodename ndim_spatial = len(self.patch_shape) ndim_total = 2 + ndim_spatial import math sqrt2pi = math.sqrt(2 * math.pi) arguments = common.defn_arguments("x y W".split()) threadindex = common.threadindex(ndim_spatial, "y_dims") weightpass_defn = common.weightpass_defn(nodename, self.patch_shape, grad=False) strings = [] strings.append(""" #include <stdio.h> #include <sys/time.h> $weightpass_defn __global__ void $function_name($arguments) { $threadindex """) for i in range(ndim_spatial): strings.append(""" int a%(i2)s = a[i0 * $ndim_spatial + %(i)s], b%(i2)s = b[i0 * $ndim_spatial + %(i)s]; assert(0 <= a%(i2)s); assert(a%(i2)s <= b%(i2)s); assert(b%(i2)s <= x_dims[%(i2)s]); """ % dict(i=i, i2=2 + i)) # loop over input pixel indices i{2, 3, ...}V # compute start of input memory # NOTE: assumes x, W contiguous # FIXME: the a, b windows are not contiguous, so this is all wrong. must adjust for a, b in both x and W indices strings.append( "const float* x_pointer = x + i0 * x_strides[0] + i1 * x_strides[1];" ) strings.append( "const float* W_pointer = W + i0 * W_strides[0] + i1 * W_strides[1];" ) strings.append("float result = 0.0f;") for i, patch_dim in enumerate(self.patch_shape): strings.append(""" x_pointer += a%(i)s; for (int i%(i)sV = a%(i)s; i%(i)sV < b%(i)s; ++i%(i)sV) { """ % dict(i=2 + i, patch_dim=patch_dim)) weight = " * ".join("*(W + %i)" % j for j in range(ndim_spatial)) strings.append("result += $weight * (*x_pointer);") strings.append("++x_pointer;") strings.append("W_pointer += $ndim_spatial;") strings.extend("}" * ndim_spatial) # store result at output memory location strings.append( "size_t y_index = i0 * y_strides[0] + i1 * y_strides[1] + %s;" % " + ".join("i%(i)sv * y_strides[%(i)s]" % dict(i=2 + i) for i in range(ndim_spatial))) strings.append("assert(y_index < y_size);") strings.append("y[y_index] = result;") strings.append("}") from string import Template return Template("\n".join(strings)).substitute(locals())
def c_support_code_apply(self, node, nodename): function_name = "TimCropperGrad_%s" % nodename weight_function_name = "%s_weight" % function_name ndim_spatial = len(self.patch_shape) ndim_total = 2 + ndim_spatial import math sqrt2pi = math.sqrt(2 * math.pi) strings = [] strings.append( """ #include <stdio.h> """ ) strings.append(common.weightfunction(name=weight_function_name, grad=True)) arguments = [] for var in "dydl dyds dCdy x".split(): arguments.append("const size_t %s_size" % var) arguments.append("%sfloat* %s" % ("const " if var in "x dCdy".split() else "", var)) arguments.append("const int* %s_dims" % var) arguments.append("const int* %s_strides" % var) # a, b, l, s are contiguous arguments.extend("const float* %s" % var for var in "abls") arguments = ", \n".join(arguments) threadindex = common.threadindex(ndim_spatial, "dCdy_dims") # generate code that does output initialization and computation. this is # best kept together in the python code to reduce confusion. initializations = [] computations = [] for var in "ls": dy = "dyd" + var dw = "dwd" + var # patch pixel index; we need to compute dydl[0]..dydl[ndim_spatial] # for each pixel, so one more subscript is needed to fully qualify # the output element. output_indexish = "i0 * %(dy)s_strides[0] + i1 * %(dy)s_strides[1] + %(rest)s" % dict( dy=dy, rest=" + ".join("i%(i)sv * %(dy)s_strides[%(i)s]" % dict(dy=dy, i=2 + i) for i in range(ndim_spatial)), ) for j in range(ndim_spatial): index = "%(output_indexish)s + %(j)s * %(dy)s_strides[$ndim_total]" % dict( output_indexish=output_indexish, dy=dy, j=j ) # initialize to zero initializations.append( """ assert(%(index)s < %(dy)s_size); %(dy)s[%(index)s] = 0.0f; """ % dict(dy=dy, index=index) ) # compute contribution # for dy/dl[0], weight = dw/dl0 * w1 * w2 # for dy/dl[1], weight = w0 * dw/dl1 * w2 # for dy/dl[2], weight = w0 * w1 * dw/dl2 # etc. and similarly dy/ds weight = " * ".join((dw if i == j else "w") + str(2 + i) for i in range(ndim_spatial)) computations.append( """ assert(%(index)s < %(dy)s_size); %(dy)s[%(index)s] += %(weight)s * x[x_index]; """ % dict(dy=dy, weight=weight, index=index) ) strings.append( """ __global__ void $function_name($arguments) { $threadindex """ ) strings.append("\n".join(initializations)) for i in range(ndim_spatial): strings.append( """ const int a%(i2)s = a[i0 * $ndim_spatial + %(i)s], b%(i2)s = b[i0 * $ndim_spatial + %(i)s]; const float l%(i2)s = l[i0 * $ndim_spatial + %(i)s], s%(i2)s = s[i0 * $ndim_spatial + %(i)s]; assert(0 <= a%(i2)s); assert(a%(i2)s <= b%(i2)s); assert(b%(i2)s <= x_dims[%(i2)s]); float w%(i2)s = 0, dwdl%(i2)s = 0, dwds%(i2)s = 0; """ % dict(i=i, i2=2 + i) ) # loop over input pixel indices i{2, 3, ...}V for i, patch_dim in enumerate(self.patch_shape): strings.append( """ for (int i%(i)sV = a%(i)s; i%(i)sV < b%(i)s; ++i%(i)sV) { $weight_function_name(w%(i)s, dwdl%(i)s, dwds%(i)s, %(patch_dim)s, i%(i)sv, i%(i)sV, l%(i)s, s%(i)s); """ % dict(i=2 + i, patch_dim=patch_dim) ) # compute input memory location strings.append( "size_t x_index = i0 * x_strides[0] + i1 * x_strides[1] + %s;" % " + ".join("i%(i)sV * x_strides[%(i)s]" % dict(i=2 + i) for i in range(ndim_spatial)) ) strings.append("assert(x_index < x_size);") strings.append("\n".join(computations)) strings.extend("}" * (ndim_spatial + 1)) from string import Template return Template("\n".join(strings)).substitute(locals())
def c_support_code_apply(self, node, nodename): function_name = "TimCropperGrad_%s" % nodename weight_function_name = "%s_weight" % function_name ndim_spatial = len(self.patch_shape) ndim_total = 2 + ndim_spatial import math sqrt2pi = math.sqrt(2 * math.pi) strings = [] strings.append(""" #include <stdio.h> """) strings.append( common.weightfunction(name=weight_function_name, grad=True)) arguments = [] for var in "dydl dyds dCdy x".split(): arguments.append("const size_t %s_size" % var) arguments.append( "%sfloat* %s" % ("const " if var in "x dCdy".split() else "", var)) arguments.append("const int* %s_dims" % var) arguments.append("const int* %s_strides" % var) # a, b, l, s are contiguous arguments.extend("const float* %s" % var for var in "abls") arguments = ", \n".join(arguments) threadindex = common.threadindex(ndim_spatial, "dCdy_dims") # generate code that does output initialization and computation. this is # best kept together in the python code to reduce confusion. initializations = [] computations = [] for var in "ls": dy = "dyd" + var dw = "dwd" + var # patch pixel index; we need to compute dydl[0]..dydl[ndim_spatial] # for each pixel, so one more subscript is needed to fully qualify # the output element. output_indexish = ( "i0 * %(dy)s_strides[0] + i1 * %(dy)s_strides[1] + %(rest)s" % dict(dy=dy, rest=" + ".join("i%(i)sv * %(dy)s_strides[%(i)s]" % dict(dy=dy, i=2 + i) for i in range(ndim_spatial)))) for j in range(ndim_spatial): index = ( "%(output_indexish)s + %(j)s * %(dy)s_strides[$ndim_total]" % dict(output_indexish=output_indexish, dy=dy, j=j)) # initialize to zero initializations.append(""" assert(%(index)s < %(dy)s_size); %(dy)s[%(index)s] = 0.0f; """ % dict(dy=dy, index=index)) # compute contribution # for dy/dl[0], weight = dw/dl0 * w1 * w2 # for dy/dl[1], weight = w0 * dw/dl1 * w2 # for dy/dl[2], weight = w0 * w1 * dw/dl2 # etc. and similarly dy/ds weight = " * ".join((dw if i == j else "w") + str(2 + i) for i in range(ndim_spatial)) computations.append(""" assert(%(index)s < %(dy)s_size); %(dy)s[%(index)s] += %(weight)s * x[x_index]; """ % dict(dy=dy, weight=weight, index=index)) strings.append(""" __global__ void $function_name($arguments) { $threadindex """) strings.append("\n".join(initializations)) for i in range(ndim_spatial): strings.append(""" const int a%(i2)s = a[i0 * $ndim_spatial + %(i)s], b%(i2)s = b[i0 * $ndim_spatial + %(i)s]; const float l%(i2)s = l[i0 * $ndim_spatial + %(i)s], s%(i2)s = s[i0 * $ndim_spatial + %(i)s]; assert(0 <= a%(i2)s); assert(a%(i2)s <= b%(i2)s); assert(b%(i2)s <= x_dims[%(i2)s]); float w%(i2)s = 0, dwdl%(i2)s = 0, dwds%(i2)s = 0; """ % dict(i=i, i2=2 + i)) # loop over input pixel indices i{2, 3, ...}V for i, patch_dim in enumerate(self.patch_shape): strings.append(""" for (int i%(i)sV = a%(i)s; i%(i)sV < b%(i)s; ++i%(i)sV) { $weight_function_name(w%(i)s, dwdl%(i)s, dwds%(i)s, %(patch_dim)s, i%(i)sv, i%(i)sV, l%(i)s, s%(i)s); """ % dict(i=2 + i, patch_dim=patch_dim)) # compute input memory location strings.append( "size_t x_index = i0 * x_strides[0] + i1 * x_strides[1] + %s;" % " + ".join("i%(i)sV * x_strides[%(i)s]" % dict(i=2 + i) for i in range(ndim_spatial))) strings.append("assert(x_index < x_size);") strings.append("\n".join(computations)) strings.extend("}" * (ndim_spatial + 1)) from string import Template return Template("\n".join(strings)).substitute(locals())