def logistic_derivative(context, activations, delta, dest=None): kernel_cache, thread = context.kernel_cache, context.thread if dest is None: dest = delta key = (logistic_derivative, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('activations', Annotation(activations, 'i')), Parameter('delta', Annotation(activations, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${delta.ctype} d = ${delta.load_same}; d = d*a*(1.0f - a); ${dest.store_same}(d); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, delta, dest)
def renormalize_kernel(ctx, array, norm, constraint): kernel_cache, thread = ctx.kernel_cache, ctx.thread constraint = numpy.float32(constraint) key = (renormalize_kernel, array.shape, norm.shape, thread._context) if key not in kernel_cache.keys(): comp = PureParallel([ Parameter('array', Annotation(array, 'io')), Parameter('norm', Annotation(norm, 'i')), Parameter('constraint', Annotation(constraint)) ], """ // Renormalize if necessary float n = ${norm.load_idx}(${idxs[1]}); float c = ${constraint}; if ( n > c ) { float a = ${array.load_same}; a = a * c / n; ${array.store_same}(a); } """, guiding_array='array') kernel_cache[key] = comp.compile(thread) kernel_cache[key](array, norm, constraint)
def classification_delta_kernel(ctx, outputs, targets, deltas): kernel_cache, thread = ctx.kernel_cache, ctx.thread assert outputs.shape[0] == targets.shape[0] == deltas.shape[0] assert len(targets.shape) == 1 assert targets.dtype == numpy.int32 assert outputs.shape[1] == deltas.shape[1] key = (classification_delta_kernel, outputs.shape) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('outputs', Annotation(outputs, 'i')), Parameter('targets', Annotation(targets, 'i')), Parameter('deltas', Annotation(deltas, 'o')) ], """ ${outputs.ctype} out = ${outputs.load_same}; SIZE_T t = ${targets.load_idx}(${idxs[0]}); SIZE_T idx = ${idxs[1]}; ${deltas.ctype} d; if (t == idx) { d = 1.0f - out; } else { d = -out; } ${deltas.store_same}(d); """, guiding_array='deltas') kernel_cache[key] = kernel.compile(thread) # Run kernel kernel_cache[key](outputs, targets, deltas)
def linear(context, activations, bias, dest=None): kernel_cache, thread = context.kernel_cache, context.thread if dest is None: dest = activations key = (linear, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) assert activations.shape[1] == bias.shape[0] kernel = PureParallel([ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${bias.ctype} b = ${bias.load_idx}(${idxs[1]}); a += b; ${dest.store_same}(a); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, bias, dest) return dest
def softplus(self, activations, bias, dest=None): kernel_cache, thread = self.kernel_cache, self.thread if dest is None: dest = activations key = (self.softplus, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) assert activations.shape[1] == bias.shape[0] kernel = PureParallel([ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${bias.ctype} b = ${bias.load_idx}(${idxs[1]}); a += b; a = min(max(-45.0f, a), 45.0f); a = log(1.0f + exp(a)); ${dest.store_same}(a); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, bias, dest) return dest
def nan_to_zeros(self, array, dest=None): kernel_cache, thread = self.kernel_cache, self.thread if dest is None: dest = array key = (self.nan_to_zeros, array.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('array', Annotation(array, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${array.ctype} a = ${array.load_same}; if (isnan(a)) { ${dest.store_same}(0.0f); } """, guiding_array='array') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](array, dest) return dest
def sub(self, mat1, mat2, dest): """ Subtract mat2 from mat1. ATTENTION: if a value is nan, the result will be zero. """ kernel_cache = self.kernel_cache thread = self.thread key = (self.sub, mat1.dtype, mat1.shape) if key not in kernel_cache.keys(): log.info("compiling " + str(key)) assert mat1.shape == mat2.shape == dest.shape kernel_delta_output = PureParallel([ Parameter('mat1', Annotation(mat1, 'i')), Parameter('mat2', Annotation(mat2, 'i')), Parameter('dest', Annotation(dest, 'o')) ], """ // Delta ( for the output layer ) ${mat1.ctype} m1 = ${mat1.load_same}; ${mat2.ctype} m2 = ${mat2.load_same}; if (isnan(m1) || isnan(m2)) { ${dest.store_same}(0.0f); } else { ${dest.ctype} d = m1 - m2; ${dest.store_same}(d); } """, guiding_array='dest') kernel_cache[key] = kernel_delta_output.compile(thread) kernel_cache[key](mat1, mat2, dest)
def add(self, mat1, mat2, dest): kernel_cache = self.kernel_cache thread = self.thread key = (self.add, mat1.dtype, mat1.shape) if key not in kernel_cache.keys(): log.info("compiling " + str(key)) assert mat1.shape == mat2.shape == dest.shape kernel_delta_output = PureParallel([ Parameter('mat1', Annotation(mat1, 'i')), Parameter('mat2', Annotation(mat2, 'i')), Parameter('dest', Annotation(dest, 'o')) ], """ // Delta ( for the output layer ) ${mat1.ctype} m1 = ${mat1.load_same}; ${mat2.ctype} m2 = ${mat2.load_same}; ${dest.ctype} d = m1 + m2; ${dest.store_same}(d); """, guiding_array='dest') kernel_cache[key] = kernel_delta_output.compile(thread) kernel_cache[key](mat1, mat2, dest)
def softplus_derivative(self, activations, delta, dest=None): kernel_cache, thread = self.kernel_cache, self.thread if dest is None: dest = delta key = (self.softplus_derivative, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('activations', Annotation(activations, 'i')), Parameter('delta', Annotation(activations, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${delta.ctype} d = ${delta.load_same}; // the softplus function already has been applied // to the activations, so wee need to apply the // inverse of softplus chained with logistic // note: logistic is the derivative of softplus a = min(max(-45.0f, a), 45.0f); a = 1.0f / (1.0f / (exp(a) - 1.0f) + 1.0f); d = d*a; ${dest.store_same}(d); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread) # Run kernel kernel_cache[key](activations, delta, dest)
def convolve2d_propagation(ctx, array, weights, dest): """ The output is the valid discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_propagation, weights.shape, array.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling" + str(key)) channels, filters, owidth, oheight = weights.shape[0], weights.shape[ 1], dest.shape[1], dest.shape[2] render_kwds = { 'w0': weights.shape[2], 'w1': weights.shape[3], 'a0': array.shape[2], 'a1': array.shape[3], 'off0': int(weights.shape[2] - 1), 'off1': int(weights.shape[3] - 1) } kernel_conv = PureParallel([ Parameter('array', Annotation(array, 'i')), Parameter('weights', Annotation(weights, 'i')), Parameter('dest', Annotation(dest, 'o')) ], """ // Array dimensions: // array : (channels, width, height) // weights: (channels, filters, fwidth, fheight) // dest (channels, filters, owidth, oheight) float a = 0.0f; SIZE_T x, y, i, j; const SIZE_T number = ${idxs[0]}; const SIZE_T channel = ${idxs[1]}; const SIZE_T filter = ${idxs[2]}; const SIZE_T xout = ${idxs[3]}; const SIZE_T yout = ${idxs[4]}; for (i=0; i < ${w0}; i++){ for (j=0; j < ${w1}; j++){ x = xout - i + ${off0}; y = yout - j + ${off1}; a += ${array.load_idx}(number, channel, x, y) * ${weights.load_idx}(channel, filter, i, j); // channel, filter, i, j } } ${dest.store_same}(a); """, guiding_array='dest', render_kwds=render_kwds) kernel_cache[key] = kernel_conv.compile(thread, fast_math=True) # run convolution kernel_cache[key](array, weights, dest) return dest
def class_errors(ctx, expected, actual, errors): """ expected int32, actual float, errors int32 """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (class_errors, expected.shape) if key not in kernel_cache.keys(): # target should be an integer logging.info("compiling " + str(key)) assert expected.shape == errors.shape # one neuron per class assert expected.shape == (actual.shape[0], ) # index of the class assert actual.dtype == numpy.float32 assert expected.dtype == numpy.int32 assert errors.dtype == numpy.int32 kernel = PureParallel( [ Parameter('expected', Annotation(expected, 'i')), Parameter('actual', Annotation(actual, 'i')), Parameter('errors', Annotation(errors, 'o')) ], """ SIZE_T expected = ${expected.load_idx}(${idxs[0]});; float maximum=0.0f; float value; SIZE_T maxindex = 0; SIZE_T tl = ${target_length}; // calculate argmax for(SIZE_T j=0; j < tl; j++) { value = ${actual.load_idx}(${idxs[0]}, j); if (value > maximum) { maximum = value; maxindex = j; } } // If the confidence is too low, return an error if (maximum < (1.0f / ${target_length}.0f + 0.001f)) { ${errors.store_same}(1); return; }; // compare argmax if (maxindex != expected) { ${errors.store_same}(1); } else { ${errors.store_same}(0); } """, guiding_array='expected', render_kwds={'target_length': numpy.int32(actual.shape[1])}) kernel_cache[key] = kernel.compile(thread) kernel_cache[key](expected, actual, errors)
def _process_kernel_arguments(self, args): """ Scan through kernel arguments passed by the user, check types, and wrap ad hoc values if necessary. Does not change the plan state. """ processed_args = [] adhoc_idgen = IdGen('_adhoc') adhoc_values = {} for arg in args: if not isinstance(arg, KernelArgument): if hasattr(arg, 'shape') and hasattr(arg, 'dtype'): if len(arg.shape) > 0: raise ValueError( "Arrays are not allowed as ad hoc arguments") # Not creating a new persistent scalar with _scalar(), # because the kernel compilation may fail, # in which case we would have to roll back the plan state. # These arguments are local to this kernel anyway, # so there's no need in registering them in the plan. name = self._translator(adhoc_idgen()) adhoc_values[name] = arg annotation = Annotation(Type(arg.dtype)) arg = KernelArgument(name, annotation.type) else: raise TypeError("Unknown argument type: " + str(type(arg))) else: annotation = self._get_annotation(arg.name) processed_args.append(Parameter(arg.name, annotation)) return processed_args, adhoc_values
def softmax(ctx, activations, bias, dest=None): """ Softmax Activation Function """ kernel_cache, thread = ctx.kernel_cache, ctx.thread if dest is None: dest = activations key = (softmax, activations.shape) if key not in kernel_cache.keys(): logging.info("compiling " + str(key)) # Regression hidden layer kernel_softmax = PureParallel( [ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ float x; float b; float s = 0.0f; SIZE_T tl = ${target_length}; for(SIZE_T j=0; j < tl; j++) { x = ${activations.load_idx}(${idxs[0]}, j); b = ${bias.load_idx}(j); x += b; x = exp(min(max(x, -45.0f), 45.0f)); ${dest.store_idx}(${idxs[0]}, j, x); s += x; } // divide by sum for(SIZE_T j=0; j < tl; j++) { x = ${dest.load_idx}(${idxs[0]}, j); x /= s; ${dest.store_idx}(${idxs[0]}, j, x); } """, guiding_array=(activations.shape[0], ), render_kwds={'target_length': numpy.int32(activations.shape[1])}) kernel_cache[key] = kernel_softmax.compile(thread) kernel_cache[key](activations, bias, dest)
def persistent_array(self, arr): """ Adds a persistent GPU array to the plan, and returns the corresponding :py:class:`KernelArgument`. """ name = self._translator(self._persistent_value_idgen()) ann = Annotation(arr, 'i') self._internal_annotations[name] = ann self._persistent_values[name] = self._thread.to_device(arr) return KernelArgument(name, ann.type)
def _scalar(self, val): """ Adds a persistent scalar to the plan, and returns the corresponding :py:class:`KernelArgument`. """ name = self._translator(self._persistent_value_idgen()) ann = Annotation(val) self._internal_annotations[name] = ann self._persistent_values[name] = ann.type(val) return KernelArgument(name, ann.type)
def _connect(self, ntr): # At this point we assume that ``ntr`` describes a valid connection. # All sanity checks are performed in ``connect()``. for tr_param in ntr.trf.signature.parameters.values(): node_name = ntr.node_from_tr[tr_param.name] if node_name == ntr.connector_node_name: ann = self.leaf_parameters[node_name].annotation if ann.input and ann.output: # splitting the 'io' leaf updated_role = 'i' if ntr.output else 'o' # Since it is an array parameter, we do not need to worry # about preserving the default value (it can't have one). self.leaf_parameters[node_name] = Parameter( node_name, Annotation(ann.type, role=updated_role)) else: # 'i' or 'o' leaf is hidden by the transformation del self.leaf_parameters[node_name] else: if (node_name in self.leaf_parameters and self.leaf_parameters[node_name].annotation.array): ann = self.leaf_parameters[node_name].annotation if (ann.input and ntr.output) or (ann.output and not ntr.output): # Joining 'i' and 'o' paths into an 'io' leaf. # Since it is an array parameter, we do not need to worry # about preserving the default value (it can't have one). self.leaf_parameters[node_name] = Parameter( node_name, Annotation(ann.type, role='io')) else: self.leaf_parameters[node_name] = tr_param.rename( node_name) if node_name not in self.nodes: self.nodes[node_name] = Node() self.nodes[ntr.connector_node_name] = self.nodes[ ntr.connector_node_name].connect(ntr)
def temp_array(self, shape, dtype, strides=None): """ Adds a temporary GPU array to the plan, and returns the corresponding :py:class:`KernelArgument`. Temporary arrays can share physical memory, but in such a way that their contents is guaranteed to persist between the first and the last use in a kernel during the execution of the plan. """ name = self._translator(self._temp_array_idgen()) ann = Annotation(Type(dtype, shape=shape, strides=strides), 'io') self._internal_annotations[name] = ann self._temp_arrays.add(name) return KernelArgument(name, ann.type)
def dropout(ctx, mat, rand, probability): kernel_cache = ctx.kernel_cache probability = numpy.float32(probability) thread = ctx.thread key = (dropout, mat.dtype, mat.shape) if key not in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('mat', Annotation(mat, 'o')), Parameter('rand', Annotation(mat, 'i')), Parameter('probability', Annotation(probability)) ], """ ${rand.ctype} r = ${rand.load_same}; if (r < ${probability}) { ${mat.store_same}(0.0f); } """, guiding_array='mat') kernel_cache[key] = kernel.compile(thread) kernel_cache[key](mat, rand, probability)
def scale(self, mat, scalar): kernel_cache = self.kernel_cache scalar = numpy.float32(scalar) thread = self.thread key = (self.scale, mat.dtype, mat.shape) if key not in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('mat', Annotation(mat, 'io')), Parameter('scalar', Annotation(scalar)) ], """ // Delta ( for the output layer ) ${mat.ctype} m = ${mat.load_same}; ${mat.ctype} s = ${scalar}; m *= s; ${mat.store_same}(m); """, guiding_array='mat') kernel_cache[key] = kernel.compile(thread) kernel_cache[key](mat, scalar)
def copy_minibatch(self, array, indices, minibatch): kernel_cache, thread = self.kernel_cache, self.thread key = (self.copy_minibatch, minibatch.dtype, minibatch.shape, array.shape) if key not in kernel_cache.keys(): log.info("compiling " + str(key)) assert minibatch.shape[0] == indices.shape[0] assert indices.dtype == numpy.int32 dimensions = numpy.int32(len(array.shape)) assert minibatch.shape[0] == indices.shape[0] kernel = PureParallel([ Parameter('array', Annotation(array, 'i')), Parameter('indices', Annotation(indices, 'i')), Parameter('minibatch', Annotation(minibatch, 'o')) ], """ SIZE_T idx = ${indices.load_idx}(${idxs[0]}); %if dimensions == 2: ${minibatch.store_same}(${array.load_idx}(idx, ${idxs[1]})); %elif dimensions == 3: ${minibatch.store_same}(${array.load_idx}(idx, ${idxs[1]}, ${idxs[2]})); %else: ${minibatch.store_same}(${array.load_idx}(idx)); %endif """, guiding_array='minibatch', render_kwds=dict(dimensions=dimensions)) log.info(array.shape) log.info(indices.shape) log.info(minibatch.shape) kernel_cache[key] = kernel.compile(thread) kernel_cache[key](array, indices, minibatch)
def lwta(ctx, mat, lwta_size): kernel_cache = ctx.kernel_cache lwta_size = numpy.float32(lwta_size) thread = ctx.thread key = (lwta, mat.dtype, mat.shape, lwta_size) if key not in kernel_cache.keys(): num_units = mat.shape[1] log.info("compiling " + str(key)) kernel = PureParallel([Parameter('mat', Annotation(mat, 'io'))], """ SIZE_T this_idx = ${idxs[1]}; SIZE_T group_size = ${lwta_size}; // only the first thread per group computes anything if (this_idx % group_size == 0) { SIZE_T argmax = ${idxs[1]}; SIZE_T candidate_idx; ${mat.ctype} ma = ${mat.load_same}; ${mat.ctype} candidate_value; // find the argmax in the group for (SIZE_T i=1; i < group_size; i++) { candidate_idx = this_idx + i; if (candidate_idx >= ${num_units}) break; candidate_value = ${mat.load_idx}(${idxs[0]}, candidate_idx); if ( candidate_value > ma) { ma = candidate_value; argmax = candidate_idx; } } // second pass: zero all except argmax for (SIZE_T i=0; i < group_size; i++) { candidate_idx = this_idx + i; if (candidate_idx >= ${num_units}) break; if ( candidate_idx != argmax ) { ${mat.store_idx}(${idxs[0]}, candidate_idx, 0.0f); } } } """, guiding_array='mat', render_kwds=dict(lwta_size=lwta_size, num_units=num_units)) kernel_cache[key] = kernel.compile(thread) kernel_cache[key](mat)
def __init__(self, root_parameters): # Preserve order of initial root parameters. # These can repeat. self.root_names = [] # Keeping whole parameters, because we want to preserve the default values (if any). self.root_parameters = {} self.nodes = {} # all nodes of the tree self.leaf_parameters = {} # nodes available for connection for param in root_parameters: self.root_names.append(param.name) if param.name in self.root_parameters and param != self.root_parameters[ param.name]: # Could be an 'io' parameter used for separate 'i' and 'o' parameters # in a nested computation. # Need to check types and merge. new_ann = param.annotation old_param = self.root_parameters[param.name] old_ann = old_param.annotation # FIXME: Not sure when these can be raised assert old_ann.type == new_ann.type assert old_param.default == param.default # Given the old_param != param, the only possible combinations of roles are # 'i' and 'o', 'i' and 'io', 'o' and 'io'. # In all cases the resulting role is 'io'. new_param = Parameter(param.name, Annotation(new_ann.type, 'io'), default=param.default) self.root_parameters[param.name] = new_param self.leaf_parameters[param.name] = new_param else: self.nodes[param.name] = Node() self.root_parameters[param.name] = param self.leaf_parameters[param.name] = param
def convolve2d_gradient(ctx, prev_deltas, deltas, gradient_intermediate): """ The output is the full discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_gradient, prev_deltas.shape, deltas.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) # Extract shapes from the arrays n, channels, p_width, p_height = prev_deltas.shape n_1, filters, d_width, d_height = deltas.shape n, d_width_1, d_height_1, channels_1, filters_1, f_width, f_height = gradient_intermediate.shape # Some assertions to be sure everything is correct assert n_1 == n assert filters_1 == filters assert channels_1 == channels expected_shape = get_output_shape(prev_deltas, deltas, 'gradient') assert expected_shape == gradient_intermediate.shape assert d_width_1 == d_width assert d_height_1 == d_height # Render keywords render_kwds = { 'n': n, 'filters': filters, 'channels': channels, 'f_width': f_width, 'f_height': f_height, 'd_width': d_width, 'd_height': d_height, 'p_width': p_width, 'p_height': p_height, } # The kernel kernel = PureParallel([ Parameter('prev_deltas', Annotation(prev_deltas, 'i')), Parameter('deltas', Annotation(deltas, 'i')), Parameter('gradient_intermediate', Annotation(gradient_intermediate, 'o')) ], """ const SIZE_T number = ${idxs[0]}; const SIZE_T dx = ${idxs[1]}; const SIZE_T dy = ${idxs[2]}; const SIZE_T channel = ${idxs[3]}; const SIZE_T filter = ${idxs[4]}; const SIZE_T fx = ${idxs[5]}; const SIZE_T fy = ${idxs[6]}; // weight gradient at the weight position fx, fy is defined by the sum // // (deltas * prev_deltas[fx:d_width+fx, fy:fy+d_height]).sum() // // alternatively we can store all delta positions and sum in a separate kernel - this is what we do now. float g = ${deltas.load_idx}(number, filter, dx, dy) * ${prev_deltas.load_idx}(number, channel, dx+fx, dy+fy); ${gradient_intermediate.store_same}(g); """, guiding_array='gradient_intermediate', render_kwds=render_kwds) kernel_cache[key] = kernel.compile(thread, fast_math=True) # run convolution -> intermediate kernel_cache[key](prev_deltas, deltas, gradient_intermediate) return gradient_intermediate
def convolve2d_backprop(ctx, deltas, weights, deltas_intermediate): """ The output is the full discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_backprop, deltas.shape, weights.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) # Extract shapes from the arrays channels, filters, f_width, f_height = weights.shape n_1, filters_1, d_width, d_height = deltas.shape n, channels_1, filters_2, p_width, p_height = deltas_intermediate.shape # Some assertions to be sure everything is correct assert n_1 == n assert filters_2 == filters_1 == filters assert channels_1 == channels expected_shape = get_output_shape(deltas, weights, 'backprop') assert expected_shape == deltas_intermediate.shape # Render keywords render_kwds = { 'n': n, 'filters': filters, 'channels': channels, 'f_width': f_width, 'f_height': f_height, 'd_width': d_width, 'd_height': d_height, 'p_width': p_width, 'p_height': p_height, } # The kernel kernel = PureParallel([ Parameter('deltas', Annotation(deltas, 'i')), Parameter('weights', Annotation(weights, 'i')), Parameter('deltas_intermediate', Annotation(deltas_intermediate, 'o')) ], """ float d = 0.0f; SIZE_T x, y, i, j, fi, fj; const SIZE_T number = ${idxs[0]}; const SIZE_T channel = ${idxs[1]}; const SIZE_T filter = ${idxs[2]}; const SIZE_T xout = ${idxs[3]}; const SIZE_T yout = ${idxs[4]}; for (i=0; i < ${f_width}; i++){ for (j=0; j < ${f_height}; j++){ x = xout - i; if (x < 0) continue; if (x >= ${d_width}) continue; y = yout - j; if (y < 0) continue; if (y >= ${d_height}) continue; // acces weights in flipped order! fi = ${f_width} - i - 1; fj = ${f_height} - j - 1; d += ${deltas.load_idx}(number, channel, x, y) * ${weights.load_idx}(channel, filter, fi, fj); } } ${deltas_intermediate.store_same}(d); """, guiding_array='deltas_intermediate', render_kwds=render_kwds) kernel_cache[key] = kernel.compile(thread, fast_math=True) # run convolution -> intermediate kernel_cache[key](deltas, weights, deltas_intermediate) return deltas_intermediate
def sarprop_kernel(ctx, weights, gradient, last_gradient, step_sizes, noise, parameters): """ SARPROP update kernel """ kernel_cache, thread = ctx.kernel_cache, ctx.thread assert weights.shape == gradient.shape == last_gradient.shape == step_sizes.shape key = (sarprop_kernel, weights.shape, thread._context) + tuple( parameters.values()) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) kernel = PureParallel([ Parameter('weights', Annotation(weights, 'io')), Parameter('gradient', Annotation(gradient, 'i')), Parameter('last_gradient', Annotation(last_gradient, 'io')), Parameter('step_sizes', Annotation(step_sizes, 'io')), Parameter('noise', Annotation(step_sizes, 'i')) ], """ ${weights.ctype} w = ${weights.load_same}; ${gradient.ctype} g = ${gradient.load_same}; ${last_gradient.ctype} lg = ${last_gradient.load_same}; ${step_sizes.ctype} s = ${step_sizes.load_same}; ${noise.ctype} n = ${noise.load_same}; n = fabs(n); // Adapt step size if (g * lg > 0.0f) { s = min(${reward_factor}f * s, ${max_step_size}f); // Apply update if (g < 0.0f) { w = w - s*n; } if (g > 0.0f) { w = w + s*n; } } else { // punish step size s = max(${punish_factor}f * s, ${min_step_size}f); } // If l1 weight decay is greater zero, apply it % if l1_decay > 0.0: if (w > 0.0f) { w = max(0.0f, w - ${l1_decay}f); } if (w < 0.0f) { w = min(0.0f, w + ${l1_decay}f); } % endif; // If l2 weight decay is greater zero, apply it % if l2_decay > 0.0: w *= ${1.0 - l2_decay}f; % endif; // Save last gradient lg = g; ${weights.store_same}(w); ${last_gradient.store_same}(lg); ${step_sizes.store_same}(s); """, guiding_array='weights', render_kwds=parameters) kernel_cache[key] = kernel.compile(thread) # Run kernel kernel_cache[key](weights, gradient, last_gradient, step_sizes, noise)