def classification_delta_kernel(ctx, outputs, targets, deltas): kernel_cache, thread = ctx.kernel_cache, ctx.thread assert outputs.shape[0] == targets.shape[0] == deltas.shape[0] assert len(targets.shape) == 1 assert targets.dtype == numpy.int32 assert outputs.shape[1] == deltas.shape[1] key = (classification_delta_kernel, outputs.shape) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel( [ Parameter('outputs', Annotation(outputs, 'i')), Parameter('targets', Annotation(targets, 'i')), Parameter('deltas', Annotation(deltas, 'o')) ], """ ${outputs.ctype} out = ${outputs.load_same}; SIZE_T t = ${targets.load_idx}(${idxs[0]}); SIZE_T idx = ${idxs[1]}; ${deltas.ctype} d; if (t == idx) { d = 1.0f - out; } else { d = -out; } ${deltas.store_same}(d); """, guiding_array='deltas') kernel_cache[key] = kernel.compile(thread) # Run kernel kernel_cache[key](outputs, targets, deltas)
def classification_delta_kernel(ctx, outputs, targets, deltas): kernel_cache, thread = ctx.kernel_cache, ctx.thread assert outputs.shape[0] == targets.shape[0] == deltas.shape[0] assert len(targets.shape) == 1 assert targets.dtype == numpy.int32 assert outputs.shape[1] == deltas.shape[1] key = (classification_delta_kernel, outputs.shape) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('outputs', Annotation(outputs, 'i')), Parameter('targets', Annotation(targets, 'i')), Parameter('deltas', Annotation(deltas, 'o')) ], """ ${outputs.ctype} out = ${outputs.load_same}; SIZE_T t = ${targets.load_idx}(${idxs[0]}); SIZE_T idx = ${idxs[1]}; ${deltas.ctype} d; if (t == idx) { d = 1.0f - out; } else { d = -out; } ${deltas.store_same}(d); """, guiding_array='deltas') kernel_cache[key] = kernel.compile(thread) # Run kernel kernel_cache[key](outputs, targets, deltas)
def logistic_derivative(context, activations, delta, dest=None): kernel_cache, thread = context.kernel_cache, context.thread if dest is None: dest = delta key = (logistic_derivative, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel( [ Parameter('activations', Annotation(activations, 'i')), Parameter('delta', Annotation(activations, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${delta.ctype} d = ${delta.load_same}; d = d*a*(1.0f - a); ${dest.store_same}(d); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, delta, dest)
def logistic(context, activations, bias, dest=None): kernel_cache, thread = context.kernel_cache, context.thread if dest is None: dest = activations key = (logistic, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) assert activations.shape[1] == bias.shape[0] kernel = PureParallel( [ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${bias.ctype} b = ${bias.load_idx}(${idxs[1]}); a += b; a = min(max(-45.0f, a), 45.0f); a = 1.0f / (1.0f + exp(-a)); ${dest.store_same}(a); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, bias, dest) return dest
def logistic(context, activations, bias, dest=None): kernel_cache, thread = context.kernel_cache, context.thread if dest is None: dest = activations key = (logistic, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) assert activations.shape[1] == bias.shape[0] kernel = PureParallel([ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${bias.ctype} b = ${bias.load_idx}(${idxs[1]}); a += b; a = min(max(-45.0f, a), 45.0f); a = 1.0f / (1.0f + exp(-a)); ${dest.store_same}(a); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, bias, dest) return dest
def test_guiding_output(thr): N = 1000 dtype = numpy.float32 p = PureParallel( [ Parameter('output', Annotation(Type(dtype, shape=N), 'o')), Parameter('input', Annotation(Type(dtype, shape=(2, N)), 'i'))], """ float t1 = ${input.load_idx}(0, ${idxs[0]}); float t2 = ${input.load_idx}(1, ${idxs[0]}); ${output.store_idx}(${idxs[0]}, t1 + t2); """, guiding_array='output') a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev) res_ref = a[0] + a[1] assert diff_is_negligible(res_dev.get(), res_ref)
def logistic_derivative(context, activations, delta, dest=None): kernel_cache, thread = context.kernel_cache, context.thread if dest is None: dest = delta key = (logistic_derivative, activations.shape, thread) if not key in kernel_cache.keys(): log.info("compiling " + str(key)) kernel = PureParallel([ Parameter('activations', Annotation(activations, 'i')), Parameter('delta', Annotation(activations, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ ${activations.ctype} a = ${activations.load_same}; ${delta.ctype} d = ${delta.load_same}; d = d*a*(1.0f - a); ${dest.store_same}(d); """, guiding_array='activations') kernel_cache[key] = kernel.compile(thread, fast_math=True) # Run kernel kernel_cache[key](activations, delta, dest)
def test_guiding_output(thr): N = 1000 dtype = numpy.float32 p = PureParallel([ Parameter('output', Annotation(Type(dtype, shape=N), 'o')), Parameter('input', Annotation(Type(dtype, shape=(2, N)), 'i')) ], """ float t1 = ${input.load_idx}(0, ${idxs[0]}); float t2 = ${input.load_idx}(1, ${idxs[0]}); ${output.store_idx}(${idxs[0]}, t1 + t2); """, guiding_array='output') a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev) res_ref = a[0] + a[1] assert diff_is_negligible(res_dev.get(), res_ref)
def class_errors(ctx, expected, actual, errors): """ expected int32, actual float, errors int32 """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (class_errors, expected.shape) if key not in kernel_cache.keys(): # target should be an integer logging.info("compiling " + str(key)) assert expected.shape == errors.shape # one neuron per class assert expected.shape == (actual.shape[0], ) # index of the class assert actual.dtype == numpy.float32 assert expected.dtype == numpy.int32 assert errors.dtype == numpy.int32 kernel = PureParallel( [ Parameter('expected', Annotation(expected, 'i')), Parameter('actual', Annotation(actual, 'i')), Parameter('errors', Annotation(errors, 'o')) ], """ SIZE_T expected = ${expected.load_idx}(${idxs[0]});; float maximum=0.0f; float value; SIZE_T maxindex = 0; SIZE_T tl = ${target_length}; // calculate argmax for(SIZE_T j=0; j < tl; j++) { value = ${actual.load_idx}(${idxs[0]}, j); if (value > maximum) { maximum = value; maxindex = j; } } // If the confidence is too low, return an error if (maximum < (1.0f / ${target_length}.0f + 0.001f)) { ${errors.store_same}(1); return; }; // compare argmax if (maxindex != expected) { ${errors.store_same}(1); } else { ${errors.store_same}(0); } """, guiding_array='expected', render_kwds={'target_length': numpy.int32(actual.shape[1])}) kernel_cache[key] = kernel.compile(thread) kernel_cache[key](expected, actual, errors)
def convolve2d_propagation(ctx, array, weights, dest): """ The output is the valid discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_propagation, weights.shape, array.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling" + str(key)) channels, filters, owidth, oheight = weights.shape[0], weights.shape[ 1], dest.shape[1], dest.shape[2] render_kwds = { 'w0': weights.shape[2], 'w1': weights.shape[3], 'a0': array.shape[2], 'a1': array.shape[3], 'off0': int(weights.shape[2] - 1), 'off1': int(weights.shape[3] - 1) } kernel_conv = PureParallel([ Parameter('array', Annotation(array, 'i')), Parameter('weights', Annotation(weights, 'i')), Parameter('dest', Annotation(dest, 'o')) ], """ // Array dimensions: // array : (channels, width, height) // weights: (channels, filters, fwidth, fheight) // dest (channels, filters, owidth, oheight) float a = 0.0f; SIZE_T x, y, i, j; const SIZE_T number = ${idxs[0]}; const SIZE_T channel = ${idxs[1]}; const SIZE_T filter = ${idxs[2]}; const SIZE_T xout = ${idxs[3]}; const SIZE_T yout = ${idxs[4]}; for (i=0; i < ${w0}; i++){ for (j=0; j < ${w1}; j++){ x = xout - i + ${off0}; y = yout - j + ${off1}; a += ${array.load_idx}(number, channel, x, y) * ${weights.load_idx}(channel, filter, i, j); // channel, filter, i, j } } ${dest.store_same}(a); """, guiding_array='dest', render_kwds=render_kwds) kernel_cache[key] = kernel_conv.compile(thread, fast_math=True) # run convolution kernel_cache[key](array, weights, dest) return dest
def convolve2d_propagation(ctx, array, weights, dest): """ The output is the valid discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_propagation, weights.shape, array.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling" + str(key)) channels, filters, owidth, oheight = weights.shape[0], weights.shape[1], dest.shape[1], dest.shape[2] render_kwds = { 'w0': weights.shape[2], 'w1': weights.shape[3], 'a0': array.shape[2], 'a1': array.shape[3], 'off0': int(weights.shape[2] - 1), 'off1': int(weights.shape[3] - 1) } kernel_conv = PureParallel( [ Parameter('array', Annotation(array, 'i')), Parameter('weights', Annotation(weights, 'i')), Parameter('dest', Annotation(dest, 'o')) ], """ // Array dimensions: // array : (channels, width, height) // weights: (channels, filters, fwidth, fheight) // dest (channels, filters, owidth, oheight) float a = 0.0f; SIZE_T x, y, i, j; const SIZE_T number = ${idxs[0]}; const SIZE_T channel = ${idxs[1]}; const SIZE_T filter = ${idxs[2]}; const SIZE_T xout = ${idxs[3]}; const SIZE_T yout = ${idxs[4]}; for (i=0; i < ${w0}; i++){ for (j=0; j < ${w1}; j++){ x = xout - i + ${off0}; y = yout - j + ${off1}; a += ${array.load_idx}(number, channel, x, y) * ${weights.load_idx}(channel, filter, i, j); // channel, filter, i, j } } ${dest.store_same}(a); """, guiding_array='dest', render_kwds=render_kwds) kernel_cache[key] = kernel_conv.compile( thread, fast_math=True) # run convolution kernel_cache[key](array, weights, dest) return dest
def class_errors(ctx, expected, actual, errors): """ expected int32, actual float, errors int32 """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (class_errors, expected.shape) if key not in kernel_cache.keys(): # target should be an integer logging.info("compiling " + str(key)) assert expected.shape == errors.shape # one neuron per class assert expected.shape == (actual.shape[0],) # index of the class assert actual.dtype == numpy.float32 assert expected.dtype == numpy.int32 assert errors.dtype == numpy.int32 kernel = PureParallel( [ Parameter('expected', Annotation(expected, 'i')), Parameter('actual', Annotation(actual, 'i')), Parameter('errors', Annotation(errors, 'o')) ], """ SIZE_T expected = ${expected.load_idx}(${idxs[0]});; float maximum=0.0f; float value; SIZE_T maxindex = 0; SIZE_T tl = ${target_length}; // calculate argmax for(SIZE_T j=0; j < tl; j++) { value = ${actual.load_idx}(${idxs[0]}, j); if (value > maximum) { maximum = value; maxindex = j; } } // If the confidence is too low, return an error if (maximum < (1.0f / ${target_length}.0f + 0.001f)) { ${errors.store_same}(1); return; }; // compare argmax if (maxindex != expected) { ${errors.store_same}(1); } else { ${errors.store_same}(0); } """, guiding_array='expected', render_kwds={'target_length' : numpy.int32(actual.shape[1])}) kernel_cache[key] = kernel.compile(thread) kernel_cache[key](expected, actual, errors)
def softmax(ctx, activations, bias, dest=None): """ Softmax Activation Function """ kernel_cache, thread = ctx.kernel_cache, ctx.thread if dest is None: dest = activations key = (softmax, activations.shape) if key not in kernel_cache.keys(): logging.info("compiling " + str(key)) # Regression hidden layer kernel_softmax = PureParallel( [ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ float x; float b; float s = 0.0f; SIZE_T tl = ${target_length}; for(SIZE_T j=0; j < tl; j++) { x = ${activations.load_idx}(${idxs[0]}, j); b = ${bias.load_idx}(j); x += b; x = exp(min(max(x, -45.0f), 45.0f)); ${dest.store_idx}(${idxs[0]}, j, x); s += x; } // divide by sum for(SIZE_T j=0; j < tl; j++) { x = ${dest.load_idx}(${idxs[0]}, j); x /= s; ${dest.store_idx}(${idxs[0]}, j, x); } """, guiding_array=(activations.shape[0], ), render_kwds={'target_length': numpy.int32(activations.shape[1])}) kernel_cache[key] = kernel_softmax.compile(thread) kernel_cache[key](activations, bias, dest)
def softmax(ctx, activations, bias, dest=None): """ Softmax Activation Function """ kernel_cache, thread = ctx.kernel_cache, ctx.thread if dest is None: dest = activations key = (softmax, activations.shape) if key not in kernel_cache.keys(): logging.info("compiling " + str(key)) # Regression hidden layer kernel_softmax = PureParallel( [ Parameter('activations', Annotation(activations, 'i')), Parameter('bias', Annotation(bias, 'i')), Parameter('dest', Annotation(dest, 'o')), ], """ float x; float b; float s = 0.0f; SIZE_T tl = ${target_length}; for(SIZE_T j=0; j < tl; j++) { x = ${activations.load_idx}(${idxs[0]}, j); b = ${bias.load_idx}(j); x += b; x = exp(min(max(x, -45.0f), 45.0f)); ${dest.store_idx}(${idxs[0]}, j, x); s += x; } // divide by sum for(SIZE_T j=0; j < tl; j++) { x = ${dest.load_idx}(${idxs[0]}, j); x /= s; ${dest.store_idx}(${idxs[0]}, j, x); } """, guiding_array=(activations.shape[0],), render_kwds={'target_length' : numpy.int32(activations.shape[1])}) kernel_cache[key] = kernel_softmax.compile(thread) kernel_cache[key](activations, bias, dest)
def test_zero_length_shape(thr): dtype = numpy.float32 p = PureParallel( [ Parameter('output', Annotation(Type(dtype, shape=tuple()), 'o')), Parameter('input', Annotation(Type(dtype, shape=tuple()), 'i'))], """ float t = ${input.load_idx}(); ${output.store_idx}(t * 2); """, guiding_array=tuple()) a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev) res_ref = (a * 2).astype(dtype) assert diff_is_negligible(res_dev.get(), res_ref)
def test_zero_length_shape(thr): dtype = numpy.float32 p = PureParallel([ Parameter('output', Annotation(Type(dtype, shape=tuple()), 'o')), Parameter('input', Annotation(Type(dtype, shape=tuple()), 'i')) ], """ float t = ${input.load_idx}(); ${output.store_idx}(t * 2); """, guiding_array=tuple()) a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev) res_ref = (a * 2).astype(dtype) assert diff_is_negligible(res_dev.get(), res_ref)
def get_procs(thr, N): fft = FFTFactory.create(thr, (N,), compile_=False) unimod_trans = Transformation( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i'))], """ VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_same}; if (idx>${N}/2){ val.x = 0.0; val.y = 0.0; ${output.store_same}(val); }else ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); """, render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float64), N=N) ) fft.parameter.output.connect(unimod_trans, unimod_trans.input, uni=unimod_trans.output) fft_unimod = fft.compile(thr) mag_square = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_idx}(idx); val.x = val.x*val.x + val.y*val.y; val.y = 0; ${output.store_idx}(idx, val); ''' ) mag_square = mag_square.compile(thr) apply_mask = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('origin', Annotation(Type(np.complex128, N), 'i')), Parameter('mask', Annotation(Type(np.double, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; ${output.store_idx}(idx, ${mul}(${origin.load_idx}(idx), ${mask.load_idx}(idx))); ''', render_kwds=dict(mul=functions.mul(np.complex128, np.double)) ) apply_mask = apply_mask.compile(thr) combine_mag_phi = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('mag_square', Annotation(Type(np.complex128, N), 'i')), Parameter('phase', Annotation(Type(np.complex128, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; double r = ${mag_square.load_idx}(idx).x; r = r<0.0 ? 0.0 : ${pow}(r, 0.5); double2 v = ${phase.load_idx}(idx); double angle = atan2(v.y, v.x); ${output.store_idx}(idx, ${polar}(r, angle)); ''', render_kwds=dict(pow=functions.pow(np.double), polar=functions.polar(np.double)) ) combine_mag_phi = combine_mag_phi.compile(thr) return fft_unimod, mag_square, apply_mask, combine_mag_phi
def convolve2d_gradient(ctx, prev_deltas, deltas, gradient_intermediate): """ The output is the full discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_gradient, prev_deltas.shape, deltas.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) # Extract shapes from the arrays n, channels, p_width, p_height = prev_deltas.shape n_1, filters, d_width, d_height = deltas.shape n, d_width_1, d_height_1, channels_1, filters_1, f_width, f_height = gradient_intermediate.shape # Some assertions to be sure everything is correct assert n_1 == n assert filters_1 == filters assert channels_1 == channels expected_shape = get_output_shape(prev_deltas, deltas, 'gradient') assert expected_shape == gradient_intermediate.shape assert d_width_1 == d_width assert d_height_1 == d_height # Render keywords render_kwds = { 'n': n, 'filters': filters, 'channels': channels, 'f_width': f_width, 'f_height': f_height, 'd_width': d_width, 'd_height': d_height, 'p_width': p_width, 'p_height': p_height, } # The kernel kernel = PureParallel([ Parameter('prev_deltas', Annotation(prev_deltas, 'i')), Parameter('deltas', Annotation(deltas, 'i')), Parameter('gradient_intermediate', Annotation(gradient_intermediate, 'o')) ], """ const SIZE_T number = ${idxs[0]}; const SIZE_T dx = ${idxs[1]}; const SIZE_T dy = ${idxs[2]}; const SIZE_T channel = ${idxs[3]}; const SIZE_T filter = ${idxs[4]}; const SIZE_T fx = ${idxs[5]}; const SIZE_T fy = ${idxs[6]}; // weight gradient at the weight position fx, fy is defined by the sum // // (deltas * prev_deltas[fx:d_width+fx, fy:fy+d_height]).sum() // // alternatively we can store all delta positions and sum in a separate kernel - this is what we do now. float g = ${deltas.load_idx}(number, filter, dx, dy) * ${prev_deltas.load_idx}(number, channel, dx+fx, dy+fy); ${gradient_intermediate.store_same}(g); """, guiding_array='gradient_intermediate', render_kwds=render_kwds) kernel_cache[key] = kernel.compile(thread, fast_math=True) # run convolution -> intermediate kernel_cache[key](prev_deltas, deltas, gradient_intermediate) return gradient_intermediate
def convolve2d_backprop(ctx, deltas, weights, deltas_intermediate): """ The output is the full discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_backprop, deltas.shape, weights.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) # Extract shapes from the arrays channels, filters, f_width, f_height = weights.shape n_1, filters_1, d_width, d_height = deltas.shape n, channels_1, filters_2, p_width, p_height = deltas_intermediate.shape # Some assertions to be sure everything is correct assert n_1 == n assert filters_2 == filters_1 == filters assert channels_1 == channels expected_shape = get_output_shape(deltas, weights, 'backprop') assert expected_shape == deltas_intermediate.shape # Render keywords render_kwds = { 'n': n, 'filters': filters, 'channels': channels, 'f_width': f_width, 'f_height': f_height, 'd_width': d_width, 'd_height': d_height, 'p_width': p_width, 'p_height': p_height, } # The kernel kernel = PureParallel([ Parameter('deltas', Annotation(deltas, 'i')), Parameter('weights', Annotation(weights, 'i')), Parameter('deltas_intermediate', Annotation(deltas_intermediate, 'o')) ], """ float d = 0.0f; SIZE_T x, y, i, j, fi, fj; const SIZE_T number = ${idxs[0]}; const SIZE_T channel = ${idxs[1]}; const SIZE_T filter = ${idxs[2]}; const SIZE_T xout = ${idxs[3]}; const SIZE_T yout = ${idxs[4]}; for (i=0; i < ${f_width}; i++){ for (j=0; j < ${f_height}; j++){ x = xout - i; if (x < 0) continue; if (x >= ${d_width}) continue; y = yout - j; if (y < 0) continue; if (y >= ${d_height}) continue; // acces weights in flipped order! fi = ${f_width} - i - 1; fj = ${f_height} - j - 1; d += ${deltas.load_idx}(number, channel, x, y) * ${weights.load_idx}(channel, filter, fi, fj); } } ${deltas_intermediate.store_same}(d); """, guiding_array='deltas_intermediate', render_kwds=render_kwds) kernel_cache[key] = kernel.compile(thread, fast_math=True) # run convolution -> intermediate kernel_cache[key](deltas, weights, deltas_intermediate) return deltas_intermediate
def get_procs(thr, N): fft = FFTFactory.create(thr, (N, ), compile_=False) unimod_trans = Transformation( [ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i')) ], """ VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_same}; if (idx>${N}/2){ val.x = 0.0; val.y = 0.0; ${output.store_same}(val); }else ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); """, render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float64), N=N)) fft.parameter.output.connect(unimod_trans, unimod_trans.input, uni=unimod_trans.output) fft_unimod = fft.compile(thr) mag_square = PureParallel([ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i')) ], ''' VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_idx}(idx); val.x = val.x*val.x + val.y*val.y; val.y = 0; ${output.store_idx}(idx, val); ''') mag_square = mag_square.compile(thr) apply_mask = PureParallel( [ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('origin', Annotation(Type(np.complex128, N), 'i')), Parameter('mask', Annotation(Type(np.double, N), 'i')) ], ''' VSIZE_T idx = ${idxs[0]}; ${output.store_idx}(idx, ${mul}(${origin.load_idx}(idx), ${mask.load_idx}(idx))); ''', render_kwds=dict(mul=functions.mul(np.complex128, np.double))) apply_mask = apply_mask.compile(thr) combine_mag_phi = PureParallel([ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('mag_square', Annotation(Type(np.complex128, N), 'i')), Parameter('phase', Annotation(Type(np.complex128, N), 'i')) ], ''' VSIZE_T idx = ${idxs[0]}; double r = ${mag_square.load_idx}(idx).x; r = r<0.0 ? 0.0 : ${pow}(r, 0.5); double2 v = ${phase.load_idx}(idx); double angle = atan2(v.y, v.x); ${output.store_idx}(idx, ${polar}(r, angle)); ''', render_kwds=dict( pow=functions.pow(np.double), polar=functions.polar(np.double))) combine_mag_phi = combine_mag_phi.compile(thr) return fft_unimod, mag_square, apply_mask, combine_mag_phi
def convolve2d_backprop(ctx, deltas, weights, deltas_intermediate): """ The output is the full discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_backprop, deltas.shape, weights.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) # Extract shapes from the arrays channels, filters, f_width, f_height = weights.shape n_1, filters_1, d_width, d_height = deltas.shape n, channels_1, filters_2, p_width, p_height = deltas_intermediate.shape # Some assertions to be sure everything is correct assert n_1 == n assert filters_2 == filters_1 == filters assert channels_1 == channels expected_shape = get_output_shape(deltas, weights, 'backprop') assert expected_shape == deltas_intermediate.shape # Render keywords render_kwds = { 'n':n, 'filters':filters, 'channels': channels, 'f_width': f_width, 'f_height': f_height, 'd_width': d_width, 'd_height': d_height, 'p_width': p_width, 'p_height': p_height, } # The kernel kernel = PureParallel( [ Parameter('deltas', Annotation(deltas, 'i')), Parameter('weights', Annotation(weights, 'i')), Parameter('deltas_intermediate', Annotation(deltas_intermediate, 'o')) ], """ float d = 0.0f; SIZE_T x, y, i, j, fi, fj; const SIZE_T number = ${idxs[0]}; const SIZE_T channel = ${idxs[1]}; const SIZE_T filter = ${idxs[2]}; const SIZE_T xout = ${idxs[3]}; const SIZE_T yout = ${idxs[4]}; for (i=0; i < ${f_width}; i++){ for (j=0; j < ${f_height}; j++){ x = xout - i; if (x < 0) continue; if (x >= ${d_width}) continue; y = yout - j; if (y < 0) continue; if (y >= ${d_height}) continue; // acces weights in flipped order! fi = ${f_width} - i - 1; fj = ${f_height} - j - 1; d += ${deltas.load_idx}(number, channel, x, y) * ${weights.load_idx}(channel, filter, fi, fj); } } ${deltas_intermediate.store_same}(d); """, guiding_array='deltas_intermediate', render_kwds=render_kwds) kernel_cache[key] = kernel.compile( thread, fast_math=True) # run convolution -> intermediate kernel_cache[key](deltas, weights, deltas_intermediate) return deltas_intermediate
def convolve2d_gradient(ctx, prev_deltas, deltas, gradient_intermediate): """ The output is the full discrete linear convolution of the inputs. """ kernel_cache, thread = ctx.kernel_cache, ctx.thread key = (convolve2d_gradient, prev_deltas.shape, deltas.shape, thread) if not key in kernel_cache.keys(): logging.info("compiling " + str(key)) # Extract shapes from the arrays n, channels, p_width, p_height = prev_deltas.shape n_1, filters, d_width, d_height = deltas.shape n, d_width_1, d_height_1, channels_1, filters_1, f_width, f_height = gradient_intermediate.shape # Some assertions to be sure everything is correct assert n_1 == n assert filters_1 == filters assert channels_1 == channels expected_shape = get_output_shape(prev_deltas, deltas, 'gradient') assert expected_shape == gradient_intermediate.shape assert d_width_1 == d_width assert d_height_1 == d_height # Render keywords render_kwds = { 'n':n, 'filters':filters, 'channels': channels, 'f_width': f_width, 'f_height': f_height, 'd_width': d_width, 'd_height': d_height, 'p_width': p_width, 'p_height': p_height, } # The kernel kernel = PureParallel( [ Parameter('prev_deltas', Annotation(prev_deltas, 'i')), Parameter('deltas', Annotation(deltas, 'i')), Parameter('gradient_intermediate', Annotation(gradient_intermediate, 'o')) ], """ const SIZE_T number = ${idxs[0]}; const SIZE_T dx = ${idxs[1]}; const SIZE_T dy = ${idxs[2]}; const SIZE_T channel = ${idxs[3]}; const SIZE_T filter = ${idxs[4]}; const SIZE_T fx = ${idxs[5]}; const SIZE_T fy = ${idxs[6]}; // weight gradient at the weight position fx, fy is defined by the sum // // (deltas * prev_deltas[fx:d_width+fx, fy:fy+d_height]).sum() // // alternatively we can store all delta positions and sum in a separate kernel - this is what we do now. float g = ${deltas.load_idx}(number, filter, dx, dy) * ${prev_deltas.load_idx}(number, channel, dx+fx, dy+fy); ${gradient_intermediate.store_same}(g); """, guiding_array='gradient_intermediate', render_kwds=render_kwds) kernel_cache[key] = kernel.compile( thread, fast_math=True) # run convolution -> intermediate kernel_cache[key](prev_deltas, deltas, gradient_intermediate) return gradient_intermediate