def __init__(self, input_shape : nc.TensorShape, tiles, is_add_to_output): self.info = info = nc.info.InfoTile(input_shape, tiles) self.forward_krn = nc.CLKernel(global_shape=(info.output_shape.size,), kernel_text=f""" {ph.define_axes_accessor('I', input_shape)} {ph.define_axes_accessor('O', info.output_shape)} __kernel void impl(__global float* O, __global const float* I) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', info.output_shape.rank, 'gid')} O[gid] {'+=' if is_add_to_output else '='} I[I_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})]; }} """) self.backward_krn = nc.CLKernel(global_shape=(input_shape.size,), kernel_text=f""" {ph.define_axes_accessor('I', input_shape)} {ph.define_axes_accessor('O', info.output_shape)} __kernel void impl(__global float* dI, __global const float* dO ,{','.join([ f' long i{i}_offset' for i in range(input_shape.rank) ]) } ) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', input_shape.rank, 'gid')} {';'.join([ f'i{i} += i{i}_offset' for i in range(input_shape.rank) ]) }; dI[gid] += dO[O_idx({ph.axes_seq_enum('i', input_shape.rank)})]; }} """)
def __init__(self, ElementWiseOpKernel_cls, ElementWiseOpKernel_args, input_shape, is_add_to_output): self.output_shape = input_shape self.kernel = ElementWiseOpKernel_cls(*ElementWiseOpKernel_args) self.forward_krn = nc.CLKernel(global_shape=(input_shape.size, ), kernel_text=f""" __kernel void impl(__global float* O_t, __global const float* I_t) {{ size_t idx = get_global_id(0); float I = I_t[idx]; float O = 0.0; {self.kernel.get_forward_kernel_text()} O_t[idx] {'+=' if is_add_to_output else '='} O; }} """) self.backward_krn = nc.CLKernel(global_shape=(input_shape.size, ), kernel_text=f""" __kernel void impl(__global float* dI_t, __global const float* I_t, __global const float* O_t, __global const float* dO_t) {{ size_t idx = get_global_id(0); float I = I_t[idx]; float O = O_t[idx]; float dO = dO_t[idx]; float dI = 0.0; {self.kernel.get_backward_kernel_text()} dI_t[idx] += dI; }} """)
def __init__(self, op_type, input_shape : nc.TensorShape, axes : nc.TensorAxes, keepdims=False): self.op_type = op_type self.info = info = nc.info.InfoReduction(input_shape, axes, keepdims) # Determine transpose order for intermediate tensor, where reduction axes will be at the end self.intermediate_transpose_axes = info.output_axes + info.reduction_axes self.intermediate_shape = nc.info.InfoTranspose(input_shape, self.intermediate_transpose_axes).output_shape # slices argument to fetch processed tensor from zero indexes self.inter_slices = ( slice(None,None,None), ) * info.output_axes.rank + (0,) * info.reduction_axes.rank # COLS are reduction axes, ROWS are remaining axes rows_rank = info.output_axes.rank self.ROWS = ROWS = self.intermediate_shape[:rows_rank].size self.COLS = COLS = self.intermediate_shape[rows_rank:].size # Number of stages to operate COLS n_stages = (COLS-1).bit_length() self.forward_krn_shapes = [ (ROWS * math.ceil(COLS/ (2**(stage+1)) ),) for stage in range(n_stages) ] self.forward_krn_stage_cols = [ math.ceil(COLS / (2**(stage+1)) ) for stage in range(n_stages) ] self.forward_krn_stage_valid_cols = [ math.ceil(COLS / (2** stage ) ) for stage in range(n_stages) ] self.krn_I_shape = (input_shape.size,) if op_type == 'mean': self.forward_krn = _ReduceOp.forward_krns['sum'] else: self.forward_krn = _ReduceOp.forward_krns[op_type] if op_type in ['sum', 'mean']: self.backward_krn = nc.CLKernel(f""" {ph.define_axes_accessor('I', input_shape)} {ph.define_axes_accessor('O', info.output_shape_keepdims)} __kernel void impl(__global float* dI, __global const float* dO) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', input_shape.rank, 'gid')} dI[gid] += dO[O_idx_mod({ph.axes_seq_enum('i', input_shape.rank )})] {f'/ {COLS}' if op_type == 'mean' else ''}; }} """) elif op_type in ['min', 'max']: self.backward_krn = nc.CLKernel(f""" {ph.define_axes_accessor('I', input_shape)} {ph.define_axes_accessor('O', info.output_shape_keepdims)} __kernel void impl(__global float* dI, __global const float* I, __global const float* dO, __global const float* O, __global const float* OLock) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', input_shape.rank, 'gid')} size_t o_idx = O_idx({ph.axes_seq_enum('i', info.output_shape_keepdims.rank, zero_axes=info.reduction_axes)}); if (I[gid] == O[o_idx]) dI[gid] += (atomic_inc( (volatile __global int*) &OLock[o_idx]) == 0) //1 if we are first * dO[o_idx]; }} """)
def __init__(self, tensors_list, momentum=0.0, nesterov=False, lr=0.001, lr_decay=0.0, lr_dropout=0.0, clipnorm=0.0): super().__init__(tensors_list, lr=lr, lr_decay=lr_decay, lr_dropout=lr_dropout, clipnorm=clipnorm, saveables=['_accs']) self.momentum = momentum self.nesterov = nesterov _accs = {} for t in self.get_trainable_tensors(): _accs[t.get_name()] = nn.Tensor_zeros_like(t) self._accs = _accs self._update_acc_krn = nc.CLKernel(f""" {self._get_lr_kernel_common_text()} __kernel void impl (__global float* A, __global const float* G {self._get_lr_kernel_args_text()} ) {{ size_t gid = get_global_id(0); {self._get_lr_kernel_text()} A[gid] = {momentum}*A[gid] - lr*G[gid]; }} """) self._update_t_krn = nc.CLKernel(f""" {self._get_lr_kernel_common_text()} #define NESTEROV {int(nesterov)} __kernel void impl (__global float* V, __global const float* G, __global const float* A {self._get_lr_kernel_args_text()} ) {{ size_t gid = get_global_id(0); {self._get_lr_kernel_text()} #if NESTEROV==1 V[gid] += {momentum}*A[gid] - lr*G[gid]; #else V[gid] += A[gid]; #endif }} """)
def __init__(self, tensors_list, beta_1=0.9, beta_2=0.999, lr=0.001, lr_decay=0.0, lr_dropout=0.0, clipnorm=0.0): super().__init__(tensors_list, lr=lr, lr_decay=lr_decay, lr_dropout=lr_dropout, clipnorm=clipnorm, saveables=['_ms', '_vs']) self.beta_1 = beta_1 self.beta_2 = beta_2 _ms, _vs = {}, {} for t in self.get_trainable_tensors(): _ms[t.get_name()] = nn.Tensor_zeros_like(t) _vs[t.get_name()] = nn.Tensor_zeros_like(t) self._ms, self._vs = _ms, _vs self._update_ms_krn = nc.CLKernel(f""" __kernel void impl(__global float* M, __global const float* G) {{ size_t gid = get_global_id(0); M[gid] = {beta_1}*M[gid] + (1.0 - {beta_1})*G[gid]; }} """) self._update_vs_krn = nc.CLKernel(f""" __kernel void impl(__global float* V, __global const float* G) {{ size_t gid = get_global_id(0); float g = G[gid]; V[gid] = {beta_2}*V[gid] + (1.0 - {beta_2})*g*g; }} """) self._update_t_krn = nc.CLKernel(f""" {self._get_lr_kernel_common_text()} __kernel void impl (__global float* T, __global const float* M, __global const float* V {self._get_lr_kernel_args_text()} ) {{ size_t gid = get_global_id(0); {self._get_lr_kernel_text()} T[gid] += -lr*M[gid] / ( sqrt(V[gid]) + 1e-7 ); }} """)
def __init__(self, input_shape: nc.TensorShape, rate, seed, is_add_to_output): if rate < 0 or rate >= 1.0: raise ValueError(f'rate must be in range [0 .. 1.0)') self.rate = rate if seed is None: seed = np.random.randint(2147483648) self.seed = seed self.output_shape = input_shape self.krn = nc.CLKernel(global_shape=(input_shape.size, ), kernel_text=f""" {ph.include_hash()} {ph.define_axes_accessor('I', input_shape)} {ph.define_axes_accessor('O', self.output_shape)} __kernel void impl(__global float* O, __global const float* I, uint seed) {{ size_t gid = get_global_id(0); float value = 0.0; if ( hash_float_uint(seed+gid) <= {rate} ) value = I[gid] * ( 1.0 / ( 1.0 - {rate} ) ); O[gid] {'+=' if is_add_to_output else '='} value; }}""")
def __init__(self, input_shape: nc.TensorShape, size, is_add_to_output): N, IC, IH, IW = input_shape OC = IC OH = IH * size OW = IW * size self.output_shape = output_shape = nc.TensorShape((N, OC, OH, OW)) common_kernel_text = f""" {ph.define_axes_accessor('I', input_shape, 'NCHW')} {ph.define_axes_accessor('O', output_shape, 'NCHW')} """ self.O_forward_krn = nc.CLKernel(global_shape=(output_shape.size, ), kernel_text=f""" {common_kernel_text} #define SIZE {size} __kernel void impl(__global float* O, __global const float* I) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', 'NCHW', 'gid')} O[gid] {'+=' if is_add_to_output else '='} I[I_idx(on,oc,oh / SIZE,ow / SIZE)]; }} """) self.dI_krn = nc.CLKernel(global_shape=(input_shape.size, ), kernel_text=f""" {common_kernel_text} #define SIZE {size} __kernel void impl(__global float* dI, __global const float* dO) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', 'NCHW', 'gid')} float v = 0.0; for (int y=0; y<SIZE; ++y) for (int x=0; x<SIZE; ++x) v += dO[O_idx(in,ic,ih*SIZE+y,iw*SIZE+x)]; dI[gid] += v; }} """)
def __init__(self, tensors_list, rho=0.9, lr=0.001, lr_decay=0.0, lr_dropout=0.0, clipnorm=0.0): super().__init__(tensors_list, lr=lr, lr_decay=lr_decay, lr_dropout=lr_dropout, clipnorm=clipnorm, saveables=['_accs']) _accs = {} for t in self.get_trainable_tensors(): _accs[t.get_name()] = nn.Tensor_zeros_like(t) self._accs = _accs self.rho = rho self._update_acc_krn = nc.CLKernel(f""" __kernel void impl (__global float* A, __global const float* G) {{ size_t gid = get_global_id(0); float g = G[gid]; A[gid] = {rho} * A[gid] + (1.0 - {rho}) * g * g; }} """) self._update_t_krn = nc.CLKernel(f""" {self._get_lr_kernel_common_text()} __kernel void impl (__global float* V, __global const float* G, __global const float* A {self._get_lr_kernel_args_text()} ) {{ size_t gid = get_global_id(0); {self._get_lr_kernel_text()} V[gid] += -lr * G[gid] / ( sqrt(A[gid]) + 1e-7 ); }} """)
def __init__(self, low=0.0, high=1.0): super().__init__() self.low = low self.high = high self.krn = nc.CLKernel(kernel_text=f""" {ph.include_hash()} __kernel void impl(__global float* O, uint seed) {{ size_t gid = get_global_id(0); O[gid] = hash_float_uint(gid+seed)*({high}-({low}))+({low}); }} """)
def __init__(self, input_shape : nc.TensorShape, axes_order : nc.TensorAxes, is_add_to_output): self.axes_order = axes_order self.info = info = nc.info.InfoTranspose(input_shape, axes_order) self.forward_krn = nc.CLKernel(global_shape=(input_shape.size,), kernel_text=f""" {ph.define_axes_accessor('I', input_shape)} {ph.define_axes_accessor('O', info.output_shape)} __kernel void impl(__global const float* I, __global float* O) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', input_shape.rank, 'gid')} O[O_idx({ph.axes_order_enum('i', axes_order )})] {'+=' if is_add_to_output else '='} I[gid]; }}""")
def __init__(self, input_shapes, axis, is_add_to_output): self.info = info = nc.info.InfoConcat(input_shapes, axis) self.forward_krn = nc.CLKernel(f""" {ph.define_axes_accessor('I', info.output_shape )} {ph.define_axes_accessor('O', info.output_shape )} #undef I{info.axis} __kernel void impl(__global float* O, __global const float* I, long axis_offset, long I{info.axis}) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', info.output_shape.rank, 'gid')} i{info.axis} += axis_offset; O[O_idx({ph.axes_seq_enum('i', info.output_shape.rank)})] {'+=' if is_add_to_output else '='} I[gid]; }} """)
def __init__(self, input_shape: nc.TensorShape, axis, stack_count, is_add_to_output): self.info = info = nc.info.InfoStack(input_shape, axis, stack_count) self.forward_krn = nc.CLKernel(global_shape=(input_shape.size, ), kernel_text=f""" {ph.define_axes_accessor('I', input_shape )} {ph.define_axes_accessor('O', info.output_shape )} __kernel void impl(__global float* O, __global const float* I, long n) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', input_shape.rank, 'gid')} O[O_idx({ph.axes_seq_enum('i', input_shape.rank, new_axis=('n', info.axis))})] {'+=' if is_add_to_output else '='} I[gid]; }} """)
def __init__(self, mean=0.0, stddev=1.0): super().__init__() self.krn = nc.CLKernel(kernel_text=f""" {ph.include_constants_pi()} {ph.include_hash()} __kernel void impl(__global float* O, uint seed) {{ size_t gid = get_global_id(0); float2 rnd = hash_float2_uint(gid+seed); float rnd_normal = sqrt(-2*log(rnd.x))*cos(2*PI_F*rnd.y); O[gid] = {mean} + rnd_normal*{stddev}; }} """)
class _MeshGrid2D(Initializer): krn = nc.CLKernel(kernel_text=f""" __kernel void impl(__global float* O, float h_start, float h_step , float w_start, float w_step, uint OH, uint OW, uint OD) {{ size_t gid = get_global_id(0); size_t od = gid % OD; gid /= OD; size_t ow = gid % OW; gid /= OW; size_t oh = gid % OH; size_t oc = gid / OH; gid = get_global_id(0); float v = 1.0; if (od == 0) v = h_start+oh*h_step; else if (od == 1) v = w_start+ow*w_step; O[gid] = v; }} """) def __init__(self, h_start, h_stop, w_start, w_stop): super().__init__() self.h_start = h_start self.h_stop = h_stop self.w_start = w_start self.w_stop = w_stop def initialize_CLBuffer(self, cl_buffer, tensor_shape: nc.TensorShape): if tensor_shape.rank != 4: raise ValueError(f'tensor_shape.rank must == 4') OC, OH, OW, OD = tensor_shape if OD != 3: raise ValueError(f'D {OD} must == 3') if OH > 1: h_step = (self.h_stop - self.h_start) / (OH - 1) else: h_step = 0 if OW > 1: w_step = (self.w_stop - self.w_start) / (OW - 1) else: w_step = 0 cl_buffer.device.run(_MeshGrid2D.krn, cl_buffer, np.float32(self.h_start), np.float32(h_step), np.float32(self.w_start), np.float32(w_step), np.uint32(OH), np.uint32(OW), np.uint32(OD), global_shape=(OC * OH * OW * OD, )) def __str__(self): return f'MeshGrid2D'
def __init__(self, input_shape: nc.TensorShape, coords_shape: nc.TensorShape): N, IC, IH, IW = input_shape if coords_shape.rank not in [3, 4, 5]: raise ValueError( f'Coords shape rank must be 3(HWD) or 4(CHWD) or 5(NCHWD)') KN, KC = 1, 1 if coords_shape.rank == 5: KN, KC, KH, KW, KD = coords_shape elif coords_shape.rank == 4: KC, KH, KW, KD = coords_shape elif coords_shape.rank == 3: KH, KW, KD = coords_shape self.coords_N_tile = 1 self.coords_C_tile = 1 if KN != N: if KN == 1: self.coords_N_tile = N else: raise ValueError( f'Coords output batch {KN} does not match tensor input batch {N}.' ) if KC != IC: if KC == 1: self.coords_C_tile = IC else: raise ValueError( f'Coords output channels {KC} does not match tensor input channels {IC}.' ) if KD != 2: raise ValueError(f'Coords D {KD} channels must be == 2 (x,y)') self.output_shape = output_shape = nc.TensorShape((N, IC, KH, KW)) common_kernel_text = f""" {ph.define_axes_accessor('I', input_shape, 'NCHW')} {ph.define_axes_accessor('O', output_shape, 'NCHW')} """ self.O_forward_krn = nc.CLKernel(global_shape=(output_shape.size, ), kernel_text=f""" {common_kernel_text} {ph.define_axes_accessor('K', (KN,KC,KH,KW), 'NCHW')} __kernel void impl(__global float* O, __global const float* I, __global const float2* K) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', 'NCHW', 'gid')} float v = 0.0; float2 xys = K[K_idx_mod(on,oc,oh,ow)]; for (int ih=0; ih < IH; ++ih) {{ float ys_mod = max(0.0, 1.0-fabs(xys.y-ih)); if (ys_mod != 0.0) for (int iw=0; iw < IW; ++iw) {{ float xs_mod = max(0.0, 1.0-fabs(xys.x-iw)); if (xs_mod != 0.0) v += xs_mod*ys_mod*I[I_idx(on,oc,ih,iw)]; }} }} O[gid] = v; }} """) self.dI_krn = nc.CLKernel(global_shape=(input_shape.size, ), kernel_text=f""" {common_kernel_text} {ph.define_axes_accessor('K', (KN,KC,KH,KW), 'NCHW')} __kernel void impl(__global float* dI, __global const float2* K, __global float* dO) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', 'NCHW', 'gid')} float v = 0.0; for (int oh=0; oh < OH; ++oh) for (int ow=0; ow < OW; ++ow) {{ float2 xys = K[K_idx_mod(in,ic,oh,ow)]; float ys_mod = max(0.0, 1.0-fabs(xys.y-ih)); if (ys_mod != 0.0) {{ float xs_mod = max(0.0, 1.0-fabs(xys.x-iw)); if (xs_mod != 0.0) v += xs_mod*ys_mod*dO[O_idx(in,ic,oh,ow)]; }} }} dI[gid] += v; }} """) self.dK_krn = nc.CLKernel(global_shape=(N * IC * KH * KW, ), kernel_text=f""" {common_kernel_text} {ph.define_axes_accessor('K', (N,IC,KH,KW), 'NCHW')} __kernel void impl(__global float2* dK, __global const float* I, __global const float2* K, __global float* dO) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('k', 'NCHW', 'gid')} float dk_x = 0.0; float dk_y = 0.0; float2 xys = K[gid]; for (int ih=0; ih < IH; ++ih) {{ {{ float ys_mod = max(0.0, 1.0-fabs(xys.y-ih)); if (ys_mod != 0.0) for (int iw=0; iw < IW; ++iw) if ( fabs(iw-xys.x) < 1.0 ) {{ float xs_mod = 1.0*(iw >= xys.x)-1.0*(iw < xys.x); dk_x += xs_mod*ys_mod*I[I_idx(kn,kc,ih,iw)] * dO[O_idx(kn,kc,kh,kw)]; }} }} if (fabs(ih-xys.y) < 1.0) {{ float ys_mod = 1.0*(ih >= xys.y)-1.0*(ih < xys.y); for (int iw=0; iw < IW; ++iw) {{ float xs_mod = max(0.0, 1.0-fabs(xys.x-iw)); if (xs_mod != 0.0) dk_y += xs_mod*ys_mod*I[I_idx(kn,kc,ih,iw)] * dO[O_idx(kn,kc,kh,kw)]; }} }} }} dK[gid] += (float2)(dk_x, dk_y); }} """)
def __init__(self, N, C, H, W, OH, OW, KH, KW, PADL, PADT, DILATION, STRIDE, sshape, is_transpose, input_data_format, is_add_to_output): if input_data_format not in ['NCHW', 'NHWC']: raise ValueError(f'Unknown input_data_format {input_data_format}.') d = {'N': N, 'C': C, 'H': OH, 'W': OW, 'J': KH, 'I': KW} sshape = sshape.upper() output_shape = [1] O_shape = [] O_shape_size = 1 O_shape_axes = '' for symbol in sshape: if symbol == '_': output_shape.append(1) else: value = d.get(symbol, None) if value is None: raise ValueError( f'Unknown symbol {symbol}. Valid symbols: _{list(d.keys())}' ) if value == -1: raise ValueError(f'Duplicate symbol {symbol}') output_shape[-1] *= value O_shape.append(value) O_shape_size *= value O_shape_axes += symbol d[symbol] = -1 self.output_shape = tuple(output_shape) O_shape = tuple(O_shape) input_shape = (N, C, H, W) if input_data_format == 'NCHW' else (N, H, W, C) if not is_transpose: self.krn = nc.CLKernel(global_shape=(O_shape_size, ), kernel_text=f""" #define STRIDE {STRIDE} #define DILATION {DILATION} #define PADT {PADT} #define PADL {PADL} {ph.define_axes_accessor('I', input_shape, input_data_format)} {ph.define_axes_accessor('O', O_shape, O_shape_axes)} __kernel void impl(__global float* O, __global const float* I) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', O_shape_axes, 'gid')} #define ic oc #define in on float value = 0.0; int ih = -PADT + oj*DILATION + oh*STRIDE; if (ih >= 0 & ih < IH) {{ int iw = -PADL + oi*DILATION + ow*STRIDE; if (iw >= 0 & iw < IW) value = I[I_idx({ph.axes_order_enum('i', input_data_format)})]; }} O[O_idx({ph.axes_order_enum('o', O_shape_axes)})] {'+=' if is_add_to_output else '='} value; }} """) else: self.krn = nc.CLKernel(global_shape=(O_shape_size, ), kernel_text=f""" #define STRIDE {STRIDE} #define DILATION {DILATION} #define PADT {PADT} #define PADL {PADL} {ph.define_axes_accessor('I', input_shape, input_data_format)} {ph.define_axes_accessor('O', O_shape, O_shape_axes)} __kernel void impl(__global float* O, __global const float* I) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', O_shape_axes, 'gid')} #define ic oc #define in on float value = 0.0; int ih = ( PADT + oh - oj*DILATION ) / STRIDE; if (ih >= 0 & ih < IH & (oh == -PADT + oj*DILATION + ih*STRIDE) ) {{ int iw = ( PADL + ow - oi*DILATION ) / STRIDE; if (iw >= 0 & iw < IW & (ow == -PADL + oi*DILATION + iw*STRIDE) ) value = I[I_idx(in,ic,ih,iw)]; }} O[O_idx({ph.axes_order_enum('o', O_shape_axes)})] {'+=' if is_add_to_output else '='} value; }} """)
def __init__(self, input_shape: nc.TensorShape, slices, is_add_to_output): # Validate slices argument for given shape. new_slices = [] before_ellipsis = None for s in slices: if s is Ellipsis: before_ellipsis = new_slices new_slices = [] continue elif s is not None and not isinstance(s, (int, tuple)): raise ValueError( f'unknown slice argument {s} of type {s.__class__}') new_slices.append(s) if before_ellipsis is not None: # Process Ellipsis separator new_slices_n_axes = sum([1 for x in new_slices if x != None]) before_ellipsis_n_axes = sum( [1 for x in before_ellipsis if x != None]) # Expand slices by filling intermediate (None,None,None) for each remaining axis new_slices = before_ellipsis + \ [(None,None,None)]*max(0, input_shape.rank-before_ellipsis_n_axes-new_slices_n_axes) + \ new_slices new_slices_n_axes = sum([1 for x in new_slices if x != None]) if new_slices_n_axes > input_shape.rank: raise ValueError('slices arguments more than shape axes') elif new_slices_n_axes < input_shape.rank: # Fill remaining axes new_slices += [(None, None, None) ] * (input_shape.rank - new_slices_n_axes) slices = tuple(new_slices) # Compute shapes output_is_reshaped = True # Flag determines that output_tensor # can be just reshaped without any computation output_shape = [] # output tensor shape output_shape_krn = [ ] # output shape used in kernel, must match input shape input_axes_begin_step = [] # begin,step ints for every input shape i_axis = 0 for v in slices: if v is None: # None is new axis # We can add unlimited number of (1,) axes at any place of shape output_shape.append(1) continue i_axis_size = input_shape[i_axis] i_axis += 1 if isinstance(v, int): if v < 0: v += i_axis_size if v < 0 or v >= i_axis_size: raise ValueError( f'index {v} is out of bounds for axis {i_axis} with size {i_axis_size}' ) b, e, s = v, v, 1 else: b, e, s = v # Fix begin, end, step values if s is None: s = 1 if s == 0: raise ValueError('slice step cannot be zero') if b is None: b = 0 if s > 0 else i_axis_size - 1 if e is None: e = i_axis_size if s > 0 else -1 elif e < 0: e += i_axis_size if b < 0: b += i_axis_size if s > 0: b = np.clip(b, 0, i_axis_size) e = np.clip(e, 0, i_axis_size) else: b = np.clip(b, 0, i_axis_size - 1) e = np.clip(e, -1, i_axis_size) if i_axis_size != 1 and not (b == 0 and e == i_axis_size and s == 1): # Such params of axis slice will change input, thus output cannot be just reshaped input output_is_reshaped = False # Compute output_axis_size based on begin,end,step output_axis_size = max(0, math.ceil((e - b) / s)) if output_axis_size >= 1: # >= 1 : select range of indexes, axis will remain output_shape.append(output_axis_size) # ^ othwerwise axis will be supressed # output_shape to use in kernel, must match rank of input shape output_shape_krn.append(max(1, output_axis_size)) # for every output_shape_krn axis # we have exact begin,step values to fetch value from input input_axes_begin_step.append((b, s)) output_shape_krn = nc.TensorShape(output_shape_krn) self.output_is_reshaped = output_is_reshaped self.output_shape = nc.TensorShape(output_shape) self.forward_krn = nc.CLKernel(global_shape=(output_shape_krn.size, ), kernel_text=f""" {ph.define_axes_accessor('I', input_shape )} {ph.define_axes_sizes('O', output_shape_krn )} __kernel void impl(__global const float* I, __global float* O) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', output_shape_krn.rank, 'gid')} {''.join( f'size_t i{i} = {b} + o{i} * {s};' for i, (b,s) in enumerate(input_axes_begin_step) ) } O[get_global_id(0)] {'+=' if is_add_to_output else '='} I[I_idx({ph.axes_seq_enum('i', input_shape.rank)})]; }} """) self.backward_krn = nc.CLKernel(global_shape=(output_shape_krn.size, ), kernel_text=f""" {ph.define_axes_accessor('I', input_shape )} {ph.define_axes_sizes('O', output_shape_krn )} __kernel void impl(__global float* dI, __global const float* O) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', output_shape_krn.rank, 'gid')} {''.join( f'size_t i{i} = {b} + o{i} * {s};' for i, (b,s) in enumerate(input_axes_begin_step) ) } dI[I_idx({ph.axes_seq_enum('i', input_shape.rank)})] += O[get_global_id(0)]; }} """)
def __init__(self, input_shape : nc.TensorShape, kernel_shape : nc.TensorShape, stride, dilation, padding): if kernel_shape.rank != 3: raise ValueError(f'Kernel shape rank must be == 3') N,IC,IH,IW = input_shape KI,KH,KW = kernel_shape if KI != IC: raise ValueError(f'Kernel input channels {KI} does not match tensor input channels {IC}.') ci = nc.info.InfoConv2D(IH, IW, KH, KW, stride, dilation, padding) OC, OH, OW = IC, ci.OH, ci.OW self.output_shape = output_shape = nc.TensorShape( (N, OC, OH, OW) ) self.OC_1_1_NxOHxOW = (OC,1,1,N*OH*OW) self.KI_KH_KW_NxOHxOW = (KI,KH,KW,N*OH*OW) common_kernel_text = f""" {ph.define_axes_accessor('I', input_shape, 'NCHW')} {ph.define_axes_accessor('O', output_shape, 'NCHW')} {ph.define_axes_accessor('K', kernel_shape, 'IHW')} #define PADL {ci.PADL} #define PADT {ci.PADT} #define STRIDE {stride} #define DILATION {dilation} """ self.O_depthwise_krn = nc.CLKernel(global_shape=(output_shape.size,), kernel_text=f""" {common_kernel_text} __kernel void impl(__global float* O, __global const float* I, __global const float* K) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', 'NCHW', 'gid')} float v = 0.0; for (int kh=0; kh<KH; ++kh) {{ int ih = -PADT + kh*DILATION + oh*STRIDE; if (ih >= 0 & ih < IH) for (int kw=0; kw<KW; ++kw) {{ int iw = -PADL + kw*DILATION + ow*STRIDE; if (iw >= 0 & iw < IW) v += I[I_idx(on,oc,ih,iw)]*K[K_idx(oc,kh,kw)]; }} }} O[gid] = v; }} """) self.dI_depthwise_krn = nc.CLKernel(global_shape=(input_shape.size,), kernel_text=f""" {common_kernel_text} __kernel void impl(__global float* dI, __global const float* K, __global const float* dO) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', 'NCHW', 'gid')} float v = 0.0; for (int kh=0; kh<KH; ++kh) {{ int oh = (PADT + ih - kh*DILATION ) / STRIDE; if (oh >= 0 & oh < OH) for (int kw=0; kw<KW; ++kw) {{ int ow = (PADL + iw - kw*DILATION ) / STRIDE; if (ow >= 0 & ow < OW & iw == (-PADL + kw*DILATION + ow*STRIDE) & ih == (-PADT + kh*DILATION + oh*STRIDE) ) v += dO[O_idx(in,ic,oh,ow)]*K[K_idx(ic,kh,kw)]; }} }} dI[gid] += v; }} """) self.im2col = lambda x: nc.op.unfold2D(x, N, IC, IH, IW, OH, OW, KH, KW, ci.PADL, ci.PADT, dilation, stride, 'CJI_NHW', is_transpose=False)
def __init__(self, a_shape, b_shape, is_add_to_output): if a_shape.rank != b_shape.rank: raise ValueError( f'Ranks are not equal. {a_shape.rank} != {b_shape.rank}') rank = a_shape.rank if rank < 2: raise ValueError('Tensors rank must be at least 2.') K, M = a_shape[-2], a_shape[-1] N, B_COLS = b_shape[-2], b_shape[-1] if K != B_COLS: raise ValueError('A_ROWS != B_COLS') BATCH = a_shape[0:-2].size B_BATCH = b_shape[0:-2].size if BATCH != B_BATCH: raise ValueError( f'BATCH size {BATCH} != {B_BATCH} in shapes {a_shape} {b_shape}' ) if rank == 2: self.output_shape = output_shape = nc.TensorShape((N, M)) else: self.output_shape = output_shape = nc.TensorShape(a_shape[:-2] + (N, M)) self.M = M self.N = N self.K = K # Determining optimal tile widths for MW in [16, 8, 4, 2, 1]: if M % MW == 0: break for KW in [8, 4, 2, 1]: if N % KW == 0 and K % KW == 0: break NW = KW self.forward_krn = nc.CLKernel(global_shape=(M // MW, N // NW, BATCH), kernel_text=f""" #define K {K} #define N {N} #define MW {MW} // M tile Width #define NW {NW} // N tile Width -- NW & KW should be the same ! #define KW {KW} // K tile Width #define MT {M//MW} // MT is max for 'mt' (M tile count) #define KT {K//KW} // KT is max for 'kt' (K tile count) #define floatMW { f'float{MW}' if MW != 1 else 'float'} #define floatKW { f'float{KW}' if KW != 1 else 'float'} __kernel void GeMM(const __global floatMW* restrict A, const __global floatKW* restrict B, __global floatMW* C) {{ size_t mt = get_global_id(0); //global M-tile id size_t nc = get_global_id(1); //global N-tile id size_t batch = get_global_id(2); float AT[KW][MW]; // sub tiles float BT[NW][KW]; float CT[NW][MW]; #pragma unroll for (uint i=0; i<NW*MW; ++i) // zero CT tile ((float*) CT)[i] = 0.0; for (uint kt=0; kt<KT; ++kt) // iterate over K-dim tiles {{ #pragma unroll for (uint k=0; k<KW; ++k) // every k-element inside K-dim tile *( (floatMW*) AT[k] ) = A[batch*K*MT + (kt*KW + k)*MT + mt]; // store M-Width floats #pragma unroll for (uint n=0; n<NW; ++n) // every n-element inside N-dim tile *( (floatKW*) BT[n] ) = B[batch*N*KT + (nc*NW + n)*KT + kt]; // store K-Width floats #pragma unroll for (uint k=0; k<KW; ++k) #pragma unroll for (uint n=0; n<NW; ++n) // sub tiles multiplication #pragma unroll for (uint m=0; m<MW; ++m) CT[n][m] += AT[k][m] * BT[n][k]; }} #pragma unroll for (uint n=0; n<NW; ++n) C[ batch*N*MT + (nc*NW + n)*MT + mt] {'+=' if is_add_to_output else '='} *( (floatMW*) CT[n]); }}""")
def __init__(self, op_type, input_shape: nc.TensorShape, pool_size, stride, padding): if op_type not in ['avg', 'min', 'max']: raise ValueError(f'unknown op_type {op_type}') if pool_size < 2: raise ValueError(f'pool_size {pool_size} must be at least 2') self.op_type = op_type N, IC, IH, IW = input_shape ci = nc.info.InfoConv2D(IH, IW, pool_size, pool_size, stride, 1, padding) OC, OH, OW = IC, ci.OH, ci.OW self.output_shape = output_shape = nc.TensorShape((N, OC, OH, OW)) common_kernel_text = f""" {ph.define_axes_accessor('I', input_shape, 'NCHW')} {ph.define_axes_accessor('O', output_shape, 'NCHW')} #define PADL {ci.PADL} #define PADT {ci.PADT} #define POOL_SIZE {pool_size} #define STRIDE {stride} """ self.O_forward_krn = nc.CLKernel(global_shape=(output_shape.size, ), kernel_text=f""" {common_kernel_text} __kernel void impl(__global float* O, __global const float* I) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', 'NCHW', 'gid')} { {'avg' : 'float v = 0.0; int v_count = 0;', 'max' : 'float v = -INFINITY;', 'min' : 'float v = INFINITY;', }[op_type] } for (int ph=0; ph<POOL_SIZE; ++ph) for (int pw=0; pw<POOL_SIZE; ++pw) {{ int ih = -PADT + ph + oh*STRIDE; int iw = -PADL + pw + ow*STRIDE; if (iw >= 0 & ih >= 0 & iw < IW & ih < IH) {{ { {'avg' : 'v += I[I_idx(on,oc,ih,iw)]; ++v_count;', 'max' : 'v = fmax(v, I[I_idx(on,oc,ih,iw)]);', 'min' : 'v = fmin(v, I[I_idx(on,oc,ih,iw)]);', }[op_type] } }} }} { {'avg' : 'if (v_count != 0) v /= v_count;', 'max' : 'if (v == -INFINITY) v = 0.0;', 'min' : 'if (v == INFINITY) v = 0.0;', }[op_type] } O[gid] = v; }} """) if op_type == 'avg': self.dI_krn = nc.CLKernel(global_shape=(input_shape.size, ), kernel_text=f""" {common_kernel_text} __kernel void impl(__global float* dI, __global const float* dO) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', 'NCHW', 'gid')} float v = 0.0; for (int ph=0; ph<POOL_SIZE; ++ph) for (int pw=0; pw<POOL_SIZE; ++pw) {{ int oh = (PADT + ih - ph ) / STRIDE; int ow = (PADL + iw - pw ) / STRIDE; if (ow >= 0 & oh >= 0 & ow < OW & oh < OH & iw == (-PADL + pw + ow*STRIDE) & ih == (-PADT + ph + oh*STRIDE) ) {{ int d=0; for (int dph=0; dph<POOL_SIZE; ++dph) for (int dpw=0; dpw<POOL_SIZE; ++dpw) {{ int dih = -PADT + dph + oh*STRIDE; int diw = -PADL + dpw + ow*STRIDE; d += (diw >= 0 & dih >= 0 & diw < IW & dih < IH); }} v += dO[O_idx(in,ic,oh,ow)] / d; }} }} dI[gid] += v; }} """) elif op_type in ['min', 'max']: # Implementation is different from tensorflow in case when the same values exist in reduction axes. # Example tf : 3 4 5 5 , max = 5, gradients : 0 0 0.5 0.5 # Example litenn : 3 4 5 5 , max = 5, gradients : 0 0 1 0 # or gradients : 0 0 0 1 - depends on which GPU thread will be first ! self.dI_krn = nc.CLKernel(global_shape=(input_shape.size, ), kernel_text=f""" {common_kernel_text} __kernel void impl(__global float* dI, __global const float* I, __global const float* dO, __global const float* O) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('i', 'NCHW', 'gid')} float v = 0.0; // Iterate over all O pixels, where 'I' have contribution for (int ph=0; ph<POOL_SIZE; ++ph) for (int pw=0; pw<POOL_SIZE; ++pw) {{ int oh = (PADT + ih - ph ) / STRIDE; int ow = (PADL + iw - pw ) / STRIDE; if (ow >= 0 & oh >= 0 & ow < OW & oh < OH & iw == (-PADL + pw + ow*STRIDE) & ih == (-PADT + ph + oh*STRIDE) ) {{ //Now we have oh,ow where ih,iw have contribution float Ov = O[O_idx(in,ic,oh,ow)]; //Iterate in those I pixels, which were used to produce O //to determine first min/max match for (int dphw=0; dphw < POOL_SIZE*POOL_SIZE; ++dphw) {{ int dih = -PADT + (dphw / POOL_SIZE) + oh*STRIDE; int diw = -PADL + (dphw % POOL_SIZE) + ow*STRIDE; if (diw >= 0 & dih >= 0 & diw < IW & dih < IH & I[I_idx(in,ic,dih,diw)] == Ov) {{ // Match I==O if (dih == ih & diw == iw) // but add gradient only if current ih/iw index match dih/diw v += dO[O_idx(in,ic,oh,ow)]; break; }} }} }} }} dI[gid] += v; }} """)
def __init__(self, DualWiseOpKernel_cls, DualWiseOpKernel_args, a_shape: nc.TensorShape, b_shape: nc.TensorShape, is_add_to_output): self.kernel = DualWiseOpKernel_cls(*DualWiseOpKernel_args) self.info = info = nc.info.InfoBroadcast(a_shape, b_shape) # Implement kernel. Process of both broadcasted shapes using index modulus accessor. self.forward_krn = nc.CLKernel(global_shape=(info.output_shape.size, ), kernel_text=f""" {ph.define_axes_accessor('A', info.a_br_shape )} {ph.define_axes_accessor('B', info.b_br_shape )} {ph.define_axes_accessor('O', info.output_shape )} __kernel void impl(__global float* O_t, __global const float* A_t, __global const float* B_t) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', info.output_shape.rank, 'gid')} float A = A_t[A_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})]; float B = B_t[B_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})]; float O = 0.0; {self.kernel.get_forward_kernel_text()} O_t[get_global_id(0)] {'+=' if is_add_to_output else '='} O; }} """) self.backward_A_krn = nc.CLKernel( global_shape=(info.output_shape.size, ), kernel_text=f""" {ph.define_axes_accessor('A', info.a_br_shape)} {ph.define_axes_accessor('B', info.b_br_shape)} {ph.define_axes_accessor('O', info.output_shape)} __kernel void impl(__global float* dA_t, __global const float* A_t, __global const float* B_t, __global const float* O_t, __global const float* dO_t) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', info.output_shape.rank, 'gid')} float A = A_t[A_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})]; float B = B_t[B_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})]; float O = O_t[gid]; float dO = dO_t[gid]; float dA = 0.0; {self.kernel.get_backward_A_kernel_text()} dA_t[gid] += dA; }} """) self.backward_B_krn = nc.CLKernel( global_shape=(info.output_shape.size, ), kernel_text=f""" {ph.define_axes_accessor('A', info.a_br_shape)} {ph.define_axes_accessor('B', info.b_br_shape)} {ph.define_axes_accessor('O', info.output_shape)} __kernel void impl(__global float* dB_t, __global const float* A_t, __global const float* B_t, __global const float* O_t, __global const float* dO_t) {{ size_t gid = get_global_id(0); {ph.axes_idxs_from_var('o', info.output_shape.rank, 'gid')} float A = A_t[A_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})]; float B = B_t[B_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})]; float O = O_t[gid]; float dO = dO_t[gid]; float dB = 0.0; {self.kernel.get_backward_B_kernel_text()} dB_t[gid] += dB; }} """)
class BatchNorm2D(nn.Module): """ Batch Normalization 2D module. arguments in_ch int input channels Don't forget to call most parent Module.set_training(bool) References [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167) """ def __init__(self, in_ch, momentum=0.99): self.in_ch = in_ch if momentum < 0 or momentum > 1.0: raise ValueError( f'momentum {momentum} must be in range [0 .. 1.0]') self.momentum = momentum self.gamma = nn.Tensor((in_ch, ), init=nn.initializer.Scalar(1.0)) self.beta = nn.Tensor((in_ch, ), init=nn.initializer.Scalar(0.0)) self.running_mean = nn.Tensor((in_ch, ), init=nn.initializer.Scalar(0.0)) self.running_var = nn.Tensor((in_ch, ), init=nn.initializer.Scalar(1.0)) super().__init__( saveables=['gamma', 'beta', 'running_mean', 'running_var'], trainables=['gamma', 'beta']) def forward(self, x, **kwargs): if self.is_training(): mean, var = nn.moments(x, axes=(0, 2, 3), keepdims=True) BatchNorm2D.upd_krn.run(self.running_mean, self.running_var, mean, var, np.float32(self.momentum), global_shape=(self.in_ch, )) else: mean = self.running_mean.reshape((1, -1, 1, 1)) var = self.running_var.reshape((1, -1, 1, 1)) x = (x - mean) / (nn.sqrt(var) + 1e-5) x = x * self.gamma.reshape( (1,-1,1,1) ) \ + self.beta.reshape( (1,-1,1,1) ) return x def __str__(self): return f"{self.__class__.__name__} : in_ch:{self.in_ch}" def __repr__(self): return self.__str__() upd_krn = nc.CLKernel(kernel_text=""" __kernel void asd( __global float* RM, __global float* RV, __global const float* M, __global const float* V, float momentum) {{ size_t gid = get_global_id(0); float rm = RM[gid]; float m = M[gid]; float rv = RV[gid]; float v = V[gid]; RM[gid] = rm*momentum + m*(1.0-momentum); RV[gid] = rv*momentum + v*(1.0-momentum); }} """)