Esempi in Python per CLKernel, esempi in Python per litenn.core.CLKernel

Esempio n. 1

0

Mostra file

File: tile.py Progetto: JHP4911/litenn

    def __init__(self, input_shape : nc.TensorShape, tiles, is_add_to_output):
        self.info = info = nc.info.InfoTile(input_shape, tiles)

        self.forward_krn = nc.CLKernel(global_shape=(info.output_shape.size,), kernel_text=f"""
{ph.define_axes_accessor('I', input_shape)}
{ph.define_axes_accessor('O', info.output_shape)}
__kernel void impl(__global float* O, __global const float* I)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('o', info.output_shape.rank, 'gid')}
    O[gid] {'+=' if is_add_to_output else '='}
           I[I_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})];
}}
""")
        self.backward_krn = nc.CLKernel(global_shape=(input_shape.size,), kernel_text=f"""
{ph.define_axes_accessor('I', input_shape)}
{ph.define_axes_accessor('O', info.output_shape)}
__kernel void impl(__global float* dI, __global const float* dO
                    ,{','.join([ f' long i{i}_offset' for i in range(input_shape.rank) ]) } )
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('i', input_shape.rank, 'gid')}
    {';'.join([ f'i{i} += i{i}_offset' for i in range(input_shape.rank) ]) };
    dI[gid] += dO[O_idx({ph.axes_seq_enum('i', input_shape.rank)})];
}}
""")

Esempio n. 2

0

Mostra file

    def __init__(self, ElementWiseOpKernel_cls, ElementWiseOpKernel_args,
                 input_shape, is_add_to_output):
        self.output_shape = input_shape
        self.kernel = ElementWiseOpKernel_cls(*ElementWiseOpKernel_args)

        self.forward_krn = nc.CLKernel(global_shape=(input_shape.size, ),
                                       kernel_text=f"""
__kernel void impl(__global float* O_t, __global const float* I_t)
{{
    size_t idx = get_global_id(0);
    float I = I_t[idx];
    float O = 0.0;
    {self.kernel.get_forward_kernel_text()}
    O_t[idx] {'+=' if is_add_to_output else '='} O;
}}
""")

        self.backward_krn = nc.CLKernel(global_shape=(input_shape.size, ),
                                        kernel_text=f"""
__kernel void impl(__global float* dI_t, __global const float* I_t, __global const float* O_t, __global const float* dO_t)
{{
    size_t idx = get_global_id(0);
    float I = I_t[idx];
    float O = O_t[idx];
    float dO = dO_t[idx];
    float dI = 0.0;
    {self.kernel.get_backward_kernel_text()}
    dI_t[idx] += dI;
}}
""")

Esempio n. 3

0

Mostra file

File: reduce.py Progetto: JHP4911/litenn

    def __init__(self, op_type, input_shape : nc.TensorShape, axes : nc.TensorAxes, keepdims=False):
        self.op_type = op_type
        self.info = info = nc.info.InfoReduction(input_shape, axes, keepdims)

        # Determine transpose order for intermediate tensor, where reduction axes will be at the end
        self.intermediate_transpose_axes = info.output_axes + info.reduction_axes
        self.intermediate_shape = nc.info.InfoTranspose(input_shape, self.intermediate_transpose_axes).output_shape

        # slices argument to fetch processed tensor from zero indexes
        self.inter_slices = ( slice(None,None,None), ) * info.output_axes.rank + (0,) * info.reduction_axes.rank

        # COLS are reduction axes, ROWS are remaining axes
        rows_rank = info.output_axes.rank
        self.ROWS = ROWS = self.intermediate_shape[:rows_rank].size
        self.COLS = COLS = self.intermediate_shape[rows_rank:].size

        # Number of stages to operate COLS
        n_stages = (COLS-1).bit_length()
        self.forward_krn_shapes           = [ (ROWS * math.ceil(COLS/ (2**(stage+1)) ),) for stage in range(n_stages) ]
        self.forward_krn_stage_cols       = [ math.ceil(COLS / (2**(stage+1)) ) for stage in range(n_stages) ]
        self.forward_krn_stage_valid_cols = [ math.ceil(COLS / (2** stage   ) ) for stage in range(n_stages) ]

        self.krn_I_shape = (input_shape.size,)


        if op_type == 'mean':
            self.forward_krn = _ReduceOp.forward_krns['sum']
        else:
            self.forward_krn = _ReduceOp.forward_krns[op_type]

        if op_type in ['sum', 'mean']:
            self.backward_krn = nc.CLKernel(f"""
{ph.define_axes_accessor('I', input_shape)}
{ph.define_axes_accessor('O', info.output_shape_keepdims)}
__kernel void impl(__global float* dI, __global const float* dO)
{{
size_t gid = get_global_id(0);
{ph.axes_idxs_from_var('i', input_shape.rank, 'gid')}
dI[gid] += dO[O_idx_mod({ph.axes_seq_enum('i', input_shape.rank )})]
                        {f'/ {COLS}' if op_type == 'mean' else ''};
}}
""")
        elif op_type in ['min', 'max']:
            self.backward_krn = nc.CLKernel(f"""
{ph.define_axes_accessor('I', input_shape)}
{ph.define_axes_accessor('O', info.output_shape_keepdims)}
__kernel void impl(__global float* dI, __global const float* I, __global const float* dO, __global const float* O, __global const float* OLock)
{{
size_t gid = get_global_id(0);
{ph.axes_idxs_from_var('i', input_shape.rank, 'gid')}

size_t o_idx = O_idx({ph.axes_seq_enum('i', info.output_shape_keepdims.rank, zero_axes=info.reduction_axes)});

if (I[gid] == O[o_idx])
    dI[gid] += (atomic_inc( (volatile __global int*) &OLock[o_idx]) == 0) //1 if we are first
               * dO[o_idx];
}}
""")

Esempio n. 4

0

Mostra file

File: SGD.py Progetto: JHP4911/litenn

    def __init__(self,
                 tensors_list,
                 momentum=0.0,
                 nesterov=False,
                 lr=0.001,
                 lr_decay=0.0,
                 lr_dropout=0.0,
                 clipnorm=0.0):
        super().__init__(tensors_list,
                         lr=lr,
                         lr_decay=lr_decay,
                         lr_dropout=lr_dropout,
                         clipnorm=clipnorm,
                         saveables=['_accs'])

        self.momentum = momentum
        self.nesterov = nesterov

        _accs = {}
        for t in self.get_trainable_tensors():
            _accs[t.get_name()] = nn.Tensor_zeros_like(t)
        self._accs = _accs

        self._update_acc_krn = nc.CLKernel(f"""
{self._get_lr_kernel_common_text()}


__kernel void impl (__global float* A, __global const float* G {self._get_lr_kernel_args_text()} )
{{
    size_t gid = get_global_id(0);

    {self._get_lr_kernel_text()}
    
    A[gid] = {momentum}*A[gid] - lr*G[gid];    
}}
""")

        self._update_t_krn = nc.CLKernel(f"""
{self._get_lr_kernel_common_text()}

#define NESTEROV {int(nesterov)}
__kernel void impl (__global float* V, __global const float* G, __global const float* A
                    {self._get_lr_kernel_args_text()} )
{{
    size_t gid = get_global_id(0);
    
    {self._get_lr_kernel_text()}
    
#if NESTEROV==1
    V[gid] += {momentum}*A[gid] - lr*G[gid];
#else
    V[gid] += A[gid];
#endif

}}
""")

Esempio n. 5

0

Mostra file

    def __init__(self,
                 tensors_list,
                 beta_1=0.9,
                 beta_2=0.999,
                 lr=0.001,
                 lr_decay=0.0,
                 lr_dropout=0.0,
                 clipnorm=0.0):
        super().__init__(tensors_list,
                         lr=lr,
                         lr_decay=lr_decay,
                         lr_dropout=lr_dropout,
                         clipnorm=clipnorm,
                         saveables=['_ms', '_vs'])

        self.beta_1 = beta_1
        self.beta_2 = beta_2

        _ms, _vs = {}, {}
        for t in self.get_trainable_tensors():
            _ms[t.get_name()] = nn.Tensor_zeros_like(t)
            _vs[t.get_name()] = nn.Tensor_zeros_like(t)
        self._ms, self._vs = _ms, _vs

        self._update_ms_krn = nc.CLKernel(f"""
__kernel void impl(__global float* M, __global const float* G)
{{
    size_t gid = get_global_id(0);
    M[gid] = {beta_1}*M[gid] + (1.0 - {beta_1})*G[gid];
}}
""")
        self._update_vs_krn = nc.CLKernel(f"""
__kernel void impl(__global float* V, __global const float* G)
{{
    size_t gid = get_global_id(0);
    float g = G[gid];
    V[gid] = {beta_2}*V[gid] + (1.0 - {beta_2})*g*g;
}}
""")

        self._update_t_krn = nc.CLKernel(f"""
{self._get_lr_kernel_common_text()}


__kernel void impl (__global float* T, __global const float* M, __global const float* V
                    {self._get_lr_kernel_args_text()} )
{{
    size_t gid = get_global_id(0);

    {self._get_lr_kernel_text()}

    T[gid] += -lr*M[gid] / ( sqrt(V[gid]) + 1e-7 );
}}
""")

Esempio n. 6

0

Mostra file

File: dropout.py Progetto: JHP4911/litenn

    def __init__(self, input_shape: nc.TensorShape, rate, seed,
                 is_add_to_output):
        if rate < 0 or rate >= 1.0:
            raise ValueError(f'rate must be in range [0 .. 1.0)')

        self.rate = rate
        if seed is None:
            seed = np.random.randint(2147483648)
        self.seed = seed

        self.output_shape = input_shape

        self.krn = nc.CLKernel(global_shape=(input_shape.size, ),
                               kernel_text=f"""
{ph.include_hash()}

{ph.define_axes_accessor('I', input_shape)}
{ph.define_axes_accessor('O', self.output_shape)}

__kernel void impl(__global float* O, __global const float* I, uint seed)
{{
    size_t gid = get_global_id(0);

    float value = 0.0;
    if ( hash_float_uint(seed+gid) <= {rate} )
        value = I[gid] * ( 1.0 / ( 1.0 - {rate} ) );
        
    O[gid] {'+=' if is_add_to_output else '='} value;
}}""")

Esempio n. 7

0

Mostra file

File: resize2D_nearest.py Progetto: JHP4911/litenn

    def __init__(self, input_shape: nc.TensorShape, size, is_add_to_output):
        N, IC, IH, IW = input_shape
        OC = IC
        OH = IH * size
        OW = IW * size

        self.output_shape = output_shape = nc.TensorShape((N, OC, OH, OW))

        common_kernel_text = f"""
{ph.define_axes_accessor('I', input_shape, 'NCHW')}
{ph.define_axes_accessor('O', output_shape, 'NCHW')}
"""

        self.O_forward_krn = nc.CLKernel(global_shape=(output_shape.size, ),
                                         kernel_text=f"""
{common_kernel_text}

#define SIZE {size}

__kernel void impl(__global float* O, __global const float* I)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('o', 'NCHW', 'gid')}

    O[gid] {'+=' if is_add_to_output else '='} I[I_idx(on,oc,oh / SIZE,ow / SIZE)];
}}
""")
        self.dI_krn = nc.CLKernel(global_shape=(input_shape.size, ),
                                  kernel_text=f"""
{common_kernel_text}

#define SIZE {size}

__kernel void impl(__global float* dI, __global const float* dO)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('i', 'NCHW', 'gid')}
    
    float v = 0.0;
    for (int y=0; y<SIZE; ++y)
    for (int x=0; x<SIZE; ++x)    
        v += dO[O_idx(in,ic,ih*SIZE+y,iw*SIZE+x)];
    
    dI[gid] += v;
}}
""")

Esempio n. 8

0

Mostra file

    def __init__(self,
                 tensors_list,
                 rho=0.9,
                 lr=0.001,
                 lr_decay=0.0,
                 lr_dropout=0.0,
                 clipnorm=0.0):
        super().__init__(tensors_list,
                         lr=lr,
                         lr_decay=lr_decay,
                         lr_dropout=lr_dropout,
                         clipnorm=clipnorm,
                         saveables=['_accs'])

        _accs = {}
        for t in self.get_trainable_tensors():
            _accs[t.get_name()] = nn.Tensor_zeros_like(t)
        self._accs = _accs
        self.rho = rho

        self._update_acc_krn = nc.CLKernel(f"""
__kernel void impl (__global float* A, __global const float* G)
{{
    size_t gid = get_global_id(0);
    float g = G[gid];
    A[gid] = {rho} * A[gid] + (1.0 - {rho}) * g * g;
}}
""")

        self._update_t_krn = nc.CLKernel(f"""
{self._get_lr_kernel_common_text()}
__kernel void impl (__global float* V, __global const float* G, __global const float* A
                    {self._get_lr_kernel_args_text()} )
{{
    size_t gid = get_global_id(0);
    
    {self._get_lr_kernel_text()}
    
    V[gid] += -lr * G[gid] / ( sqrt(A[gid]) + 1e-7 );
}}
""")

Esempio n. 9

0

Mostra file

File: RandomUniform.py Progetto: JHP4911/litenn

    def __init__(self, low=0.0, high=1.0):
        super().__init__()
        self.low = low
        self.high = high
        self.krn = nc.CLKernel(kernel_text=f"""
{ph.include_hash()}

__kernel void impl(__global float* O, uint seed)
{{
    size_t gid = get_global_id(0);
    O[gid] = hash_float_uint(gid+seed)*({high}-({low}))+({low});
}}
""")

Esempio n. 10

0

Mostra file

    def __init__(self, input_shape : nc.TensorShape, axes_order : nc.TensorAxes, is_add_to_output):
        self.axes_order = axes_order
        self.info = info = nc.info.InfoTranspose(input_shape, axes_order)
   
        self.forward_krn = nc.CLKernel(global_shape=(input_shape.size,), kernel_text=f"""
{ph.define_axes_accessor('I', input_shape)}
{ph.define_axes_accessor('O', info.output_shape)}
__kernel void impl(__global const float* I, __global float* O)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('i', input_shape.rank, 'gid')}
    O[O_idx({ph.axes_order_enum('i', axes_order )})]
    {'+=' if is_add_to_output else '='} I[gid];
}}""")

Esempio n. 11

0

Mostra file

File: concat.py Progetto: JHP4911/litenn

    def __init__(self, input_shapes, axis, is_add_to_output):
        self.info = info = nc.info.InfoConcat(input_shapes, axis)
        self.forward_krn = nc.CLKernel(f"""
{ph.define_axes_accessor('I', info.output_shape )}
{ph.define_axes_accessor('O', info.output_shape )}
#undef I{info.axis}
__kernel void impl(__global float* O, __global const float* I, long axis_offset, long I{info.axis})
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('i', info.output_shape.rank, 'gid')}
    i{info.axis} += axis_offset;
    O[O_idx({ph.axes_seq_enum('i', info.output_shape.rank)})]
    {'+=' if is_add_to_output else '='} I[gid];
}}
""")

Esempio n. 12

0

Mostra file

    def __init__(self, input_shape: nc.TensorShape, axis, stack_count,
                 is_add_to_output):
        self.info = info = nc.info.InfoStack(input_shape, axis, stack_count)

        self.forward_krn = nc.CLKernel(global_shape=(input_shape.size, ),
                                       kernel_text=f"""
{ph.define_axes_accessor('I', input_shape )}
{ph.define_axes_accessor('O', info.output_shape )}
__kernel void impl(__global float* O, __global const float* I, long n)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('i', input_shape.rank, 'gid')}
    O[O_idx({ph.axes_seq_enum('i', input_shape.rank, new_axis=('n', info.axis))})]
    {'+=' if is_add_to_output else '='} I[gid];
}}
""")

Esempio n. 13

0

Mostra file

    def __init__(self, mean=0.0, stddev=1.0):
        super().__init__()
        self.krn = nc.CLKernel(kernel_text=f"""
{ph.include_constants_pi()}
{ph.include_hash()}

__kernel void impl(__global float* O, uint seed)
{{
    size_t gid = get_global_id(0);
    
    float2 rnd = hash_float2_uint(gid+seed);
    
    float rnd_normal = sqrt(-2*log(rnd.x))*cos(2*PI_F*rnd.y);
    
    O[gid] = {mean} + rnd_normal*{stddev};
}}
""")

Esempio n. 14

0

Mostra file

class _MeshGrid2D(Initializer):
    krn = nc.CLKernel(kernel_text=f"""
__kernel void impl(__global float* O, float h_start, float h_step
                                    , float w_start, float w_step,
                                     uint OH, uint OW, uint OD)
{{
    size_t gid = get_global_id(0);
    
    size_t od = gid % OD; gid /= OD;
    size_t ow = gid % OW; gid /= OW;
    size_t oh = gid % OH;
    size_t oc = gid / OH; gid = get_global_id(0);
    
    float v = 1.0;
    if (od == 0)
        v = h_start+oh*h_step;
    else 
    if (od == 1)
        v = w_start+ow*w_step;
        
        
    O[gid] = v;
}}
""")

    def __init__(self, h_start, h_stop, w_start, w_stop):
        super().__init__()
        self.h_start = h_start
        self.h_stop = h_stop
        self.w_start = w_start
        self.w_stop = w_stop

    def initialize_CLBuffer(self, cl_buffer, tensor_shape: nc.TensorShape):
        if tensor_shape.rank != 4:
            raise ValueError(f'tensor_shape.rank must == 4')

        OC, OH, OW, OD = tensor_shape
        if OD != 3:
            raise ValueError(f'D {OD} must == 3')

        if OH > 1:
            h_step = (self.h_stop - self.h_start) / (OH - 1)
        else:
            h_step = 0

        if OW > 1:
            w_step = (self.w_stop - self.w_start) / (OW - 1)
        else:
            w_step = 0

        cl_buffer.device.run(_MeshGrid2D.krn,
                             cl_buffer,
                             np.float32(self.h_start),
                             np.float32(h_step),
                             np.float32(self.w_start),
                             np.float32(w_step),
                             np.uint32(OH),
                             np.uint32(OW),
                             np.uint32(OD),
                             global_shape=(OC * OH * OW * OD, ))

    def __str__(self):
        return f'MeshGrid2D'

Esempio n. 15

0

Mostra file

    def __init__(self, input_shape: nc.TensorShape,
                 coords_shape: nc.TensorShape):
        N, IC, IH, IW = input_shape

        if coords_shape.rank not in [3, 4, 5]:
            raise ValueError(
                f'Coords shape rank must be 3(HWD) or 4(CHWD) or 5(NCHWD)')
        KN, KC = 1, 1

        if coords_shape.rank == 5:
            KN, KC, KH, KW, KD = coords_shape
        elif coords_shape.rank == 4:
            KC, KH, KW, KD = coords_shape
        elif coords_shape.rank == 3:
            KH, KW, KD = coords_shape

        self.coords_N_tile = 1
        self.coords_C_tile = 1

        if KN != N:
            if KN == 1:
                self.coords_N_tile = N
            else:
                raise ValueError(
                    f'Coords output batch {KN} does not match tensor input batch {N}.'
                )

        if KC != IC:
            if KC == 1:
                self.coords_C_tile = IC
            else:
                raise ValueError(
                    f'Coords output channels {KC} does not match tensor input channels {IC}.'
                )

        if KD != 2:
            raise ValueError(f'Coords D {KD} channels must be == 2 (x,y)')

        self.output_shape = output_shape = nc.TensorShape((N, IC, KH, KW))

        common_kernel_text = f"""
{ph.define_axes_accessor('I', input_shape, 'NCHW')}
{ph.define_axes_accessor('O', output_shape, 'NCHW')}
"""
        self.O_forward_krn = nc.CLKernel(global_shape=(output_shape.size, ),
                                         kernel_text=f"""
{common_kernel_text}
{ph.define_axes_accessor('K', (KN,KC,KH,KW), 'NCHW')}
__kernel void impl(__global float* O, __global const float* I, __global const float2* K)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('o', 'NCHW', 'gid')}

    float v = 0.0;

    float2 xys = K[K_idx_mod(on,oc,oh,ow)];

    for (int ih=0; ih < IH; ++ih)
    {{
        float ys_mod = max(0.0, 1.0-fabs(xys.y-ih));
        if (ys_mod != 0.0)
        for (int iw=0; iw < IW; ++iw)
        {{
            float xs_mod = max(0.0, 1.0-fabs(xys.x-iw));
            if (xs_mod != 0.0)
                v += xs_mod*ys_mod*I[I_idx(on,oc,ih,iw)];
        }}
    }}

    O[gid] = v;
}}
""")

        self.dI_krn = nc.CLKernel(global_shape=(input_shape.size, ),
                                  kernel_text=f"""
{common_kernel_text}
{ph.define_axes_accessor('K', (KN,KC,KH,KW), 'NCHW')}
__kernel void impl(__global float* dI, __global const float2* K, __global float* dO)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('i', 'NCHW', 'gid')}

    float v = 0.0;

    for (int oh=0; oh < OH; ++oh)
    for (int ow=0; ow < OW; ++ow)
    {{
        float2 xys = K[K_idx_mod(in,ic,oh,ow)];

        float ys_mod = max(0.0, 1.0-fabs(xys.y-ih));
        if (ys_mod != 0.0)
        {{
            float xs_mod = max(0.0, 1.0-fabs(xys.x-iw));
            if (xs_mod != 0.0)
                v += xs_mod*ys_mod*dO[O_idx(in,ic,oh,ow)];
        }}
    }}

    dI[gid] += v;
}}
""")

        self.dK_krn = nc.CLKernel(global_shape=(N * IC * KH * KW, ),
                                  kernel_text=f"""
{common_kernel_text}
{ph.define_axes_accessor('K', (N,IC,KH,KW), 'NCHW')}
__kernel void impl(__global float2* dK, __global const float* I, __global const float2* K, __global float* dO)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('k', 'NCHW', 'gid')}

    float dk_x = 0.0;
    float dk_y = 0.0;

    float2 xys = K[gid];

    for (int ih=0; ih < IH; ++ih)
    {{
        {{
            float ys_mod = max(0.0, 1.0-fabs(xys.y-ih));

            if (ys_mod != 0.0)
            for (int iw=0; iw < IW; ++iw)
            if ( fabs(iw-xys.x) < 1.0 )
            {{
                float xs_mod = 1.0*(iw >= xys.x)-1.0*(iw < xys.x);
                dk_x += xs_mod*ys_mod*I[I_idx(kn,kc,ih,iw)] * dO[O_idx(kn,kc,kh,kw)];
            }}
        }}

        if (fabs(ih-xys.y) < 1.0)
        {{
            float ys_mod = 1.0*(ih >= xys.y)-1.0*(ih < xys.y);
            for (int iw=0; iw < IW; ++iw)
            {{
                float xs_mod = max(0.0, 1.0-fabs(xys.x-iw));
                if (xs_mod != 0.0)
                    dk_y += xs_mod*ys_mod*I[I_idx(kn,kc,ih,iw)] * dO[O_idx(kn,kc,kh,kw)];
            }}
        }}
    }}
    dK[gid] += (float2)(dk_x, dk_y);
}}
""")

Esempio n. 16

0

Mostra file

File: unfold2D.py Progetto: JHP4911/litenn

    def __init__(self, N, C, H, W, OH, OW, KH, KW, PADL, PADT, DILATION,
                 STRIDE, sshape, is_transpose, input_data_format,
                 is_add_to_output):
        if input_data_format not in ['NCHW', 'NHWC']:
            raise ValueError(f'Unknown input_data_format {input_data_format}.')

        d = {'N': N, 'C': C, 'H': OH, 'W': OW, 'J': KH, 'I': KW}
        sshape = sshape.upper()

        output_shape = [1]
        O_shape = []
        O_shape_size = 1
        O_shape_axes = ''
        for symbol in sshape:
            if symbol == '_':
                output_shape.append(1)
            else:
                value = d.get(symbol, None)
                if value is None:
                    raise ValueError(
                        f'Unknown symbol {symbol}. Valid symbols: _{list(d.keys())}'
                    )
                if value == -1:
                    raise ValueError(f'Duplicate symbol {symbol}')
                output_shape[-1] *= value
                O_shape.append(value)
                O_shape_size *= value
                O_shape_axes += symbol
                d[symbol] = -1

        self.output_shape = tuple(output_shape)
        O_shape = tuple(O_shape)

        input_shape = (N, C, H, W) if input_data_format == 'NCHW' else (N, H,
                                                                        W, C)

        if not is_transpose:
            self.krn = nc.CLKernel(global_shape=(O_shape_size, ),
                                   kernel_text=f"""
#define STRIDE {STRIDE}
#define DILATION {DILATION}
#define PADT {PADT}
#define PADL {PADL}

{ph.define_axes_accessor('I', input_shape, input_data_format)}
{ph.define_axes_accessor('O', O_shape, O_shape_axes)}
__kernel void impl(__global float* O, __global const float* I)
{{  
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('o', O_shape_axes, 'gid')}
#define ic oc
#define in on
    float value = 0.0;
    int ih = -PADT + oj*DILATION + oh*STRIDE;
    if (ih >= 0 & ih < IH)
    {{
        int iw = -PADL + oi*DILATION + ow*STRIDE;
        if (iw >= 0 & iw < IW)
            value = I[I_idx({ph.axes_order_enum('i', input_data_format)})];
    }}

    O[O_idx({ph.axes_order_enum('o', O_shape_axes)})]
    {'+=' if is_add_to_output else '='} value;
}}
""")
        else:
            self.krn = nc.CLKernel(global_shape=(O_shape_size, ),
                                   kernel_text=f"""
#define STRIDE {STRIDE}
#define DILATION {DILATION}
#define PADT {PADT}
#define PADL {PADL}

{ph.define_axes_accessor('I', input_shape, input_data_format)}
{ph.define_axes_accessor('O', O_shape, O_shape_axes)}
__kernel void impl(__global float* O, __global const float* I)
{{  
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('o', O_shape_axes, 'gid')}
#define ic oc
#define in on
    float value = 0.0;
    int ih = ( PADT + oh - oj*DILATION ) / STRIDE;
    if (ih >= 0 & ih < IH & (oh == -PADT + oj*DILATION + ih*STRIDE) )
    {{
        int iw = ( PADL + ow - oi*DILATION ) / STRIDE;   
        if (iw >= 0 & iw < IW & (ow == -PADL + oi*DILATION + iw*STRIDE) )
            value = I[I_idx(in,ic,ih,iw)];
    }}
    O[O_idx({ph.axes_order_enum('o', O_shape_axes)})]
    {'+=' if is_add_to_output else '='} value;
}}
""")

Esempio n. 17

0

Mostra file

    def __init__(self, input_shape: nc.TensorShape, slices, is_add_to_output):

        # Validate slices argument for given shape.
        new_slices = []
        before_ellipsis = None

        for s in slices:
            if s is Ellipsis:
                before_ellipsis = new_slices
                new_slices = []
                continue
            elif s is not None and not isinstance(s, (int, tuple)):
                raise ValueError(
                    f'unknown slice argument {s} of type {s.__class__}')

            new_slices.append(s)

        if before_ellipsis is not None:
            # Process Ellipsis separator
            new_slices_n_axes = sum([1 for x in new_slices if x != None])
            before_ellipsis_n_axes = sum(
                [1 for x in before_ellipsis if x != None])

            # Expand slices by filling intermediate (None,None,None) for each remaining axis
            new_slices = before_ellipsis + \
                         [(None,None,None)]*max(0, input_shape.rank-before_ellipsis_n_axes-new_slices_n_axes) + \
                         new_slices

        new_slices_n_axes = sum([1 for x in new_slices if x != None])
        if new_slices_n_axes > input_shape.rank:
            raise ValueError('slices arguments more than shape axes')
        elif new_slices_n_axes < input_shape.rank:
            # Fill remaining axes
            new_slices += [(None, None, None)
                           ] * (input_shape.rank - new_slices_n_axes)

        slices = tuple(new_slices)

        # Compute shapes
        output_is_reshaped = True  # Flag determines that output_tensor
        # can be just reshaped without any computation
        output_shape = []  # output tensor shape
        output_shape_krn = [
        ]  # output shape used in kernel, must match input shape
        input_axes_begin_step = []  # begin,step ints for every input shape

        i_axis = 0
        for v in slices:
            if v is None:
                # None is new axis
                # We can add unlimited number of (1,) axes at any place of shape
                output_shape.append(1)
                continue

            i_axis_size = input_shape[i_axis]
            i_axis += 1

            if isinstance(v, int):
                if v < 0: v += i_axis_size
                if v < 0 or v >= i_axis_size:
                    raise ValueError(
                        f'index {v} is out of bounds for axis {i_axis} with size {i_axis_size}'
                    )
                b, e, s = v, v, 1
            else:
                b, e, s = v

            # Fix begin, end, step values
            if s is None: s = 1
            if s == 0:
                raise ValueError('slice step cannot be zero')

            if b is None: b = 0 if s > 0 else i_axis_size - 1
            if e is None: e = i_axis_size if s > 0 else -1
            elif e < 0: e += i_axis_size

            if b < 0: b += i_axis_size

            if s > 0:
                b = np.clip(b, 0, i_axis_size)
                e = np.clip(e, 0, i_axis_size)
            else:
                b = np.clip(b, 0, i_axis_size - 1)
                e = np.clip(e, -1, i_axis_size)

            if i_axis_size != 1 and not (b == 0 and e == i_axis_size
                                         and s == 1):
                # Such params of axis slice will change input, thus output cannot be just reshaped input
                output_is_reshaped = False

            # Compute output_axis_size based on begin,end,step
            output_axis_size = max(0, math.ceil((e - b) / s))

            if output_axis_size >= 1:
                # >= 1 : select range of indexes, axis will remain
                output_shape.append(output_axis_size)
            # ^ othwerwise axis will be supressed

            # output_shape to use in kernel, must match rank of input shape
            output_shape_krn.append(max(1, output_axis_size))

            # for every output_shape_krn axis
            # we have exact begin,step values to fetch value from input
            input_axes_begin_step.append((b, s))

        output_shape_krn = nc.TensorShape(output_shape_krn)
        self.output_is_reshaped = output_is_reshaped
        self.output_shape = nc.TensorShape(output_shape)

        self.forward_krn = nc.CLKernel(global_shape=(output_shape_krn.size, ),
                                       kernel_text=f"""
{ph.define_axes_accessor('I', input_shape )}
{ph.define_axes_sizes('O', output_shape_krn )}
__kernel void impl(__global const float* I, __global float* O)
{{
size_t gid = get_global_id(0);
{ph.axes_idxs_from_var('o', output_shape_krn.rank, 'gid')}
{''.join( f'size_t i{i} = {b} + o{i} * {s};' for i, (b,s) in enumerate(input_axes_begin_step)  )  }
O[get_global_id(0)] {'+=' if is_add_to_output else '='} I[I_idx({ph.axes_seq_enum('i', input_shape.rank)})];
}}
""")

        self.backward_krn = nc.CLKernel(global_shape=(output_shape_krn.size, ),
                                        kernel_text=f"""
{ph.define_axes_accessor('I', input_shape )}
{ph.define_axes_sizes('O', output_shape_krn )}
__kernel void impl(__global float* dI, __global const float* O)
{{
size_t gid = get_global_id(0);
{ph.axes_idxs_from_var('o', output_shape_krn.rank, 'gid')}
{''.join( f'size_t i{i} = {b} + o{i} * {s};' for i, (b,s) in enumerate(input_axes_begin_step)  )  }
dI[I_idx({ph.axes_seq_enum('i', input_shape.rank)})] += O[get_global_id(0)];
}}
""")

Esempio n. 18

0

Mostra file

File: depthwise_conv2D.py Progetto: JHP4911/litenn

    def __init__(self, input_shape : nc.TensorShape, kernel_shape : nc.TensorShape, stride, dilation, padding):
        if kernel_shape.rank != 3:
            raise ValueError(f'Kernel shape rank must be == 3')   
        N,IC,IH,IW = input_shape
        KI,KH,KW = kernel_shape      
        if KI != IC:
            raise ValueError(f'Kernel input channels {KI} does not match tensor input channels {IC}.')
        
        ci = nc.info.InfoConv2D(IH, IW, KH, KW, stride, dilation, padding)
        OC, OH, OW = IC, ci.OH, ci.OW
        self.output_shape = output_shape = nc.TensorShape( (N, OC, OH, OW) )

        self.OC_1_1_NxOHxOW = (OC,1,1,N*OH*OW)
        self.KI_KH_KW_NxOHxOW = (KI,KH,KW,N*OH*OW)

        common_kernel_text = f"""
{ph.define_axes_accessor('I', input_shape, 'NCHW')}
{ph.define_axes_accessor('O', output_shape, 'NCHW')}
{ph.define_axes_accessor('K', kernel_shape, 'IHW')}
#define PADL {ci.PADL}
#define PADT {ci.PADT}

#define STRIDE {stride}
#define DILATION {dilation}
"""
        self.O_depthwise_krn = nc.CLKernel(global_shape=(output_shape.size,), kernel_text=f"""
{common_kernel_text}
__kernel void impl(__global float* O, __global const float* I, __global const float* K)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('o', 'NCHW', 'gid')}

    float v = 0.0;
    for (int kh=0; kh<KH; ++kh)
    {{
        int ih = -PADT + kh*DILATION + oh*STRIDE;
        if (ih >= 0 & ih < IH)
            for (int kw=0; kw<KW; ++kw)
            {{
                int iw = -PADL + kw*DILATION + ow*STRIDE;
                if (iw >= 0 & iw < IW)
                    v += I[I_idx(on,oc,ih,iw)]*K[K_idx(oc,kh,kw)];
            }}
    }}
    O[gid] = v;
}}
""")

        self.dI_depthwise_krn = nc.CLKernel(global_shape=(input_shape.size,), kernel_text=f"""
{common_kernel_text}
__kernel void impl(__global float* dI, __global const float* K, __global const float* dO)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('i', 'NCHW', 'gid')}
    float v = 0.0;
    for (int kh=0; kh<KH; ++kh)
    {{
        int oh = (PADT + ih - kh*DILATION ) / STRIDE;
        if (oh >= 0 & oh < OH)
            for (int kw=0; kw<KW; ++kw)
            {{
                int ow = (PADL + iw - kw*DILATION ) / STRIDE;
                if (ow >= 0 & ow < OW
                    & iw == (-PADL + kw*DILATION + ow*STRIDE)
                    & ih == (-PADT + kh*DILATION + oh*STRIDE) )
                        v += dO[O_idx(in,ic,oh,ow)]*K[K_idx(ic,kh,kw)];
            }}
    }}
    dI[gid] += v;
}}
""")

        self.im2col  = lambda x: nc.op.unfold2D(x, N, IC, IH, IW, OH, OW, KH, KW, ci.PADL, ci.PADT, dilation, stride, 'CJI_NHW', is_transpose=False)

Esempio n. 19

0

Mostra file

    def __init__(self, a_shape, b_shape, is_add_to_output):
        if a_shape.rank != b_shape.rank:
            raise ValueError(
                f'Ranks are not equal. {a_shape.rank} != {b_shape.rank}')

        rank = a_shape.rank
        if rank < 2:
            raise ValueError('Tensors rank must be at least 2.')

        K, M = a_shape[-2], a_shape[-1]
        N, B_COLS = b_shape[-2], b_shape[-1]

        if K != B_COLS:
            raise ValueError('A_ROWS != B_COLS')

        BATCH = a_shape[0:-2].size
        B_BATCH = b_shape[0:-2].size

        if BATCH != B_BATCH:
            raise ValueError(
                f'BATCH size {BATCH} != {B_BATCH} in shapes {a_shape} {b_shape}'
            )

        if rank == 2:
            self.output_shape = output_shape = nc.TensorShape((N, M))
        else:
            self.output_shape = output_shape = nc.TensorShape(a_shape[:-2] +
                                                              (N, M))

        self.M = M
        self.N = N
        self.K = K

        # Determining optimal tile widths
        for MW in [16, 8, 4, 2, 1]:
            if M % MW == 0:
                break
        for KW in [8, 4, 2, 1]:
            if N % KW == 0 and K % KW == 0:
                break
        NW = KW

        self.forward_krn = nc.CLKernel(global_shape=(M // MW, N // NW, BATCH),
                                       kernel_text=f"""
#define K {K}
#define N {N}
#define MW {MW}     // M tile Width
#define NW {NW}     // N tile Width  -- NW & KW should be the same !
#define KW {KW}     // K tile Width
#define MT {M//MW}  // MT is max for 'mt' (M tile count)
#define KT {K//KW}  // KT is max for 'kt' (K tile count)

#define floatMW { f'float{MW}' if MW != 1 else 'float'}
#define floatKW { f'float{KW}' if KW != 1 else 'float'}

__kernel void GeMM(const __global floatMW* restrict A, const __global floatKW* restrict B, __global floatMW* C)
{{
    size_t mt = get_global_id(0);    //global M-tile id
    size_t nc = get_global_id(1);    //global N-tile id
    size_t batch = get_global_id(2); 
        
    float AT[KW][MW]; // sub tiles
    float BT[NW][KW];
    float CT[NW][MW];

    #pragma unroll
    for (uint i=0; i<NW*MW; ++i) // zero CT tile
        ((float*) CT)[i] = 0.0;

    for (uint kt=0; kt<KT; ++kt)  // iterate over K-dim tiles
    {{
        #pragma unroll
        for (uint k=0; k<KW; ++k)  // every k-element inside K-dim tile
            *( (floatMW*) AT[k] ) = A[batch*K*MT + (kt*KW + k)*MT + mt]; // store M-Width floats

        #pragma unroll
        for (uint n=0; n<NW; ++n)  // every n-element inside N-dim tile
            *( (floatKW*) BT[n] ) = B[batch*N*KT + (nc*NW + n)*KT + kt]; // store K-Width floats

        #pragma unroll
        for (uint k=0; k<KW; ++k)
        #pragma unroll
        for (uint n=0; n<NW; ++n)  // sub tiles multiplication
        #pragma unroll
        for (uint m=0; m<MW; ++m)
            CT[n][m] += AT[k][m] * BT[n][k];
    }}

    #pragma unroll
    for (uint n=0; n<NW; ++n)
        C[ batch*N*MT + (nc*NW + n)*MT + mt] {'+=' if is_add_to_output else '='}
                               *( (floatMW*) CT[n]);
}}""")

Esempio n. 20

0

Mostra file

    def __init__(self, op_type, input_shape: nc.TensorShape, pool_size, stride,
                 padding):
        if op_type not in ['avg', 'min', 'max']:
            raise ValueError(f'unknown op_type {op_type}')
        if pool_size < 2:
            raise ValueError(f'pool_size {pool_size} must be at least 2')
        self.op_type = op_type

        N, IC, IH, IW = input_shape
        ci = nc.info.InfoConv2D(IH, IW, pool_size, pool_size, stride, 1,
                                padding)
        OC, OH, OW = IC, ci.OH, ci.OW
        self.output_shape = output_shape = nc.TensorShape((N, OC, OH, OW))

        common_kernel_text = f"""
{ph.define_axes_accessor('I', input_shape, 'NCHW')}
{ph.define_axes_accessor('O', output_shape, 'NCHW')}

#define PADL {ci.PADL}
#define PADT {ci.PADT}

#define POOL_SIZE {pool_size}
#define STRIDE {stride}
"""

        self.O_forward_krn = nc.CLKernel(global_shape=(output_shape.size, ),
                                         kernel_text=f"""
{common_kernel_text}

__kernel void impl(__global float* O, __global const float* I)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('o', 'NCHW', 'gid')}

    { {'avg' : 'float v = 0.0; int v_count = 0;',
       'max' : 'float v = -INFINITY;',
       'min' : 'float v = INFINITY;',
      }[op_type] }

    for (int ph=0; ph<POOL_SIZE; ++ph)
    for (int pw=0; pw<POOL_SIZE; ++pw)
    {{
        int ih = -PADT + ph + oh*STRIDE;
        int iw = -PADL + pw + ow*STRIDE;
        if (iw >= 0 & ih >= 0 & iw < IW & ih < IH)
        {{
            { {'avg' : 'v +=        I[I_idx(on,oc,ih,iw)]; ++v_count;',
               'max' : 'v = fmax(v, I[I_idx(on,oc,ih,iw)]);',
               'min' : 'v = fmin(v, I[I_idx(on,oc,ih,iw)]);',
              }[op_type] }
        }}
    }}

    { {'avg' : 'if (v_count != 0) v /= v_count;',
       'max' : 'if (v == -INFINITY) v = 0.0;',
       'min' : 'if (v == INFINITY) v = 0.0;',
      }[op_type] }


    O[gid] = v;
}}
""")
        if op_type == 'avg':
            self.dI_krn = nc.CLKernel(global_shape=(input_shape.size, ),
                                      kernel_text=f"""
{common_kernel_text}
__kernel void impl(__global float* dI, __global const float* dO)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('i', 'NCHW', 'gid')}

    float v = 0.0;

    for (int ph=0; ph<POOL_SIZE; ++ph)
    for (int pw=0; pw<POOL_SIZE; ++pw)
    {{
        int oh = (PADT + ih - ph ) / STRIDE;
        int ow = (PADL + iw - pw ) / STRIDE;
        if (ow >= 0 & oh >= 0 & ow < OW & oh < OH
            & iw == (-PADL + pw + ow*STRIDE)
            & ih == (-PADT + ph + oh*STRIDE) )
        {{
            int d=0;
            for (int dph=0; dph<POOL_SIZE; ++dph)
            for (int dpw=0; dpw<POOL_SIZE; ++dpw)
            {{
                int dih = -PADT + dph + oh*STRIDE;
                int diw = -PADL + dpw + ow*STRIDE;
                d += (diw >= 0 & dih >= 0 & diw < IW & dih < IH);
            }}
            v += dO[O_idx(in,ic,oh,ow)] / d;
        }}
    }}

    dI[gid] += v;
}}
""")
        elif op_type in ['min', 'max']:
            # Implementation is different from tensorflow in case when the same values exist in reduction axes.
            # Example tf     : 3 4 5 5 , max = 5, gradients : 0 0 0.5 0.5
            # Example litenn : 3 4 5 5 , max = 5, gradients : 0 0   1   0
            #                                 or  gradients : 0 0   0   1 - depends on which GPU thread will be first !
            self.dI_krn = nc.CLKernel(global_shape=(input_shape.size, ),
                                      kernel_text=f"""
{common_kernel_text}
__kernel void impl(__global float* dI, __global const float* I, __global const float* dO, __global const float* O)
{{
    size_t gid = get_global_id(0);
    {ph.axes_idxs_from_var('i', 'NCHW', 'gid')}

    float v = 0.0;

    // Iterate over all O pixels, where 'I' have contribution
    for (int ph=0; ph<POOL_SIZE; ++ph)
    for (int pw=0; pw<POOL_SIZE; ++pw)
    {{
        int oh = (PADT + ih - ph ) / STRIDE;
        int ow = (PADL + iw - pw ) / STRIDE;
        if (ow >= 0 & oh >= 0 & ow < OW & oh < OH
            & iw == (-PADL + pw + ow*STRIDE)
            & ih == (-PADT + ph + oh*STRIDE) )
        {{
            //Now we have oh,ow where ih,iw have contribution

            float Ov = O[O_idx(in,ic,oh,ow)];

            //Iterate in those I pixels, which were used to produce O
            //to determine first min/max match
            for (int dphw=0; dphw < POOL_SIZE*POOL_SIZE; ++dphw)
            {{
                int dih = -PADT + (dphw / POOL_SIZE) + oh*STRIDE;
                int diw = -PADL + (dphw % POOL_SIZE) + ow*STRIDE;
                if (diw >= 0 & dih >= 0 & diw < IW & dih < IH &
                    I[I_idx(in,ic,dih,diw)] == Ov)
                {{
                    // Match I==O
                    if (dih == ih & diw == iw)
                        // but add gradient only if current ih/iw index match dih/diw
                        v += dO[O_idx(in,ic,oh,ow)];
                    break;
                }}
            }}
        }}
    }}

    dI[gid] += v;
}}
""")

Esempio n. 21

0

Mostra file

    def __init__(self, DualWiseOpKernel_cls, DualWiseOpKernel_args,
                 a_shape: nc.TensorShape, b_shape: nc.TensorShape,
                 is_add_to_output):
        self.kernel = DualWiseOpKernel_cls(*DualWiseOpKernel_args)
        self.info = info = nc.info.InfoBroadcast(a_shape, b_shape)

        # Implement kernel. Process of both broadcasted shapes using index modulus accessor.
        self.forward_krn = nc.CLKernel(global_shape=(info.output_shape.size, ),
                                       kernel_text=f"""
{ph.define_axes_accessor('A', info.a_br_shape )}
{ph.define_axes_accessor('B', info.b_br_shape )}
{ph.define_axes_accessor('O', info.output_shape )}
__kernel void impl(__global float* O_t, __global const float* A_t, __global const float* B_t)
{{
size_t gid = get_global_id(0);
{ph.axes_idxs_from_var('o', info.output_shape.rank, 'gid')}
float A = A_t[A_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})];
float B = B_t[B_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})];
float O = 0.0;
{self.kernel.get_forward_kernel_text()}
O_t[get_global_id(0)] {'+=' if is_add_to_output else '='} O;
}}
""")

        self.backward_A_krn = nc.CLKernel(
            global_shape=(info.output_shape.size, ),
            kernel_text=f"""
{ph.define_axes_accessor('A', info.a_br_shape)}
{ph.define_axes_accessor('B', info.b_br_shape)}
{ph.define_axes_accessor('O', info.output_shape)}
__kernel void impl(__global float* dA_t,
                   __global const float* A_t, __global const float* B_t,
                   __global const float* O_t, __global const float* dO_t)
{{
size_t gid = get_global_id(0);
{ph.axes_idxs_from_var('o', info.output_shape.rank, 'gid')}
float A = A_t[A_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})];
float B = B_t[B_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})];
float O = O_t[gid];
float dO = dO_t[gid];
float dA = 0.0;
{self.kernel.get_backward_A_kernel_text()}
dA_t[gid] += dA;
}}
""")
        self.backward_B_krn = nc.CLKernel(
            global_shape=(info.output_shape.size, ),
            kernel_text=f"""
{ph.define_axes_accessor('A', info.a_br_shape)}
{ph.define_axes_accessor('B', info.b_br_shape)}
{ph.define_axes_accessor('O', info.output_shape)}
__kernel void impl(__global float* dB_t,
                   __global const float* A_t, __global const float* B_t,
                   __global const float* O_t, __global const float* dO_t)
{{
size_t gid = get_global_id(0);
{ph.axes_idxs_from_var('o', info.output_shape.rank, 'gid')}
float A = A_t[A_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})];
float B = B_t[B_idx_mod({ph.axes_seq_enum('o', info.output_shape.rank)})];
float O = O_t[gid];
float dO = dO_t[gid];
float dB = 0.0;
{self.kernel.get_backward_B_kernel_text()}
dB_t[gid] += dB;
}}
""")

Esempio n. 22

0

Mostra file

File: BatchNorm2D.py Progetto: JHP4911/litenn

class BatchNorm2D(nn.Module):
    """
    Batch Normalization 2D module.

    arguments

        in_ch       int             input channels
    
    Don't forget to call most parent Module.set_training(bool)
    
    References
    
    [Batch Normalization: Accelerating Deep Network Training by
    Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
    """
    def __init__(self, in_ch, momentum=0.99):
        self.in_ch = in_ch

        if momentum < 0 or momentum > 1.0:
            raise ValueError(
                f'momentum {momentum} must be in range [0 .. 1.0]')
        self.momentum = momentum

        self.gamma = nn.Tensor((in_ch, ), init=nn.initializer.Scalar(1.0))
        self.beta = nn.Tensor((in_ch, ), init=nn.initializer.Scalar(0.0))

        self.running_mean = nn.Tensor((in_ch, ),
                                      init=nn.initializer.Scalar(0.0))
        self.running_var = nn.Tensor((in_ch, ),
                                     init=nn.initializer.Scalar(1.0))

        super().__init__(
            saveables=['gamma', 'beta', 'running_mean', 'running_var'],
            trainables=['gamma', 'beta'])

    def forward(self, x, **kwargs):

        if self.is_training():
            mean, var = nn.moments(x, axes=(0, 2, 3), keepdims=True)

            BatchNorm2D.upd_krn.run(self.running_mean,
                                    self.running_var,
                                    mean,
                                    var,
                                    np.float32(self.momentum),
                                    global_shape=(self.in_ch, ))
        else:
            mean = self.running_mean.reshape((1, -1, 1, 1))
            var = self.running_var.reshape((1, -1, 1, 1))

        x = (x - mean) / (nn.sqrt(var) + 1e-5)

        x = x * self.gamma.reshape( (1,-1,1,1) ) \
              + self.beta.reshape( (1,-1,1,1) )
        return x

    def __str__(self):
        return f"{self.__class__.__name__} : in_ch:{self.in_ch}"

    def __repr__(self):
        return self.__str__()

    upd_krn = nc.CLKernel(kernel_text="""
__kernel void asd( __global float* RM, __global float* RV, __global const float* M, __global const float* V, float momentum)
{{
    size_t gid = get_global_id(0);
    
    float rm = RM[gid];
    float m = M[gid];
    float rv = RV[gid];
    float v = V[gid];
    
    RM[gid] = rm*momentum + m*(1.0-momentum);
    RV[gid] = rv*momentum + v*(1.0-momentum);
}}
""")