Ejemplo n.º 1
0
    def forward_gpu(self, inputs):
        x, h_tm1 = inputs
        N = x.shape[0]
        
        #update gate
        u = cuda.empty((N,self.out_size),dtype=np.float32)
        cuk.dot(x, self.Wu, out=u, transb = 't')
        cuk.dotAdd(h_tm1, self.Vu, C=u, transb='t')
        #reset gate
        r = cuda.empty((N,self.out_size),dtype=np.float32)
        cuk.dot(x, self.Wr, out=r, transb = 't')
        cuk.dotAdd(h_tm1, self.Vr, C=r, transb='t')
        if not self.nobias:
            cuk.addVec2Mat(u, self.bu)
            cuk.addVec2Mat(r, self.br)
        self.u = cuk.sigmoid(x=u, out=u)
        self.r = cuk.sigmoid(x=r, out=r)
        
        #new memory
        HV = cuda.empty((N,self.out_size),dtype=np.float32)
        self.HV = cuk.dot(h_tm1, self.Vh, out=HV, transb='t')
        h_tilde = cuda.empty((N,self.out_size),dtype=np.float32)
        h_tilde = cuk.hadamard(r, self.HV, out=h_tilde)
        cuk.dotAdd(x, self.Wh, C=h_tilde, transb='t')
        if not self.nobias:
            cuk.addVec2Mat(h_tilde, self.bh)
        self.h_tilde = cuk.tanh(x=h_tilde, out=h_tilde)

        #hidden state
        h = cuda.empty((N,self.out_size),dtype=np.float32)
        self.h = cuk.gru_forward(u=u, h_tilde=h_tilde, h_tm1=h_tm1,
                                 out=h)
        
        return self.h,
Ejemplo n.º 2
0
 def forward_gpu(self, inputs):
     x, targets = inputs
     N = x.shape[0]
     
     
     #Linear function
     z = cuda.empty((N,self.no_labels), dtype=np.float32)
     cuk.dot(x, self.W, out=z, transb='t')
     if not self.nobias:
         cuk.addVec2Mat(z, self.b)
     
     self.probs = z
     if cudnn.enabled and self.use_cudnn:
         handle = cudnn.get_default_handle()
         desc = cudnn.get_tensor_desc(z, 1, 1)
         libcudnn.cudnnSoftmaxForward(
             handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(z),
             0, desc.value, cudnn.get_ptr(self.probs))
     else:
         cuk.softmax(z, self.probs)
     
     if self.return_probs:
         return self.probs,
         
     if self.compute_loss:
         correct_probs = cuda.empty((N,),dtype=np.float32)
         cuk.getByIndex_LogAndClip(
                                     self.probs, targets,
                                      out=correct_probs)
         loss = -cuda.cumisc.sum(correct_probs, keepdims=True)/N
     else:
         loss = np.atleast_2d(np.array(np.nan,dtype=np.float32))
     
     return loss,
Ejemplo n.º 3
0
    def forward_gpu(self, x):
        n, c, h, w = x[0].shape
        out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        out_c = self.W.shape[0]
        y = cuda.empty((n, out_c, out_h, out_w), dtype=self.dtype)
        if cuda.cudnn_enabled and self.use_cudnn:
            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x[0])
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(self.W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx))
            if self.b is not None:
                self.bias_desc = cudnn.create_tensor_descriptor(
                    self.b[None, :, None, None])

            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.getConvolutionForwardWorkspaceSize(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, algo)
            workspace = cuda.empty(
                (max(workspace_size // 4, 1),), dtype=self.dtype)

            one = ctypes.c_float(1)
            zero = ctypes.c_float(0)
            libcudnn.convolutionForward(
                handle, one, x_desc.value, x[0].data.ptr,
                self.filter_desc.value, self.W.data.ptr, self.conv_desc.value,
                algo, workspace.data.ptr, workspace_size, zero, y_desc.value,
                y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                libcudnn.addTensor(
                    handle, libcudnn.CUDNN_ADD_SAME_C, one,
                    self.bias_desc.value, self.b.data.ptr, one, y_desc.value,
                    y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw)

            # TODO(beam2d): Use streams
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            col_mats = self.col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            y_mats = y.reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                y_mats[i] = W_mat.dot(col_mats[i])

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                y += self.b.reshape((1, out_c, 1, 1))

        return y,
Ejemplo n.º 4
0
    def forward_gpu(self, x):
        n, c, h, w = x[0].shape
        out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        out_c = self.W.shape[0]
        y = cuda.empty((n, out_c, out_h, out_w), dtype=self.dtype)
        if cuda.cudnn_enabled and self.use_cudnn:
            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x[0])
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(self.W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx))
            if self.b is not None:
                self.bias_desc = cudnn.create_tensor_descriptor(
                    self.b[None, :, None, None])

            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.getConvolutionForwardWorkspaceSize(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, algo)
            workspace = cuda.empty(
                (max(workspace_size // 4, 1),), dtype=self.dtype)

            one = ctypes.c_float(1)
            zero = ctypes.c_float(0)
            libcudnn.convolutionForward(
                handle, one, x_desc.value, x[0].data.ptr,
                self.filter_desc.value, self.W.data.ptr, self.conv_desc.value,
                algo, workspace.data.ptr, workspace_size, zero, y_desc.value,
                y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                libcudnn.addTensor(
                    handle, libcudnn.CUDNN_ADD_SAME_C, one,
                    self.bias_desc.value, self.b.data.ptr, one, y_desc.value,
                    y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw)

            # TODO(beam2d): Use streams
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            col_mats = self.col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            y_mats = y.reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                y_mats[i] = W_mat.dot(col_mats[i])

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                y += self.b.reshape((1, out_c, 1, 1))

        return y,
Ejemplo n.º 5
0
 def backward_gpu(self, inputs, grad_outputs):
     y, z = inputs
     N = y.shape[0]
     gy = cuda.empty(y.shape)
     gz = cuda.empty(z.shape)
     cuda.culinalg.add_dot(self.z_centered, self.covariance, gy, transb="T", alpha=1.0 / N, beta=0.0)
     cuda.culinalg.add_dot(self.y_centered, self.covariance, gz, alpha=1.0 / N, beta=0.0)
     return gy, gz
Ejemplo n.º 6
0
 def backward_gpu(self, x, gy):
     a, b = x
     batch_size = a.shape[0]
     ga = cuda.empty((batch_size,) + _as_mat(a[0]).shape)
     gb = cuda.empty((batch_size,) + _as_mat(b[0]).shape)
     _batch_matmul_gpu(
         gy[0], b, transb=not self.transb, transout=self.transa, out=ga)
     _batch_matmul_gpu(
         a, gy[0], transa=not self.transa, transout=self.transb, out=gb)
     ga = ga.reshape(a.shape)
     gb = gb.reshape(b.shape)
     return ga, gb
Ejemplo n.º 7
0
 def backward_gpu(self, inputs, grad_outputs):
     y, z = inputs
     gcost, = grad_outputs
     N = y.shape[0]
     gy = cuda.empty(y.shape)
     gz = cuda.empty(z.shape)
     cuda.culinalg.add_dot(self.z_centered, self.covariance, gy,
                           transb='T', alpha=1./N, beta=0.)
     cuda.culinalg.add_dot(self.y_centered, self.covariance, gz,
                           alpha=1./N, beta=0.)
     gy = cuda.cumisc.multiply(gy, gcost)
     gz = cuda.cumisc.multiply(gz, gcost)
     return gy, gz
Ejemplo n.º 8
0
    def forward_gpu(self, x):
        if cuda.cudnn_enabled and self.use_cudnn:
            return super(MaxPooling2D, self).forward_gpu(x)

        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(
            h, self.kh, self.sy, self.ph, self.cover_all)
        y_w = conv.get_conv_outsize(
            w, self.kw, self.sx, self.pw, self.cover_all)
        y = cuda.empty((n, c, y_h, y_w), dtype=x[0].dtype)
        self.indexes = cuda.empty((n, c, y_h, y_w), dtype=numpy.int32)

        cuda.elementwise(
            'raw T in, int32 h, int32 w, int32 out_h, int32 out_w,'
            'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw',
            'T out, S indexes',
            '''
               int c0    = i / (out_h * out_w);
               int out_y = i / out_w % out_h;
               int out_x = i % out_w;
               int in_y_0 = max(0, out_y * sy - ph);
               int in_y_1 = min(h, out_y * sy + kh - ph);
               int in_x_0 = max(0, out_x * sx - pw);
               int in_x_1 = min(w, out_x * sx + kw - pw);

               float maxval = in[in_x_0 + w * (in_y_0 + h * c0)];
               int argmax_y = in_y_0;
               int argmax_x = in_x_0;
               for (int y = in_y_0; y < in_y_1; ++y) {
                 int offset_y = w * (y + h * c0);
                 for (int x = in_x_0; x < in_x_1; ++x) {
                   float v = in[x + offset_y];
                   if (maxval < v) {
                     maxval   = v;
                     argmax_y = y;
                     argmax_x = x;
                   }
                 }
               }
               out = maxval;

               int argmax_ky = argmax_y + ph - out_y * sy;
               int argmax_kx = argmax_x + pw - out_x * sx;
               indexes = argmax_kx + kw * argmax_ky;
            ''', 'max_pool_fwd')(x[0].reduced_view(),
                                 h, w, y_h, y_w, self.kh, self.kw,
                                 self.sy, self.sx, self.ph, self.pw,
                                 y, self.indexes)
        return y,
Ejemplo n.º 9
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        kh, kw = W.shape[2:]
        n, in_c, in_h, in_w = x.shape
        c = W.shape[1]  # out_c
        if self.outh is None:
            self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph)
        if self.outw is None:
            self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw)
        if len(inputs) == 3:
            b = inputs[2]
        if cuda.cudnn_enabled and self.use_cudnn:
            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y = cuda.empty((n, c, self.outh, self.outw), dtype=numpy.float32)
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx))
            if len(inputs) == 3:
                self.bias_desc = cudnn.create_tensor_descriptor(
                    b[None, :, None, None])

            one = numpy.array(1, dtype=x.dtype).ctypes
            zero = numpy.array(0, dtype=x.dtype).ctypes

            libcudnn.convolutionBackwardData(
                handle, one.data, self.filter_desc.value, W.data.ptr,
                x_desc.value, x.data.ptr, self.conv_desc.value,
                zero.data, y_desc.value, y.data.ptr)
            if len(inputs) == 3:
                libcudnn.addTensor(
                    handle, libcudnn.CUDNN_ADD_SAME_C,
                    one.data, self.bias_desc.value, b.data.ptr,
                    one.data, y_desc.value, y.data.ptr)
        else:
            W_mat = W.reshape(in_c, c * kh * kw)
            x_mats = x.reshape(n, in_c, in_h * in_w)
            gcol = cuda.empty((n, c, kh, kw, in_h,
                               in_w), dtype=numpy.float32)
            gcol_mats = gcol.reshape(n, c * kh * kw, in_h * in_w)
            for i in moves.range(n):
                cuda.cupy.dot(W_mat.T, x_mats[i], gcol_mats[i])
            y = conv.col2im_gpu(
                gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw)
            if len(inputs) == 3:
                y += b.reshape(1, b.size, 1, 1)
        return y,
Ejemplo n.º 10
0
    def forward_gpu(self, x):
        if cudnn.enabled and self.use_cudnn:
            return super(MaxPooling2D, self).forward_gpu(x)

        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph,
                                    self.cover_all)
        y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw,
                                    self.cover_all)
        y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32)
        self.indexes = cuda.empty((n, c, y_h, y_w), dtype=numpy.int32)

        cuda.elementwise(
            '''
               float* out, int* indexes, const float* in,
               int h, int w, int out_h, int out_w,
               int kh, int kw, int sy, int sx, int ph, int pw
            ''', '''
               int c0    = i / (out_h * out_w);
               int out_y = i / out_w % out_h;
               int out_x = i % out_w;
               int in_y_0 = max(0, out_y * sy - ph);
               int in_y_1 = min(h, out_y * sy + kh - ph);
               int in_x_0 = max(0, out_x * sx - pw);
               int in_x_1 = min(w, out_x * sx + kw - pw);

               float maxval = in[in_x_0 + w * (in_y_0 + h * c0)];
               int argmax_y = in_y_0;
               int argmax_x = in_x_0;
               for (int y = in_y_0; y < in_y_1; ++y) {
                 int offset_y = w * (y + h * c0);
                 for (int x = in_x_0; x < in_x_1; ++x) {
                   float v = in[x + offset_y];
                   if (maxval < v) {
                     maxval   = v;
                     argmax_y = y;
                     argmax_x = x;
                   }
                 }
               }
               out[i] = maxval;

               int argmax_ky = argmax_y + ph - out_y * sy;
               int argmax_kx = argmax_x + pw - out_x * sx;
               indexes[i] = argmax_kx + kw * argmax_ky;
            ''',
            'max_pool_fwd')(y, self.indexes, x[0], h, w, y_h, y_w, self.kh,
                            self.kw, self.sy, self.sx, self.ph, self.pw)
        return y,
Ejemplo n.º 11
0
    def forward_gpu(self, x):
        i_len, j_len = array.as_mat(x[0]).shape
        k_len = array.as_mat(x[1]).shape[1]
        l_len = self.W.shape[2]

        # When indices are enclosed with [], they are 'flatten'
        # (i.e. linealized as 1-D array)
        # ij->[ij]
        e1 = array.as_vec(x[0])
        # ik->[ik]
        e2 = array.as_vec(x[1])
        e1e2 = cuda.empty(i_len * j_len * k_len, dtype=numpy.float32)
        # '[ij],[ik]->[ijk]'
        cuda.elementwise(
            'float* y, float* e1, float* e2, int e1c, int e2c',
            '''
            int I = i / e1c / e2c;
            int J = (i - I * e1c * e2c) / e2c;
            int K = i % e2c;
            y[i] = e1[I * e1c + J] * e2[I * e2c + K];
            ''',
            'row_wise_outer_product')(
                e1e2, e1, e2, j_len, k_len)

        # [ijk]->i[jk]
        e1e2 = e1e2.reshape(i_len, j_len * k_len)

        # jkl->[jk]l
        W_mat = self.W.reshape(
            self.W.shape[0] * self.W.shape[1], self.W.shape[2])

        y = cuda.empty((i_len, l_len), dtype=numpy.float32)
        with cuda.using_cumisc():
            # 'i[jk],[jk]l->il'
            cuda.culinalg.dot(e1e2, W_mat, out=y)

        if not self.nobias:
            e1 = array.as_mat(x[0])
            e2 = array.as_mat(x[1])
            with cuda.using_cumisc():
                # ij,jl->il
                cuda.culinalg.add_dot(e1, self.V1, y)
                # ik,kl->il
                cuda.culinalg.add_dot(e2, self.V2, y)
            cuda.elementwise(
                'float* y, float* b, int n_channel',
                'y[i] += b[i % n_channel]',
                'linear_bias')(y, self.b, self.b.size)
        return y,
Ejemplo n.º 12
0
    def forward_gpu(self, inputs):
        x, t = inputs

        max_length = cuda.reduce(
            'int* t, int* begins', 'begins[t[i] + 1] - begins[t[i]]',
            'max(a,b)', '0', 'binary_hierarchical_softmax_max_length',
            numpy.int32
        )(t, self.begins)
        max_length = cuda.to_cpu(max_length)[()]

        length = max_length * x.shape[0]
        ls = cuda.empty((length,), dtype=numpy.float32)
        n_in = x.shape[1]
        wxy = cuda.empty((length,), dtype=numpy.float32)
        cuda.elementwise(
            '''float* ls, float* wxy, const float* x, const float* w,
            const int* ts, const int* paths, const float* codes,
            const int* begins, int c, int max_length''',
            '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];

              x = &x[ind * c];

              float wx = 0;
              for (int j = 0; j < c; ++j) {
                wx += w[node * c + j] * x[j];
              }
              wxy[i] = wx * codes[p];
              ls[i] = log(1 + exp(-wxy[i]));
            } else {
              ls[i] = 0;
            }
            ''',
            'binary_hierarchical_softmax_forward'
        )(ls, wxy, x, self.W, t, self.paths, self.codes, self.begins,
          n_in, max_length)
        self.max_length = max_length
        self.wxy = wxy
        return cuda.gpuarray.sum(ls),
Ejemplo n.º 13
0
    def forward_gpu(self, inputs):
        x, t = inputs

        max_length = cuda.reduce(
            'int* t, int* begins', 'begins[t[i] + 1] - begins[t[i]]',
            'max(a,b)', '0', 'binary_hierarchical_softmax_max_length',
            numpy.int32
        )(t, self.begins)
        max_length = cuda.to_cpu(max_length)[()]

        length = max_length * x.shape[0]
        ls = cuda.empty((length,), dtype=numpy.float32)
        n_in = x.shape[1]
        wxy = cuda.empty((length,), dtype=numpy.float32)
        cuda.elementwise(
            '''float* ls, float* wxy, const float* x, const float* w,
            const int* ts, const int* paths, const float* codes,
            const int* begins, int c, int max_length''',
            '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];

              x = &x[ind * c];

              float wx = 0;
              for (int j = 0; j < c; ++j) {
                wx += w[node * c + j] * x[j];
              }
              wxy[i] = wx * codes[p];
              ls[i] = log(1 + exp(-wxy[i]));
            } else {
              ls[i] = 0;
            }
            ''',
            'binary_hierarchical_softmax_forward'
        )(ls, wxy, x, self.W, t, self.paths, self.codes, self.begins,
          n_in, max_length)
        self.max_length = max_length
        self.wxy = wxy
        return cuda.gpuarray.sum(ls),
Ejemplo n.º 14
0
def col2im_gpu(col, sy, sx, ph, pw, h, w):
    n, c, kh, kw, out_h, out_w = col.shape

    img = cuda.empty((n, c, h, w), dtype=col.dtype)
    cuda.elementwise(
        '''
           float* img, const float* col, int h, int w,
           int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw
        ''', '''
           int c0 = i / (h * w);
           int y  = i / w % h + ph;
           int x  = i % w + pw;

           int out_y_0 = max(0,     (y - kh + sy) / sy);
           int out_y_1 = min(out_h, (y      + sy) / sy);
           int out_x_0 = max(0,     (x - kw + sx) / sx);
           int out_x_1 = min(out_w, (x      + sx) / sx);

           float val = 0;
           for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
             int ky = y - out_y * sy;
             for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
               int kx = x - out_x * sx;
               val += col[out_x + out_w * (out_y + out_h * (kx + kw * (ky + kh * c0)))];
             }
           }
           img[i] = val;
        ''', 'col2im')(img, col, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw)
    return img
    def forward_gpu(self, x):
        if cuda.cudnn_enabled and self.use_cudnn:
            return super(AveragePooling2D, self).forward_gpu(x)

        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32)
        coeff = 1. / (self.kh * self.kw)
        cuda.elementwise(
            'raw T in, int32 h, int32 w,'
            'int32 out_h, int32 out_w, int32 kh, int32 kw,'
            'int32 sy, int32 sx, int32 ph, int32 pw, T coeff', 'T out', '''
               int c0    = i / (out_h * out_w);
               int out_y = i / out_w % out_h;
               int out_x = i % out_w;
               int in_y_0 = max(0, out_y * sy - ph);
               int in_y_1 = min(h, out_y * sy + kh - ph);
               int in_x_0 = max(0, out_x * sx - pw);
               int in_x_1 = min(w, out_x * sx + kw - pw);

               float val = 0;
               for (int y = in_y_0; y < in_y_1; ++y) {
                 int offset_y = w * (y + h * c0);
                 for (int x = in_x_0; x < in_x_1; ++x) {
                   val += in[x + offset_y];
                 }
               }
               out = val * coeff;
            ''', 'avg_pool_fwd')(x[0].reduced_view(), h, w, y_h, y_w, self.kh,
                                 self.kw, self.sy, self.sx, self.ph, self.pw,
                                 coeff, y)
        return y,
Ejemplo n.º 16
0
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False):
    n, c, h, w = img.shape
    out_h = get_conv_outsize(h, kh, sy, ph, cover_all)
    out_w = get_conv_outsize(w, kw, sx, pw, cover_all)

    col = cuda.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype)
    cuda.elementwise(
        'raw T img, int32 h, int32 w, int32 out_h, int32 out_w,'
        'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T col',
        '''
           int c0 = i / (kh * kw * out_h * out_w);
           int ky = i / (kw * out_h * out_w) % kh;
           int kx = i / (out_h * out_w) % kw;
           int out_y = i / out_w % out_h;
           int out_x = i % out_w;

           int in_y = ky + out_y * sy - ph;
           int in_x = kx + out_x * sx - pw;
           if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) {
             col = img[in_x + w * (in_y + h * c0)];
           } else {
             col = 0;
           }
        ''', 'im2col')(img.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx,
                       ph, pw, col)
    return col
Ejemplo n.º 17
0
def hotdot(a, indices, out=None, dont_add=False):
    """
    In:
        a: a pycuda gpuarray
        indices: hot indices a K-hot encoded matrix
    out:
        out: x.dot(a.T), where x is a K-hot encoded matrix 
    
    """
    H, D = a.shape
    N, K = indices.shape
    if N == 1:
        bdim, gdim = Get_bdim_and_gdimRowVec(H)
    elif H >= (N*4):
        bdim, gdim = Get_bdim_and_gdimSmallNBigM(N,H)
    else:
        bdim, gdim = Get_bdim_and_gdim2D(N,H)
    if dont_add:
        B = np.int32(1)
    else:
        B = np.int32(0)
        
    if out is None:
        out = cuda.empty((N,H), dtype=np.float32)
        B = np.int32(1)
    
    if K > 1:
        HotDot1_kernel.prepared_call(gdim, bdim,
                                a.gpudata, out.gpudata, indices.gpudata,
                np.int32(K), np.int32(N), np.int32(H), np.int32(D), np.int32(B))
    else:
        HotDot2_kernel.prepared_call(gdim, bdim,
                                a.gpudata, out.gpudata, indices.gpudata,
                        np.int32(N), np.int32(H), np.int32(D), np.int32(B))
        return out
Ejemplo n.º 18
0
    def forward_gpu(self, x):
        if cudnn.enabled and self.use_cudnn:
            return super(AveragePooling2D, self).forward_gpu(x)

        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32)
        coeff = 1. / (self.kh * self.kw)

        cuda.elementwise(
            '''
               float* out, const float* in, int h, int w, int out_h, int out_w,
               int kh, int kw, int sy, int sx, int ph, int pw, float coeff
            ''', '''
               int c0    = i / (out_h * out_w);
               int out_y = i / out_w % out_h;
               int out_x = i % out_w;
               int in_y_0 = max(0, out_y * sy - ph);
               int in_y_1 = min(h, out_y * sy + kh - ph);
               int in_x_0 = max(0, out_x * sx - pw);
               int in_x_1 = min(w, out_x * sx + kw - pw);

               float val = 0;
               for (int y = in_y_0; y < in_y_1; ++y) {
                 int offset_y = w * (y + h * c0);
                 for (int x = in_x_0; x < in_x_1; ++x) {
                   val += in[x + offset_y];
                 }
               }
               out[i] = val * coeff;
            ''', 'avg_pool_fwd')(y, x[0], h, w, y_h, y_w, self.kh, self.kw,
                                 self.sy, self.sx, self.ph, self.pw, coeff)
        return y,
Ejemplo n.º 19
0
    def backward_gpu(self, x, gy):
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            gx = cuda.empty_like(x[0])
            desc = cudnn.get_tensor_desc(x[0], 1, 1)
            libcudnn.cudnnSoftmaxBackward(handle, _algorithm,
                                          _mode, 1, desc.value,
                                          cudnn.get_ptr(self.y), desc.value,
                                          cudnn.get_ptr(gy[0]), 0, desc.value,
                                          cudnn.get_ptr(gx))
        else:
            gx = self.y * gy[0]
            c = gx.shape[1]
            sum_ydy = cuda.empty((gx.shape[0], ), dtype=numpy.float32)
            cuda.elementwise(
                'float* sum_ydy, const float* ydy, int c', '''
                   const float* row = ydy + i * c;
                   float sum = 0;
                   for (int j = 0; j < c; ++j) {
                     sum += row[j];
                   }
                   sum_ydy[i] = sum;
                ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c)
            cuda.elementwise(
                'float* gx, const float* y, const float* sum_ydy, int c',
                'gx[i] -= y[i] * sum_ydy[i / c]',
                'softmax_bwd_diff')(gx, self.y, sum_ydy, c)

        return gx,
Ejemplo n.º 20
0
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False):
    n, c, h, w = img.shape
    out_h = get_conv_outsize(h, kh, sy, ph, cover_all)
    out_w = get_conv_outsize(w, kw, sx, pw, cover_all)

    col = cuda.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype)
    cuda.elementwise(
        '''
           float* col, const float* img, int h, int w,
           int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw
        ''', '''
           int c0 = i / (kh * kw * out_h * out_w);
           int ky = i / (kw * out_h * out_w) % kh;
           int kx = i / (out_h * out_w) % kw;
           int out_y = i / out_w % out_h;
           int out_x = i % out_w;

           int in_y = ky + out_y * sy - ph;
           int in_x = kx + out_x * sx - pw;
           if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) {
             col[i] = img[in_x + w * (in_y + h * c0)];
           } else {
             col[i] = 0;
           }
        ''', 'im2col')(col, img, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw)
    return col
Ejemplo n.º 21
0
def col2im_gpu(col, sy, sx, ph, pw, h, w):
    n, c, kh, kw, out_h, out_w = col.shape

    img = cuda.empty((n, c, h, w), dtype=col.dtype)
    cuda.elementwise(
        'raw T col, int32 h, int32 w, int32 out_h, int32 out_w,'
        'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw',
        'T img',
        '''
           int c0 = i / (h * w);
           int y  = i / w % h + ph;
           int x  = i % w + pw;

           int out_y_0 = max(0,     (y - kh + sy) / sy);
           int out_y_1 = min(out_h, (y      + sy) / sy);
           int out_x_0 = max(0,     (x - kw + sx) / sx);
           int out_x_1 = min(out_w, (x      + sx) / sx);

           T val = 0;
           for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
             int ky = y - out_y * sy;
             for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
               int kx = x - out_x * sx;
               int k = out_y + out_h * (kx + kw * (ky + kh * c0));
               val += col[out_x + out_w * k];
             }
           }
           img = val;
        ''',
        'col2im')(col.reduced_view(),
                  h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, img)
    return img
Ejemplo n.º 22
0
    def forward_gpu(self, x):
        xshape = x[0].shape
        self.cdimx = xshape[self.axis]
        self.rdim = numpy.prod(xshape[self.axis + 1:])

        indices = self.indices_or_sections
        if isinstance(indices, collections.Iterable):
            indices = list(indices)
            indices.append(xshape[self.axis])
        else:
            if xshape[self.axis] % indices:
                raise ValueError(
                    'array split does not result in an equal division')
            indices = six.moves.range(
                indices, xshape[self.axis] + indices, indices)
        ys = []
        kernel = cuda.elementwise(
            _args, 'COPY(y[i] = x[idx])', 'split_fwd', preamble=_preamble)
        bi = 0
        for i in indices:
            i = min(i, xshape[self.axis])
            cdimy = max(0, i - bi)
            s = list(xshape)
            s[self.axis] = cdimy
            y = cuda.empty(s, dtype=x[0].dtype)
            if cdimy != 0:
                kernel(y, x[0], cdimy, self.cdimx, self.rdim, bi)
            bi = i
            ys.append(y)
        return tuple(ys)
Ejemplo n.º 23
0
def col2im_gpu(col, sy, sx, ph, pw, h, w):
    n, c, kh, kw, out_h, out_w = col.shape

    img = cuda.empty((n, c, h, w), dtype=col.dtype)
    cuda.elementwise(
        '''
           float* img, const float* col, int h, int w,
           int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw
        ''', '''
           int c0 = i / (h * w);
           int y  = i / w % h + ph;
           int x  = i % w + pw;

           int out_y_0 = max(0,     (y - kh + sy) / sy);
           int out_y_1 = min(out_h, (y      + sy) / sy);
           int out_x_0 = max(0,     (x - kw + sx) / sx);
           int out_x_1 = min(out_w, (x      + sx) / sx);

           float val = 0;
           for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
             int ky = y - out_y * sy;
             for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
               int kx = x - out_x * sx;
               val += col[out_x + out_w * (out_y + out_h * (kx + kw * (ky + kh * c0)))];
             }
           }
           img[i] = val;
        ''',
        'col2im')(img, col, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw)
    return img
Ejemplo n.º 24
0
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False):
    n, c, h, w = img.shape
    out_h = get_conv_outsize(h, kh, sy, ph, cover_all)
    out_w = get_conv_outsize(w, kw, sx, pw, cover_all)

    col = cuda.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype)
    cuda.elementwise(
        '''
           float* col, const float* img, int h, int w,
           int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw
        ''', '''
           int c0 = i / (kh * kw * out_h * out_w);
           int ky = i / (kw * out_h * out_w) % kh;
           int kx = i / (out_h * out_w) % kw;
           int out_y = i / out_w % out_h;
           int out_x = i % out_w;

           int in_y = ky + out_y * sy - ph;
           int in_x = kx + out_x * sx - pw;
           if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) {
             col[i] = img[in_x + w * (in_y + h * c0)];
           } else {
             col[i] = 0;
           }
        ''',
        'im2col')(col, img, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw)
    return col
Ejemplo n.º 25
0
    def forward_gpu(self, x):
        if cudnn.enabled and self.use_cudnn:
            return super(AveragePooling2D, self).forward_gpu(x)

        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32)
        coeff = 1. / (self.kh * self.kw)

        cuda.elementwise(
            '''
               float* out, const float* in, int h, int w, int out_h, int out_w,
               int kh, int kw, int sy, int sx, int ph, int pw, float coeff
            ''', '''
               int c0    = i / (out_h * out_w);
               int out_y = i / out_w % out_h;
               int out_x = i % out_w;
               int in_y_0 = max(0, out_y * sy - ph);
               int in_y_1 = min(h, out_y * sy + kh - ph);
               int in_x_0 = max(0, out_x * sx - pw);
               int in_x_1 = min(w, out_x * sx + kw - pw);

               float val = 0;
               for (int y = in_y_0; y < in_y_1; ++y) {
                 int offset_y = w * (y + h * c0);
                 for (int x = in_x_0; x < in_x_1; ++x) {
                   val += in[x + offset_y];
                 }
               }
               out[i] = val * coeff;
            ''', 'avg_pool_fwd')(y, x[0], h, w, y_h, y_w, self.kh, self.kw,
                                 self.sy, self.sx, self.ph, self.pw, coeff)
        return y,
Ejemplo n.º 26
0
    def forward_gpu(self, x):
        xshape = x[0].shape
        self.cdimx = xshape[self.axis]
        self.rdim = numpy.prod(xshape[self.axis + 1:])

        if isinstance(self.indices_or_sections, collections.Iterable):
            ind = list(self.indices_or_sections)
            ind.append(self.cdimx)
        else:
            sec = self.indices_or_sections
            if self.cdimx % sec:
                raise ValueError(
                    'array split does not result in an equal division')
            ind = numpy.arange(1, sec + 1) * (self.cdimx // sec)
        ys = []
        kernel = cuda.elementwise(
            _args, 'COPY(y[i] = x[idx])', 'split_fwd', preamble=_preamble)
        prev_i = 0
        for i in ind:
            cdimy = max(0, min(i, self.cdimx) - prev_i)
            s = list(xshape)
            s[self.axis] = cdimy
            y = cuda.empty(s, dtype=x[0].dtype)
            if cdimy == 0:
                raise ValueError('Not support if shape contains 0')
            kernel(y, x[0], cdimy, self.cdimx, self.rdim, prev_i)
            prev_i = i
            ys.append(y)
        return tuple(ys)
Ejemplo n.º 27
0
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False):
    n, c, h, w = img.shape
    out_h = get_conv_outsize(h, kh, sy, ph, cover_all)
    out_w = get_conv_outsize(w, kw, sx, pw, cover_all)

    col = cuda.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype)
    cuda.elementwise(
        'raw T img, int32 h, int32 w, int32 out_h, int32 out_w,'
        'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw',
        'T col',
        '''
           int c0 = i / (kh * kw * out_h * out_w);
           int ky = i / (kw * out_h * out_w) % kh;
           int kx = i / (out_h * out_w) % kw;
           int out_y = i / out_w % out_h;
           int out_x = i % out_w;

           int in_y = ky + out_y * sy - ph;
           int in_x = kx + out_x * sx - pw;
           if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) {
             col = img[in_x + w * (in_y + h * c0)];
           } else {
             col = 0;
           }
        ''',
        'im2col')(img.reduced_view(),
                  h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, col)
    return col
Ejemplo n.º 28
0
    def backward_gpu(self, x, gy):
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            gx = cuda.empty_like(x[0])
            desc = cudnn.get_tensor_desc(x[0], 1, 1)
            libcudnn.cudnnSoftmaxBackward(
                handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(
                    self.y),
                desc.value, cudnn.get_ptr(gy[0]), 0, desc.value,
                cudnn.get_ptr(gx))
        else:
            gx = self.y * gy[0]
            c = gx.shape[1]
            sum_ydy = cuda.empty((gx.shape[0],), dtype=numpy.float32)
            cuda.elementwise(
                'float* sum_ydy, const float* ydy, int c',
                '''
                   const float* row = ydy + i * c;
                   float sum = 0;
                   for (int j = 0; j < c; ++j) {
                     sum += row[j];
                   }
                   sum_ydy[i] = sum;
                ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c)
            cuda.elementwise(
                'float* gx, const float* y, const float* sum_ydy, int c',
                'gx[i] -= y[i] * sum_ydy[i / c]',
                'softmax_bwd_diff')(gx, self.y, sum_ydy, c)

        return gx,
Ejemplo n.º 29
0
    def forward_gpu(self, x):
        xshape = x[0].shape
        self.cdimx = xshape[self.axis]
        self.rdim = numpy.prod(xshape[self.axis + 1:], dtype=int)

        if isinstance(self.indices_or_sections, collections.Iterable):
            ind = list(self.indices_or_sections)
            ind.append(self.cdimx)
        else:
            sec = self.indices_or_sections
            if self.cdimx % sec:
                raise ValueError(
                    'array split does not result in an equal division')
            ind = numpy.arange(1, sec + 1) * (self.cdimx // sec)
        ys = []
        kernel = cuda.elementwise(_args,
                                  'COPY(y[i] = x[idx])',
                                  'split_fwd',
                                  preamble=_preamble)
        prev_i = 0
        for i in ind:
            cdimy = max(0, min(i, self.cdimx) - prev_i)
            s = list(xshape)
            s[self.axis] = cdimy
            y = cuda.empty(tuple(s), dtype=x[0].dtype)
            if cdimy == 0:
                raise ValueError('Not support if shape contains 0')
            kernel(y, x[0], cdimy, self.cdimx, self.rdim, prev_i)
            prev_i = i
            ys.append(y)
        return tuple(ys)
Ejemplo n.º 30
0
 def forward_gpu(self, x):
     a, b = x
     shape = self._output_shape(a, b)
     ret = cuda.empty(shape)
     _batch_matmul_gpu(a, b,
                       transa=self.transa, transb=self.transb, out=ret)
     return ret,
Ejemplo n.º 31
0
 def forward_gpu(self, x):
     y = cuda.empty((x[0].size, self.W.shape[1]), dtype=numpy.float32)
     cuda.elementwise(
         'float* y, const float* W, const int* x, int n_out',
         'y[i] = W[x[i / n_out] * n_out + i % n_out]',
         'embed_id_fwd')(y, self.W, x[0], self.W.shape[1])
     return y,
Ejemplo n.º 32
0
def col2im_gpu(col, sy, sx, ph, pw, h, w):
    n, c, kh, kw, out_h, out_w = col.shape

    img = cuda.empty((n, c, h, w), dtype=col.dtype)
    cuda.elementwise(
        'raw T col, int32 h, int32 w, int32 out_h, int32 out_w,'
        'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw', 'T img',
        '''
           int c0 = i / (h * w);
           int y  = i / w % h + ph;
           int x  = i % w + pw;

           int out_y_0 = max(0,     (y - kh + sy) / sy);
           int out_y_1 = min(out_h, (y      + sy) / sy);
           int out_x_0 = max(0,     (x - kw + sx) / sx);
           int out_x_1 = min(out_w, (x      + sx) / sx);

           T val = 0;
           for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
             int ky = y - out_y * sy;
             for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
               int kx = x - out_x * sx;
               int k = out_y + out_h * (kx + kw * (ky + kh * c0));
               val += col[out_x + out_w * k];
             }
           }
           img = val;
        ''', 'col2im')(col.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx,
                       ph, pw, img)
    return img
Ejemplo n.º 33
0
    def forward_gpu(self, inputs):
        x, t = inputs
        max_length = cuda.reduce(
            'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]',
            'max(a, b)', 'out = a', '0',
            'binary_hierarchical_softmax_max_length')(t, self.begins)
        max_length = cuda.to_cpu(max_length)[()]

        length = max_length * x.shape[0]
        ls = cuda.empty((length,), dtype=numpy.float32)
        n_in = x.shape[1]
        wxy = cuda.empty((length,), dtype=numpy.float32)
        cuda.elementwise(
            '''raw T x, raw T w, raw int32 ts, raw int32 paths,
            raw T codes, raw int32 begins, int32 c, int32 max_length''',
            'T ls, T wxy',
            '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];

              T wx = 0;
              for (int j = 0; j < c; ++j) {
                int w_ind[] = {node, j};
                int x_ind[] = {ind, j};
                wx += w[w_ind] * x[x_ind];
              }
              wxy = wx * codes[p];
              ls = log(1 + exp(-wxy));
            } else {
              ls = 0;
            }
            ''',
            'binary_hierarchical_softmax_forward'
        )(x, self.W, t, self.paths, self.codes, self.begins, n_in, max_length,
          ls, wxy)
        self.max_length = max_length
        self.wxy = wxy
        return ls.sum(),
Ejemplo n.º 34
0
    def forward_gpu(self, x):
        n, out_c, out_h, out_w = x[0].shape
        c = self.W.shape[1]
        h = get_deconv_outsize(out_h, self.kh, self.sy, self.ph)
        w = get_deconv_outsize(out_w, self.kw, self.sx, self.pw)
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            x_desc = cudnn.get_tensor_desc(x[0], out_h, out_w)
            y = cuda.empty((n, c, h, w), dtype=numpy.float32)
            y_desc = cudnn.get_tensor_desc(y, h, w)

            self.filter_desc = cudnn.get_filter4d_desc(self.W)
            self.conv_desc = cudnn.get_conv2d_desc(
                (self.ph, self.pw), (self.sy, self.sx))
            if self.b is not None:
                self.bias_desc = cudnn.get_conv_bias_desc(self.b)

            libcudnn.cudnnConvolutionBackwardData(
                handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W),
                x_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value,
                0, y_desc.value, cudnn.get_ptr(y))
            if self.b is not None:
                libcudnn.cudnnAddTensor(
                    handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'],
                    1, self.bias_desc.value, cudnn.get_ptr(self.b),
                    1, y_desc.value, cudnn.get_ptr(y))
        else:
            handle = cuda.get_cublas_handle()
            # TODO(beam2d): Use streams
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            x_mats = x[0].reshape(n, out_c, out_h * out_w)
            gcol = cuda.empty((n, c, self.kh, self.kw, out_h, out_w), dtype=numpy.float32)
            gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, x_mats[i], transa='T', handle=handle,
                                  out=gcol_mats[i])
            y = conv.col2im_gpu(
                gcol, self.sy, self.sx, self.ph, self.pw, h, w)
            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                cuda.elementwise(
                    'float* y, const float* b, int c, int hw',
                    'y[i] += b[i / hw % c]',
                    'conv_bias_fwd')(y, self.b, c, h * w)
        return y,
Ejemplo n.º 35
0
    def forward_gpu(self, inputs):
        x, t = inputs
        max_length = cuda.reduce(
            'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]',
            'max(a, b)', 'out = a', '0',
            'binary_hierarchical_softmax_max_length')(t, self.begins)
        max_length = cuda.to_cpu(max_length)[()]

        length = max_length * x.shape[0]
        ls = cuda.empty((length, ), dtype=numpy.float32)
        n_in = x.shape[1]
        wxy = cuda.empty((length, ), dtype=numpy.float32)
        cuda.elementwise(
            '''raw T x, raw T w, raw int32 ts, raw int32 paths,
            raw T codes, raw int32 begins, int32 c, int32 max_length''',
            'T ls, T wxy', '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];

              T wx = 0;
              for (int j = 0; j < c; ++j) {
                int w_ind[] = {node, j};
                int x_ind[] = {ind, j};
                wx += w[w_ind] * x[x_ind];
              }
              wxy = wx * codes[p];
              ls = log(1 + exp(-wxy));
            } else {
              ls = 0;
            }
            ''',
            'binary_hierarchical_softmax_forward')(x, self.W, t, self.paths,
                                                   self.codes, self.begins,
                                                   n_in, max_length, ls, wxy)
        self.max_length = max_length
        self.wxy = wxy
        return ls.sum(),
Ejemplo n.º 36
0
    def forward_gpu(self, x):
        i_len, j_len = array.as_mat(x[0]).shape
        k_len = array.as_mat(x[1]).shape[1]
        l_len = self.W.shape[2]

        # When indices are enclosed with [], they are 'flatten'
        # (i.e. linealized as 1-D array)
        # ij->[ij]
        e1 = array.as_vec(x[0])
        # ik->[ik]
        e2 = array.as_vec(x[1])
        e1e2 = cuda.empty(i_len * j_len * k_len, dtype=numpy.float32)
        # '[ij],[ik]->[ijk]'
        cuda.elementwise(
            'float* y, float* e1, float* e2, int e1c, int e2c', '''
            int I = i / e1c / e2c;
            int J = (i - I * e1c * e2c) / e2c;
            int K = i % e2c;
            y[i] = e1[I * e1c + J] * e2[I * e2c + K];
            ''', 'row_wise_outer_product')(e1e2, e1, e2, j_len, k_len)

        # [ijk]->i[jk]
        e1e2 = e1e2.reshape(i_len, j_len * k_len)

        # jkl->[jk]l
        W_mat = self.W.reshape(self.W.shape[0] * self.W.shape[1],
                               self.W.shape[2])

        y = cuda.empty((i_len, l_len), dtype=numpy.float32)
        with cuda.using_cumisc():
            # 'i[jk],[jk]l->il'
            cuda.culinalg.dot(e1e2, W_mat, out=y)

        if not self.nobias:
            e1 = array.as_mat(x[0])
            e2 = array.as_mat(x[1])
            with cuda.using_cumisc():
                # ij,jl->il
                cuda.culinalg.add_dot(e1, self.V1, y)
                # ik,kl->il
                cuda.culinalg.add_dot(e2, self.V2, y)
            cuda.elementwise('float* y, float* b, int n_channel',
                             'y[i] += b[i % n_channel]',
                             'linear_bias')(y, self.b, self.b.size)
        return y,
Ejemplo n.º 37
0
 def backward_gpu(self, inputs, grads):
     x, = inputs
     g, = grads
     gx = cuda.empty(x.shape, numpy.float32)
     cuda.elementwise(
         'float* gx, const float* x, const float* g, float beta',
         'gx[i] = (1.f - 1.f / (1.f + __expf(beta * x[i]))) * g[i];',
         'softplus_backward')(gx, x, g, self.beta)
     return gx,
Ejemplo n.º 38
0
 def sample_gpu(self, shape):
     ps = cuda.empty(shape, numpy.float32)
     cuda.get_generator().fill_uniform(ps)
     vs = cuda.empty(shape, numpy.int32)
     cuda.elementwise(
         '''int* vs, const float* ps, const float* threshold,
         const int* values, int b''', '''
         float pb = ps[i] * b;
         int index = __float2int_rd(pb);
         // fill_uniform sometimes returns 1.0, so we need to check index
         if (index >= b) {
           index = 0;
         }
         int lr = threshold[index] < pb - index;
         vs[i] = values[index * 2 + lr];
         ''', 'walker_alias_sample')(vs, ps, self.threshold, self.values,
                                     len(self.threshold))
     return vs
Ejemplo n.º 39
0
 def forward_gpu(self, inputs):
     x, = inputs
     y = cuda.empty(x.shape)
     cuda.elementwise(
         'float* y, const float* x, float beta, float beta_inv', '''
         float bx = beta * x[i];
         y[i] = (max(bx, 0.f) + log1pf(__expf(-fabsf(bx)))) * beta_inv;
         ''', 'softplus')(y, x, self.beta, self.beta_inv)
     return y,
Ejemplo n.º 40
0
 def forward_gpu(self, x):
     x = _as_mat(x[0])
     y = cuda.empty((x.shape[0], self.W.shape[0]), dtype=x.dtype)
     with cuda.using_cumisc():
         cuda.culinalg.dot(x, self.W, transb='T', out=y)
     if self.b is not None:
         cuda.elementwise('float* y, float* b, int n_channel',
                          'y[i] += b[i % n_channel]',
                          'linear_bias')(y, self.b, self.b.size)
     return y,
Ejemplo n.º 41
0
 def backward_gpu(self, inputs, grads):
     x, = inputs
     g, = grads
     gx = cuda.empty(x.shape, numpy.float32)
     cuda.elementwise(
         'float* gx, const float* x, const float* g, float beta',
         'gx[i] = (1.f - 1.f / (1.f + __expf(beta * x[i]))) * g[i];',
         'softplus_backward'
     )(gx, x, g, self.beta)
     return gx,
Ejemplo n.º 42
0
    def backward_gpu(self, x, gy):
        # x is a dummy variable, which is required only for compatibility with pooling_2d.Pooling2D
        n, c, h, w = gy[0].shape
        y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph,
                                    self.cover_all)
        y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw,
                                    self.cover_all)
        y = cuda.empty((n, c, y_h, y_w), dtype=gy[0].dtype)
        gx = cuda.empty((n, c, y_h, y_w), dtype=x[0].dtype)
        cuda.elementwise(
            'raw T in, int32 h, int32 w, int32 out_h, int32 out_w,'
            'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw',
            'T out', '''
               int c0    = i / (out_h * out_w);
               int out_y = i / out_w % out_h;
               int out_x = i % out_w;
               int in_y_0 = max(0, out_y * sy - ph);
               int in_y_1 = min(h, out_y * sy + kh - ph);
               int in_x_0 = max(0, out_x * sx - pw);
               int in_x_1 = min(w, out_x * sx + kw - pw);

               float maxval = in[in_x_0 + w * (in_y_0 + h * c0)];
               int argmax_y = in_y_0;
               int argmax_x = in_x_0;
               for (int y = in_y_0; y < in_y_1; ++y) {
                 int offset_y = w * (y + h * c0);
                 for (int x = in_x_0; x < in_x_1; ++x) {
                   float v = in[x + offset_y];
                   if (maxval < v) {
                     maxval   = v;
                     argmax_y = y;
                     argmax_x = x;
                   }
                 }
               }
               out = maxval;

               int argmax_ky = argmax_y + ph - out_y * sy;
               int argmax_kx = argmax_x + pw - out_x * sx;
            ''', 'max_pool_fwd')(gy[0].reduced_view(), h, w, y_h, y_w, self.kh,
                                 self.kw, self.sy, self.sx, self.ph, self.pw,
                                 gx)
        return gx,
Ejemplo n.º 43
0
 def forward_gpu(self, x):
     x = _as_mat(x[0])
     y = cuda.empty((x.shape[0], self.W.shape[0]), dtype=x.dtype)
     with cuda.using_cumisc():
         cuda.culinalg.dot(x, self.W, transb='T', out=y)
     if self.b is not None:
         cuda.elementwise(
             'float* y, float* b, int n_channel',
             'y[i] += b[i % n_channel]',
             'linear_bias')(y, self.b, self.b.size)
     return y,
Ejemplo n.º 44
0
 def backward_gpu(self, inputs, grad_outputs):
     y, z = inputs
     gcost, = grad_outputs
     N = y.shape[0]
     gy = cuda.empty(y.shape)
     gz = cuda.empty(z.shape)
     cuda.culinalg.add_dot(self.z_centered,
                           self.covariance,
                           gy,
                           transb='T',
                           alpha=1. / N,
                           beta=0.)
     cuda.culinalg.add_dot(self.y_centered,
                           self.covariance,
                           gz,
                           alpha=1. / N,
                           beta=0.)
     gy = cuda.cumisc.multiply(gy, gcost)
     gz = cuda.cumisc.multiply(gz, gcost)
     return gy, gz
Ejemplo n.º 45
0
    def forward_gpu(self, inputs):
        y, z = inputs

        # Center inputs
        y_mean = cuda.empty((1, y.shape[1]))
        z_mean = cuda.empty((1, z.shape[1]))
        cuda.cumisc.mean(y, axis=0, out=y_mean, keepdims=True)
        cuda.cumisc.mean(z, axis=0, out=z_mean, keepdims=True)
        self.y_centered = cuda.cumisc.subtract(y, y_mean)
        self.z_centered = cuda.cumisc.subtract(z, z_mean)

        # Calculate cross-covariance
        self.covariance = cuda.empty((y.shape[1], z.shape[1]))
        cuda.culinalg.add_dot(
            self.y_centered, self.z_centered, self.covariance, transa="T", alpha=1.0 / y.shape[0], beta=0.0
        )

        # Calculate cost
        cost = cuda.cumisc.sum(0.5 * self.covariance ** 2)
        return (cost,)
Ejemplo n.º 46
0
 def sample_gpu(self, shape):
     ps = cuda.empty(shape, numpy.float32)
     cuda.get_generator().fill_uniform(ps)
     vs = cuda.empty(shape, numpy.int32)
     cuda.elementwise(
         '''int* vs, const float* ps, const float* threshold,
         const int* values, int b''',
         '''
         float pb = ps[i] * b;
         int index = __float2int_rd(pb);
         // fill_uniform sometimes returns 1.0, so we need to check index
         if (index >= b) {
           index = 0;
         }
         int lr = threshold[index] < pb - index;
         vs[i] = values[index * 2 + lr];
         ''',
         'walker_alias_sample'
     )(vs, ps, self.threshold, self.values, len(self.threshold))
     return vs
Ejemplo n.º 47
0
 def forward_gpu(self, inputs):
     x, = inputs
     y = cuda.empty(x.shape)
     cuda.elementwise(
         'float* y, const float* x, float beta, float beta_inv',
         '''
         float bx = beta * x[i];
         y[i] = (max(bx, 0.f) + log1pf(__expf(-fabsf(bx)))) * beta_inv;
         ''',
         'softplus'
     )(y, x, self.beta, self.beta_inv)
     return y,
Ejemplo n.º 48
0
def _partial_reduce(x):
    global _one
    out_axis, sum_axis = x.shape
    one = _one
    if one is None or one.size < sum_axis:
        one = cuda.ones(sum_axis)
        _one = one
    one = one[:sum_axis]
    handle = cuda.get_cublas_handle()
    ret = cuda.empty(out_axis)
    cuda.cublas.cublasSgemv(handle, 't', sum_axis, out_axis,
                            numpy.float32(1.0), x.gpudata, sum_axis, one.gpudata,
                            1, numpy.float32(0.0), ret.gpudata, 1)
    return ret
Ejemplo n.º 49
0
    def forward_gpu(self, inputs):
        mean, ln_var = inputs
        if self.eps is None:
            self.eps = cuda.empty(ln_var.shape, numpy.float32)
            cuda.get_generator().fill_normal(self.eps)

        noise = cuda.empty_like(ln_var)
        cuda.elementwise(
            'float* noise, const float* v, const float* e',
            'noise[i] = __expf(v[i] * 0.5f) * e[i];',
            'gaussian_forward'
        )(noise, ln_var, self.eps)
        self.noise = noise
        return mean + self.noise,
Ejemplo n.º 50
0
def _partial_reduce(x):
    global _one
    out_axis, sum_axis = x.shape
    one = _one
    if one is None or one.size < sum_axis:
        one = cuda.ones(sum_axis)
        _one = one
    one = one[:sum_axis]
    handle = cuda.get_cublas_handle()
    ret = cuda.empty(out_axis)
    cuda.cublas.cublasSgemv(handle, 't', sum_axis, out_axis,
                            numpy.float32(1.0), x.gpudata, sum_axis,
                            one.gpudata, 1, numpy.float32(0.0), ret.gpudata, 1)
    return ret
Ejemplo n.º 51
0
    def forward_gpu(self, inputs):
        mean, ln_var = inputs
        if self.eps is None:
            self.eps = cuda.empty(ln_var.shape, numpy.float32)
            cuda.get_generator().fill_normal(self.eps)

        noise = cuda.empty_like(ln_var)
        cuda.elementwise(
            'float* noise, const float* v, const float* e',
            'noise[i] = __expf(v[i] * 0.5f) * e[i];',
            'gaussian_forward'
        )(noise, ln_var, self.eps)
        self.noise = noise
        return mean + self.noise,
Ejemplo n.º 52
0
    def forward_gpu(self, inputs):
        y, z = inputs

        # Center inputs
        y_mean = cuda.empty((1, y.shape[1]))
        z_mean = cuda.empty((1, z.shape[1]))
        cuda.cumisc.mean(y, axis=0, out=y_mean, keepdims=True)
        cuda.cumisc.mean(z, axis=0, out=z_mean, keepdims=True)
        self.y_centered = cuda.cumisc.subtract(y, y_mean)
        self.z_centered = cuda.cumisc.subtract(z, z_mean)

        # Calculate cross-covariance
        self.covariance = cuda.empty((y.shape[1], z.shape[1]))
        cuda.culinalg.add_dot(self.y_centered,
                              self.z_centered,
                              self.covariance,
                              transa='T',
                              alpha=1. / y.shape[0],
                              beta=0.)

        # Calculate cost
        cost = cuda.cumisc.sum(0.5 * self.covariance**2)
        return cost,
Ejemplo n.º 53
0
def getByIndex_LogAndClip(probs, t, out=None):
    """
    This kernel takes an element in each row of probs at indices t, 
            and clips the output from 1e-8 to 1 and takes the log
    """
    N, M = probs.shape
    bdim, gdim = Get_bdim_and_gdim1D(N)
    if out is None:
        out = cuda.empty((N,1),dtype=np.float32)
             
    IndexAndClipAndLog_kernel.prepared_call(gdim, bdim,
                                probs.gpudata, t.gpudata, out.gpudata,
                                np.int32(N), np.int32(M))
    return out
Ejemplo n.º 54
0
    def forward_gpu(self, x):
        y = cuda.empty_like(x[0])
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            desc = cudnn.get_tensor_desc(x[0], 1, 1)
            libcudnn.cudnnSoftmaxForward(
                handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(x[0]),
                0, desc.value, cudnn.get_ptr(y))
            self.y = y
        else:
            maxes = cuda.empty((x[0].shape[0],), dtype=numpy.float32)
            c = x[0].shape[1]
            cuda.elementwise(
                'float* maxes, const float* x, int c',
                '''
                   const float* row = x + i * c;
                   float maxval = row[0];
                   for (int j = 1; j < c; ++j) {
                     if (maxval < row[j]) {
                       maxval = row[j];
                     }
                   }
                   maxes[i] = maxval;
                ''', 'softmax_rowmax')(maxes, x[0], c)
            cuda.elementwise(
                'float* y, const float* x, const float* maxes, int c',
                'y[i] = __expf(x[i] - maxes[i / c])',
                'softmax_exp')(y, x[0], maxes, c)
            coeff = maxes  # reuse memory
            cuda.elementwise(
                'float* coeff, const float* y, int c',
                '''
                   const float* row = y + i * c;
                   float sum = 0;
                   for (int j = 0; j < c; ++j) {
                     sum += row[j];
                   }
                   coeff[i] = 1 / sum;
                ''', 'softmax_invrowsum')(coeff, y, c)
            cuda.elementwise(
                'float* y, const float* coeff, int c', 'y[i] *= coeff[i / c]',
                'softmax_rowmul')(y, coeff, c)
            self.y = y

        return y,
Ejemplo n.º 55
0
    def forward_gpu(self, x):
        # Implementation using cudnn
        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph, self.cover_all)
        y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw, self.cover_all)
        y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32)

        handle = cudnn.get_default_handle()
        pool_desc = self.create_pool_desc()
        x_desc = cudnn.get_tensor_desc(x[0], x[0].shape[2], x[0].shape[3])
        y_desc = cudnn.get_tensor_desc(y, y_h, y_w)

        libcudnn.cudnnPoolingForward(
            handle, pool_desc.value, 1, x_desc.value, cudnn.get_ptr(x[0]),
            0, y_desc.value, cudnn.get_ptr(y))
        self.y = y

        return y,
Ejemplo n.º 56
0
 def forward_gpu(self, inputs):
     x, t = inputs
     fragments = cuda.empty((x.shape[0], ), dtype=numpy.int8)
     cuda.elementwise(
         'char* fragments, const float* x, const int* t, int c', '''
            x += i * c;
            float maxval = x[0];
            int   argmax = 0;
            for (int j = 1; j < c; ++j) {
              if (maxval < x[j]) {
                maxval = x[j];
                argmax = j;
              }
            }
            fragments[i] = argmax == t[i];
         ''', 'accuracy_fwd_map')(fragments, x, t, x.shape[1])
     y = cuda.gpuarray.sum(fragments, dtype=numpy.float32)
     y /= x.shape[0]
     return y,
Ejemplo n.º 57
0
    def forward_gpu(self, xs):
        # TODO(beam2d): Unify the process into a single kernel.
        shape = list(xs[0].shape)
        for x in xs[1:]:
            shape[self.axis] += x.shape[self.axis]
        self.shape = shape

        y = cuda.empty(shape, dtype=xs[0].dtype)
        self.cdimy = y.shape[self.axis]
        self.rdim  = numpy.prod(shape[self.axis + 1:])

        coffset = 0
        kernel  = cuda.elementwise(
            _args, 'COPY(y[idx] = x[i])', 'concat_fwd', preamble=_preamble)
        for x in xs:
            cdimx = x.shape[self.axis]
            kernel(x, y, cdimx, self.cdimy, self.rdim, coffset)
            coffset += cdimx

        return y,
Ejemplo n.º 58
0
    def forward_gpu(self, x):
        # Implementation using cudnn
        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(
            h, self.kh, self.sy, self.ph, self.cover_all)
        y_w = conv.get_conv_outsize(
            w, self.kw, self.sx, self.pw, self.cover_all)
        y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32)

        handle = cudnn.get_handle()
        pool_desc = self.create_pool_desc()
        x_desc = cudnn.create_tensor_descriptor(x[0])
        y_desc = cudnn.create_tensor_descriptor(y)

        libcudnn.poolingForward(
            handle, pool_desc.value, ctypes.c_float(1), x_desc.value,
            x[0].data.ptr, ctypes.c_float(0), y_desc.value, y.data.ptr)
        self.y = y

        return y,