Ejemplo n.º 1
0
def softmax(x, deriv=False):
    if deriv:
        return x * (1.0 - x)
    else:
        np_t = np.array([[0.0]])
        # skcuda.misc.max(x).get(np_t)
        # x = x - np_t.ravel()[0]
        gpu.sum(cm.exp(x)).get(np_t)

        return cm.exp(x) / np_t.ravel()[0]
Ejemplo n.º 2
0
 def compute_one(self, amp, r, buffer):
     cumath.exp(self.iqx * r.x() + self.iqxsq * r.z(), out=self._ex)
     cumath.exp(self.iqy * r.y() + self.iqysq * r.z(), out=self._ey)
     self._ex *= amp
     self.outer(self._ex,
                self._ey,
                buffer,
                np.int32(self.w),
                np.int32(self.h),
                block=self.block,
                grid=self.grid)
Ejemplo n.º 3
0
        def thunk():
            alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None])
            x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :])
            x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :])
            Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b))
            Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b))
            Xtn = misc.sum(Xt, axis=1, keepdims=True)
            Xfn = misc.sum(Xf, axis=1, keepdims=True)
            Xt = misc.divide(Xt, Xtn)
            Xf = misc.divide(Xf, Xfn)
            w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha)
            dq = Xt - Xf
            qdw = dq / w
            t1 = misc.sum(x * qdw, axis=1)
            f = 2 * depth + self.base.n
            t2 = f * misc.sum(dq, axis=1) / misc.sum(w, axis=1)
            t3 = misc.sum(x, axis=1) * misc.sum(qdw, axis=1)
            dalpha = t1 - t2 + t3
            del dq, t1, f, t2, t3

            iw = 1 / w
            S1 = misc.multiply(
                depth[:, None] * (self.base.n - 1) / self.base.n, iw)
            S2 = (self.base.n + depth[:, None]) / cumath.log(
                misc.sum(w, axis=1, keepdims=True))
            F = misc.multiply(misc.subtract((x * iw) - S1, S2), alpha)
            del w, iw, S1, S2

            cast = gpuarray.zeros((x_t.shape[1], Xt.shape[1]),
                                  dtype=theano.config.floatX)
            dLq_t = gpuarray.zeros(x_t.shape, dtype=theano.config.floatX)
            dLq_f = gpuarray.zeros(x_f.shape, dtype=theano.config.floatX)
            for i in range(Xt.shape[0]):
                S1 = misc.multiply(Xt[None, i, :], A)
                S2 = misc.sum(S1, axis=1, keepdims=True)
                S2 = misc.multiply(S2, misc.add(Xt[None, i, :], cast))
                dLq_t[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2),
                                       axis=1)
                S1 = misc.multiply(Xf[None, i, :], A)
                S2 = misc.sum(S1, axis=1, keepdims=True)
                S2 = misc.multiply(S2, misc.add(Xf[None, i, :], cast))
                dLq_f[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2),
                                       axis=1)
            outputs[0][0] = dalpha.get()
            outputs[1][0] = dLq_t.get()
            outputs[2][0] = dLq_f.get()
            for v in node.outputs:
                compute_map[v][0] = True
Ejemplo n.º 4
0
 def test_exp(self):
     """tests if the exp function works"""
     a = simplearray.array(100).fill_arange()/10        
     b = cumath.exp(a)
     
     for i in range(100):
         self.assert_(abs(math.exp(a[i]) - b[i]) < 1e-2)
Ejemplo n.º 5
0
    def forward_gpu(self, x, temperature):
        """forward propagation in gpu mode"""

        # obtain z
        hx = np.concatenate((self.h, x))
        hx_gpu = gpu.to_gpu(hx.astype(np.float32))
        all_weights = np.concatenate((self.forget_w, self.sel_w, self.write_w, self.add_w))
        all_biases = np.concatenate((self.forget_b, self.sel_b, self.write_b, self.add_b))
        all_weights_gpu = gpu.to_gpu(all_weights.astype(np.float32))
        all_biases_gpu = gpu.to_gpu(all_biases.astype(np.float32))
        z = gpu.zeros((self.hidden_s * 4, 1), np.float32)
        self.kernel(all_weights_gpu, hx_gpu, z, grid=(self.num_block, 1, 1), block=(self.num_thread_per_block, 1, 1))
        z += all_biases_gpu

        # non-linearity
        z[:self.hidden_s * 3, :1] = 1.0 / (gpum.exp(-1 * z[:self.hidden_s * 3, :1]) + 1.0)
        z[self.hidden_s * 3:, :1] = 1.7159 * gpum.tanh(2.0 / 3.0 * z[self.hidden_s * 3:, :1])
        z_cpu = z.get()

        # update cell and hidden
        self.c = z_cpu[:self.hidden_s, :1] * self.c + \
                 z_cpu[self.hidden_s:self.hidden_s*2, :1] * z_cpu[self.hidden_s*3:, :1]
        self.h = z_cpu[self.hidden_s * 2: self.hidden_s * 3, :1] * Tanh(self.c)

        # output
        res = np.dot(self.weights, self.h) + self.biases
        return Softmax(res, temperature)
Ejemplo n.º 6
0
def calculate_attenuation_gpu(projections_gpu, energy, p, pool):
    attenuation_gpu = gpuarray.zeros(projections_gpu[next(iter(projections_gpu))].shape, dtype=np.float32, allocator=pool.allocate)
    for mat in projections_gpu:
        # logger.debug(f'attenuating {mat}')
        attenuation_gpu = attenuation_gpu.mul_add(1.0, projections_gpu[mat], -get_absorbtion_coefs(energy, mat))
    attenuation_gpu = cumath.exp(attenuation_gpu) * energy * p
    return attenuation_gpu
Ejemplo n.º 7
0
 def fprop(self, input, output):
   max = gpuarray.zeros((1, self.batchSize), dtype = np.float32)
   col_max_reduce(max, input)
   add_vec_to_cols(input, max, output, alpha = -1)
   gpu_copy_to(cumath.exp(output), output)
   sum = gpuarray.zeros(max.shape, dtype = np.float32)
   add_col_sum_to_vec(sum, output, alpha = 0)
   div_vec_to_cols(output, sum)
Ejemplo n.º 8
0
def logsumexp(mat):
    max_dim = max_by_axis(mat, 1)
    tmp = add_vec_to_mat(mat, max_dim, 0, substract=True)

    tmp = cumath.exp(tmp)

    tmp = matrix_sum_out_axis(tmp, 1)
    tmp = cumath.log(tmp)
    max_dim += tmp
    return max_dim
Ejemplo n.º 9
0
def logsumexp(mat):
    max_dim = max_by_axis(mat, 1)
    tmp = add_vec_to_mat(mat, max_dim, 0, substract=True)

    tmp = cumath.exp(tmp)
    
    tmp = matrix_sum_out_axis(tmp, 1)
    tmp = cumath.log(tmp)
    max_dim += tmp
    return max_dim
Ejemplo n.º 10
0
def exp(d_a, mode=MathModes.ACC):
    if mode == MathModes.ACC:
        return cumath.exp(d_a)

    d_out = gpuarray.zeros_like(d_a)
    thread_size = min(d_a.size, MAX_BLOCK_SIZE)
    block_size = max(int(math.ceil(d_a.size / float(thread_size))), 1)
    exp_fast_kernel(d_a, d_out, numpy.int32(d_a.size),
            block=(thread_size,1,1), grid=(block_size,1,1))
    return d_out
Ejemplo n.º 11
0
    def update_momentum(self, factor):
        """
	update_momentum - performs an update in momentum space

        Parameters
        ----------
 	factor - essentially the size of the time step dt

	Returns
        -------
        None.

	"""
        if not self.gpu:
            self.psi *= np.exp(-1j * factor * self.m * self.V)
        else:
            self.g_psi_hat[...] = -1.0j * factor * self.m * self.g_V
            cumath.exp(self.g_psi_hat, out=self.g_psi_hat)
            self.g_psi *= self.g_psi_hat
Ejemplo n.º 12
0
 def thunk():
     alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None])
     x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :])
     x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :])
     Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b))
     Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b))
     Xtn = misc.sum(Xt, axis=1, keepdims=True)
     Xfn = misc.sum(Xf, axis=1, keepdims=True)
     Xt = misc.divide(Xt, Xtn)
     Xf = misc.divide(Xf, Xfn)
     w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha)
     wp = cumath.log(w)
     wpn = misc.sum(wp, axis=1, keepdims=True) / self.n
     wp = misc.subtract(wp, wpn)
     t1 = misc.sum(x * wp, axis=1)
     t2 = (self.n + depth) * cumath.log(misc.sum(w, axis=1))
     t3 = depth * wpn
     outputs[0][0] = misc.sum(t1 - t2 + t3).get()
     for v in node.outputs:
         compute_map[v][0] = True
Ejemplo n.º 13
0
    def _fprop(self, X, X_space):
        A = scikits.cuda.linalg.dot(X, self.W)
        B, bias_space = self._b_space.broadcast(self.b, b=X_space.get_extent('b'))
        Y = cumath.exp(A + B)

        Z = scikits.cuda.linalg.dot(Y, self._sum_vector_classes)
        Z_space = bias_space.with_extents(w=1)
        Z, Z_space = Z_space.broadcast(Z, w=self.n_classes)

        Y /= Z

        return Y
Ejemplo n.º 14
0
    def _fprop(self, X, X_space):
        A = scikits.cuda.linalg.dot(X, self.W)
        B, bias_space = self._b_space.broadcast(self.b,
                                                b=X_space.get_extent('b'))
        Y = cumath.exp(A + B)

        Z = scikits.cuda.linalg.dot(Y, self._sum_vector_classes)
        Z_space = bias_space.with_extents(w=1)
        Z, Z_space = Z_space.broadcast(Z, w=self.n_classes)

        Y /= Z

        return Y
Ejemplo n.º 15
0
    def f(A_t, A_w, dz = delta_z):
        if f.delta_z != dz:
            f.w_exp = cumath.exp(-1j * dz/2. * w_op)
            f.t_exp = cumath.exp(-1j * dz * t_op)
            f.delta_z = dz
        
        ## Dispersion (I pass)
        f.A_t = A_t
        f.A_w = A_w
        
        #print A_w.get()[n_points/2],     
        prod(A_w, f.w_exp, A_w)
        #A_w = f.w_exp*A_w
        #print A_w.get()[n_points/2],
        ifft_g(f.A_w, f.A_t)  ## Scale factor included in fft_g
        
        
        ## Constant potential term
        prod(f.A_t, f.t_exp, f.A_t)

        ## Nonlinear operator as intensity dependency
        if nlin != 0:
            f.A_t = f.A_t * cumath.exp(-1j * delta_z * nlin * f.A_t * f.A_t.conj())
        ## Additional nonlinear terms as a function t_nl_op(A(t),dt,z)
        if t_nl_op != None:
            f.A_t = f.A_t * cumath.exp(-1j * delta_z * t_nl_op(f.A_t, dt, z0+delta_z/2) )
        ## Apodization
        if apod:
            prod(f.A_t, apod_array, f.A_t)
            
        fft_g(f.A_t, f.A_w) ## Scale factor included in fft_g
        
        ## Dispersion (II pass)
        prod(f.A_w, f.w_exp, f.A_w)
        
        ifft_g(f.A_w, f.A_t)  ## Scale factor included in fft_g
        
        
        return f.A_t, f.A_w
Ejemplo n.º 16
0
    def shift_trev_freq(self):
        """
        GPU implementation of shift_trev_freq
        """
        t_rev = self.RFParams.t_rev[self.RFParams.counter[0]]
        dev_induced_voltage_f = bm.rfft(self.dev_mtw_memory, self.n_mtw_fft)
        dev_induced_voltage_f *= cm.exp(self.dev_omegaj_mtw * t_rev)

        self.dev_mtw_memory = get_gpuarray((self.n_mtw_memory, bm.precision.real_t, id(self), 'mtw_m'))
        dummy = bm.irfft(dev_induced_voltage_f, caller_id=id(self))
        gpu_copy_d2d(self.dev_mtw_memory, dummy,
                     range=range(0, self.n_mtw_memory))
        set_zero_real(self.dev_mtw_memory,
                      slice=slice(-int(self.buffer_size), None, None))
Ejemplo n.º 17
0
    def shift_trev_freq(self):
        """
        Method to shift the induced voltage by a revolution period in the
        frequency domain
        """
        t_rev = self.RFParams.t_rev[self.RFParams.counter[0]]
        # Shift in frequency domain
        dev_induced_voltage_f = bm.rfft(self.dev_mtw_memory, self.n_mtw_fft)
        dev_induced_voltage_f *= cm.exp(self.dev_omegaj_mtw * t_rev)

        self.dev_mtw_memory = get_gpuarray(
            (self.n_mtw_memory, bm.precision.real_t, id(self), 'mtw_m'))
        dummy = bm.irfft(dev_induced_voltage_f, caller_id=self(id))
        gpu_copy_d2d(self.dev_mtw_memory,
                     dummy,
                     range=range(0, self.n_mtw_memory))
        set_zero_real(self.dev_mtw_memory,
                      slice=slice(-int(self.buffer_size), None, None))
Ejemplo n.º 18
0
GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(max_tpb*8))

# Indexes are not contiguous

griddimx = int(nclmns / max_tpb) 
griddimy = int(nsamps) 
# One thread per sample-time
grd = (griddimx, griddimy, 1)
blk = (max_tpb,  1,        1)

maxes = np.array(all_l_rhots_gpu[:,0][1::nmodes].get()).astype(np.float64)
maxes_gpu = gpuarray.to_gpu(maxes)

GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8)
# ***** THIS IS CORRECT AND WORKING UP THROUGH HERE AS OF AUGUST 10TH 2016 ***** 

''' 
 Marginalize over Time
'''

all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate 

GPU_nv_reduc(all_l_rhots_gpu) # sum over time 

lnL_gpu = maxes_gpu + cumath.log(all_l_rhots_gpu) # TIMES DELTA T FIXME




start = drv.Event()
end = drv.Event()

x = np.random.normal(size=N)

start.record()
dX = gpuarray.to_gpu(x)
end.record()
end.synchronize()
print "Transfer to GPU time: %fs" % (start.time_till(end) * 1e-3)

print "Timing vectorized exponentiation:"

start.record()
dexpX = cumath.exp(dX)
end.record()
end.synchronize()
print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3)

start.record()
expX = np.exp(x)
end.record()
end.synchronize()
print "CPU calc time: %fs" % (start.time_till(end) * 1e-3)

print "Timing vectorized dot product/sum of squares:"

start.record()
gpuarray.dot(dX, dX)
end.record()
Ejemplo n.º 20
0
def demosaick_gpu(img):
    img = gp.to_gpu(img)
    p2x = im2col(img, _i2c2)
    cm.log(img + _eps, out=img)
    p1x = im2col(img, _i2c1)

    wA = p1x.shape[0]
    wB = p2x.shape[0]
    hA = p1x.shape[1]
    hB = p2x.shape[1]

    # Path 1
    p1x = p1x.reshape([wA * hA, 576])
    p1y = lg.dot(p1x, _wts.int1)
    cm.exp(p1y, out=p1y)

    p1y = p1y.reshape([wA * hA * 64, 3 * _ofac])
    p1x = lg.dot(p1y, _wts.int2)
    msc.add_matvec(p1x, _wts.int2b, out=p1x)
    p1x = p1x.reshape([wA * hA * 64 * 3, _ofac])

    # Path 2
    # conv1
    p2x = p2x.reshape([wB * hB, 64])
    p2y = lg.dot(p2x, _wts.c1)
    msc.add_matvec(p2y, _wts.c1b, out=p2y)
    gp.maximum(p2y, 0., p2y)
    p2y = p2y.reshape([wB, hB, _numsel])

    # conv2
    shI = [wB - 1, hB - 1, _numsel]
    shM = [(wB - 1) * (hB - 1), _numsel]
    p2x = gp.empty(shM, dtype=np.float32)
    pTT = gp.empty(shI, dtype=np.float32)

    pTT = pTT.reshape(shI)
    pTT[...] = p2y[0:-1, 0:-1, :]
    pTT = pTT.reshape(shM)
    p2x = lg.dot(pTT, _wts.c200)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[0:-1, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c201, p2x)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[1:, 0:-1, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c210, p2x)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[1:, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c211, p2x)
    msc.add_matvec(p2x, _wts.c2b, out=p2x)
    gp.maximum(p2x, 0., p2x)
    p2x = p2x.reshape(shI)

    # conv 3
    shI = [wB - 2, hB - 2, _numsel]
    shM = [(wB - 2) * (hB - 2), _numsel]
    p2y = gp.empty(shM, dtype=np.float32)
    pTT = gp.empty(shI, dtype=np.float32)

    pTT = pTT.reshape(shI)
    pTT[...] = p2x[0:-1, 0:-1, :]
    pTT = pTT.reshape(shM)
    p2y = lg.dot(pTT, _wts.c300)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[0:-1, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c301, p2y)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[1:, 0:-1, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c310, p2y)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[1:, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c311, p2y)
    msc.add_matvec(p2y, _wts.c3b, out=p2y)
    gp.maximum(p2y, 0., p2y)

    p2x = lg.dot(p2y, _wts.sout)

    msc.add_matvec(p2x, _wts.soutb, out=p2x)
    gp.maximum(p2x, 0., p2x)
    p2x = p2x.reshape(p1x.shape)

    # Combine
    p1x *= p2x
    p1 = msc.sum(p1x, axis=1)
    gp.maximum(p1, 0., p1)
    gp.minimum(p1, 1., p1)
    p1 = p1.reshape([wA, hA, 64 * 3])

    im = p2im(p1.get())

    return im
x = np.random.normal(size = n)
x_short = np.random.normal(size = 8)

start.record()
dev_x = gpuarray.to_gpu(x)
dev_x_short = gpuarray.to_gpu(x_short)
end.record() 
end.synchronize()
print "Transfer to GPU time: %fs" %(start.time_till(end)*1e-3)


print "Timing vectorized exponentiation:"

start.record()
dev_expx_short = cumath.exp(dev_x_short)
end.record() 
end.synchronize()
print "GPU array calc time (initial): %fs" %(start.time_till(end)*1e-3)

start.record()
dev_expx = cumath.exp(dev_x)
end.record() 
end.synchronize()
print "GPU array calc time: %fs" %(start.time_till(end)*1e-3)

start.record()
exp_x = np.exp(x)
end.record() 
end.synchronize()
print "CPU calc time: %fs" %(start.time_till(end)*1e-3)
Ejemplo n.º 22
0
    def bptt(self, data, temperature, length):
        """full back propagation through time"""
        loss = 0
        D_forget_w = np.zeros((self.hidden_s, self.input_s + self.hidden_s))
        D_forget_b = np.zeros((self.hidden_s, 1))
        D_sel_w = np.zeros((self.hidden_s, self.input_s + self.hidden_s))
        D_sel_b = np.zeros((self.hidden_s, 1))
        D_add_w = np.zeros((self.hidden_s, self.input_s + self.hidden_s))
        D_add_b = np.zeros((self.hidden_s, 1))
        D_write_w = np.zeros((self.hidden_s, self.input_s + self.hidden_s))
        D_write_b = np.zeros((self.hidden_s, 1))
        D_biases = np.zeros((self.output_s, 1))
        D_weights = np.zeros((self.output_s, self.hidden_s))
        hANDx = []
        forget_in = []
        sel_in = []
        add_in = []
        write_in = []
        c_hist = []
        h_hist = []
        forget_ = []
        sel_ = []
        add_ = []
        write_ = []
        prediction = []
        c_init = np.copy(self.c)

        E_over_write_next = np.zeros((1, self.hidden_s))
        E_over_c_next = np.zeros((1, self.hidden_s))

        # first forward propagation
        if self.gpu:  # in gpu mode

            all_weights = np.concatenate((self.forget_w, self.sel_w, self.write_w, self.add_w))
            all_biases = np.concatenate((self.forget_b, self.sel_b, self.write_b, self.add_b))
            all_weights_gpu = gpu.to_gpu(all_weights.astype(np.float32))
            all_biases_gpu = gpu.to_gpu(all_biases.astype(np.float32))
            z = gpu.zeros((self.hidden_s * 4, 1), np.float32)

            for i in range(length):
                x = data[i]

                # obtain z
                hx = np.concatenate((self.h, x))
                hANDx.append(hx)
                hx_gpu = gpu.to_gpu(hx.astype(np.float32))
                self.kernel(all_weights_gpu, hx_gpu, z, grid=(self.num_block, 1, 1),
                            block=(self.num_thread_per_block, 1, 1))
                z += all_biases_gpu
                z_cpu = z.get()
                forget_in.append(z_cpu[:self.hidden_s, :1])
                sel_in.append(z_cpu[self.hidden_s:self.hidden_s * 2, :1])
                write_in.append(z_cpu[self.hidden_s * 2:self.hidden_s * 3, :1])
                add_in.append(z_cpu[self.hidden_s * 3:, :1])

                # non-linearity
                z[:self.hidden_s * 3, :1] = 1.0 / (gpum.exp(-1 * z[:self.hidden_s * 3, :1]) + 1.0)
                z[self.hidden_s * 3:, :1] = 1.7159 * gpum.tanh(2 / 3.0 * z[self.hidden_s * 3:, :1])
                z_cpu = z.get()
                forget_.append(z_cpu[:self.hidden_s, :1])
                sel_.append(z_cpu[self.hidden_s:self.hidden_s * 2, :1])
                write_.append(z_cpu[self.hidden_s * 2:self.hidden_s * 3, :1])
                add_.append(z_cpu[self.hidden_s * 3:, :1])

                # update cell and hidden
                self.c = z_cpu[:self.hidden_s, :1] * self.c + z_cpu[self.hidden_s:self.hidden_s * 2, :1] \
                                * z_cpu[self.hidden_s * 3:, :1]
                self.h = z_cpu[self.hidden_s * 2: self.hidden_s * 3, :1] * Tanh(self.c)
                c_hist.append(self.c + 0)
                h_hist.append(self.h + 0)

                # output
                res = Softmax(np.dot(self.weights, self.h) + self.biases, temperature)
                prediction.append(res)
                loss += -np.log(res[np.argmax(data[i + 1]), 0])

        else:
            for i in range(length):
                x = data[i]
                info = np.concatenate((self.h, x), axis=0)
                hANDx.append(info)

                a = np.dot(self.forget_w, info) + self.forget_b
                forget_in.append(a)
                forget = Sigmoid(a)
                forget_.append(forget)

                a = np.dot(self.sel_w, info) + self.sel_b
                sel_in.append(a)
                select = Sigmoid(a)
                sel_.append(select)

                a = np.dot(self.add_w, info) + self.add_b
                add_in.append(a)
                add = Tanh(a)
                add_.append(add)

                self.c = self.c * forget + select * add
                a = np.dot(self.write_w, info) + self.write_b
                write_in.append(a)
                write = Sigmoid(a)
                write_.append(write)
                c_hist.append(np.copy(self.c))
                self.h = write * Tanh(self.c)
                h_hist.append(np.copy(self.h))

                a = np.dot(self.weights, self.h) + self.biases
                res = Softmax(a, temperature)
                prediction.append(res)
                loss += -np.log(res[np.argmax(data[i+1]), 0])

        # back propagation through time
        for i in range(length-1, -1, -1):

            # some variable
            hx_t = np.transpose(hANDx[i])

            # obtain current layer delta
            delta = prediction[i] - data[i+1]
            D_biases += delta
            D_weights += np.dot(delta, h_hist[i].T)

            # obtain E_over_h w.r.t. current layer delta
            delta_h = np.dot(delta.T, self.weights)

            # obtain E_over_h w.r.t. write gate
            if i == length-1:
                write_h = np.zeros((1, self.hidden_s))
            else:
                diag_sigmoid_grad = numpy.matlib.repmat(sigmoid_grad(write_in[i+1]), 1, self.hidden_s)
                write_w_part = self.write_w[:, :self.hidden_s]
                write_over_h = diag_sigmoid_grad * write_w_part
                write_h = np.dot(E_over_write_next, write_over_h)

            # obtain E_over_h w.r.t. memory cell
            if i == length-1:
                c_h = np.zeros((1, self.hidden_s))
            else:

                # part A: forget_over_h
                diag_sigmoid_grad = numpy.matlib.repmat(sigmoid_grad(forget_in[i+1]), 1, self.hidden_s)
                forget_w_part = self.forget_w[:, :self.hidden_s]
                forget_over_h = diag_sigmoid_grad * forget_w_part
                forget_over_h *= numpy.matlib.repmat(c_hist[i], 1, self.hidden_s)

                # part B: sel_over_h
                diag_sigmoid_grad = numpy.matlib.repmat(sigmoid_grad(sel_in[i+1]), 1, self.hidden_s)
                sel_w_part = self.sel_w[:, :self.hidden_s]
                sel_over_h = diag_sigmoid_grad * sel_w_part
                sel_over_h *= numpy.matlib.repmat(add_[i+1], 1, self.hidden_s)

                # part C: add_over_h
                diag_sigmoid_grad = numpy.matlib.repmat(sigmoid_grad(add_in[i+1]), 1, self.hidden_s)
                add_w_part = self.add_w[:, :self.hidden_s]
                add_over_h = diag_sigmoid_grad * add_w_part
                add_over_h *= numpy.matlib.repmat(sel_[i+1], 1, self.hidden_s)

                # finally c_h
                c_over_h = forget_over_h + sel_over_h + add_over_h
                c_h = np.dot(E_over_c_next, c_over_h)

            # obtain E_over_h and relevant gradients
            E_over_h = delta_h + write_h + c_h

            # write gate update
            update_write = E_over_h * np.transpose(Tanh(c_hist[i]))
            update_write *= np.transpose(sigmoid_grad(write_in[i]))
            D_write_b += update_write.T
            D_write_w += np.dot(update_write.T, hx_t)

            # memory cell update, with E_over_c recursively, and update E_over_c_next as well
            E_over_c = E_over_h * np.transpose(write_[i]) * np.transpose(tanh_grad(c_hist[i]))
            if i == length-1:
                E_over_c_next = E_over_c
            else:
                E_over_c += E_over_c_next * np.transpose(forget_[i+1])
                E_over_c_next = E_over_c

            # forget gate update
            if i == 0:
                c_last = c_init
            else:
                c_last = c_hist[i-1]
            update_forget = E_over_c * np.transpose(c_last) * np.transpose(sigmoid_grad(forget_in[i]))
            D_forget_b += update_forget.T
            D_forget_w += np.dot(update_forget.T, hx_t)

            # sel update
            update_sel = E_over_c * np.transpose(add_[i])
            update_sel *= np.transpose(sigmoid_grad(sel_in[i]))
            D_sel_b += update_sel.T
            D_sel_w += np.dot(update_sel.T, hx_t)

            # add update
            update_add = E_over_c * np.transpose(sel_[i])
            update_add *= np.transpose(tanh_grad(add_in[i]))
            D_add_b += update_add.T
            D_add_w += np.dot(update_add.T, hx_t)

            # update E_over_write
            E_over_write_next = E_over_h * np.transpose(Tanh(c_hist[i]))

        for each in [D_forget_w, D_forget_b, D_sel_w, D_sel_b, D_add_w, D_add_b,
                D_write_w, D_write_b, D_weights, D_biases]:
            np.clip(each, -30, 30, out=each)

        return D_forget_w, D_forget_b, D_sel_w, D_sel_b, D_add_w, D_add_b, \
                D_write_w, D_write_b, D_weights, D_biases, loss/(length+0.0)
Ejemplo n.º 23
0
def softmax(mat):
    L = logsumexp(mat)
    return cumath.exp(add_vec_to_mat(mat, -L, inplace=True))
Ejemplo n.º 24
0
def marginalize_all_lnL(mod, all_l_rhots_gpu, nmodes, nsamps, ntimes, nclmns, delta_t):
	# Recopy constants into device constant memory

	# **-- constants --**
	max_tpb = 1024 
	
	nmodes_gpu = mod.get_global("nmodes")[0]
	nsamps_gpu = mod.get_global("nsamps")[0]
	ntimes_gpu = mod.get_global("ntimes")[0]
	nclmns_gpu = mod.get_global("nclmns")[0]

	cuda.memcpy_htod(nmodes_gpu, np.array(nmodes, ndmin=1).astype(np.int32))
	cuda.memcpy_htod(nsamps_gpu, np.array(nsamps, ndmin=1).astype(np.int32))
	cuda.memcpy_htod(ntimes_gpu, np.array(ntimes, ndmin=1).astype(np.int32))
	cuda.memcpy_htod(nclmns_gpu, np.array(nclmns, ndmin=1).astype(np.int32))

	# Get GPU functions

	GPU_find_max_in_shrmem = mod.get_function("find_max_in_shrmem")
	GPU_nv_reduc = mod.get_function("nv_reduc")
	GPU_bcast_vec_to_matrix = mod.get_function("bcast_vec_to_matrix")

	def next_greater_power_of_2(x):  
    		return 2**(x-1).bit_length()


        griddimx = int(nclmns / max_tpb)
        griddimy = int(nsamps)
        # One thread per sample-time
        grd = (griddimx, griddimy, 1)
        blk = (max_tpb,  1,        1)

	print("Finding Maximum...\n")
	# Get the maxes
	GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(max_tpb*8))

	griddimy = int(nsamps)
	blokdimx = next_greater_power_of_2(griddimx) # Only need as many threads as we had blocks in x dimension
	grd = (1, griddimy, 1)
	blk = (blokdimx, 1, 1)
	# Second reduction - this works as long as we don't have rhoTS longer then 1024^2
	GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(blokdimx*8))
	
	# Collect the maxes through the host	
	maxes = np.array(all_l_rhots_gpu[:,0][nmodes-2::nmodes].get()).astype(np.float64)
	maxes_gpu = gpuarray.to_gpu(maxes)
	
	griddimx = int(nclmns / max_tpb)
	griddimy = int(nsamps)
	# One thread per sample-time
	grd = (griddimx, griddimy, 1)
	blk = (max_tpb,  1,        1)


	GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8)

	# Exponentiating a bunch of zeros creates a bunch of extra ones that we don't want in our
	# sum, so this is the number we need to subtract out to offset it
	padwidth = nclmns - ntimes


	all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate 

	print("Reducing final answer...\n")
	GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=max_tpb*8) # sum over time 

	griddimy = int(nsamps)
	blokdimx = next_greater_power_of_2(griddimx) # Only need as many threads as we had blocks in x dimension
	grd = (1, griddimy, 1)
	blk = (blokdimx, 1, 1)

	GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=blokdimx*8) # sum over time 

	lnL = (all_l_rhots_gpu[:,0][nmodes-1::nmodes].get() - padwidth).astype(np.float64)
	lnL_gpu = gpuarray.to_gpu(lnL)
	lnL_gpu = maxes_gpu + cumath.log(lnL_gpu*delta_t)

	return lnL_gpu.get()
Ejemplo n.º 25
0
    for step in xrange(N_TIMESTEPS):
        # print step
        # Implementing split-step method
        # Update wavefunction and resovoir, record density
        cu_fft.fft(psi_gpu, psi_gpu, plan_forward)
        psi_gpu *= kineticFactorHalf_gpu
        cu_fft.ifft(psi_gpu, psi_gpu, plan_inverse, scale=True)

        # currentDensity_gpu = abs(psi_gpu) ** 2
        # currentDensity_gpu = psi_gpu.real **2 + psi_gpu.imag ** 2
        currentDensity_gpu = (psi_gpu * psi_gpu.conj()).real
        # modSquared.prepared_call(grid, block, psi_gpu.gpudata,
        #                          currentDensity_gpu.gpudata, 1024)
        # n_gpu *= cumath.exp(-gammaRdt_gpu + Rdt_gpu * currentDensity_gpu)
        n_gpu *= cumath.exp(
            misc.add(-gammaRdt_gpu,
                     -misc.multiply(Rdt_gpu, currentDensity_gpu)))
        n_gpu += Pdt_gpu
        psi_gpu *= cumath.exp(
            misc.add(
                misc.add(
                    misc.multiply(expFactorPolFirst_gpu, n_gpu),
                    misc.multiply(expFactorPolSecond_gpu, currentDensity_gpu)),
                expFactorPolThird_gpu))

        #  psiNonlinear.prepared_call(grid, block, expFactorPolFirst,
        #                             expFactorPolSecond, expFactorPolThird,
        #                             psi_gpu.gpudata, n_gpu.gpudata,
        #                             currentDensity_gpu.gpudata, 1024)

        cu_fft.fft(psi_gpu, psi_gpu, plan_forward)
Ejemplo n.º 26
0
    def batch_backward_gpu(self, X, Y, mini_batch_size):
        """the backward propagation algorithm to compute the gradient of the cost function, with a batched version
           for using CUDA-based gpu"""
        D_biases = [np.zeros(b.shape) for b in self.biases]
        D_weights = [np.zeros(w.shape) for w in self.weights]
        weights_gpu = [gpu.to_gpu(self.weights[i].astype(np.float32)) for i in range(self.num_layers - 1)]

        # feed-forward
        activation = gpu.to_gpu(X.astype(np.float32))
        activations = [activation]
        zs = []
        count = 0
        for b, w in zip(self.biases, self.weights):

            # gpu mode variables
            Z = gpu.zeros((self.sizes[count + 1], mini_batch_size), np.float32)
            kernel = self.kernel1[count]
            kernel(weights_gpu[count], activation, Z, grid=(self.num_block1[count], 1, 1),
                   block=(self.num_thread_per_block, 1, 1))
            Z += gpu.to_gpu(mat.repmat(b, 1, mini_batch_size).astype(np.float32))
            zs.append(Z)

            if count < self.num_layers - 2:
                if self.act == 0:
                    activation = 1.0 / (1.0 + gpum.exp(-1 * Z))
                    activations.append(activation)
                elif self.act == 1:
                    activation = 1.7159 * gpum.tanh(2.0 / 3.0 * Z)
                    activations.append(activation)
                else:
                    activation = gpu.zeros(Z.shape, np.float32)
                    kernel = self.kernel3[count]
                    kernel(Z, activation, grid=(self.num_block3[count], 1, 1), block=(self.num_thread_per_block, 1, 1))
                    activations.append(activation)
            else:
                activation = Softmax(Z.get())
                activations.append(activation)
            count += 1

        # backward
        # first the special case for output layer (due to softmax layer)
        delta = activations[-1] - Y
        D_biases[-1] = np.array([np.sum(delta, axis=1)]).T
        delta = gpu.to_gpu(delta.astype(np.float32))
        D_weights_gpu = gpu.zeros(D_weights[-1].shape, np.float32)
        kernel = self.kernel5[-1]

        # these are for handling the bug in PyCuda library regarding matrix transpose
        a_t = gpu.to_gpu(np.zeros((activations[-2].shape[1], activations[-2].shape[0]), np.float32)
                         + gpu.transpose(activations[-2]).get())

        # execute the kernel to update D_weights[-1]
        kernel(delta, a_t, D_weights_gpu, grid=(self.num_block5[-1], 1, 1),
               block=(self.num_thread_per_block, 1, 1))
        D_weights[-1] = D_weights_gpu.get()

        # then compute the derivative of other hidden layers, from large to small
        count = 0
        for l in range(2, self.num_layers):
            Z = zs[-l]
            if self.act == 0:
                grad = (1.0 / (1.0 + gpum.exp(-1 * Z))) * (1 - (1.0 / (1.0 + gpum.exp(-1 * Z))))
            elif self.act == 1:
                grad = 1.7159 * 2 / 3.0 * (1 - (gpum.tanh(2.0/3.0 * Z)) ** 2)
            else:
                grad = gpu.zeros(Z.shape, np.float32)
                kernel = self.kernel4[count]
                kernel(Z, grad, grid=(self.num_block4[count], 1, 1), block=(self.num_thread_per_block, 1, 1))

            product = gpu.zeros((self.weights[-l+1].shape[1], mini_batch_size), np.float32)
            kernel = self.kernel2[count]
            weights_t = gpu.to_gpu((np.zeros((weights_gpu[-l + 1].shape[1], weights_gpu[-l + 1].shape[0]))
                                    + self.weights[-l+1].T).astype(np.float32))
            kernel(weights_t, delta, product, grid=(self.num_block2[count], 1, 1),
                        block=(self.num_thread_per_block, 1, 1))
            delta = product * grad

            # for each weights and biases
            D_biases[-l] = np.array([np.sum(delta.get(), axis=1)]).T
            kernel = self.kernel5[-l]
            a_t = gpu.to_gpu(np.zeros((activations[-l-1].shape[1], activations[-l-1].shape[0]), np.float32)
                                    + gpu.transpose(activations[-l-1]).get())
            D_weights_gpu = gpu.zeros(D_weights[-l].shape, np.float32)
            kernel(delta, a_t, D_weights_gpu, grid=(self.num_block5[-l], 1, 1),
                   block=(self.num_thread_per_block, 1, 1))
            D_weights[-l] = D_weights_gpu.get()
            count += 1

        return D_biases, D_weights
Ejemplo n.º 27
0
    for step in xrange(N_TIMESTEPS):
        # print step
       # Implementing split-step method
       # Update wavefunction and resovoir, record density
        cu_fft.fft(psi_gpu, psi_gpu, plan_forward)
        psi_gpu *= kineticFactorHalf_gpu
        cu_fft.ifft(psi_gpu, psi_gpu, plan_inverse, scale=True)

        # currentDensity_gpu = abs(psi_gpu) ** 2
        # currentDensity_gpu = psi_gpu.real **2 + psi_gpu.imag ** 2
        currentDensity_gpu = (psi_gpu * psi_gpu.conj()).real
        # modSquared.prepared_call(grid, block, psi_gpu.gpudata,
        #                          currentDensity_gpu.gpudata, 1024)
        # n_gpu *= cumath.exp(-gammaRdt_gpu + Rdt_gpu * currentDensity_gpu)
        n_gpu *= cumath.exp(misc.add(- gammaRdt_gpu,
                                     - misc.multiply(Rdt_gpu, currentDensity_gpu)))
        n_gpu += Pdt_gpu
        psi_gpu *= cumath.exp(
            misc.add(
                misc.add(misc.multiply(expFactorPolFirst_gpu, n_gpu),
                         misc.multiply(expFactorPolSecond_gpu, currentDensity_gpu)),
                expFactorPolThird_gpu))

        #  psiNonlinear.prepared_call(grid, block, expFactorPolFirst,
        #                             expFactorPolSecond, expFactorPolThird,
        #                             psi_gpu.gpudata, n_gpu.gpudata,
        #                             currentDensity_gpu.gpudata, 1024)

        cu_fft.fft(psi_gpu, psi_gpu, plan_forward)
        # record spectrum
        drv.memcpy_dtod(spectrum[step, :].gpudata, psi_gpu[N//2, :].gpudata,
Ejemplo n.º 28
0
def softmax(mat):
    tmp = gpuarray.empty_like(mat)
    L = logsumexp(mat)
    tmp = add_vec_to_mat(mat, L, substract=True)
    tmp = cumath.exp(tmp)
    return tmp
start = drv.Event()
end = drv.Event()

x = np.random.normal(size = N)

start.record()
dX = gpuarray.to_gpu(x)
end.record() 
end.synchronize()
print "Transfer to GPU time: %fs" %(start.time_till(end)*1e-3)


print "Timing vectorized exponentiation:"

start.record()
dexpX = cumath.exp(dX)
end.record() 
end.synchronize()
print "GPU array calc time: %fs" %(start.time_till(end)*1e-3)

start.record()
expX = np.exp(x)
end.record() 
end.synchronize()
print "CPU calc time: %fs" %(start.time_till(end)*1e-3)

print "Timing vectorized dot product/sum of squares:"

start.record()
gpuarray.dot(dX,dX)
end.record() 
Ejemplo n.º 30
0
def marginalize_all_lnL(mod, all_l_rhots_gpu, nmodes, nsamps, ntimes, nclmns,
                        delta_t):
    # Recopy constants into device constant memory

    # **-- constants --**
    max_tpb = 1024

    nmodes_gpu = mod.get_global("nmodes")[0]
    nsamps_gpu = mod.get_global("nsamps")[0]
    ntimes_gpu = mod.get_global("ntimes")[0]
    nclmns_gpu = mod.get_global("nclmns")[0]

    cuda.memcpy_htod(nmodes_gpu, np.array(nmodes, ndmin=1).astype(np.int32))
    cuda.memcpy_htod(nsamps_gpu, np.array(nsamps, ndmin=1).astype(np.int32))
    cuda.memcpy_htod(ntimes_gpu, np.array(ntimes, ndmin=1).astype(np.int32))
    cuda.memcpy_htod(nclmns_gpu, np.array(nclmns, ndmin=1).astype(np.int32))

    # Get GPU functions

    GPU_find_max_in_shrmem = mod.get_function("find_max_in_shrmem")
    GPU_nv_reduc = mod.get_function("nv_reduc")
    GPU_bcast_vec_to_matrix = mod.get_function("bcast_vec_to_matrix")

    def next_greater_power_of_2(x):
        return 2**(x - 1).bit_length()

    griddimx = int(nclmns / max_tpb)
    griddimy = int(nsamps)
    # One thread per sample-time
    grd = (griddimx, griddimy, 1)
    blk = (max_tpb, 1, 1)

    print("Finding Maximum...\n")
    # Get the maxes
    GPU_find_max_in_shrmem(all_l_rhots_gpu,
                           grid=grd,
                           block=blk,
                           shared=int(max_tpb * 8))

    griddimy = int(nsamps)
    blokdimx = next_greater_power_of_2(
        griddimx)  # Only need as many threads as we had blocks in x dimension
    grd = (1, griddimy, 1)
    blk = (blokdimx, 1, 1)
    # Second reduction - this works as long as we don't have rhoTS longer then 1024^2
    GPU_find_max_in_shrmem(all_l_rhots_gpu,
                           grid=grd,
                           block=blk,
                           shared=int(blokdimx * 8))

    # Collect the maxes through the host
    maxes = np.array(all_l_rhots_gpu[:, 0][nmodes - 2::nmodes].get()).astype(
        np.float64)
    maxes_gpu = gpuarray.to_gpu(maxes)

    griddimx = int(nclmns / max_tpb)
    griddimy = int(nsamps)
    # One thread per sample-time
    grd = (griddimx, griddimy, 1)
    blk = (max_tpb, 1, 1)

    GPU_bcast_vec_to_matrix(all_l_rhots_gpu,
                            -maxes_gpu,
                            grid=grd,
                            block=blk,
                            shared=8)

    # Exponentiating a bunch of zeros creates a bunch of extra ones that we don't want in our
    # sum, so this is the number we need to subtract out to offset it
    padwidth = nclmns - ntimes

    all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu)  # exponentiate

    print("Reducing final answer...\n")
    GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk,
                 shared=max_tpb * 8)  # sum over time

    griddimy = int(nsamps)
    blokdimx = next_greater_power_of_2(
        griddimx)  # Only need as many threads as we had blocks in x dimension
    grd = (1, griddimy, 1)
    blk = (blokdimx, 1, 1)

    GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk,
                 shared=blokdimx * 8)  # sum over time

    lnL = (all_l_rhots_gpu[:, 0][nmodes - 1::nmodes].get() - padwidth).astype(
        np.float64)
    lnL_gpu = gpuarray.to_gpu(lnL)
    lnL_gpu = maxes_gpu + cumath.log(lnL_gpu * delta_t)

    return lnL_gpu.get()
Ejemplo n.º 31
0
def logsumexp(mat):
    max_dim = max_by_axis(mat, 1)
    tmp = add_vec_to_mat(mat, -max_dim, 0)
    L = max_dim + cumath.log(matrix_sum_out_axis(cumath.exp(tmp), 1))
    return L
Ejemplo n.º 32
0
def softmax(mat):
    L = logsumexp(mat)
    return cumath.exp(add_vec_to_mat(mat, -L, inplace=True))
Ejemplo n.º 33
0
 def exp(self):
     return CUDAArray(cumath.exp(self.arr))
Ejemplo n.º 34
0
end = drv.Event()

x = np.random.normal(size=n)
x_short = np.random.normal(size=8)

start.record()
dev_x = gpuarray.to_gpu(x)
dev_x_short = gpuarray.to_gpu(x_short)
end.record()
end.synchronize()
print "Transfer to GPU time: %fs" % (start.time_till(end) * 1e-3)

print "Timing vectorized exponentiation:"

start.record()
dev_expx_short = cumath.exp(dev_x_short)
end.record()
end.synchronize()
print "GPU array calc time (initial): %fs" % (start.time_till(end) * 1e-3)

start.record()
dev_expx = cumath.exp(dev_x)
end.record()
end.synchronize()
print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3)

start.record()
exp_x = np.exp(x)
end.record()
end.synchronize()
print "CPU calc time: %fs" % (start.time_till(end) * 1e-3)
Ejemplo n.º 35
0
def logsumexp(mat):
    max_dim = max_by_axis(mat, 1)
    tmp = add_vec_to_mat(mat, -max_dim, 0)
    L = max_dim + cumath.log(matrix_sum_out_axis(cumath.exp(tmp), 1))
    return L
Ejemplo n.º 36
0
for i in range(10):
    gpu_ind.set(indices[i])
    gpuarray.take(probs, gpu_ind, out=selected_probs)
    utils.scalar_sub(selected_probs, 1.0, selected_probs)
    gpuarray.multi_put([selected_probs], gpu_ind, out=[probs])

#print probs

t1 = time.clock()

for i in range(N):
    # get the softmax probs first
    utils.max(scores, 1, maxscores, maxscoreids)
    utils.sub_matvec(scores, maxscores, 0, deltas)
    cumath.exp(deltas, out=deltas)
    scm.sum(deltas, 1, sumdeltas)
    utils.div_matvec(deltas, sumdeltas, 0, probs)
#    probs.get(cpu_probs)
#    cpu_probs[np.arange(B), indices[i]] -= 1
#    probs.set(cpu_probs)
    gpu_ind.set(indices[i])
    gpuarray.take(probs, gpu_ind, out=selected_probs)
    utils.scalar_sub(selected_probs, 1.0, selected_probs)
    gpuarray.multi_put([selected_probs], gpu_ind, out=[probs])

t2 = time.clock()

#print probs

print 'tdiff = %.3f, per loop = %.6f, wps = %.3f' % ((t2-t1), (t2-t1)/N,
Ejemplo n.º 37
0
for i in range(10):
    gpu_ind.set(indices[i])
    gpuarray.take(probs, gpu_ind, out=selected_probs)
    utils.scalar_sub(selected_probs, 1.0, selected_probs)
    gpuarray.multi_put([selected_probs], gpu_ind, out=[probs])

#print probs

t1 = time.clock()

for i in range(N):
    # get the softmax probs first
    utils.max(scores, 1, maxscores, maxscoreids)
    utils.sub_matvec(scores, maxscores, 0, deltas)
    cumath.exp(deltas, out=deltas)
    scm.sum(deltas, 1, sumdeltas)
    utils.div_matvec(deltas, sumdeltas, 0, probs)
    #    probs.get(cpu_probs)
    #    cpu_probs[np.arange(B), indices[i]] -= 1
    #    probs.set(cpu_probs)
    gpu_ind.set(indices[i])
    gpuarray.take(probs, gpu_ind, out=selected_probs)
    utils.scalar_sub(selected_probs, 1.0, selected_probs)
    gpuarray.multi_put([selected_probs], gpu_ind, out=[probs])

t2 = time.clock()

#print probs

print 'tdiff = %.3f, per loop = %.6f, wps = %.3f' % ((t2 - t1),
def register_multiple_images_subpix_cuda(stack, template):

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as drv
    import pycuda.cumath as cumath
    import skcuda.fft as cu_fft
    import skcuda.linalg as lin
    import skcuda.cublas as cub
    from numpy import pi, newaxis, floor
    import cmath
    from pycuda.elementwise import ElementwiseKernel
    from pycuda.compiler import SourceModule

    from numpy import conj, abs, arctan2, sqrt, real, imag, shape, zeros, trunc, ceil, floor, fix
    from numpy.fft import fftshift, ifftshift
    fft2, ifft2 = fftn, ifftn = fast_ffts.get_ffts(nthreads=1,
                                                   use_numpy_fft=False)

    mod = SourceModule("""
   #include <pycuda-complex.hpp>"
   
    __global__ void load_convert(unsigned short *a, float *b,int f, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        int offset = f * imlen;
        if (idx <imlen)
        {
            b[idx] = (float)a[offset+idx];
        }
    }
        
    __global__ void convert_export(float *a, unsigned short *b,int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            b[idx] = (unsigned short)(a[idx]>0 ? a[idx] : 0) ;
        }
    }
        
    __global__ void multiply_comp_float(pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            z[idx] = x[idx] * y[idx];
        }
    }
        
    __global__ void calc_conj(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            y[idx]._M_re = x[idx]._M_re;
            y[idx]._M_im = -x[idx]._M_im;
        }
    }
        
        
    __global__ void convert_multiply(float *x, pycuda::complex<float> *y, float sx, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            y[idx]._M_re = 0;
            y[idx]._M_im = x[idx] * sx;
        }
    }
        
    __global__ void transfer_array(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlenl, int imlen,  int nlargeh, int nh)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        int offset = imlenl*3/4;
        if (idx<imlen)
        {
            int target_ind = (offset+(idx/nh)*nlargeh + (idx % nh))%imlenl;
            x[target_ind] = y[idx];
        }      
    
    }    
        
    __global__ void calc_shiftmatrix(float *x, float *y, pycuda::complex<float> *z, float sx, float sy,float dg, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            z[idx]._M_re = 0;
            z[idx]._M_im = x[idx] * sx + y[idx] * sy + dg;
        }
    }
        
    __global__ void sub_float(float *x, float *y, float sv,  int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            x[idx] = y[idx]-sv;
        }
    }
        

    """)

    load_convert_kernel = mod.get_function('load_convert')
    convert_export_kernel = mod.get_function('convert_export')
    convert_multiply_kernel = mod.get_function('convert_multiply')
    multiply_float_kernel = mod.get_function('multiply_comp_float')
    transfer_array_kernel = mod.get_function('transfer_array')
    calc_shiftmatrix_kernel = mod.get_function('calc_shiftmatrix')
    conj_kernel = mod.get_function('calc_conj')
    sub_float_kernel = mod.get_function('sub_float')

    Z = stack.shape[0]
    M = stack.shape[1]
    N = stack.shape[2]
    max_memsize = 4200000000

    imlen = M * N
    half_imlen = M * (N // 2 + 1)
    grid_dim = (64, int(imlen / (512 * 64)) + 1, 1)
    block_dim = (512, 1, 1)  #512 threads per block

    stack_bin = int(max_memsize / (M * N * stack.itemsize))
    stack_ite = int(Z / stack_bin) + 1

    usfac = 100  ## needs to be bigger than 10

    if not template.shape == stack.shape[1:]:
        raise ValueError("Images must have same shape.")

    if np.any(np.isnan(template)):
        template = template.copy()
        template[template != template] = 0
    if np.any(np.isnan(stack)):
        stack = stack.copy()
        stack[stack != stack] = 0

    mlarge = M * 2
    nlarge = N * 2

    t = time.time()

    plan_forward = cu_fft.Plan((M, N), np.float32, np.complex64)
    plan_inverse = cu_fft.Plan((M, N), np.complex64, np.float32)
    plan_inverse_big = cu_fft.Plan((mlarge, nlarge), np.complex64, np.float32)
    cub_h = cub.cublasCreate()

    template_gpu = gpuarray.to_gpu(template.astype('float32'))
    source_gpu = gpuarray.empty((M, N), np.float32)
    ifft_gpu = gpuarray.empty((M, N), np.float32)
    result_gpu = gpuarray.empty((M, N), np.uint16)

    templatef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64)
    sourcef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64)
    prod_gpu1 = gpuarray.empty((M, N // 2 + 1), np.complex64)
    prod_gpu2 = gpuarray.empty((M, N // 2 + 1), np.complex64)
    shiftmatrix = gpuarray.empty((M, N // 2 + 1), np.complex64)

    cu_fft.fft(template_gpu, templatef_gpu, plan_forward, scale=True)
    templatef_gpu = templatef_gpu.conj()

    move_list = np.zeros((Z, 2))

    largearray1_gpu = gpuarray.zeros((mlarge, nlarge // 2 + 1), np.complex64)
    largearray2_gpu = gpuarray.empty((mlarge, nlarge), np.float32)
    imlenl = mlarge * (nlarge // 2 + 1)

    zoom_factor = 1.5
    dftshiftG = trunc(ceil(usfac * zoom_factor) / 2)
    #% Center of output array at dftshift+1
    upsample_dim = int(ceil(usfac * zoom_factor))

    term1c = (ifftshift(np.arange(N, dtype='float') - floor(N / 2)).
              T[:, newaxis]) / N  # fftfreq  # output points
    term2c = ((np.arange(upsample_dim, dtype='float')) / usfac)[newaxis, :]
    term1r = (np.arange(upsample_dim, dtype='float').T)[:, newaxis]
    term2r = (ifftshift(np.arange(M, dtype='float')) -
              floor(M / 2))[newaxis, :]  # fftfreq
    term1c_gpu = gpuarray.to_gpu(term1c[:int(floor(N / 2) +
                                             1), :].astype('float32'))
    term2c_gpu = gpuarray.to_gpu(term2c.astype('float32'))
    term1r_gpu = gpuarray.to_gpu(term1r.astype('float32'))
    term2r_gpu = gpuarray.to_gpu(term2r.astype('float32'))
    term2c_gpu_ori = gpuarray.to_gpu(term2c.astype('float32'))
    term1r_gpu_ori = gpuarray.to_gpu(term1r.astype('float32'))

    kernc_gpu = gpuarray.zeros((N // 2 + 1, upsample_dim), np.float32)
    kernr_gpu = gpuarray.zeros((upsample_dim, M), np.float32)
    kernc_gpuc = gpuarray.zeros((N // 2 + 1, upsample_dim), np.complex64)
    kernr_gpuc = gpuarray.zeros((upsample_dim, M), np.complex64)

    Nr = np.fft.ifftshift(np.linspace(-np.fix(M / 2), np.ceil(M / 2) - 1, M))
    Nc = np.fft.ifftshift(np.linspace(-np.fix(N / 2), np.ceil(N / 2) - 1, N))
    [Nc, Nr] = np.meshgrid(Nc, Nr)
    Nc_gpu = gpuarray.to_gpu((Nc[:, :N // 2 + 1] / N).astype('float32'))
    Nr_gpu = gpuarray.to_gpu((Nr[:, :N // 2 + 1] / M).astype('float32'))

    upsampled1 = gpuarray.empty((upsample_dim, N // 2 + 1), np.complex64)
    upsampled2 = gpuarray.empty((upsample_dim, upsample_dim), np.complex64)

    source_stack = gpuarray.empty((stack_bin, M, N), dtype=stack.dtype)
    copy = drv.Memcpy3D()
    copy.set_src_host(stack.data)
    copy.set_dst_device(source_stack.gpudata)
    copy.width_in_bytes = copy.src_pitch = stack.strides[1]
    copy.src_height = copy.height = M

    for zb in range(stack_ite):

        zrange = np.arange(zb * stack_bin, min((stack_bin * (zb + 1)), Z))
        copy.depth = len(zrange)
        copy.src_z = int(zrange[0])
        copy()

        for i in range(len(zrange)):

            t = zb * stack_bin + i
            load_convert_kernel(source_stack,
                                source_gpu.gpudata,
                                np.int32(i),
                                np.int32(imlen),
                                block=block_dim,
                                grid=grid_dim)
            cu_fft.fft(source_gpu, sourcef_gpu, plan_forward, scale=True)

            multiply_float_kernel(sourcef_gpu,
                                  templatef_gpu,
                                  prod_gpu1,
                                  np.int32(half_imlen),
                                  block=block_dim,
                                  grid=grid_dim)
            transfer_array_kernel(largearray1_gpu,
                                  prod_gpu1,
                                  np.int32(imlenl),
                                  np.int32(half_imlen),
                                  np.int32(nlarge // 2 + 1),
                                  np.int32(N // 2 + 1),
                                  block=block_dim,
                                  grid=grid_dim)
            cu_fft.ifft(largearray1_gpu,
                        largearray2_gpu,
                        plan_inverse_big,
                        scale=True)
            peakind = cub.cublasIsamax(cub_h, largearray2_gpu.size,
                                       largearray2_gpu.gpudata, 1)
            rloc, cloc = np.unravel_index(peakind, largearray2_gpu.shape)

            md2 = trunc(mlarge / 2)
            nd2 = trunc(nlarge / 2)
            if rloc > md2:
                row_shift2 = rloc - mlarge
            else:
                row_shift2 = rloc
            if cloc > nd2:
                col_shift2 = cloc - nlarge
            else:
                col_shift2 = cloc
            row_shiftG = row_shift2 / 2.
            col_shiftG = col_shift2 / 2.

            # Initial shift estimate in upsampled grid

            row_shiftG0 = round(row_shiftG * usfac) / usfac
            col_shiftG0 = round(col_shiftG * usfac) / usfac
            # Matrix multiply DFT around the current shift estimate
            roffG = dftshiftG - row_shiftG0 * usfac
            coffG = dftshiftG - col_shiftG0 * usfac

            sub_float_kernel(term2c_gpu,
                             term2c_gpu_ori,
                             np.float32(coffG / usfac),
                             np.int32(term2c_gpu.size),
                             block=block_dim,
                             grid=grid_dim)
            sub_float_kernel(term1r_gpu,
                             term1r_gpu_ori,
                             np.float32(roffG),
                             np.int32(term1r_gpu.size),
                             block=block_dim,
                             grid=grid_dim)

            lin.dot(term1c_gpu, term2c_gpu, handle=cub_h, out=kernc_gpu)
            lin.dot(term1r_gpu, term2r_gpu, handle=cub_h, out=kernr_gpu)
            convert_multiply_kernel(kernc_gpu,
                                    kernc_gpuc,
                                    np.float32(-2 * pi),
                                    np.int32(kernc_gpu.size),
                                    block=block_dim,
                                    grid=grid_dim)
            convert_multiply_kernel(kernr_gpu,
                                    kernr_gpuc,
                                    np.float32(-2 * pi / (M * usfac)),
                                    np.int32(kernr_gpu.size),
                                    block=block_dim,
                                    grid=grid_dim)
            cumath.exp(kernc_gpuc, out=kernc_gpuc)
            cumath.exp(kernr_gpuc, out=kernr_gpuc)

            conj_kernel(prod_gpu1,
                        prod_gpu2,
                        np.int32(half_imlen),
                        block=block_dim,
                        grid=grid_dim)

            lin.dot(kernr_gpuc, prod_gpu2, handle=cub_h, out=upsampled1)
            lin.dot(upsampled1, kernc_gpuc, handle=cub_h, out=upsampled2)

            CCG = conj(upsampled2.get()) / (md2 * nd2 * usfac**2)
            rlocG, clocG = np.unravel_index(abs(CCG).argmax(), CCG.shape)
            CCGmax = CCG[rlocG, clocG]

            rlocG = rlocG - dftshiftG  #+ 1 # +1 # questionable/failed hack + 1;
            clocG = clocG - dftshiftG  #+ 1 # -1 # questionable/failed hack - 1;
            row_shiftG = row_shiftG0 + rlocG / usfac
            col_shiftG = col_shiftG0 + clocG / usfac

            diffphaseG = arctan2(imag(CCGmax), real(CCGmax))

            # Compute registered version of source stack
            calc_shiftmatrix_kernel(Nr_gpu,
                                    Nc_gpu,
                                    shiftmatrix,
                                    np.float32(row_shiftG * 2 * np.pi),
                                    np.float32(col_shiftG * 2 * np.pi),
                                    np.float32(diffphaseG),
                                    np.int32(half_imlen),
                                    block=block_dim,
                                    grid=grid_dim)
            cumath.exp(shiftmatrix, out=shiftmatrix)
            multiply_float_kernel(sourcef_gpu,
                                  shiftmatrix,
                                  prod_gpu1,
                                  np.int32(half_imlen),
                                  block=block_dim,
                                  grid=grid_dim)
            cu_fft.ifft(prod_gpu1, ifft_gpu, plan_inverse)
            convert_export_kernel(ifft_gpu,
                                  result_gpu,
                                  np.int32(imlen),
                                  block=block_dim,
                                  grid=grid_dim)

            move_list[t, :] = (row_shiftG, col_shiftG)
            stack[t, :, :] = result_gpu.get()

    cub.cublasDestroy(cub_h)
    return (stack, move_list)
Ejemplo n.º 39
0
N = 100000

# --- Create random vectorson the CPU
h_a = np.random.randn(1, N)
h_b = np.random.randn(1, N)

# --- Set CPU arrays as single precision
h_a = h_a.astype(np.float32)
h_b = h_b.astype(np.float32)
h_c = np.empty_like(h_a)

d_a = gpuarray.to_gpu(h_a)
d_b = gpuarray.to_gpu(h_b)

start.record()
d_c = (cumath.sqrt(cumath.fabs(d_a)) + cumath.exp(d_b))
end.record()
end.synchronize()
secs = start.time_till(end) * 1e-3
print("Processing time = %fs" % (secs))

h_c = d_c.get()

if np.all(abs(h_c - (np.sqrt(np.abs(h_a)) + np.exp(h_b))) < 1e-5):
    print("Test passed!")
else:
    print("Error!")

# --- Flush context printf buffer
cuda.Context.synchronize()
Ejemplo n.º 40
0
 def compute_displace(self, amp, r, buffer):
     cumath.exp(self._iqx * r.x() + self._iqxz * r.z(), out=self._ex)
     cumath.exp(self._iqy * r.y() + self._iqyz * r.z(), out=self._ey)
     self._ey *= amp
     self.outer(self._ey, self._ex, buffer)
Ejemplo n.º 41
0
                       grid=grd,
                       block=blk,
                       shared=int(max_tpb * 8))

# Indexes are not contiguous

griddimx = int(nclmns / max_tpb)
griddimy = int(nsamps)
# One thread per sample-time
grd = (griddimx, griddimy, 1)
blk = (max_tpb, 1, 1)

maxes = np.array(all_l_rhots_gpu[:, 0][1::nmodes].get()).astype(np.float64)
maxes_gpu = gpuarray.to_gpu(maxes)

GPU_bcast_vec_to_matrix(all_l_rhots_gpu,
                        -maxes_gpu,
                        grid=grd,
                        block=blk,
                        shared=8)
# ***** THIS IS CORRECT AND WORKING UP THROUGH HERE AS OF AUGUST 10TH 2016 *****
''' 
 Marginalize over Time
'''

all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu)  # exponentiate

GPU_nv_reduc(all_l_rhots_gpu)  # sum over time

lnL_gpu = maxes_gpu + cumath.log(all_l_rhots_gpu)  # TIMES DELTA T FIXME
Ejemplo n.º 42
0
def softmax(mat):
    tmp = gpuarray.empty_like(mat)
    L = logsumexp(mat)
    tmp = add_vec_to_mat(mat, L, substract=True)
    tmp = cumath.exp(tmp)
    return tmp
Ejemplo n.º 43
0
def sigmoid(x, deriv=False):
    if deriv:
        return x * (1.0 - x)
    else:
        return 1.0 / (1.0 + cm.exp(-x))
Ejemplo n.º 44
0
def relu(x, deriv=False):
    if deriv:
        return 1.0 - cm.exp(-x)
    else:
        return gpu.maximum(x, 0)
Ejemplo n.º 45
0
########

N = 10

# --- Create random vectorson the CPU
h_a = np.random.randn(1, N)
h_b = np.random.randn(1, N)

# --- Set CPU arrays as single precision
h_a = h_a.astype(np.float32)
h_b = h_b.astype(np.float32)
h_c = np.empty_like(h_a, dtype=np.complex64)

d_a = gpuarray.to_gpu(h_a)
d_b = gpuarray.to_gpu(h_b)

d_c = d_a * cumath.exp(1j * d_b)

h_c = d_c.get()

if np.array_equal(h_c, h_a * np.exp(1j * h_b)):
    print("Test passed!")
else:
    print("Error!")

print(h_c)
print(h_a * np.exp(1j * h_b))

# --- Flush context printf buffer
cuda.Context.synchronize()