def calculate_H_gpu(X, W, P):
    WPW = la.add_diag(P, la.dot(W, W, "t", "n"))
    tmp = la.dot(W, la.inv(WPW, overwrite=True))
    H = la.dot(X, tmp, "n", "t")
    H = gpu.maximum(H, 0)
    H = to_unit_variance(H)
    return H, tmp
def calculate_H_gpu(X, W, P):
    WPW = la.add_diag(P, la.dot(W, W, "t", "n"))
    tmp = la.dot(W, la.inv(WPW, overwrite=True))
    H = la.dot(X, tmp, "n", "t")
    H = gpu.maximum(H, 0)
    H = to_unit_variance(H)
    return H, tmp
 def backward(self, top, propagate_down, bottom):
     with pu.caffe_cuda_context():
         h = caffe.cublas_handle()
         import scikits.cuda.linalg as linalg
         top_diff = top[0].diff_as_pycuda_gpuarray()
         ts = [self.t1_, self.t2_]
         for i in xrange(len(bottom)):
             if not propagate_down[i]:
                 continue
             diff = bottom[i].diff_as_pycuda_gpuarray()
             data = bottom[(i + 1) % 2].data_as_pycuda_gpuarray()
             # Belew 3 conditions are complicated and might be hard to
             # understand.
             swap = ts[i] ^ bool(i)
             t1 = ts[i]
             t2 = (not t1) ^ ts[(i + 1) % 2]
             for b in xrange(bottom[0].shape[0]):
                 x = top_diff[b]
                 y = data[b]
                 t1_, t2_ = t1, t2
                 if swap:
                     x, y = y, x
                     t1_, t2_ = t2_, t1_
                 linalg.dot(x, y,
                            transa=blas_trans(t1_), transb=blas_trans(t2_),
                            handle=h, out=diff[b])
 def backward(self, top, propagate_down, bottom):
     with pu.caffe_cuda_context():
         h = caffe.cublas_handle()
         import scikits.cuda.linalg as linalg
         top_diff = top[0].diff_as_pycuda_gpuarray()
         ts = [self.t1_, self.t2_]
         for i in xrange(len(bottom)):
             if not propagate_down[i]:
                 continue
             diff = bottom[i].diff_as_pycuda_gpuarray()
             data = bottom[(i + 1) % 2].data_as_pycuda_gpuarray()
             # Belew 3 conditions are complicated and might be hard to
             # understand.
             swap = ts[i] ^ bool(i)
             t1 = ts[i]
             t2 = (not t1) ^ ts[(i + 1) % 2]
             for b in xrange(bottom[0].shape[0]):
                 x = top_diff[b]
                 y = data[b]
                 t1_, t2_ = t1, t2
                 if swap:
                     x, y = y, x
                     t1_, t2_ = t2_, t1_
                 linalg.dot(x,
                            y,
                            transa=blas_trans(t1_),
                            transb=blas_trans(t2_),
                            handle=h,
                            out=diff[b])
Esempio n. 5
0
def multinomial_log_likelihood(softmax_vals,Y,one_n_trans,one_c):
    # add small amount to protect against log(0)
    small_val = 1e-9
    prod = Y*cumath.log(softmax_vals+small_val)
    prod = linalg.dot(one_n_trans,prod)
    prod = linalg.dot(prod,one_c)
    return(prod.get())
Esempio n. 6
0
    def backprop(self, input_data, df_output, cache=None):
        """ Backpropagate through the hidden layer

        **Parameters:**

        input_data : ``GPUArray``
            Inpute data to compute activations for.

        df_output : ``GPUArray``
            Gradients with respect to the activations of this layer
            (received from the layer above).

        cache : list of ``GPUArray``
            Cache obtained from forward pass. If the cache is
            provided, then the activations are not recalculated.

        **Returns:**

        gradients : tuple of ``GPUArray``
            Gradients with respect to the weights and biases in the
            form ``(df_weights, df_biases)``.

        df_input : ``GPUArray``
            Gradients with respect to the input.
        """

        # Get cache if it wasn't provided
        if cache is None:
            cache = self.feed_forward(input_data,
                                      prediction=False)

        if len(cache) == 2:
            activations, dropout_mask = cache
        else:
            activations = cache[0]

        # Multiply the binary mask with the incoming gradients
        if self.dropout and dropout_mask is not None:
            apply_dropout_mask(df_output, dropout_mask)

        # Get gradient wrt activation function
        df_activations = self.df(activations)
        delta = df_activations * df_output

        # Gradient wrt weights
        df_W = linalg.dot(input_data, delta, transa='T')
        # Gradient wrt bias
        df_b = matrix_sum_out_axis(delta, 0)
        # Gradient wrt inputs
        df_input = linalg.dot(delta, self.W, transb='T')

        # L1 weight decay
        if self.l1_penalty_weight:
            df_W -= self.l1_penalty_weight * sign(self.W)

        # L2 weight decay
        if self.l2_penalty_weight:
            df_W -= self.l2_penalty_weight * self.W

        return (df_W, df_b), df_input
Esempio n. 7
0
    def updateGradient(self,
                       bp_signal,
                       inputs,
                       print_timing=False,
                       include_prior=True):
        if print_timing:
            print ''
            t0 = t.time()
            t_run = t.time()
        if self.magic_numbers:
            back_prop = bp_signal * 0.6667 / 1.7159 * (1.7159 -
                                                       (self.outputs) *
                                                       (1.7159 + self.outputs))
        else:
            back_prop = bp_signal * (1.0 - (self.outputs * self.outputs))

        if print_timing:
            t_bp = t.time() - t_run
            t_run = t.time()
        self.gW = linalg.dot(inputs, back_prop, transa='T')
        if print_timing:
            t_dot = t.time() - t_run
            t_run = t.time()
        #self.gW = linalg.dot(linalg.transpose(inputs),back_prop)
        ones = gpuarray.to_gpu(np.ones((1, self.N)).astype(self.precision))
        if print_timing:
            t_ones = t.time() - t_run
            t_run = t.time()
        self.gB = linalg.dot(ones, back_prop)
        if print_timing:
            t_biases = t.time() - t_run
            t_run = t.time()
        if include_prior:
            self.prior.updateWeightGradient(self.weights, self.gW)
            if print_timing:
                t_weights = t.time() - t_run
                t_run = t.time()
            self.prior.updateBiasGradient(self.biases, self.gB)
        if print_timing:
            t_prior = t.time() - t_run
            print 'Total time for gradient update in hidden layer ' + str(
                self.ID) + ' ' + str(t.time() - t0)
            print 'Time to calculate backprop in hidden layer ' + str(
                self.ID) + ' ' + str(t_bp)
            print 'Time to calculate gradient for weights in hidden layer ' + str(
                self.ID) + ' ' + str(t_dot)
            print 'Time to allocate ones vector in hidden layer ' + str(
                self.ID) + ' ' + str(t_ones)
            print 'Time to biases gradient in hidden layer ' + str(
                self.ID) + ' ' + str(t_biases)
            print 'Time for prior update in hidden layer ' + str(
                self.ID) + ' ' + str(t_prior)
        if self.ID > 0:
            return linalg.dot(back_prop, self.weights, transb='T')
            #return linalg.dot(back_prop,linalg.transpose(self.weights))
        else:
            return -1
Esempio n. 8
0
def eps_r(x, A1, A2, out, handle):
    out.fill(0)
    #tmp = garr.empty((A1[0].shape[0], x.shape[1]), dtype=A1[0].dtype)
    #tmp2 = garr.empty((tmp.shape[0], A2[0].shape[0]), dtype=A1[0].dtype)
    for s in range(len(A1)):
        tmp = cla.dot(A1[s], x, handle=handle)
        tmp2 = cla.dot(tmp, A2[s], transb='C', handle=handle)
        out += tmp2

    return out
Esempio n. 9
0
def eps_r(x, A1, A2, out, handle):
    out.fill(0)    
    #tmp = garr.empty((A1[0].shape[0], x.shape[1]), dtype=A1[0].dtype)
    #tmp2 = garr.empty((tmp.shape[0], A2[0].shape[0]), dtype=A1[0].dtype)
    for s in range(len(A1)):
        tmp = cla.dot(A1[s], x, handle=handle)
        tmp2 = cla.dot(tmp, A2[s], transb='C', handle=handle)
        out += tmp2
        
    return out
Esempio n. 10
0
 def forward(self, bottom, top):
     with pu.caffe_cuda_context():
         h = caffe.cublas_handle()
         import scikits.cuda.linalg as linalg
         mat1 = bottom[0].data_as_pycuda_gpuarray()
         mat2 = bottom[1].data_as_pycuda_gpuarray()
         mato = top[0].data_as_pycuda_gpuarray()
         for b in xrange(bottom[0].shape[0]):
             linalg.dot(mat1[b], mat2[b],
                        transa=blas_trans(self.t1_),
                        transb=blas_trans(self.t2_),
                        handle=h, out=mato[b])
Esempio n. 11
0
 def test_dot_vector_complex128(self):
     a = np.asarray(np.random.rand(5), np.complex128)
     b = np.asarray(np.random.rand(5), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
     a = a.astype(np.complex128, order="F", copy=True)
     b = b.astype(np.complex128, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
Esempio n. 12
0
    def decompose(self):
        gcov = cla.dot(self._Y_gpu, self._Y_gpu, transa='C')
        ge_g, gh_g = np.linalg.eigh(gcov.get())
        I = np.argsort(ge_g)[::-1]
        ge_g, gh_g = np.sqrt(ge_g[I]), gh_g[:,I]
        # push the matrix back out
        gpueigs = gpuarray.to_gpu(gh_g)
        W_g = cla.dot(self._Y_gpu, gpueigs)
        # Unitize W_g - could be done on gpu to allow async returning
        W_g = W_g.get()
        W_g = W_g / np.sqrt(np.sum(W_g**2, axis=0))[np.newaxis, :]

        return W_g, ge_g, gh_g.T # Not sure whether the last one should be transposed
Esempio n. 13
0
 def test_dot_matrix_h_complex128(self):
     a = np.asarray(np.random.rand(2, 4) + 1j * np.random.rand(2, 4), np.complex128)
     b = np.asarray(np.random.rand(2, 2) + 1j * np.random.rand(2, 2), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, "c")
     assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
     a = a.astype(np.complex128, order="F", copy=True)
     b = b.astype(np.complex128, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, "c")
     assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
Esempio n. 14
0
 def test_dot_vector_complex128(self):
     a = np.asarray(np.random.rand(5), np.complex128)
     b = np.asarray(np.random.rand(5), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
     a = a.astype(np.complex128, order="F", copy=True)
     b = b.astype(np.complex128, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
Esempio n. 15
0
    def backprop(self, input_data, targets,
                 cache=None):
        """ Backpropagate through the logistic layer.

        **Parameters:**

        input_data : ``GPUArray``
            Inpute data to compute activations for.

        targets : ``GPUArray``
            The target values of the units.

        cache : list of ``GPUArray``
            Cache obtained from forward pass. If the cache is
            provided, then the activations are not recalculated.

        **Returns:**

        gradients : tuple of ``GPUArray``
            Gradients with respect to the weights and biases in the
            form ``(df_weights, df_biases)``.

        df_input : ``GPUArray``
            Gradients with respect to the input.
        """

        if cache is not None:
            activations = cache
        else:
            activations = self.feed_forward(input_data, prediction=False)

        delta = activations - targets
        nan_to_zeros(delta, delta)

        # Gradient wrt weights
        df_W = linalg.dot(input_data, delta, transa='T')
        # Gradient wrt bias
        df_b = matrix_sum_out_axis(delta, 0)

        # Gradient wrt input
        df_input = linalg.dot(delta, self.W, transb='T')

        # L1 penalty
        if self.l1_penalty_weight:
            df_W -= self.l1_penalty_weight * sign(self.W)

        # L2 penalty
        if self.l2_penalty_weight:
            df_W -= self.l2_penalty_weight * self.W

        return (df_W, df_b), df_input
 def forward(self, bottom, top):
     with pu.caffe_cuda_context():
         h = caffe.cublas_handle()
         import scikits.cuda.linalg as linalg
         mat1 = bottom[0].data_as_pycuda_gpuarray()
         mat2 = bottom[1].data_as_pycuda_gpuarray()
         mato = top[0].data_as_pycuda_gpuarray()
         for b in xrange(bottom[0].shape[0]):
             linalg.dot(mat1[b],
                        mat2[b],
                        transa=blas_trans(self.t1_),
                        transb=blas_trans(self.t2_),
                        handle=h,
                        out=mato[b])
Esempio n. 17
0
    def backprop(self, input_data, targets, cache=None):
        """ Backpropagate through the logistic layer.

        **Parameters:**

        input_data : ``GPUArray``
            Inpute data to compute activations for.

        targets : ``GPUArray``
            The target values of the units.

        cache : list of ``GPUArray``
            Cache obtained from forward pass. If the cache is
            provided, then the activations are not recalculated.

        **Returns:**

        gradients : tuple of ``GPUArray``
            Gradients with respect to the weights and biases in the
            form ``(df_weights, df_biases)``.

        df_input : ``GPUArray``
            Gradients with respect to the input.
        """

        if cache is not None:
            activations = cache
        else:
            activations = self.feed_forward(input_data, prediction=False)

        delta = activations - targets
        nan_to_zeros(delta, delta)

        # Gradient wrt weights
        df_W = linalg.dot(input_data, delta, transa='T')
        # Gradient wrt bias
        df_b = matrix_sum_out_axis(delta, 0)

        # Gradient wrt input
        df_input = linalg.dot(delta, self.W, transb='T')

        # L1 penalty
        if self.l1_penalty_weight:
            df_W -= self.l1_penalty_weight * sign(self.W)

        # L2 penalty
        if self.l2_penalty_weight:
            df_W -= self.l2_penalty_weight * self.W

        return (df_W, df_b), df_input
Esempio n. 18
0
    def backprop(self, input_data, df_output, cache=None):
        """ Backpropagate through the hidden layer

        Inputs:
        input_data
        df_output: the gradient wrt the output units
        cache (optional): cache object from the forward pass

        Output:
        df_W: gradient wrt the weights
        df_b: gradient wrt the bias
        df_input: gradient wrt the input

        """

        # Get cache if it wasn't provided
        if cache is None:
            cache = self.feed_forward(input_data,
                                      prediction=False)

        if len(cache) == 2:
            activations, dropout_mask = cache
        else:
            activations = cache[0]

        # Multiply the binary mask with the incoming gradients
        if self.dropout and dropout_mask is not None:
            apply_dropout_mask(df_output, dropout_mask)

        # Get gradient wrt activation function
        df_activations = self.df(activations)
        delta = df_activations * df_output

        # Gradient wrt weights
        df_W = linalg.dot(input_data, delta, transa='T')
        # Gradient wrt bias
        df_b = matrix_sum_out_axis(delta, 0)
        # Gradient wrt inputs
        df_input = linalg.dot(delta, self.W, transb='T')

        # L1 weight decay
        if self.l1_penalty_weight:
            df_W -= self.l1_penalty_weight * sign(self.W)

        # L2 weight decay
        if self.l2_penalty_weight:
            df_W -= self.l2_penalty_weight * self.W

        return (df_W, df_b), df_input
Esempio n. 19
0
 def test_dot_matrix_h_complex128(self):
     a = np.asarray(
         np.random.rand(2, 4) + 1j * np.random.rand(2, 4), np.complex128)
     b = np.asarray(
         np.random.rand(2, 2) + 1j * np.random.rand(2, 2), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, 'c')
     assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
     a = a.astype(np.complex128, order="F", copy=True)
     b = b.astype(np.complex128, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, 'c')
     assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
Esempio n. 20
0
 def test_dot_vector_float64(self):
     a = np.asarray(np.random.rand(5), np.float64)
     b = np.asarray(np.random.rand(5), np.float64)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
Esempio n. 21
0
 def test_dot_vector_complex128(self):
     a = np.asarray(np.random.rand(5), np.complex128)
     b = np.asarray(np.random.rand(5), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
Esempio n. 22
0
 def test_dot_matrix_float32(self):
     a = np.asarray(np.random.rand(4, 2), np.float32)
     b = np.asarray(np.random.rand(2, 2), np.float32)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c_gpu.get())
Esempio n. 23
0
        def thunk():
            x = inputs[0]
            y = inputs[1]

            # chop off the real/imag dimension
            input_shape_x = x[0].shape # (a, b, 2)
            input_shape_y = y[0].shape # (b, c, 2)

            output_shape = (input_shape_x[0], input_shape_y[1], 2) # (a, c, 2)

            input_x_pycuda = to_complex_gpuarray(x[0])
            input_y_pycuda = to_complex_gpuarray(y[0])

            # multistream experiment
            # print "DEBUG: Setting stream to %d" % current_stream[0]

            # prev_stream_obj = stream_pool[(current_stream[0] - 1) % num_streams]
            # print "PREV STREAM IS DONE?"
            # print prev_stream_obj.is_done()
            # print

            stream_obj = stream_pool[current_stream[0]]
            cublas.cublasSetStream(handle[0], stream_obj.handle)
            current_stream[0] += 1
            current_stream[0] %= num_streams
            # print "DEBUG: set next stream id to %d" % current_stream[0]

            output_pycuda = linalg.dot(input_x_pycuda, input_y_pycuda, handle=handle[0])

            outputs[0][0] = to_complex_cudandarray(output_pycuda)
Esempio n. 24
0
 def test_dot_matrix_t_complex128(self):
     a = np.asarray(np.random.rand(2, 4), np.complex128)
     b = np.asarray(np.random.rand(2, 2), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, 't')
     assert np.allclose(np.dot(a.T, b), c_gpu.get())
Esempio n. 25
0
 def test_dot_vector_float64(self):
     a = np.asarray(np.random.rand(5), np.float64)
     b = np.asarray(np.random.rand(5), np.float64)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
Esempio n. 26
0
 def test_dot_vector_complex128(self):
     a = np.asarray(np.random.rand(5), np.complex128)
     b = np.asarray(np.random.rand(5), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
Esempio n. 27
0
 def test_dot_matrix_float32(self):
     a = np.asarray(np.random.rand(4, 2), np.float32)
     b = np.asarray(np.random.rand(2, 2), np.float32)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c_gpu.get())
Esempio n. 28
0
        def thunk():
            x = inputs[0]
            y = inputs[1]

            # chop off the real/imag dimension
            input_shape_x = x[0].shape  # (a, b, 2)
            input_shape_y = y[0].shape  # (b, c, 2)

            output_shape = (input_shape_x[0], input_shape_y[1], 2)  # (a, c, 2)

            input_x_pycuda = to_complex_gpuarray(x[0])
            input_y_pycuda = to_complex_gpuarray(y[0])

            # multistream experiment
            # print "DEBUG: Setting stream to %d" % current_stream[0]

            # prev_stream_obj = stream_pool[(current_stream[0] - 1) % num_streams]
            # print "PREV STREAM IS DONE?"
            # print prev_stream_obj.is_done()
            # print

            stream_obj = stream_pool[current_stream[0]]
            cublas.cublasSetStream(handle[0], stream_obj.handle)
            current_stream[0] += 1
            current_stream[0] %= num_streams
            # print "DEBUG: set next stream id to %d" % current_stream[0]

            output_pycuda = linalg.dot(input_x_pycuda,
                                       input_y_pycuda,
                                       handle=handle[0])

            outputs[0][0] = to_complex_cudandarray(output_pycuda)
Esempio n. 29
0
 def test_dot_matrix_h_complex128(self):
     a = np.asarray(np.random.rand(2, 4)+1j*np.random.rand(2, 4), np.complex128)
     b = np.asarray(np.random.rand(2, 2)+1j*np.random.rand(2, 2), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, 'c')
     assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
Esempio n. 30
0
 def test_dot_matrix_t_complex64(self):
     a = np.asarray(np.random.rand(2, 4), np.complex64)
     b = np.asarray(np.random.rand(2, 2), np.complex64)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, 't')
     assert np.allclose(np.dot(a.T, b), c_gpu.get())
Esempio n. 31
0
    def feed_forward(self, input_data, prediction=False):
        """ Propagate forward through the hidden layer.
        Inputs:
        input_data -- input from the previous layer
        prediction -- (bool) whether predicting or training

        Outputs:
        lin_activations
        activations

        If self.dropout = True and prediction=False:
        Output:
        lin_activations
        activations
        dropout_mask: binary mask of dropped units

        """

        activations = linalg.dot(input_data, self.W)
        activations = add_vec_to_mat(activations, self.b, inplace=True)

        self.f(activations)

        if self.dropout and prediction:
            activations *= .5

        if self.dropout and not prediction:
            dropout_mask = sample_dropout_mask(activations)
            return activations, dropout_mask

        return (activations,)
Esempio n. 32
0
    def feed_forward(self, input_data, prediction=False):
        """Propagate forward through the layer

        **Parameters:**

        input_data : ``GPUArray``
            Inpute data to compute activations for.

        prediction : bool, optional
            Whether to use prediction model. Only relevant when using
            dropout. If true, then weights are halved if the layers
            uses dropout.

        **Returns:**
        
        activations : ``GPUArray``
            The activations of the hidden units.
        """

        activations = linalg.dot(input_data, self.W)
        activations = add_vec_to_mat(activations, self.b, inplace=True)

        self.f(activations)

        if self.dropout and prediction:
            activations *= .5

        if self.dropout and not prediction:
            dropout_mask = sample_dropout_mask(activations)
            return activations, dropout_mask

        return (activations,)
Esempio n. 33
0
 def updateGradient(self,
                    Y,
                    inputs,
                    print_timing=False,
                    include_prior=True):
     if print_timing:
         t0 = t.time()
         t_run = t.time()
     diff = Y - self.outputs
     if print_timing:
         t_diff = t.time() - t_run
         t_run = t.time()
     #self.gW = linalg.dot(linalg.transpose(inputs),diff)
     self.gW = linalg.dot(inputs, diff, transa='T')
     if print_timing:
         t_dot = t.time() - t_run
         t_run = t.time()
     ones = gpuarray.to_gpu(np.ones((1, self.N)).astype(self.precision))
     if print_timing:
         t_ones = t.time() - t_run
         t_run = t.time()
     bias_diff = Y - self.outputs
     self.gB = linalg.dot(ones, bias_diff)
     if print_timing:
         t_sum_bias = t.time() - t_run
         t_run = t.time()
     if print_timing:
         t1 = t.time()
         t0_prior = t.time()
     if include_prior:
         self.prior.updateWeightGradient(self.weights, self.gW)
         if print_timing:
             t_weights = t.time() - t_run
             t_run = t.time()
         self.prior.updateBiasGradient(self.biases, self.gB)
     if print_timing:
         t1_prior = t.time()
         print 'Total time for gradient update in softmax layer ' + str(t1 -
                                                                        t0)
         print 'Time for prior update in softmax layer ' + str(t1_prior -
                                                               t0_prior)
         print 'Time for Y-outputs ' + str(t_diff)
         print 'Time for inputs-diff dot-prod ' + str(t_dot)
         print 'Time to create ones vector ' + str(t_ones)
         print 'Time for diff-ones dot-prod ' + str(t_sum_bias)
     return linalg.dot(diff, self.weights, transb='T')
Esempio n. 34
0
File: ffnn.py Progetto: elnaaz/alex
    def cudasolve(self, A, b, tol=1e-4):
        """ Conjugate gradient solver for dense system of linear equations.

            Ax = b

            Returns: x = A^(-1)b
        """

        N = len(b)
        b = b.reshape((N,1))
        x = b.copy()
        # print 'A', A.shape
        # print 'b', b.shape
        # print 'x', x.shape
        r = b - culinalg.dot(A,x)
        # print 'r', r.shape
        p = r.copy()
        rsold = culinalg.dot(r,r, transa='T')[0][0].get()
        # print 'rsold', rsold
        for i in range(N):
            Ap = culinalg.dot(A,p)
            # print 'A', A.shape
            # print 'p', p.shape
            # print 'Ap', Ap.shape

            pAp = culinalg.dot(p, Ap, transa='T')[0][0].get()
            # print 'p^(T)Ap', pAp
            alpha = rsold / pAp
            # print 'alpha', alpha

            x += alpha*p
            # print 'x', x.shape
            r -= alpha*Ap
            rsnew = culinalg.dot(r,r, transa='T')[0][0].get()
            # print 'rsnew', math.sqrt(rsnew)

            if math.sqrt(rsnew) < tol:
                break
            else:
                p = r + (rsnew/rsold)*p
                rsold = rsnew

        print 'cudasolve> Iterations required on GPU:', i

        return x.reshape(N)
Esempio n. 35
0
    def cudasolve(self, A, b, tol=1e-4):
        """ Conjugate gradient solver for dense system of linear equations.

            Ax = b

            Returns: x = A^(-1)b
        """

        N = len(b)
        b = b.reshape((N, 1))
        x = b.copy()
        # print 'A', A.shape
        # print 'b', b.shape
        # print 'x', x.shape
        r = b - culinalg.dot(A, x)
        # print 'r', r.shape
        p = r.copy()
        rsold = culinalg.dot(r, r, transa='T')[0][0].get()
        # print 'rsold', rsold
        for i in range(N):
            Ap = culinalg.dot(A, p)
            # print 'A', A.shape
            # print 'p', p.shape
            # print 'Ap', Ap.shape

            pAp = culinalg.dot(p, Ap, transa='T')[0][0].get()
            # print 'p^(T)Ap', pAp
            alpha = rsold / pAp
            # print 'alpha', alpha

            x += alpha * p
            # print 'x', x.shape
            r -= alpha * Ap
            rsnew = culinalg.dot(r, r, transa='T')[0][0].get()
            # print 'rsnew', math.sqrt(rsnew)

            if math.sqrt(rsnew) < tol:
                break
            else:
                p = r + (rsnew / rsold) * p
                rsold = rsnew

        print 'cudasolve> Iterations required on GPU:', i

        return x.reshape(N)
Esempio n. 36
0
 def updateGradient(self,bp_signal,inputs,print_timing=False,include_prior=True):
     if print_timing:
         print '' 
         t0 =  t.time()
         t_run = t.time()
     if self.magic_numbers:
         back_prop = bp_signal* 0.6667/1.7159 * (1.7159 - (self.outputs)*(1.7159 + self.outputs))
     else:
         back_prop = bp_signal*(1.0-(self.outputs*self.outputs))
     
     if print_timing:
         t_bp = t.time() - t_run
         t_run = t.time()
     self.gW = linalg.dot(inputs,back_prop,transa='T')
     if print_timing:
         t_dot = t.time() - t_run
         t_run = t.time()
     #self.gW = linalg.dot(linalg.transpose(inputs),back_prop)
     ones = gpuarray.to_gpu(np.ones((1,self.N)).astype(self.precision))
     if print_timing:
         t_ones = t.time() - t_run
         t_run = t.time()
     self.gB = linalg.dot(ones,back_prop)
     if print_timing:
         t_biases = t.time() - t_run
         t_run = t.time()
     if include_prior:
         self.prior.updateWeightGradient(self.weights,self.gW)
         if print_timing:
             t_weights = t.time() - t_run
             t_run = t.time()
         self.prior.updateBiasGradient(self.biases,self.gB)
     if print_timing:
         t_prior = t.time() - t_run
         print 'Total time for gradient update in hidden layer ' + str(self.ID) + ' '  + str(t.time()-t0)
         print 'Time to calculate backprop in hidden layer ' + str(self.ID) + ' '  + str(t_bp)
         print 'Time to calculate gradient for weights in hidden layer ' + str(self.ID) + ' '  + str(t_dot)
         print 'Time to allocate ones vector in hidden layer ' + str(self.ID) + ' '  + str(t_ones)
         print 'Time to biases gradient in hidden layer ' + str(self.ID) + ' '  + str(t_biases)
         print 'Time for prior update in hidden layer ' + str(self.ID) + ' '  + str(t_prior)
     if self.ID > 0:
         return linalg.dot(back_prop,self.weights,transb='T')
         #return linalg.dot(back_prop,linalg.transpose(self.weights))
     else:
         return -1            
Esempio n. 37
0
def calc_x_G(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh, handle=None):
    D = A[0].shape[1]
    Dm1 = A[0].shape[0]
    q = len(A)
    
    x = garr.zeros((Dm1, q * D - Dm1), dtype=A[0].dtype)
    x_part = garr.empty_like(x)
    x_subpart = garr.empty_like(A[0])
    
    if not (C is None and Kp1 is None):
        assert (not C is None) and (not Kp1 is None)
        x_part.fill(0)
        for s in range(q):
            x_subpart = eps_r(rp1, C[s], Ap1, x_subpart, handle) #~1st line
            
            x_subpart += cla.dot(A[s], Kp1, handle=handle) #~3rd line
    
            x_part += cla.dot(cla.dot(x_subpart, r_si, handle=handle), Vsh[s], handle=handle)

        x += cla.dot(lm1_s, x_part, handle=handle)

    if not lm2 is None:
        x_part.fill(0)
        for s in range(q):     #~2nd line
            x_subpart = eps_l(lm2, Am1, Cm1[s], x_subpart, handle)
            x_part += cla.dot(x_subpart, cla.dot(r_s, Vsh[s], handle=handle), handle=handle)
        x += cla.dot(lm1_si, x_part, handle=handle)
        
    return x
Esempio n. 38
0
 def test_dot_matrix_h_complex64(self):
     a = np.asarray(
         np.random.rand(2, 4) + 1j * np.random.rand(2, 4), np.complex64)
     b = np.asarray(
         np.random.rand(2, 2) + 1j * np.random.rand(2, 2), np.complex64)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, 'c')
     assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
Esempio n. 39
0
 def _dot_matrix_tests(self, dtype, transa, transb):
     a = np.asarray(np.random.rand(4, 2), dtype)
     if transa == 'n':
         b = np.asarray(np.random.rand(2, 2), dtype)
     else:
         b = np.asarray(np.random.rand(4, 4), dtype)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb)
     aa = a if transa == 'n' else a.T
     bb = b if transb == 'n' else b.T
     assert np.allclose(np.dot(aa, bb), c_gpu.get())
     a = a.astype(dtype, order="F", copy=True)
     b = b.astype(dtype, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb)
     assert np.allclose(np.dot(aa, bb), c_gpu.get())
Esempio n. 40
0
 def _dot_matrix_tests(self, dtype, transa, transb):
     a = np.asarray(np.random.rand(4, 2), dtype)
     if transa == "n":
         b = np.asarray(np.random.rand(2, 2), dtype)
     else:
         b = np.asarray(np.random.rand(4, 4), dtype)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb)
     aa = a if transa == "n" else a.T
     bb = b if transb == "n" else b.T
     assert np.allclose(np.dot(aa, bb), c_gpu.get())
     a = a.astype(dtype, order="F", copy=True)
     b = b.astype(dtype, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb)
     assert np.allclose(np.dot(aa, bb), c_gpu.get())
Esempio n. 41
0
File: tps.py Progetto: zzz622848/lfd
def balance_matrix3_gpu(prob_nm,
                        max_iter,
                        row_priors,
                        col_priors,
                        outlierfrac,
                        r_N=None):
    if not lfd.registration._has_cuda:
        raise NotImplementedError("CUDA not installed")
    n, m = prob_nm.shape
    prob_NM = np.empty((n + 1, m + 1), 'f4')
    prob_NM[:n, :m] = prob_nm
    prob_NM[:n, m] = row_priors
    prob_NM[n, :m] = col_priors
    prob_NM[n, m] = np.sqrt(
        np.sum(row_priors) * np.sum(col_priors)
    )  # this can `be weighted bigger weight = fewer outliers
    a_N = np.ones((n + 1), 'f4')
    a_N[n] = m * outlierfrac
    b_M = np.ones((m + 1), 'f4')
    b_M[m] = n * outlierfrac

    if r_N is None: r_N = np.ones((n + 1, 1), 'f4')

    prob_NM_gpu = gpuarray.empty((n + 1, m + 1), dtype=np.float32)
    prob_MN_gpu = gpuarray.empty((m + 1, n + 1), dtype=np.float32)
    r_N_gpu = gpuarray.empty((n + 1, 1), dtype=np.float32)
    c_M_gpu = gpuarray.empty((m + 1, 1), dtype=np.float32)
    prob_NM_gpu.set_async(prob_NM)
    prob_MN_gpu.set_async(prob_NM.T.copy())
    r_N_gpu.set_async(r_N)

    for _ in xrange(max_iter):
        culinalg.dot(prob_NM_gpu, r_N_gpu, transa='T', out=c_M_gpu)
        c_M_gpu.set_async(b_M[:, None] / c_M_gpu.get())
        culinalg.dot(prob_MN_gpu, c_M_gpu, transa='T', out=r_N_gpu)
        r_N_gpu.set_async(a_N[:, None] / r_N_gpu.get())

    r_N = r_N_gpu.get()
    c_M = c_M_gpu.get()
    prob_NM *= r_N
    prob_NM *= c_M.T

    return prob_NM[:n, :m].astype(np.float64), r_N, c_M
Esempio n. 42
0
    def solve(self, wt_n, y_nd, bend_coef, f_res):
        if y_nd.shape[0] != self.n or y_nd.shape[1] != self.d:
            raise RuntimeError(
                "The dimensions of y_nd doesn't match the dimensions of x_nd")
        if not y_nd.flags.c_contiguous:
            raise RuntimeError("Expected y_nd to be c-contiguous but it isn't")
        self.sqrtWQN_gpu.set_async(np.sqrt(wt_n)[:, None] * self.QN)
        geam(self.NKN_gpu, self.NRN_gpu, self.lhs_gpu, alpha=bend_coef, beta=1)
        gemm(self.sqrtWQN_gpu,
             self.sqrtWQN_gpu,
             self.lhs_gpu,
             transa='T',
             alpha=1,
             beta=1)

        drv.memcpy_dtod_async(self.rhs_gpu.gpudata, self.NR_gpu.gpudata,
                              self.rhs_gpu.nbytes)
        self.y_dnW_gpu.set_async(
            y_nd.T * wt_n)  # use transpose so that it is f_contiguous
        gemm(self.QN_gpu,
             self.y_dnW_gpu,
             self.rhs_gpu,
             transa='T',
             transb='T',
             alpha=1,
             beta=1)

        if lfd.registration._has_cula:
            culinalg.cho_solve(self.lhs_gpu, self.rhs_gpu)
            z = self.rhs_gpu.get()
            culinalg.dot(self.N_gpu, self.rhs_gpu, out=self.theta_gpu)
            theta = self.theta_gpu.get()
        else:  # if cula is not install perform the last two computations in the CPU
            z = np.linalg.solve(self.lhs_gpu.get(), self.rhs_gpu.get())
            theta = self.N.dot(z)
        f_res.update(self.x_nd,
                     y_nd,
                     bend_coef,
                     self.rot_coef,
                     wt_n,
                     theta,
                     N=self.N,
                     z=z)
Esempio n. 43
0
File: solver.py Progetto: amoliu/lfd
 def get_solver_mats(self, x_nd, rot_coef):
     n,d = x_nd.shape
     K_nn = tps.tps_kernel_matrix(x_nd)
     A = np.r_[np.zeros((d+1,d+1)), np.c_[np.ones((n,1)), x_nd]].T
     
     n_cnts = A.shape[0]    
     _u,_s,_vh = np.linalg.svd(A.T)
     N = _u[:,n_cnts:].copy()
     NR = (N[1:1+d,:].T * rot_coef).copy() # so that it is c-contiguous
     
     N_gpu = gpuarray.to_gpu(N[1+d:,:])
     K_gpu = gpuarray.to_gpu(K_nn)
     KN_gpu = culinalg.dot(K_gpu, N_gpu)
     QN = np.c_[np.ones((n, 1)), x_nd].dot(N[:1+d,:]) + KN_gpu.get()
     
     NKN_gpu = culinalg.dot(N_gpu, KN_gpu, transa='T')
     NKN = NKN_gpu.get()
     
     NRN = NR.dot(N[1:1+d,:])
     return N, QN, NKN, NRN, NR, K_nn
Esempio n. 44
0
    def get_solver_mats(self, x_nd, rot_coef):
        n, d = x_nd.shape
        K_nn = tps.tps_kernel_matrix(x_nd)
        A = np.r_[np.zeros((d + 1, d + 1)), np.c_[np.ones((n, 1)), x_nd]].T

        n_cnts = A.shape[0]
        _u, _s, _vh = np.linalg.svd(A.T)
        N = _u[:, n_cnts:].copy()
        NR = (N[1:1 + d, :].T * rot_coef).copy()  # so that it is c-contiguous

        N_gpu = gpuarray.to_gpu(N[1 + d:, :])
        K_gpu = gpuarray.to_gpu(K_nn)
        KN_gpu = culinalg.dot(K_gpu, N_gpu)
        QN = np.c_[np.ones((n, 1)), x_nd].dot(N[:1 + d, :]) + KN_gpu.get()

        NKN_gpu = culinalg.dot(N_gpu, KN_gpu, transa='T')
        NKN = NKN_gpu.get()

        NRN = NR.dot(N[1:1 + d, :])
        return N, QN, NKN, NRN, NR, K_nn
    def x_dot_YT(self, x, Y):
        x_size = x.shape[0]
        byte = np.float32(0).nbytes
        x.strides = (x_size * byte, byte)
        x.shape = (1, x_size)

        result = culinalg.dot(x, Y, transa = 'N', transb = 'T')
        
        x.strides = (byte,)
        x.shape = (x_size,)
        return result
Esempio n. 46
0
def dot(d_a, d_b, transa='N', transb='N', out=None):
    if out is None:
        if transa == 'T':
            out_x = d_a.shape[1]
        else:
            out_x = d_a.shape[0]
        if transb == 'T':
            out_y = d_b.shape[0]
        else:
            out_y = d_b.shape[1]
        out = gpuarray.empty((out_x, out_y), numpy.float32)
    return linalg.dot(d_a, d_b, transa=transa, transb=transb, handle=handle, out=out)
Esempio n. 47
0
    def solve(self, wt_n, y_nd, bend_coef, f_res):
        if y_nd.shape[0] != self.n or y_nd.shape[1] != self.d:
            raise RuntimeError("The dimensions of y_nd doesn't match the dimensions of x_nd")
        if not y_nd.flags.c_contiguous:
            raise RuntimeError("Expected y_nd to be c-contiguous but it isn't")
        self.sqrtWQN_gpu.set_async(np.sqrt(wt_n)[:,None] * self.QN)
        geam(self.NKN_gpu, self.NRN_gpu, self.lhs_gpu, alpha=bend_coef, beta=1)
        gemm(self.sqrtWQN_gpu, self.sqrtWQN_gpu, self.lhs_gpu, transa='T', alpha=1, beta=1)

        drv.memcpy_dtod_async(self.rhs_gpu.gpudata, self.NR_gpu.gpudata, self.rhs_gpu.nbytes)
        self.y_dnW_gpu.set_async(y_nd.T * wt_n) # use transpose so that it is f_contiguous
        gemm(self.QN_gpu, self.y_dnW_gpu, self.rhs_gpu, transa='T', transb='T', alpha=1, beta=1)
        
        if lfd.registration._has_cula:
            culinalg.cho_solve(self.lhs_gpu, self.rhs_gpu)
            culinalg.dot(self.N_gpu, self.rhs_gpu, out=self.theta_gpu)
            theta = self.theta_gpu.get()
        else: # if cula is not install perform the last two computations in the CPU
            z = np.linalg.solve(self.lhs_gpu.get(), self.rhs_gpu.get())
            theta = self.N.dot(z)
        f_res.set_ThinPlateSpline(self.x_nd, y_nd, bend_coef, self.rot_coef, wt_n, theta=theta)
Esempio n. 48
0
    def forward(self, bottom, top):
        """

        """
        with pu.caffe_cuda_context():
            h = caffe.cublas_handle()
            batch_size = bottom[0].shape[0]
            dim = bottom[0].count / bottom[0].shape[0]
            pred = bottom[0].data_as_pycuda_gpuarray()
            label = bottom[1].data_as_pycuda_gpuarray()
            mask = bottom[2].data_as_pycuda_gpuarray()
            # Use bottom[0,1].diff as temporary buffer
            diff = bottom[0].diff_as_pycuda_gpuarray()
            diff2 = bottom[1].diff_as_pycuda_gpuarray()
            # Compute diff
            self.k_masked_diff_(diff, pred, label, mask)
            self.k_squared_(diff, diff2)
            import scikits.cuda.linalg as linalg
            # This needs scikits.cuda 0.5.0a3 or later
            # (sudo) pip install scikits.cuda=>0.5.0a3
            linalg.dot(diff.reshape(batch_size, dim), self.multipier_sum_,
                       handle=h, out=self.diff_sum_)
            linalg.dot(diff2.reshape(batch_size, dim), self.multipier_sum_,
                       handle=h, out=self.diff2_sum_)
            linalg.dot(mask.reshape(batch_size, dim), self.multipier_sum_,
                       handle=h, out=self.mask_sum_)
            self.k_ensure_mask_sum_(self.mask_sum_)
            term1 = self.k_div_sum_(self.diff2_sum_, self.mask_sum_)
            term2 = self.k_div_squared_sum_(self.diff_sum_, self.mask_sum_)
            top[0].data[...] = (term1.get() - self.lambda_ * term2.get()) \
                / batch_size
Esempio n. 49
0
 def updateGradient(self,Y,inputs,print_timing=False,include_prior=True):
     if print_timing:
         t0 =  t.time()
         t_run = t.time()
     diff = Y-self.outputs
     if print_timing:
         t_diff = t.time() - t_run
         t_run = t.time()
     #self.gW = linalg.dot(linalg.transpose(inputs),diff)
     self.gW = linalg.dot(inputs,diff,transa='T')
     if print_timing:
         t_dot = t.time() - t_run
         t_run = t.time()
     ones = gpuarray.to_gpu(np.ones((1,self.N)).astype(self.precision))
     if print_timing:
         t_ones = t.time() - t_run
         t_run = t.time()
     bias_diff = Y - self.outputs
     self.gB = linalg.dot(ones,bias_diff)
     if print_timing:
         t_sum_bias = t.time() - t_run
         t_run = t.time()
     if print_timing:
         t1 = t.time()
         t0_prior = t.time()
     if include_prior:
         self.prior.updateWeightGradient(self.weights,self.gW)
         if print_timing:
             t_weights = t.time() - t_run
             t_run = t.time()
         self.prior.updateBiasGradient(self.biases,self.gB)
     if print_timing:
         t1_prior = t.time()
         print 'Total time for gradient update in softmax layer ' + str(t1-t0)
         print 'Time for prior update in softmax layer ' + str(t1_prior - t0_prior)
         print 'Time for Y-outputs ' + str(t_diff)
         print 'Time for inputs-diff dot-prod ' + str(t_dot)
         print 'Time to create ones vector ' + str(t_ones)
         print 'Time for diff-ones dot-prod ' + str(t_sum_bias)
     return linalg.dot(diff,self.weights,transb='T')
Esempio n. 50
0
    def backprop(self, input_data, targets,
                 cache=None):
        """ Backpropagate through the logistic layer

        Inputs:
        input_data
        targets
        get_df_input: (bool) whether to compute and return the
            gradient wrt the inputs
        return_cache: (bool) whether to return the cache
        cache: cache object from forward pass

        """

        if cache is not None:
            activations = cache
        else:
            activations = self.feed_forward(input_data, prediction=False)

        delta = activations - targets
        nan_to_zeros(delta, delta)

        # Gradient wrt weights
        df_W = linalg.dot(input_data, delta, transa='T')
        # Gradient wrt bias
        df_b = matrix_sum_out_axis(delta, 0)

        # Gradient wrt input
        df_input = linalg.dot(delta, self.W, transb='T')

        # L1 penalty
        if self.l1_penalty_weight:
            df_W -= self.l1_penalty_weight * sign(self.W)

        # L2 penalty
        if self.l2_penalty_weight:
            df_W -= self.l2_penalty_weight * self.W

        return (df_W, df_b), df_input
Esempio n. 51
0
def left_dot_col_major_gpu(m1, m2):
    out_gpu = gpuarray.GPUArray((m1.shape[0], m2.shape[1]), np.float64, allocator=cuda.mem_alloc, order='F')
    gpu_mem_avail = GPU_MAX_MEM - out_gpu.nbytes - m1.nbytes
    print("GPU MEM: " + str(GPU_MAX_MEM))
    print("GPU MEM AVAIL: " + str(gpu_mem_avail))
    print("M1 SIZE: " + str(m1.nbytes))
    print("OUT SIZE: " + str(out_gpu.nbytes))
    frags = 1
    while ((m2.nbytes / frags) > gpu_mem_avail):
        frags += 1
    m1_gpu = gpuarray.to_gpu(m1)
    subm2_gpu = None
    shift = 0
    for subm2 in np.array_split(m2, frags, axis=1):
        if subm2_gpu is not None:
            del subm2_gpu
        subm2_gpu = gpuarray.to_gpu(subm2)
        linalg.dot(m1_gpu, subm2_gpu, out=out_gpu[:,shift:shift+subm2.shape[1]])
        shift += subm2.shape[1]
    out = out_gpu.get()
    del m1_gpu
    del out_gpu
    return out
Esempio n. 52
0
    def f(mat, axis=0, cache_one_vector=True):
        assert mat.flags.c_contiguous
        N, M = mat.shape

        if axis == 0:
            vec_shape = (N, )
            try:
                ones = one_vector_cache[vec_shape]
            except KeyError:
                ones = gpuarray.empty(vec_shape, dtype=mat.dtype).fill(1.)
                if cache_one_vector: one_vector_cache[vec_shape] = ones
            target = linalg.dot(ones, mat).ravel()
        elif axis == 1:
            vec_shape = (M, 1)
            try:
                ones = one_vector_cache[vec_shape]
            except KeyError:
                ones = gpuarray.empty((M, 1), dtype=mat.dtype).fill(1.)
                if cache_one_vector: one_vector_cache[vec_shape] = ones
            target = linalg.dot(mat, ones).ravel()
        else:
            raise ValueError('axis must be 0 or 1')

        return target
Esempio n. 53
0
        def thunk():
            x = inputs[0]
            y = inputs[1]

            # chop off the real/imag dimension
            input_shape_x = x[0].shape  # (a, b, 2)
            input_shape_y = y[0].shape  # (b, c, 2)

            output_shape = (input_shape_x[0], input_shape_y[1], 2)  # (a, c, 2)

            input_x_pycuda = to_complex_gpuarray(x[0])
            input_y_pycuda = to_complex_gpuarray(y[0])

            output_pycuda = linalg.dot(input_x_pycuda, input_y_pycuda)

            outputs[0][0] = to_complex_cudandarray(output_pycuda)
Esempio n. 54
0
def calc_x_G(Kp1,
             C,
             Cm1,
             rp1,
             lm2,
             Am1,
             A,
             Ap1,
             lm1_s,
             lm1_si,
             r_s,
             r_si,
             Vsh,
             handle=None):
    D = A[0].shape[1]
    Dm1 = A[0].shape[0]
    q = len(A)

    x = garr.zeros((Dm1, q * D - Dm1), dtype=A[0].dtype)
    x_part = garr.empty_like(x)
    x_subpart = garr.empty_like(A[0])

    if not (C is None and Kp1 is None):
        assert (not C is None) and (not Kp1 is None)
        x_part.fill(0)
        for s in range(q):
            x_subpart = eps_r(rp1, C[s], Ap1, x_subpart, handle)  #~1st line

            x_subpart += cla.dot(A[s], Kp1, handle=handle)  #~3rd line

            x_part += cla.dot(cla.dot(x_subpart, r_si, handle=handle),
                              Vsh[s],
                              handle=handle)

        x += cla.dot(lm1_s, x_part, handle=handle)

    if not lm2 is None:
        x_part.fill(0)
        for s in range(q):  #~2nd line
            x_subpart = eps_l(lm2, Am1, Cm1[s], x_subpart, handle)
            x_part += cla.dot(x_subpart,
                              cla.dot(r_s, Vsh[s], handle=handle),
                              handle=handle)
        x += cla.dot(lm1_si, x_part, handle=handle)

    return x
def train_rfn_gpu(X,
                  n_hidden,
                  n_iter,
                  learnrateW,
                  learnratePsi,
                  dropout_rate,
                  input_droput_rate,
                  minPsi=0.1,
                  seed=32):
    k = n_hidden
    n, m = X.shape
    W = np.random.normal(scale=0.01, size=(k, m)).astype(np.float32)
    P = np.array([0.1] * m, dtype=np.float32)
    XXdiag = np.diag(np.dot(X.T, X) /
                     n).copy()  # explicit copy to avoid numpy 1.8 warning
    W = gpu.to_gpu(W, allocator=_mempool.allocate)
    P = gpu.to_gpu(P, allocator=_mempool.allocate)
    X = gpu.to_gpu(X, allocator=_mempool.allocate)
    XXdiag = gpu.to_gpu(XXdiag, allocator=_mempool.allocate)
    I = la.eye(k, dtype=np.float32)

    init_rng(seed)
    t0 = time.time()
    for cur_iter in range(n_iter):
        H, tmp = calculate_H_gpu(X, W, P)
        if dropout_rate > 0:
            dropout(H, dropout_rate)
        Xtmp = X
        if input_dropout_rate > 0:
            Xtmp = X.copy()
            saltpepper_noise(Xtmp, input_dropout_rate)
        U = la.dot(Xtmp, H, "t", "n") / n
        S = la.dot(H, H, "t", "n") / n
        S += I
        S -= la.dot(tmp, W, "n", "t")
        Cii = la.dot(la.dot(W, S, "t") - 2 * U, W)

        Sinv = la.inv(S, overwrite=True)
        dW = la.dot(Sinv, U, "n", "t") - W
        dP = XXdiag + la.diag(Cii) - P

        W += learnrateW * dW
        P += learnratePsi * dP

        P = gpu.maximum(P, minPsi)
        if cur_iter % 25 == 0:
            print "iter %3d (elapsed time: %5.2fs)" % (cur_iter,
                                                       time.time() - t0)
    return W.get(), P.get()
Esempio n. 56
0
def Test():

    A = np.float32(np.random.randn(*(2000, 2000)))
    A = np.complex64(np.ones((2000, 2000)) + 1j * np.ones((2000, 2000)))
    AT = A.T.copy()

    A_32 = A  #np.float32(A)
    AT_32 = AT  #np.float32(AT)

    T = ClassTimeIt.ClassTimeIt()
    # create two random matrices and copy them to the GPU
    g_A0 = cm.CUDAMatrix(A)
    g_AT0 = cm.CUDAMatrix(AT)

    # perform calculations on the GPU
    P0 = cm.dot(g_AT0, g_A0).asarray()
    #d = cm.sum(axis = 0)
    T.timeit("GPU0")
    del (g_AT0, g_A0)
    #T.reinit()

    # copy d back to the host (CPU) and print

    g_A1 = gpuarray.to_gpu(A)
    g_AT1 = gpuarray.to_gpu(AT)
    #time.sleep(5)

    #T.timeit("tranf0")
    g_P1 = culinalg.dot(g_AT1, g_A1)

    P1 = g_P1.get()

    #T.timeit("tranf1")
    T.timeit("GPU1")

    np_P = np.dot(AT, A)
    T.timeit("np")
    #print g_P-np_P

    print(np.max(np_P - P0))
    print(np.max(np_P - P1))
Esempio n. 57
0
    def feed_forward(self, input_data, prediction=False):
        """Propagate forward through the layer.

        **Parameters:**

        input_data : ``GPUArray``
            Inpute data to compute activations for.

        prediction : bool, optional
            Whether to use prediction model. Only relevant when using
            dropout. If true, then weights are halved if the layers
            uses dropout.

        **Returns:**
        
        activations : ``GPUArray``
            The activations of the output units.
        """
        activations = linalg.dot(input_data, self.W)
        activations = add_vec_to_mat(activations, self.b, inplace=True)
        activations = softmax(activations)

        return activations
Esempio n. 58
0
    def forward(self, bottom, top):
        """

        """
        with pu.caffe_cuda_context():
            h = caffe.cublas_handle()
            batch_size = bottom[0].shape[0]
            dim = bottom[0].count / bottom[0].shape[0]
            pred = bottom[0].data_as_pycuda_gpuarray()
            label = bottom[1].data_as_pycuda_gpuarray()
            # Use bottom[0,1].diff as temporary buffer
            diff = bottom[0].diff_as_pycuda_gpuarray()
            diff2 = bottom[1].diff_as_pycuda_gpuarray()
            mask = bottom[0].diff_as_pycuda_gpuarray()
            # Compute diff
            self.k_masked_diff_(diff, pred, label)
            self.k_squared_(diff, diff2)
            import scikits.cuda.linalg as linalg
            # This needs scikits.cuda 0.5.0a3 or later
            # (sudo) pip install scikits.cuda=>0.5.0a3
            linalg.dot(diff.reshape(batch_size, dim),
                       self.multipier_sum_,
                       handle=h,
                       out=self.diff_sum_)
            linalg.dot(diff2.reshape(batch_size, dim),
                       self.multipier_sum_,
                       handle=h,
                       out=self.diff2_sum_)
            mask.fill(dtype(1.0))
            linalg.dot(mask.reshape(batch_size, dim),
                       self.multipier_sum_,
                       handle=h,
                       out=self.mask_sum_)
            self.k_ensure_mask_sum_(self.mask_sum_)
            term1 = self.k_div_sum_(self.diff2_sum_, self.mask_sum_)
            term2 = self.k_div_squared_sum_(self.diff_sum_, self.mask_sum_)
            top[0].data[...] = (term1.get() - self.lambda_ * term2.get()) \
                / batch_size