Python CudaNdarrayの例、theano.sandbox.cuda.CudaNdarray Pythonの例

コード例 #1

0

ファイルを表示

        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            output_shape = list(input_shape)
            # DFT of real input is symmetric, no need to store
            # redundant coefficients
            output_shape[-1] = output_shape[-1] // 2 + 1
            # extra dimension with length 2 for real/imag
            output_shape += [2]
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:],
                                   np.float32,
                                   np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])

コード例 #2

0

ファイルを表示

        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            # chop off the extra length-2 dimension for real/imag
            output_shape = list(input_shape[:-1])
            # restore full signal length
            output_shape[-1] = (output_shape[-1] - 1) * 2
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:],
                                   np.complex64,
                                   np.float32,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])

コード例 #3

0

ファイルを表示

ファイル: fftconv.py プロジェクト: Ambier/Theano

        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            output_shape = list(input_shape)
            # DFT of real input is symmetric, no need to store
            # redundant coefficients
            output_shape[-1] = output_shape[-1] // 2 + 1
            # extra dimension with length 2 for real/imag
            output_shape += [2]
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])

コード例 #4

0

ファイルを表示

ファイル: fftconv.py プロジェクト: corcra/uRNN

        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True

コード例 #5

0

ファイルを表示

ファイル: fftconv.py プロジェクト: Ambier/Theano

        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            # chop off the extra length-2 dimension for real/imag
            output_shape = list(input_shape[:-1])
            # restore full signal length
            output_shape[-1] = (output_shape[-1] - 1) * 2
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])

コード例 #6

0

ファイルを表示

ファイル: fftconv.py プロジェクト: Thrandis/complex_RNN

        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True

コード例 #7

0

ファイルを表示

ファイル: fftconv.py プロジェクト: corcra/uRNN

        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True

コード例 #8

0

ファイルを表示

ファイル: fftconv.py プロジェクト: Thrandis/complex_RNN

        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True

コード例 #9

0

ファイルを表示

ファイル: cuda_fft.py プロジェクト: soroushmehr/BP-FFT

        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            output_shape = tuple(input_shape)

            # print 'FFT shapes:', input_shape, '->', output_shape
            # print 'Batch size:', input_shape[0]
            # print 'Core shape:', input_shape[1:-1]

            z = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(shape=input_shape[1:-1],  # Exclude batch dim and complex dim
                                   in_dtype=np.complex64,
                                   out_dtype=np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])

コード例 #10

0

ファイルを表示

def test_cross_map_norm_noncontiguous_grad():
    # Check the case reported at https://groups.google.com/d/topic/pylearn-users/KxIYc3hczf4/discussion
    x = cuda_ftensor4('x')
    x_shuffled = x.dimshuffle(1, 2, 3, 0)
    x_shuffled = gpu_contiguous(x_shuffled)
    response_norm = CrossMapNorm(size_f=16,
                                 add_scale=(15. / 16.),
                                 pow_scale=1,
                                 blocked=True)
    output_shuffled = response_norm(x_shuffled)[0]
    output = output_shuffled.dimshuffle(3, 0, 1, 2)
    cost = output.sum()
    cost.name = 'cost'
    grad_x = theano.grad(cost, x)
    f = theano.function([x], grad_x, mode=mode_with_gpu)
    x_val = CudaNdarray(numpy.ones((2, 16, 2, 2), dtype='float32'))
    f(x_val)

コード例 #11

0

ファイルを表示

ファイル: fftconv.py プロジェクト: cfsmile/Theano

        def thunk():
            bx = inputs[0]
            by = inputs[1]

            input_shape_x = bx[0].shape  # (batch, a, b, 2)
            input_shape_y = by[0].shape  # (batch, b, c, 2)

            output_shape = (input_shape_x[0], input_shape_x[1], input_shape_y[2], 2)  # (batch, a, c, 2)

            bz = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if bz[0] is None or bz[0].shape != output_shape:
                bz[0] = CudaNdarray.zeros(output_shape)

            input_bx_pycuda = to_complex_gpuarray(bx[0])
            input_by_pycuda = to_complex_gpuarray(by[0])
            output_b_pycuda = to_complex_gpuarray(bz[0])

            # fancy native batched version
            sc_complex_dot_batched(input_bx_pycuda, input_by_pycuda, output_b_pycuda)

コード例 #12

0

ファイルを表示

        def thunk():
            bx = inputs[0]
            by = inputs[1]

            input_shape_x = bx[0].shape  # (batch, a, b, 2)
            input_shape_y = by[0].shape  # (batch, b, c, 2)

            output_shape = (input_shape_x[0], input_shape_x[1],
                            input_shape_y[2], 2)  # (batch, a, c, 2)

            bz = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if bz[0] is None or bz[0].shape != output_shape:
                bz[0] = CudaNdarray.zeros(output_shape)

            input_bx_pycuda = to_complex_gpuarray(bx[0])
            input_by_pycuda = to_complex_gpuarray(by[0])
            output_b_pycuda = to_complex_gpuarray(bz[0])

            # fancy native batched version
            sc_complex_dot_batched(input_bx_pycuda, input_by_pycuda,
                                   output_b_pycuda)

コード例 #13

0

ファイルを表示

        def thunk():
            global cusolver_handle

            # Size of the matrices to invert.
            z = outputs[0]

            # Matrix.
            A = inputs[0][0]

            # Solution vectors.
            b = inputs[1][0]

            # A is not explicitly converted between C and F order, instead we
            # switch the "transpose" flag.
            if self.trans in ('T', 'C'):
                trans = 'N'
            else:
                trans = 'T'

            # Convert b to F-order from C-order.
            b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1]))

            # This copy forces allocation of a new C-contiguous buffer
            # and returns it.
            A_cpy = A.copy()
            b_cpy = b_cpy.copy()

            assert (len(A.shape) == 2)
            assert (len(b.shape) == 2)

            if trans in ['T', 'C']:
                trans = 1
                l, n = A.shape
                k, m = b.shape
                if n != k:
                    raise ValueError('A and b must be aligned.')
            elif trans in ['N']:
                trans = 0
                n, l = A.shape
                k, m = b.shape
                if l != m:
                    raise ValueError('A and b must be aligned.')
            else:
                raise ValueError('Invalid value for trans')

            lda = max(1, n)
            ldb = max(1, n, l)

            A_ptr = A_cpy.gpudata
            b_ptr = b_cpy.gpudata

            if cusolver_handle is None:
                cusolver_handle = cusolver.cusolverDnCreate()
                print('cusolver handle', cusolver_handle)

            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
                cusolver_handle, m, n, A_ptr, lda)

            if (thunk.workspace is None
                    or thunk.workspace.size != workspace_size):
                thunk.workspace = CudaNdarray.zeros((workspace_size, ))

            if thunk.pivots is None or thunk.pivots.size != min(m, n):
                thunk.pivots = CudaNdarray.zeros((min(m, n), ))

            if thunk.dev_info is None:
                thunk.dev_info = CudaNdarray.zeros((1, ))

            workspace_ptr = thunk.workspace.gpudata
            pivots_ptr = thunk.pivots.gpudata
            dev_info_ptr = thunk.dev_info.gpudata

            cusolver.cusolverDnSgetrf(cusolver_handle, n, l, A_ptr, lda,
                                      workspace_ptr, pivots_ptr, dev_info_ptr)

            cusolver.cusolverDnSgetrs(cusolver_handle, trans, n, m, A_ptr, lda,
                                      pivots_ptr, b_ptr, ldb, dev_info_ptr)

            # Convert b to F-order from C-order and assign it to output.
            b_cpy = b_cpy.reshape(b.shape[::-1])
            b_cpy = dimshuffle(b_cpy, (1, 0))
            z[0] = b_cpy

コード例 #14

0

ファイルを表示

def test_cross_map_norm_simple():
    op = CrossMapNorm(16, 15. / 16., 1., True)
    x = CudaNdarray(numpy.ones((16, 2, 2, 2), dtype='float32'))
    x_ = theano.tensor.TensorVariable(CudaNdarrayType([False] * 4))
    f = theano.function([x_], op(x_)[0])
    numpy.testing.assert_allclose(f(x), 0.0625)