def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert s matches input shape.
            # assert (input_shape[1:-1] == s).all()
            assert (input_shape[-3:-1] == s).all()
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
                                   dtype='float32')

            input_pycuda = inputs[0][0]
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s, np.complex64, np.complex64,
                                       batch=np.prod(input_shape[:-3]))

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.fft(input_pycuda, output_pycuda, plan[0])

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
Example #2
0
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert s matches input shape.
            # assert (input_shape[1:-1] == s).all()
            assert (input_shape[1:-1] == s[:-1]).all()

            # # construct output shape
            # output_shape = [input_shape[0]] + list(s)
            # # DFT of real input is symmetric, no need to store
            # # redundant coefficients
            # output_shape[-1] = output_shape[-1] // 2 + 1
            # # extra dimension with length 2 for real/imag
            # output_shape += [2]
            # output_shape = tuple(output_shape)

            # Output is the same shape as the input (m, ..., n, 2)
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape,
                                   context=inputs[0][0].context,
                                   dtype='float32')
                # z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
                #                    dtype='float32')

            input_pycuda = inputs[0][0]
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out skcuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s,
                                       np.complex64,
                                       np.complex64,
                                       batch=input_shape[0])

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.fft(input_pycuda, output_pycuda, plan[0])

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
Example #3
0
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert s matches input shape.
            # assert (input_shape[1:-1] == s).all()
            assert (input_shape[1:-1] == s[:-1]).all()

            # # construct output shape
            # output_shape = [input_shape[0]] + list(s)
            # # DFT of real input is symmetric, no need to store
            # # redundant coefficients
            # output_shape[-1] = output_shape[-1] // 2 + 1
            # # extra dimension with length 2 for real/imag
            # output_shape += [2]
            # output_shape = tuple(output_shape)

            # Output is the same shape as the input (m, ..., n, 2)
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
                                   dtype='float32')
                # z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
                #                    dtype='float32')

            input_pycuda = inputs[0][0]
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out skcuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s, np.complex64, np.complex64,
                                       batch=input_shape[0])

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.fft(input_pycuda, output_pycuda, plan[0])

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert s matches input shape.
            # assert (input_shape[1:-1] == s).all()
            assert (input_shape[-3:-1] == s).all()
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape,
                                   context=inputs[0][0].context,
                                   dtype='float32')

            input_pycuda = inputs[0][0]
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s,
                                       np.complex64,
                                       np.complex64,
                                       batch=np.prod(input_shape[:-3]))

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.fft(input_pycuda, output_pycuda, plan[0])

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()