def perform(self, node, inputs, outputs): context = inputs[0][0].context # Input matrix. A = inputs[0] l, n = A.shape if l != n: raise ValueError('A must be a square matrix') lda = max(1, n) # cusolver operates on F ordered matrices, but A is expected # to be symmetric so it does not matter. # We copy A if needed if self.inplace: L = A else: L = pygpu.array(A, copy=True) # The output matrix will contain only the upper or lower # triangular factorization of A. If L is C ordered (it # probably is as it is the default in Theano) we just switch # the fill mode parameter of cusolver l_parameter = 0 if self.lower else 1 if L.flags['C_CONTIGUOUS']: l_parameter = 1 - l_parameter L_ptr = L.gpudata with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, l_parameter, n, L_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata cusolver.cusolverDnSpotrf(context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) val_dev_info = np.asarray(dev_info)[0] if val_dev_info > 0: raise LinAlgError('Cholesky decomposition failed (is A SPD?)') # cusolver leaves the elements in the matrix outside the considered # upper or lower triangle unchanged, so we need to put zeros outside # the triangle if self.lower: tril(L) else: triu(L) outputs[0][0] = L
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Input matrix. A = inputs[0] l, n = A.shape if l != n: raise ValueError('A must be a square matrix') lda = max(1, n) # cusolver operates on F ordered matrices, but A is expected # to be symmetric so it does not matter. # We copy A if needed if self.inplace: L = A else: L = pygpu.array(A, copy=True) # The output matrix will contain only the upper or lower # triangular factorization of A. If L is C ordered (it # probably is as it is the default in Theano) we just switch # the fill mode parameter of cusolver l_parameter = 0 if self.lower else 1 if L.flags['C_CONTIGUOUS']: l_parameter = 1 - l_parameter L_ptr = L.gpudata with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, l_parameter, n, L_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata cusolver.cusolverDnSpotrf( context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) val_dev_info = np.asarray(dev_info)[0] if val_dev_info > 0: raise LinAlgError('Cholesky decomposition failed (is A SPD?)') # cusolver leaves the elements in the matrix outside the considered # upper or lower triangle unchanged, so we need to put zeros outside # the triangle if self.lower: tril(L) else: triu(L) outputs[0][0] = L
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Input matrix. A = inputs[0] l, n = A.shape if l != n: raise ValueError('A must be a square matrix') lda = max(1, n) # cusolver operates on F ordered matrices if not self.inplace: LU = pygpu.array(A, copy=True, order='F') else: LU = A.T if A.flags['C_CONTIGUOUS'] else A LU_ptr = LU.gpudata with context: workspace_size = cusolver.cusolverDnSgetrf_bufferSize( context.cusolver_handle, n, n, LU_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, LU_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) if self.check_output: val_dev_info = np.asarray(dev_info)[0] if val_dev_info > 0: raise LinAlgError('LU decomposition failed') outputs[1][0] = pivots outputs[0][0] = LU
def thunk(): input_shape = inputs[0][0].shape s = inputs[1][0] # Since padding is not supported, assert s matches input shape. # assert (input_shape[1:-1] == s).all() assert (input_shape[-3:-1] == s).all() output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context, dtype='float32') input_pycuda = inputs[0][0] output_pycuda = z[0] with input_pycuda.context: # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(s, np.complex64, np.complex64, batch=np.prod(input_shape[:-3])) # Sync GPU variables before computation input_pycuda.sync() output_pycuda.sync() fft.fft(input_pycuda, output_pycuda, plan[0]) # Sync results to ensure output contains completed computation pycuda.driver.Context.synchronize()
def thunk(): input_shape = inputs[0][0].shape s = inputs[1][0] # Since padding is not supported, assert s matches input shape. # assert (input_shape[1:-1] == s).all() assert (input_shape[1:-1] == s[:-1]).all() # # construct output shape # output_shape = [input_shape[0]] + list(s) # # DFT of real input is symmetric, no need to store # # redundant coefficients # output_shape[-1] = output_shape[-1] // 2 + 1 # # extra dimension with length 2 for real/imag # output_shape += [2] # output_shape = tuple(output_shape) # Output is the same shape as the input (m, ..., n, 2) output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context, dtype='float32') # z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context, # dtype='float32') input_pycuda = inputs[0][0] # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out skcuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = z[0] with input_pycuda.context: # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(s, np.complex64, np.complex64, batch=input_shape[0]) # Sync GPU variables before computation input_pycuda.sync() output_pycuda.sync() fft.fft(input_pycuda, output_pycuda, plan[0]) # Sync results to ensure output contains completed computation pycuda.driver.Context.synchronize()
def print_times_axpy(): print('') print('AXPY') print('====') print('') for shape in shapes: print('shape = {}'.format(shape)) x_gpu = pygpu.zeros(shape, dtype=dtype) y_gpu = pygpu.zeros(shape, dtype=dtype) # Run once so kernel is compiled odl.space.gpuary_tensors.axpy(a, x_gpu, y_gpu) tstart = time() for _ in range(n_runs): odl.space.gpuary_tensors.axpy(a, x_gpu, y_gpu) tstop = time() print('GPU time: {:.5}'.format((tstop - tstart) / n_runs)) x_cpu = np.zeros(shape, dtype=dtype) y_cpu = np.zeros_like(x_cpu) tstart = time() for _ in range(n_runs): y_cpu += a * x_cpu tstop = time() print('CPU time, no copy: {:.5}'.format((tstop - tstart) / n_runs)) tstart = time() for _ in range(n_runs): axpy(x_cpu, y_cpu, a=a) tstop = time() print('BLAS time: {:.5}'.format((tstop - tstart) / n_runs)) tstart = time() for _ in range(n_runs): x_gpu_to_cpu = a * np.asarray(x_gpu) y_gpu_to_cpu = np.asarray(y_gpu) y_gpu_to_cpu += x_gpu_to_cpu y_gpu[:] = y_gpu_to_cpu tstop = time() print('CPU time, with copy: {:.5}'.format((tstop - tstart) / n_runs)) print('')
def print_times_lico(): print('') print('LICO') print('====') print('') for shape in shapes: print('shape = {}'.format(shape)) x_gpu = pygpu.zeros(shape, dtype=dtype) y_gpu = pygpu.zeros(shape, dtype=dtype) out_gpu = x_gpu._empty_like_me() # Run once so kernel is compiled odl.space.gpuary_tensors.lico(a, x_gpu, b, y_gpu, out_gpu) tstart = time() for _ in range(n_runs): odl.space.gpuary_tensors.lico(a, x_gpu, b, y_gpu, out_gpu) tstop = time() print('GPU time: {:.5}'.format((tstop - tstart) / n_runs)) x_cpu = np.zeros(shape, dtype=dtype) y_cpu = np.zeros_like(x_cpu) out_cpu = np.empty_like(x_cpu) tstart = time() for _ in range(n_runs): np.multiply(a, x_cpu, out=out_cpu) out_cpu += b * y_cpu tstop = time() print('CPU time, no copy: {:.5}'.format((tstop - tstart) / n_runs)) out_gpu = x_gpu._empty_like_me() tstart = time() for _ in range(n_runs): x_gpu_to_cpu = np.asarray(x_gpu) out_cpu = b * np.asarray(y_gpu) out_cpu += a * x_gpu_to_cpu out_gpu[:] = out_cpu tstop = time() print('CPU time, with copy: {:.5}'.format((tstop - tstart) / n_runs)) print('')
def thunk(): input_shape = inputs[0][0].shape s = inputs[1][0] # Since padding is not supported, assert that last dimension corresponds to # input forward transform size. # assert (input_shape[1:-2] == s[:-1]).all() # assert ((input_shape[-2] - 1) * 2 + s[-1] % 2 == s[-1]).all() # construct output shape # chop off the extra length-2 dimension for real/imag # output_shape = [input_shape[0]] + list(s) # output_shape = tuple(output_shape) output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context, dtype='float32') input_pycuda = inputs[0][0] # input_pycuda is a float32 array with an extra dimension, # but will be interpreted by skcuda as a complex64 # array instead. output_pycuda = z[0] with input_pycuda.context: # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(s, np.complex64, np.complex64, batch=output_shape[0]) # Sync GPU variables before computation input_pycuda.sync() output_pycuda.sync() fft.ifft(input_pycuda, output_pycuda, plan[0]) # strangely enough, enabling rescaling here makes it run # very, very slowly, so do this rescaling manually # afterwards! # Sync results to ensure output contains completed computation pycuda.driver.Context.synchronize()
def test_GpuArray(self): with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32, ), context=ctx)) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32, ), context=ctx), protocol=0) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32, ), context=ctx), protocol=1) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32, ), context=ctx), protocol=2) if PY3: with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32, ), context=ctx), protocol=3) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32, ), context=ctx), protocol=-1)
def test_GpuArray(self): with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx)) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=0) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=1) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=2) if PY3: with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=3) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=-1)
def print_times_scal(): print('') print('SCAL') print('====') print('') for shape in shapes: print('shape = {}'.format(shape)) x_gpu = pygpu.zeros(shape, dtype=dtype) out_gpu = x_gpu._empty_like_me() # Run once so kernel is compiled odl.space.gpuary_tensors.scal(a, x_gpu, out_gpu) tstart = time() for _ in range(n_runs): odl.space.gpuary_tensors.scal(a, x_gpu, out_gpu) tstop = time() # print('GPU time: {:.5}'.format((tstop - tstart) / n_runs)) x_cpu = np.zeros(shape, dtype=dtype) tstart = time() for _ in range(n_runs): np.multiply(a, x_cpu, out=x_cpu) tstop = time() print('CPU time, no copy: {:.5}' ''.format((tstop - tstart) / n_runs * 1e3)) tstart = time() for _ in range(n_runs): scal(a, x_cpu) tstop = time() print('BLAS time: {:.5}' ''.format((tstop - tstart) / n_runs * 1e3)) tstart = time() for _ in range(n_runs): x_gpu_to_cpu = np.asarray(x_gpu) np.multiply(a, x_gpu_to_cpu, out=x_gpu_to_cpu) x_gpu[:] = x_gpu_to_cpu tstop = time() print('CPU time, with copy: {:.5}' ''.format((tstop - tstart) / n_runs * 1e3)) print('')
def thunk(): input_shape = inputs[0][0].shape s = inputs[1][0] output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context, dtype='float32') input_pycuda = inputs[0][0] # input_pycuda is a float32 array with an extra dimension, # but will be interpreted by skcuda as a complex64 # array instead. output_pycuda = z[0] with input_pycuda.context: # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(s, np.complex64, np.complex64, batch=np.prod(input_shape[:-3])) # Sync GPU variables before computation input_pycuda.sync() output_pycuda.sync() fft.ifft(input_pycuda, output_pycuda, plan[0]) # strangely enough, enabling rescaling here makes it run # very, very slowly, so do this rescaling manually # afterwards! # Sync results to ensure output contains completed computation pycuda.driver.Context.synchronize()
def test_zero_noparam(): try: pygpu.zeros() assert False except TypeError: pass
def test_zeros_no_dtype(): # no dtype and order param x = pygpu.zeros((), context=ctx) y = numpy.zeros(()) check_meta(x, y)
def zeros(shp, order, dtype): x = pygpu.zeros(shp, dtype, order, context=ctx) y = numpy.zeros(shp, dtype, order) check_all(x, y)
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0] # Solution vectors. b = inputs[1] assert len(A.shape) == 2 assert len(b.shape) == 2 if self.trans in ["T", "C"]: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == "N": trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError("Invalid value for trans") if l != n: raise ValueError("A must be a square matrix") if n != k: raise ValueError("A and b must be aligned.") lda = max(1, n) ldb = max(1, k) # We copy A and b as cusolver operates inplace b = pygpu.array(b, copy=True, order="F") if not self.inplace: A = pygpu.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags["C_CONTIGUOUS"]: trans = 1 - trans if A.dtype == "float32": potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize potrf = cusolver.cusolverDnSpotrf potrs = cusolverDnSpotrs getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize getrf = cusolver.cusolverDnSgetrf getrs = cusolver.cusolverDnSgetrs elif A.dtype == "float64": potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize potrf = cusolver.cusolverDnDpotrf potrs = cusolverDnDpotrs getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize getrf = cusolver.cusolverDnDgetrf getrs = cusolver.cusolverDnDgetrs else: raise ValueError("Unsupported dtype") if self.A_structure == "symmetric": with context: workspace_size = potrf_bufferSize(context.cusolver_handle, 0, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context) dev_info = pygpu.zeros((1, ), dtype="int32", context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata with context: potrf( context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr, ) self.check_dev_info(dev_info) potrs( context.cusolver_handle, 0, n, m, A_ptr, lda, b_ptr, ldb, dev_info_ptr, ) else: # general case for A with context: workspace_size = getrf_bufferSize(context.cusolver_handle, n, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context) pivots = pygpu.zeros(n, dtype="int32", context=context) dev_info = pygpu.zeros((1, ), dtype="int32", context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata with context: getrf( context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr, ) self.check_dev_info(dev_info) getrs( context.cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr, ) z[0] = b
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0] # Solution vectors. b = inputs[1] assert (len(A.shape) == 2) assert (len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k) # We copy A and b as cusolver operates inplace b = pygpu.array(b, copy=True, order='F') if not self.inplace: A = pygpu.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans if self.A_structure == 'symmetric': with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, 0, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSpotrf(context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) self.check_dev_info(dev_info) cusolverDnSpotrs(context.cusolver_handle, 0, n, m, A_ptr, lda, b_ptr, ldb, dev_info_ptr) else: # general case for A with context: workspace_size = cusolver.cusolverDnSgetrf_bufferSize( context.cusolver_handle, n, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) self.check_dev_info(dev_info) cusolver.cusolverDnSgetrs(context.cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0] # Solution vectors. b = inputs[1] assert(len(A.shape) == 2) assert(len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k) # We copy A and b as cusolver operates inplace b = pygpu.array(b, copy=True, order='F') if not self.inplace: A = pygpu.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans if A.dtype == 'float32': potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize potrf = cusolver.cusolverDnSpotrf potrs = cusolverDnSpotrs getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize getrf = cusolver.cusolverDnSgetrf getrs = cusolver.cusolverDnSgetrs elif A.dtype == 'float64': potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize potrf = cusolver.cusolverDnDpotrf potrs = cusolverDnDpotrs getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize getrf = cusolver.cusolverDnDgetrf getrs = cusolver.cusolverDnDgetrs else: raise ValueError("Unsupported dtype") if self.A_structure == 'symmetric': with context: workspace_size = potrf_bufferSize( context.cusolver_handle, 0, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata with context: potrf( context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) self.check_dev_info(dev_info) potrs( context.cusolver_handle, 0, n, m, A_ptr, lda, b_ptr, ldb, dev_info_ptr) else: # general case for A with context: workspace_size = getrf_bufferSize( context.cusolver_handle, n, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata with context: getrf( context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) self.check_dev_info(dev_info) getrs( context.cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b