def _call_kernel(kernel, input, weights, output, weight_dtype=cupy.float64): """ Calls a constructed ElementwiseKernel. The kernel must take an input image, an array of weights, and an output array. The weights are the only optional part and can be passed as None and then one less argument is passed to the kernel. If the output is given as None then it will be allocated in this function. This function deals with making sure that the weights are contiguous and float64 or bool*, that the output is allocated and appriopate shaped. This also deals with the situation that the input and output arrays overlap in memory. * weights is always casted to float64 or bool in order to get an output compatible with SciPy, though float32 might be sufficient when input dtype is low precision. """ if weights is not None: weights = cupy.ascontiguousarray(weights, weight_dtype) needs_temp = cupy.shares_memory(output, input, "MAY_SHARE_BOUNDS") if needs_temp: output, temp = ( _util._get_output(output.dtype, input, None, weight_dtype), output, ) if weights is None: kernel(input, output) else: kernel(input, weights, output) if needs_temp: temp[...] = output[...] output = temp return output
def test_array_gpu(self): import cupy as cp mv = MedicalVolume(np.ones((10, 20, 30)), self._AFFINE) mv_gpu = mv.to(Device(0)) data = cp.asarray(mv_gpu) assert cp.shares_memory(data, mv_gpu.volume)
def _call_kernel( kernel, input, weights, output, structure=None, weights_dtype=numpy.float64, structure_dtype=numpy.float64, ): """ Calls a constructed ElementwiseKernel. The kernel must take an input image, an optional array of weights, an optional array for the structure, and an output array. weights and structure can be given as None (structure defaults to None) in which case they are not passed to the kernel at all. If the output is given as None then it will be allocated in this function. This function deals with making sure that the weights and structure are contiguous and float64 (or bool for weights that are footprints)*, that the output is allocated and appriopately shaped. This also deals with the situation that the input and output arrays overlap in memory. * weights is always cast to float64 or bool in order to get an output compatible with SciPy, though float32 might be sufficient when input dtype is low precision. If weights_dtype is passed as weights.dtype then no dtype conversion will occur. The input and output are never converted. """ args = [input] if weights is not None: weights = cupy.ascontiguousarray(weights, weights_dtype) args.append(weights) if structure is not None: structure = cupy.ascontiguousarray(structure, structure_dtype) args.append(structure) output = _util._get_output(output, input, None, cupy.dtype(weights_dtype)) needs_temp = cupy.shares_memory(output, input, "MAY_SHARE_BOUNDS") if needs_temp: output, temp = _util._get_output(output.dtype, input), output args.append(output) kernel(*args) if needs_temp: temp[...] = output[...] output = temp return output
def _get_spline_output(input, output, allow_float32=False): """Create workspace array, temp, and the final dtype for the output. If allow_float32 is False, temp will always have float64 dtype. If allow_float32 is True, temp will have float32 dtype when ``input`` or ``output`` is single precision. """ complex_data = input.dtype.kind == "c" if complex_data: min_float_dtype = cupy.complex64 if allow_float32 else cupy.complex128 else: min_float_dtype = cupy.float32 if allow_float32 else cupy.float64 if isinstance(output, cupy.ndarray): if complex_data and output.dtype.kind != "c": raise ValueError( "output must have complex dtype for complex inputs") float_dtype = cupy.promote_types(output.dtype, min_float_dtype) output_dtype = output.dtype else: if output is None: output = output_dtype = input.dtype else: output_dtype = cupy.dtype(output) float_dtype = cupy.promote_types(output, min_float_dtype) if (isinstance(output, cupy.ndarray) and output.dtype == float_dtype == output_dtype and output.flags.c_contiguous): if output is not input: output[...] = input[...] temp = output else: temp = input.astype(float_dtype, copy=False) temp = cupy.ascontiguousarray(temp) if cupy.shares_memory(temp, input, 'MAY_SHARE_BOUNDS'): temp = temp.copy() return temp, float_dtype, output_dtype
def _get_spline_output(input, output): """Create workspace array, temp, and the final dtype for the output. Differs from SciPy by not always forcing the internal floating point dtype to be double precision. """ complex_data = input.dtype.kind == 'c' if complex_data: min_float_dtype = cupy.complex64 else: min_float_dtype = cupy.float32 if isinstance(output, cupy.ndarray): if complex_data and output.dtype.kind != 'c': raise ValueError( 'output must have complex dtype for complex inputs' ) float_dtype = cupy.promote_types(output.dtype, min_float_dtype) output_dtype = output.dtype else: if output is None: output = output_dtype = input.dtype else: output_dtype = cupy.dtype(output) float_dtype = cupy.promote_types(output, min_float_dtype) if (isinstance(output, cupy.ndarray) and output.dtype == float_dtype == output_dtype and output.flags.c_contiguous): if output is not input: output[...] = input[...] temp = output else: temp = input.astype(float_dtype, copy=False) temp = cupy.ascontiguousarray(temp) if cupy.shares_memory(temp, input, 'MAY_SHARE_BOUNDS'): temp = temp.copy() return temp, float_dtype, output_dtype
def _binary_erosion(input, structure, iterations, mask, output, border_value, origin, invert, brute_force=True): try: iterations = operator.index(iterations) except TypeError: raise TypeError('iterations parameter should be an integer') if input.dtype.kind == 'c': raise TypeError('Complex type not supported') if structure is None: structure = generate_binary_structure(input.ndim, 1) all_weights_nonzero = input.ndim == 1 center_is_true = True default_structure = True else: structure = structure.astype(dtype=bool, copy=False) # transfer to CPU for use in determining if it is fully dense # structure_cpu = cupy.asnumpy(structure) default_structure = False if structure.ndim != input.ndim: raise RuntimeError('structure and input must have same dimensionality') if not structure.flags.c_contiguous: structure = cupy.ascontiguousarray(structure) if structure.size < 1: raise RuntimeError('structure must not be empty') if mask is not None: if mask.shape != input.shape: raise RuntimeError('mask and input must have equal sizes') if not mask.flags.c_contiguous: mask = cupy.ascontiguousarray(mask) masked = True else: masked = False origin = _util._fix_sequence_arg(origin, input.ndim, 'origin', int) if isinstance(output, cupy.ndarray): if output.dtype.kind == 'c': raise TypeError('Complex output type not supported') else: output = bool output = _util._get_output(output, input) temp_needed = cupy.shares_memory(output, input, 'MAY_SHARE_BOUNDS') if temp_needed: # input and output arrays cannot share memory temp = output output = _util._get_output(output.dtype, input) if structure.ndim == 0: # kernel doesn't handle ndim=0, so special case it here if float(structure): output[...] = cupy.asarray(input, dtype=bool) else: output[...] = ~cupy.asarray(input, dtype=bool) return output origin = tuple(origin) int_type = _util._get_inttype(input) offsets = _filters_core._origins_to_offsets(origin, structure.shape) if not default_structure: # synchronize required to determine if all weights are non-zero nnz = int(cupy.count_nonzero(structure)) all_weights_nonzero = nnz == structure.size if all_weights_nonzero: center_is_true = True else: center_is_true = _center_is_true(structure, origin) erode_kernel = _get_binary_erosion_kernel( structure.shape, int_type, offsets, center_is_true, border_value, invert, masked, all_weights_nonzero, ) if iterations == 1: if masked: output = erode_kernel(input, structure, mask, output) else: output = erode_kernel(input, structure, output) elif center_is_true and not brute_force: raise NotImplementedError( 'only brute_force iteration has been implemented') else: if cupy.shares_memory(output, input, 'MAY_SHARE_BOUNDS'): raise ValueError('output and input may not overlap in memory') tmp_in = cupy.empty_like(input, dtype=output.dtype) tmp_out = output if iterations >= 1 and not iterations & 1: tmp_in, tmp_out = tmp_out, tmp_in if masked: tmp_out = erode_kernel(input, structure, mask, tmp_out) else: tmp_out = erode_kernel(input, structure, tmp_out) # TODO: kernel doesn't return the changed status, so determine it here changed = not (input == tmp_out).all() # synchronize! ii = 1 while ii < iterations or ((iterations < 1) and changed): tmp_in, tmp_out = tmp_out, tmp_in if masked: tmp_out = erode_kernel(tmp_in, structure, mask, tmp_out) else: tmp_out = erode_kernel(tmp_in, structure, tmp_out) changed = not (tmp_in == tmp_out).all() ii += 1 if not changed and (not ii & 1): # synchronize! # can exit early if nothing changed # (only do this after even number of tmp_in/out swaps) break output = tmp_out if temp_needed: temp[...] = output output = temp return output