def _correlate_or_convolve(input, weights, output, mode, cval, origin, convolution): if input.dtype.kind == 'c': raise TypeError('Complex type not supported.') if not hasattr(origin, '__getitem__'): origin = [ origin, ] * input.ndim else: origin = list(origin) wshape = [ii for ii in weights.shape if ii > 0] if len(wshape) != input.ndim: raise RuntimeError('filter weights array has incorrect shape.') if convolution: weights = weights[tuple([slice(None, None, -1)] * weights.ndim)] for ii in range(len(origin)): origin[ii] = -origin[ii] if weights.shape[ii] % 2 == 0: origin[ii] -= 1 for _origin, lenw in zip(origin, wshape): if (lenw // 2 + _origin < 0) or (lenw // 2 + _origin >= lenw): raise ValueError('invalid origin') if mode not in ('reflect', 'constant', 'nearest', 'mirror', 'wrap'): msg = 'boundary mode not supported (actual: {}).'.format(mode) raise RuntimeError(msg) output = _get_output(output, input) if weights.size == 0: return output input = cupy.ascontiguousarray(input) weights = cupy.ascontiguousarray(weights, cupy.float64) return _get_correlete_kernel(input.ndim, mode, cval, input.shape, tuple(wshape), tuple(origin))(input, weights, output)
def slice(self, slice): res = Sparse4DData() res.indices = cp.ascontiguousarray(self.indices[slice]) res.counts = cp.ascontiguousarray(self.counts[slice]) res.scan_dimensions = np.array(res.counts.shape[:2]) res.frame_dimensions = self.frame_dimensions.copy() return res
def reduced_binary_einsum(arr0, sub0, arr1, sub1, sub_others): set0 = set(sub0) set1 = set(sub1) assert len(set0) == len(sub0), 'operand 0 should be reduced: diagonal' assert len(set1) == len(sub1), 'operand 1 should be reduced: diagonal' if len(sub0) == 0 or len(sub1) == 0: return arr0 * arr1, sub0 + sub1 set_others = set(sub_others) shared = set0 & set1 batch_dims = shared & set_others contract_dims = shared - batch_dims bs0, cs0, ts0 = _make_transpose_axes(sub0, batch_dims, contract_dims) bs1, cs1, ts1 = _make_transpose_axes(sub1, batch_dims, contract_dims) sub_b = [sub0[axis] for axis in bs0] assert sub_b == [sub1[axis] for axis in bs1] sub_l = [sub0[axis] for axis in ts0] sub_r = [sub1[axis] for axis in ts1] sub_out = sub_b + sub_l + sub_r assert set(sub_out) <= set_others, 'operands should be reduced: unary sum' if len(contract_dims) == 0: # Use element-wise multiply when no contraction is needed if len(sub_out) == len(sub_others): # to assure final output of einsum is C-contiguous sub_out = sub_others arr0 = _expand_dims_transpose(arr0, sub0, sub_out) arr1 = _expand_dims_transpose(arr1, sub1, sub_out) return arr0 * arr1, sub_out for accelerator in _accelerator.get_routine_accelerators(): if accelerator == _accelerator.ACCELERATOR_CUTENSOR: if _use_cutensor(arr0.dtype, sub0, arr1.dtype, sub1, batch_dims, contract_dims): if len(sub_out) == len(sub_others): # to assure final output of einsum is C-contiguous sub_out = sub_others out_shape = _get_out_shape(arr0.shape, sub0, arr1.shape, sub1, sub_out) arr_out = cupy.empty(out_shape, arr0.dtype) arr0 = cupy.ascontiguousarray(arr0) arr1 = cupy.ascontiguousarray(arr1) desc_0 = cutensor.create_tensor_descriptor(arr0) desc_1 = cutensor.create_tensor_descriptor(arr1) desc_out = cutensor.create_tensor_descriptor(arr_out) arr_out = cutensor.contraction(1.0, arr0, desc_0, sub0, arr1, desc_1, sub1, 0.0, arr_out, desc_out, sub_out) return arr_out, sub_out tmp0, shapes0 = _flatten_transpose(arr0, [bs0, ts0, cs0]) tmp1, shapes1 = _flatten_transpose(arr1, [bs1, cs1, ts1]) shapes_out = shapes0[0] + shapes0[1] + shapes1[2] assert shapes0[0] == shapes1[0] arr_out = cupy.matmul(tmp0, tmp1).reshape(shapes_out) return arr_out, sub_out
def nn_gpu(ref, query): import cupy with open(cu_file) as f: kernel = cupy.RawKernel(f.read(), "cuComputeDistanceGlobal") ref_nb, ref_dim = ref.shape query_nb, query_dim = query.shape assert ref_dim == query_dim dim = ref_dim ref = ref.transpose(1, 0) query = query.transpose(1, 0) ref = cupy.ascontiguousarray(ref) query = cupy.ascontiguousarray(query) dist = cupy.empty((ref_nb, query_nb), dtype=cupy.float32) BLOCK_DIM = 16 grid = ( int(math.ceil(query_nb / BLOCK_DIM)), int(math.ceil(ref_nb / BLOCK_DIM)), 1, ) block = (16, 16, 1) args = (ref, ref_nb, query, query_nb, dim, dist) shared_mem = BLOCK_DIM * BLOCK_DIM + BLOCK_DIM * BLOCK_DIM + 5 kernel(grid, block, args=args, shared_mem=shared_mem) indices = cupy.argmin(dist, axis=0) return indices
def set_hooke_vertical(self, hooke, div_hooke=None): """Sets a hooke tensor, using a vertical approximation where suitable.""" # Project, reconstruct, and compute difference vertical = self._to_vertical(hooke) # Find where the vertical approximation is suitable reconstr = self._from_vertical(vertical) error2 = ((reconstr - hooke)**2).sum(axis=(0, 1)) reconstr = None norm2 = (hooke**2).sum(axis=(0, 1)) rel_error = np.sqrt(error2 / norm2) norm2, error2 = None, None self._vertical = self.block_expand(vertical) vertical = None full = rel_error > self.vertical_thres # where to use the full hooke tensors rel_error = None full_ratio = full.sum() / np.prod(self.shape) if full_ratio >= 0.3: print( "Performance warning : proportion {full_ratio} of " "Hooke tensors cannot be handled by the vertical approximation" ) self._full_index = self.block_expand( np.nonzeros(full).astype(np.int32)) full_hooke = hooke[:, :, full] full = None weights, offsets = VoronoiDecomposition(full_hooke, offset_t=np.int8) self._full_weights = cp.ascontiguousarray(np.moveaxis(weights, 0, 1)) weights = None self._full_offsets = cp.ascontiguousarray( np.moveaxis(self._compress_offsets(offsets), 0, 1)) offsets = None assert False # __TODO__ : compute div_hooke
def fast_iterative_method(self,data): """ Applies (a variant of) the fast iterative method. """ updateNext_o = fd.block_expand(data.trigger,self.shape_i,mode='constant', constant_values=False).reshape(self.shape_o+(-1,)).any(axis=-1) updateNext_o = cp.ascontiguousarray(updateNext_o.astype(np.uint8)) scorePrev_o = cp.zeros(self.shape_o, dtype='uint8') scoreNext_o = scorePrev_o.copy() policy = data.policy nitermax_o = policy.nitermax_o stop = self.InitStop(data) # strict_iter_o needed for niter_o in range(nitermax_o): if stop(updateNext_o): return niter_o updateList_o = cp.ascontiguousarray(cp.flatnonzero(updateNext_o), dtype=self.int_t) scorePrev_o,scoreNext_o = scoreNext_o,scorePrev_o updateNext_o.fill(0); scoreNext_o.fill(0) data.kernel((updateList_o.size,),(self.size_i,), KernelArgs(data) + (updateList_o,scorePrev_o,scoreNext_o,updateNext_o)) # print("------------- scorePrev_o,scoreNext_o,updateNext_o -------------------") # print(scorePrev_o) # print(scoreNext_o) # print(updateNext_o) return nitermax_o
def activation_backward(x, y, gy, mode): x = cupy.ascontiguousarray(x) gy = cupy.ascontiguousarray(gy) gx = cupy.empty_like(x) dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes handle = get_handle() y_mat = _as4darray(y) desc = create_tensor_descriptor(y_mat) if _cudnn_version >= 4000: act_desc = Descriptor(cudnn.createActivationDescriptor(), cudnn.destroyActivationDescriptor) cudnn.setActivationDescriptor( act_desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0) cudnn.activationBackward_v4( handle, act_desc.value, one.data, desc.value, y.data.ptr, desc.value, gy.data.ptr, desc.value, x.data.ptr, zero.data, desc.value, gx.data.ptr) else: cudnn.activationBackward_v3( handle, mode, one.data, desc.value, y.data.ptr, desc.value, gy.data.ptr, desc.value, x.data.ptr, zero.data, desc.value, gx.data.ptr) return gx
def backward_gpu(self, inputs, gradients): # data_in: [bs, nf, ...] # indices: [bs, is, is] # data_out: [bs, is, is, ..] data_in_shape = inputs[0].shape indices = cp.ascontiguousarray(inputs[1]) grad_out = cp.ascontiguousarray(gradients[0]) grad_in = cp.ascontiguousarray(cp.zeros(data_in_shape, 'float32')) chainer.cuda.elementwise( 'raw float32 grad_in, int32 index, raw float32 grad_out', '', string.Template(''' if (index < 0) return; int bn = i / (${image_size} * ${image_size}); int pos_from = bn * ${num_features} * ${dim} + index * ${dim}; int pos_to = i * ${dim}; float* p1 = (float*)&grad_in[pos_from]; float* p2 = (float*)&grad_out[pos_to]; for (int j = 0; j < ${dim}; j++) atomicAdd(p1++, *p2++); ''').substitute( image_size=indices.shape[1], num_features=data_in_shape[1], dim=functools.reduce(lambda x, y: x * y, data_in_shape[2:]), ), 'function', )( grad_in, indices, grad_out, ) return grad_in, None
def forward(ctx, data, masks, default_value): # PyTorch to CuPy device = data.device data_in = cp.asarray(data) masks = cp.asarray(masks) data_out = data_in.copy() dim = data_in.size / masks.size # distribute masks = cp.ascontiguousarray(masks) data_out = cp.ascontiguousarray(data_out) kernel = cp.ElementwiseKernel( 'raw S data_out, int64 mask', '', string.Template(''' if (mask == 0) { ${dtype}* p = (${dtype}*)&data_out[i * ${dim}]; for (int j = 0; j < ${dim}; j++) *p++ = ${default_value}; } ''').substitute( dim=dim, dtype=utils.get_dtype_in_cuda(data_out.dtype), default_value=default_value, ), 'function', ) kernel(data_out, masks) # CuPy to PyTorch data_out = torch.as_tensor(data_out, device=device) return data_out
def _fftn(a, s, axes, norm, direction, value_type='C2C', order='A', plan=None, overwrite_x=False, out=None): if norm not in (None, 'ortho'): raise ValueError('Invalid norm value %s, should be None or "ortho".' % norm) axes, axes_sorted = _prep_fftn_axes(a.ndim, s, axes) if not axes_sorted: return a a = _convert_dtype(a, value_type) if order == 'A': if a.flags.f_contiguous: order = 'F' elif a.flags.c_contiguous: order = 'C' else: a = cupy.ascontiguousarray(a) order = 'C' elif order not in ['C', 'F']: raise ValueError('Unsupported order: {}'.format(order)) # Note: need to call_cook_shape prior to sorting the axes a = _cook_shape(a, s, axes, value_type, order=order) if order == 'C' and not a.flags.c_contiguous: a = cupy.ascontiguousarray(a) elif order == 'F' and not a.flags.f_contiguous: a = cupy.asfortranarray(a) a = _exec_fftn(a, direction, value_type, norm=norm, axes=axes_sorted, overwrite_x=overwrite_x, plan=plan, out=out) return a
def activation_backward(x, y, gy, mode): x = cupy.ascontiguousarray(x) gy = cupy.ascontiguousarray(gy) gx = cupy.empty_like(x) dtype = "d" if x.dtype == "d" else "f" one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes handle = get_handle() y_mat = _as4darray(y) desc = create_tensor_descriptor(y_mat) cudnn.activationBackward_v3( handle, mode, one.data, desc.value, y.data.ptr, desc.value, gy.data.ptr, desc.value, x.data.ptr, zero.data, desc.value, gx.data.ptr, ) return gx
def activation_backward(x, y, gy, mode): x = cupy.ascontiguousarray(x) gy = cupy.ascontiguousarray(gy) gx = cupy.empty_like(x) dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes handle = get_handle() y_mat = _as4darray(y) desc = create_tensor_descriptor(y_mat) if _cudnn_version >= 4000: act_desc = Descriptor(cudnn.createActivationDescriptor(), cudnn.destroyActivationDescriptor) cudnn.setActivationDescriptor( act_desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0) cudnn.activationBackward_v4( handle, act_desc.value, one.data, desc.value, y.data.ptr, desc.value, gy.data.ptr, desc.value, x.data.ptr, zero.data, desc.value, gx.data.ptr) else: cudnn.activationBackward_v3( handle, mode, one.data, desc.value, y.data.ptr, desc.value, gy.data.ptr, desc.value, x.data.ptr, zero.data, desc.value, gx.data.ptr) return gx
def backward(self, indexes, gx): torch_y_grad = [ torch.zeros(*gx[i].shape, device=torch.device("cuda:" + str(self.gpu)), requires_grad=False, dtype=torch.float32).contiguous() for i in range(len(gx)) ] for i in range(len(gx)): if gx[i] is not None: datacopy(torch_y_grad[i].data.data_ptr(), cupy.ascontiguousarray(gx[i].data), torch_y_grad[i].numel()) for i in range(len(gx)): if gx[i] is not None: self.torch_y[i].backward(torch_y_grad[i]) x_grad = [ cupy.ascontiguousarray( cupy.zeros(tuple(self.torch_x[indexes[i]].grad.shape), dtype=cupy.float32)) for i in range(len(indexes)) ] for i in range(len(indexes)): datacopy(x_grad[i], self.torch_x[indexes[i]].grad.contiguous().data_ptr(), x_grad[i].size) # Not support Double Backprop for now x_grad = [chainer.Variable(x, requires_grad=False) for x in x_grad] return tuple(x_grad)
def _fftn(a, s, axes, norm, direction, value_type='C2C', order='A', plan=None, overwrite_x=False, out=None): if norm not in (None, 'ortho'): raise ValueError('Invalid norm value %s, should be None or "ortho".' % norm) axes, axes_sorted = _prep_fftn_axes(a.ndim, s, axes, value_type) if not axes_sorted: if value_type == 'C2C': return a else: raise IndexError('list index out of range') a = _convert_dtype(a, value_type) if order == 'A': if a.flags.f_contiguous: order = 'F' elif a.flags.c_contiguous: order = 'C' else: a = cupy.ascontiguousarray(a) order = 'C' elif order not in ['C', 'F']: raise ValueError('Unsupported order: {}'.format(order)) # Note: need to call _cook_shape prior to sorting the axes a = _cook_shape(a, s, axes, value_type, order=order) for n in a.shape: if n < 1: raise ValueError( 'Invalid number of FFT data points (%d) specified.' % n) if order == 'C' and not a.flags.c_contiguous: a = cupy.ascontiguousarray(a) elif order == 'F' and not a.flags.f_contiguous: a = cupy.asfortranarray(a) # _cook_shape tells us input shape only, and not output shape out_size = _get_fftn_out_size(a.shape, s, axes_sorted[-1], value_type) a = _exec_fftn(a, direction, value_type, norm=norm, axes=axes_sorted, overwrite_x=overwrite_x, plan=plan, out=out, out_size=out_size) return a
def ensure_cugraph_obj(obj, nx_weight_attr=None, matrix_graph_type=None): """ Convert the input obj - if possible - to a cuGraph Graph-type obj (Graph, DiGraph, etc.) and return a tuple of (cugraph Graph-type obj, original input obj type). If matrix_graph_type is specified, it is used as the cugraph Graph-type obj to create when converting from a matrix type. """ # FIXME: importing here to avoid circular import from cugraph.structure import Graph, DiGraph, MultiGraph, MultiDiGraph from cugraph.utilities.nx_factory import convert_from_nx input_type = type(obj) if input_type in [Graph, DiGraph, MultiGraph, MultiDiGraph]: return (obj, input_type) elif (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]): return (convert_from_nx(obj, weight=nx_weight_attr), input_type) elif (input_type in CP_MATRIX_TYPES) or (input_type in SP_MATRIX_TYPES): if matrix_graph_type is None: matrix_graph_type = Graph elif matrix_graph_type not in [Graph, DiGraph]: raise TypeError( f"matrix_graph_type must be either a cugraph " f"Graph or DiGraph, got: {matrix_graph_type}" ) if input_type in ( CP_COMPRESSED_MATRIX_TYPES + SP_COMPRESSED_MATRIX_TYPES ): coo = obj.tocoo(copy=False) else: coo = obj if input_type in CP_MATRIX_TYPES: df = cudf.DataFrame( { "source": cp.ascontiguousarray(coo.row), "destination": cp.ascontiguousarray(coo.col), "weight": cp.ascontiguousarray(coo.data), } ) else: df = cudf.DataFrame( {"source": coo.row, "destination": coo.col, "weight": coo.data} ) # FIXME: # * do a quick check that symmetry is stored explicitly in the cupy # data for sym matrices (ie. for each uv, check vu is there) # * populate the cugraph graph with directed data and set renumbering # to false in from edge list call. G = matrix_graph_type() G.from_cudf_edgelist(df, edge_attr="weight", renumber=True) return (G, input_type) else: raise TypeError(f"obj of type {input_type} is not supported.")
def _fftn(a, s, axes, norm, direction, value_type='C2C', order='A', plan=None, overwrite_x=False, out=None): if norm not in (None, 'ortho'): raise ValueError('Invalid norm value %s, should be None or "ortho".' % norm) a = _convert_dtype(a, value_type) if (s is not None) and (axes is not None) and len(s) != len(axes): raise ValueError('Shape and axes have different lengths.') if axes is None: if s is None: dim = a.ndim else: dim = len(s) axes = [i for i in six.moves.range(-dim, 0)] axes = tuple(axes) if order == 'A': if a.flags.f_contiguous: order = 'F' elif a.flags.c_contiguous: order = 'C' else: a = cupy.ascontiguousarray(a) order = 'C' elif order not in ['C', 'F']: raise ValueError('Unsupported order: {}'.format(order)) a = _cook_shape(a, s, axes, value_type, order=order) if order == 'C' and not a.flags.c_contiguous: a = cupy.ascontiguousarray(a) elif order == 'F' and not a.flags.f_contiguous: a = cupy.asfortranarray(a) # sort the provided axes in ascending order axes = tuple(sorted(np.mod(axes, a.ndim))) a = _exec_fftn(a, direction, value_type, norm=norm, axes=axes, overwrite_x=overwrite_x, plan=plan, out=out) return a
def _apply_conv_mode(full, s1, s2, mode, axes): # See scipy's documentation in scipy.signal.signaltools if mode == 'full': return cupy.ascontiguousarray(full) if mode == 'valid': s1 = [full.shape[a] if a not in axes else s1[a] - s2[a] + 1 for a in range(full.ndim)] starts = [(cur-new)//2 for cur, new in zip(full.shape, s1)] slices = tuple(slice(start, start+length) for start, length in zip(starts, s1)) return cupy.ascontiguousarray(full[slices])
def fwd_ptycho(self, psi, prb, scan): """Ptychography transform (FQ)""" res = cp.zeros([self.nscan, self.ndet, self.ndet], dtype='complex64') # convert to C-contiguous arrays if needed psi = cp.ascontiguousarray(psi) prb = cp.ascontiguousarray(prb) scan = cp.ascontiguousarray(scan) # run C wrapper self.fwd(res.data.ptr, psi.data.ptr, prb.data.ptr, scan.data.ptr, 0) # igpu = 0 return res
def adj_ptycho(self, data, prb, scan): """Adjoint ptychography transform (Q*F*)""" res = cp.zeros([self.nz, self.n], dtype='complex64') # convert to C-contiguous arrays if needed data = data.copy() data = cp.ascontiguousarray(data) prb = cp.ascontiguousarray(prb) scan = cp.ascontiguousarray(scan) # run C wrapper self.adj(res.data.ptr, data.data.ptr, prb.data.ptr, scan.data.ptr, 0) # igpu = 0 return res
def load_mc(self, t_ij): for j, (l, h) in enumerate( zip(self.observables.lows, self.observables.highs)): in_bounds = np.logical_and(t_ij[:, j] > l, t_ij[:, j] < h) t_ij = t_ij[in_bounds] self.t_ij = cp.asarray(t_ij) self.w_i = cp.ones(t_ij.shape[0]) if self.bootstrap_binning is not None: counts, _ = cp.histogramdd(cp.asarray(self.t_ij), bins=self.bin_edges, weights=cp.asarray(self.w_i)) self.counts = (cp.asarray(counts).flatten() / self.bin_vol / cp.sum(cp.asarray(counts))).reshape(counts.shape) self.sigma_j = cp.std(self.t_ij, axis=0) self.h_ij = self._adapt_bandwidth() for j, (l, h, refl) in enumerate( zip(self.observables.lows, self.observables.highs, self.reflect_axes)): if not refl: continue if type(refl) == tuple: low, high = refl mask = self.t_ij[:, j] < low t_ij_reflected_low = cp.copy(self.t_ij[mask, :]) h_ij_reflected_low = self.h_ij[mask, :] w_i_reflected_low = self.w_i[mask, :] t_ij_reflected_low[:, j] = 2 * l - t_ij_reflected_low[:, j] mask = self.t_ij[:, j] > high t_ij_reflected_high = cp.copy(self.t_ij[mask, :]) h_ij_reflected_high = self.h_ij[mask, :] w_i_reflected_high = self.w_i[mask, :] t_ij_reflected_high[:, j] = 2 * h - t_ij_reflected_high[:, j] else: t_ij_reflected_low = cp.copy(self.t_ij) h_ij_reflected_low = self.h_ij w_i_reflected_low = self.w_i t_ij_reflected_low[:, j] = 2 * l - self.t_ij[:, j] t_ij_reflected_high = cp.copy(self.t_ij) h_ij_reflected_high = self.h_ij w_i_reflected_high = self.w_i t_ij_reflected_high[:, j] = 2 * h - self.t_ij[:, j] self.t_ij = cp.concatenate( [self.t_ij, t_ij_reflected_low, t_ij_reflected_high]) self.h_ij = cp.concatenate( [self.h_ij, h_ij_reflected_low, h_ij_reflected_high]) self.w_i = cp.concatenate( [self.w_i, w_i_reflected_low, w_i_reflected_high]) self.t_ij = cp.ascontiguousarray(self.t_ij) self.h_ij = cp.ascontiguousarray(self.h_ij) self.w_i = cp.ascontiguousarray(self.w_i)
def __init__(self, data): self.table_size = 2**24 self.hash_factor = 2531011 self.dim = data.shape[-1] self.indices = cp.ascontiguousarray( cp.zeros((self.table_size, ), 'int32')) - 1 self.values = cp.ascontiguousarray( cp.zeros((self.table_size, self.dim), 'int32')) self.value_list = cp.ascontiguousarray( cp.zeros((self.table_size, self.dim), 'int32')) self.size = None self.init_keys(data)
def reduced_binary_einsum(arr0, sub0, arr1, sub1, sub_others): set0 = set(sub0) set1 = set(sub1) assert len(set0) == len(sub0), 'operand 0 should be reduced: diagonal' assert len(set1) == len(sub1), 'operand 1 should be reduced: diagonal' if len(sub0) == 0 or len(sub1) == 0: return arr0 * arr1, sub0 + sub1 set_others = set(sub_others) shared = set0 & set1 batch_dims = shared & set_others contract_dims = shared - batch_dims bs0, cs0, ts0 = _make_transpose_axes(sub0, batch_dims, contract_dims) bs1, cs1, ts1 = _make_transpose_axes(sub1, batch_dims, contract_dims) sub_b = [sub0[axis] for axis in bs0] assert sub_b == [sub1[axis] for axis in bs1] sub_l = [sub0[axis] for axis in ts0] sub_r = [sub1[axis] for axis in ts1] sub_out = sub_b + sub_l + sub_r assert set(sub_out) <= set_others, 'operands should be reduced: unary sum' if _use_cutensor(arr0.dtype, sub0, arr1.dtype, sub1, batch_dims, contract_dims): if len(sub_out) == len(sub_others): sub_out = sub_others out_shape = _get_out_shape(arr0.shape, sub0, arr1.shape, sub1, sub_out) arr_out = cupy.empty(out_shape, arr0.dtype) arr0 = cupy.ascontiguousarray(arr0) arr1 = cupy.ascontiguousarray(arr1) desc_0 = cutensor.create_tensor_descriptor(arr0) desc_1 = cutensor.create_tensor_descriptor(arr1) desc_out = cutensor.create_tensor_descriptor(arr_out) arr_out = cutensor.contraction(1.0, arr0, desc_0, sub0, arr1, desc_1, sub1, 0.0, arr_out, desc_out, sub_out) return arr_out, sub_out tmp0, shapes0 = _flatten_transpose(arr0, [bs0, ts0, cs0]) tmp1, shapes1 = _flatten_transpose(arr1, [bs1, cs1, ts1]) shapes_out = shapes0[0] + shapes0[1] + shapes1[2] assert shapes0[0] == shapes1[0] arr_out = cupy.matmul(tmp0, tmp1).reshape(shapes_out) return arr_out, sub_out
def activation_backward(x, y, gy, mode): x = cupy.ascontiguousarray(x) gy = cupy.ascontiguousarray(gy) gx = cupy.empty_like(x) dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes handle = get_handle() y_mat = _as4darray(y) desc = create_tensor_descriptor(y_mat) cudnn.activationBackward_v3(handle, mode, one.data, desc.value, y.data.ptr, desc.value, gy.data.ptr, desc.value, x.data.ptr, zero.data, desc.value, gx.data.ptr) return gx
def find(self, data): ret = cp.ascontiguousarray(cp.zeros(data.shape[:-1], 'int32')) - 1 data = cp.ascontiguousarray(data) loop_indices = cp.arange(data.size / self.dim).astype('int32') ok = cp.zeros((1, ), 'int32') chainer.cuda.elementwise( 'int32 j, raw int32 data, raw int32 indices, raw int32 values, raw int32 ret, raw int32 ok', '', string.Template(''' /* */ int* value = &data[j * ${dim}]; int bn = i / ${num_points}; /* compute initial key */ unsigned int key = 0; for (int k = 0; k < ${dim}; k++) key = (key + value[k]) * ${hash_factor}; key = key % ${table_size}; for (int l = 0; l < 100; l++) { if (indices[bn * ${table_size} + key] < 0) { ret[j] = -1; break; } bool match = true; for (int k = 0; k < ${dim}; k++) if (values[(bn * ${table_size} + key) * ${dim} + k] != value[k]) match = false; if (match) { ret[j] = indices[bn * ${table_size} + key]; break; } else { key = (key + 1) % ${table_size}; } if (l == 99) { ok[0] = -1; } } ''').substitute( table_size=self.table_size, hash_factor=self.hash_factor, num_points=data.shape[1], dim=self.dim, ), 'function', )(loop_indices, data, self.indices, self.values, ret, ok) if int(ok[0]) < 0: raise Exception return ret
def forward_gpu(self, inputs): # data_in: [bs, nf, ...] # indices: [bs, is, is] # data_out: [bs, is, is, ..] data_in, indices = list(map(cp.ascontiguousarray, inputs)) data_out = cp.ascontiguousarray( cp.zeros(tuple(list(indices.shape[:3]) + list(data_in.shape[2:])), 'float32')) chainer.cuda.elementwise( 'raw float32 data_in, int32 index, raw float32 data_out', '', string.Template(''' if (index < 0) return; int bn = i / (${image_size} * ${image_size}); int pos_from = bn * ${num_features} * ${dim} + index * ${dim}; int pos_to = i * ${dim}; float* p1 = (float*)&data_in[pos_from]; float* p2 = (float*)&data_out[pos_to]; for (int j = 0; j < ${dim}; j++) *p2++ = *p1++; ''').substitute( image_size=indices.shape[1], num_features=data_in.shape[1], dim=functools.reduce(lambda x, y: x * y, data_in.shape[2:]), ), 'function', )( data_in, indices, data_out, ) return data_out,
def _filter_input(image, prefilter, mode, cval, order): """Perform spline prefiltering when needed. Spline orders > 1 need a prefiltering stage to preserve resolution. For boundary modes without analytical spline boundary conditions, some prepadding of the input with cupy.pad is used to maintain accuracy. ``npad`` is an integer corresponding to the amount of padding at each edge of the array. """ if not prefilter or order < 2: return (cupy.ascontiguousarray(image), 0) padded, npad = _prepad_for_spline_filter(image, mode, cval) float_dtype = cupy.promote_types(image.dtype, cupy.float32) filtered = spline_filter(padded, order, output=float_dtype, mode=mode) return cupy.ascontiguousarray(filtered), npad
def _estimate_pdf_multi(self, x_kj, w_i=None, get=True): if w_i == None: w_i = self.w_i if self.bootstrap_binning is None: n = self.t_ij.shape[0] h_j = (4 / 3 / n)**(1 / 5) * self.sigma_j if cp == np: return np.asarray([ KernelDensityPDF._kdpdf0(x_j, self.t_ij, h_j, self.w_i) for x_j in x_kj ]) else: x_kj = cp.asarray(x_kj) h_j = cp.ascontiguousarray(cp.asarray(h_j)) pdf_k = cp.empty(x_kj.shape[0]) block_size = 64 grid_size = x_kj.shape[0] // block_size + 1 KernelDensityPDF._kdpdf0_multi( (grid_size, ), (block_size, ), (x_kj, self.t_ij, h_j, w_i, self.t_ij.shape[0], self.t_ij.shape[1], x_kj.shape[0], pdf_k)) pdf_k = pdf_k / cp.sum(self.w_i) return pdf_k.get() if get else pdf_k else: #should do this on GPU... x_kj = cp.asnumpy(x_kj) from scipy.interpolate import RegularGridInterpolator interp = RegularGridInterpolator(cp.asnumpy(self.bin_centers), cp.asnumpy(self.counts), bounds_error=False, fill_value=None) pdf_k = cp.asarray(interp(x_kj)) min_val = np.min(self.counts) pdf_k[pdf_k < min_val] = min_val return pdf_k
def _as_contiguous(args): if isinstance(args, (list, tuple)): ret = [] for arg in args: if arg is None: ret.append(None) continue if arg.flags.c_contiguous is False: arg = cupy.ascontiguousarray(arg) ret.append(arg) return ret if args.flags.c_contiguous is False: args = cupy.ascontiguousarray(args) return args
def _call_kernel(kernel, input, weights, output, weight_dtype=cupy.float64): """ Calls a constructed ElementwiseKernel. The kernel must take an input image, an array of weights, and an output array. The weights are the only optional part and can be passed as None and then one less argument is passed to the kernel. If the output is given as None then it will be allocated in this function. This function deals with making sure that the weights are contiguous and float64 or bool*, that the output is allocated and appriopate shaped. This also deals with the situation that the input and output arrays overlap in memory. * weights is always casted to float64 or bool in order to get an output compatible with SciPy, though float32 might be sufficient when input dtype is low precision. """ if weights is not None: weights = cupy.ascontiguousarray(weights, weight_dtype) needs_temp = cupy.shares_memory(output, input, "MAY_SHARE_BOUNDS") if needs_temp: output, temp = ( _util._get_output(output.dtype, input, None, weight_dtype), output, ) if weights is None: kernel(input, output) else: kernel(input, weights, output) if needs_temp: temp[...] = output[...] output = temp return output
def _correlate(in1, in2, mode='full', method='auto', convolution=False): quick_out = _st_core._check_conv_inputs(in1, in2, mode, convolution) if quick_out is not None: return quick_out if method not in ('auto', 'direct', 'fft'): raise ValueError('acceptable methods are "auto", "direct", or "fft"') if method == 'auto': method = choose_conv_method(in1, in2, mode=mode) if method == 'direct': return _st_core._direct_correlate(in1, in2, mode, in1.dtype, convolution) # if method == 'fft': inputs_swapped = _st_core._inputs_swap_needed(mode, in1.shape, in2.shape) if inputs_swapped: in1, in2 = in2, in1 if not convolution: in2 = _st_core._reverse_and_conj(in2) out = fftconvolve(in1, in2, mode) result_type = cupy.result_type(in1, in2) if result_type.kind in 'ui': out = out.round() out = out.astype(result_type, copy=False) if not convolution and inputs_swapped: out = cupy.ascontiguousarray(_st_core._reverse_and_conj(out)) return out
def _check_size_footprint_structure(ndim, size, footprint, structure, stacklevel=3, force_footprint=False): if structure is None and footprint is None: if size is None: raise RuntimeError("no footprint or filter size provided") sizes = _util._fix_sequence_arg(size, ndim, 'size', int) if force_footprint: return None, cupy.ones(sizes, bool), None return sizes, None, None if size is not None: warnings.warn("ignoring size because {} is set".format( 'structure' if footprint is None else 'footprint'), UserWarning, stacklevel=stacklevel + 1) if footprint is not None: footprint = cupy.array(footprint, bool, True, 'C') if not footprint.any(): raise ValueError("all-zero footprint is not supported") if structure is None: if not force_footprint and footprint.all(): if footprint.ndim != ndim: raise RuntimeError("size must have length equal to input rank") return footprint.shape, None, None return None, footprint, None structure = cupy.ascontiguousarray(structure) if footprint is None: footprint = cupy.ones(structure.shape, bool) return None, footprint, structure
def from_4Dcamera_file(filename): with h5py.File(filename, 'r') as f0: frames = f0['electron_events/frames'][:] scan_dimensions = ( f0['electron_events/scan_positions'].attrs['Ny'], f0['electron_events/scan_positions'].attrs['Nx']) frame_dimensions = np.array((576, 576)) def unragged_frames_size(frames): mm = 0 for ev in frames: if ev.shape[0] > mm: mm = ev.shape[0] return mm def make_unragged_frames(frames, scan_dimensions): unragged_frame_size = unragged_frames_size(frames.ravel()) fr_full = cp.zeros((frames.ravel().shape[0], unragged_frame_size), dtype=cp.int32) fr_full[:] = cp.iinfo(fr_full.dtype).max for ii, ev in enumerate(frames.ravel()): fr_full[ii, :ev.shape[0]] = cp.array(ev) fr_full_4d = fr_full.reshape((*scan_dimensions, fr_full.shape[1])) fr_full_4d = fr_full_4d[:, :-1, :] return fr_full_4d d = Sparse4DData() d.indices = cp.ascontiguousarray( make_unragged_frames(frames.ravel(), scan_dimensions)) d.scan_dimensions = np.array(d.indices.shape[:2]) d.frame_dimensions = frame_dimensions d.counts = cp.zeros(d.indices.shape, dtype=cp.bool) d.counts[d.indices != cp.iinfo(d.indices.dtype).max] = 1 return d
def activation_forward(x, mode): x = cupy.ascontiguousarray(x) y = cupy.empty_like(x) dtype = "d" if x.dtype == "d" else "f" one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes handle = get_handle() x_mat = _as4darray(x) desc = create_tensor_descriptor(x_mat) cudnn.activationForward_v3(handle, mode, one.data, desc.value, x_mat.data.ptr, zero.data, desc.value, y.data.ptr) return y
def _call_nms_kernel(bbox, thresh): n_bbox = bbox.shape[0] threads_per_block = 64 col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) blocks = (col_blocks, col_blocks, 1) threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64) bbox = cp.ascontiguousarray(bbox, dtype=np.float32) kern = _load_kernel('nms_kernel', _nms_gpu_code) kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), bbox, mask_dev)) mask_host = mask_dev.get() selection, n_selec = _nms_gpu_post( mask_host, n_bbox, threads_per_block, col_blocks) return selection, n_selec
def test_ascontiguousarray_on_contiguous_array(self): a = testing.shaped_arange((2, 3, 4)) b = cupy.ascontiguousarray(a) self.assertIs(a, b)
def test_ascontiguousarray_on_noncontiguous_array(self): a = testing.shaped_arange((2, 3, 4)) b = a.transpose(2, 0, 1) c = cupy.ascontiguousarray(b) self.assertTrue(c.flags.c_contiguous) testing.assert_array_equal(b, c)
def __call__(self, loc, score, anchor, img_size, scale=1.): """input should be ndarray Propose RoIs. Inputs :obj:`loc, score, anchor` refer to the same anchor when indexed by the same index. On notations, :math:`R` is the total number of anchors. This is equal to product of the height and the width of an image and the number of anchor bases per pixel. Type of the output is same as the inputs. Args: loc (array): Predicted offsets and scaling to anchors. Its shape is :math:`(R, 4)`. score (array): Predicted foreground probability for anchors. Its shape is :math:`(R,)`. anchor (array): Coordinates of anchors. Its shape is :math:`(R, 4)`. img_size (tuple of ints): A tuple :obj:`height, width`, which contains image size after scaling. scale (float): The scaling factor used to scale an image after reading it from a file. Returns: array: An array of coordinates of proposal boxes. Its shape is :math:`(S, 4)`. :math:`S` is less than :obj:`self.n_test_post_nms` in test time and less than :obj:`self.n_train_post_nms` in train time. :math:`S` depends on the size of the predicted bounding boxes and the number of bounding boxes discarded by NMS. """ # NOTE: when test, remember # faster_rcnn.eval() # to set self.traing = False if self.parent_model.training: n_pre_nms = self.n_train_pre_nms n_post_nms = self.n_train_post_nms else: n_pre_nms = self.n_test_pre_nms n_post_nms = self.n_test_post_nms # Convert anchors into proposal via bbox transformations. # roi = loc2bbox(anchor, loc) roi = loc2bbox(anchor, loc) # Clip predicted boxes to image. roi[:, slice(0, 4, 2)] = np.clip( roi[:, slice(0, 4, 2)], 0, img_size[0]) roi[:, slice(1, 4, 2)] = np.clip( roi[:, slice(1, 4, 2)], 0, img_size[1]) # Remove predicted boxes with either height or width < threshold. min_size = self.min_size * scale hs = roi[:, 2] - roi[:, 0] ws = roi[:, 3] - roi[:, 1] keep = np.where((hs >= min_size) & (ws >= min_size))[0] roi = roi[keep, :] score = score[keep] # Sort all (proposal, score) pairs by score from highest to lowest. # Take top pre_nms_topN (e.g. 6000). order = score.ravel().argsort()[::-1] if n_pre_nms > 0: order = order[:n_pre_nms] roi = roi[order, :] # Apply nms (e.g. threshold = 0.7). # Take after_nms_topN (e.g. 300). # unNOTE: somthing is wrong here! # TODO: remove cuda.to_gpu keep = non_maximum_suppression( cp.ascontiguousarray(cp.asarray(roi)), thresh=self.nms_thresh) if n_post_nms > 0: keep = keep[:n_post_nms] roi = roi[keep] return roi