def _create_contraction_plan(desc, algo, ws_pref): """Create a contraction plan""" handle = get_handle() key = (handle.ptr, algo) if key in _contraction_finds: find = _contraction_finds[key] else: find = cutensor.ContractionFind() cutensor.initContractionFind(handle, find, algo) _contraction_finds[key] = find ws_allocation_success = False for pref in (ws_pref, cutensor.WORKSPACE_MIN): ws_size = cutensor.contractionGetWorkspace(handle, desc, find, pref) try: ws = cupy.ndarray((ws_size, ), dtype=numpy.int8) ws_allocation_success = True except Exception: warnings.warn('cuTENSOR: failed to allocate memory of workspace ' 'with preference ({}) and size ({}).' ''.format(pref, ws_size)) if ws_allocation_success: break if not ws_allocation_success: raise RuntimeError('cuTENSOR: failed to allocate memory of workspace.') key = (handle.ptr, desc.ptr, find.ptr, ws_size) if key in _contraction_plans: plan = _contraction_plans[key] else: plan = cutensor.ContractionPlan() cutensor.initContractionPlan(handle, plan, desc, find, ws_size) _contraction_plans[key] = plan return plan, ws, ws_size
def contraction(alpha, A, desc_A, mode_A, B, desc_B, mode_B, beta, C, desc_C, mode_C, uop=cutensor.OP_IDENTITY, compute_dtype=None, algo=cutensor.ALGO_DEFAULT, ws_pref=cutensor.WORKSPACE_RECOMMENDED): """General tensor contraction This routine computes the tensor contraction: C = uop(alpha * uop_A(A) * uop_B(B) + beta * uop_C(C)) See cupy/cuda/cutensor.contraction for details. Args: alpha: Scaling factor for A * B. A (cupy.ndarray): Input tensor. desc_A (class Descriptor): A descriptor that holds the information about the data type, modes, and strides of tensor A. mode_A (tuple of int/str): A tuple that holds the labels of the modes of tensor A (e.g., if A_{x,y,z} => mode_A = {'x','y','z'}) B (cupy.ndarray): Input tensor. desc_B (class Descriptor): A descriptor that holds the information about the data type, modes, and strides of tensor B. mode_B (tuple of int/str): A tuple that holds the labels of the modes of tensor B. beta: Scaling factor for C. C (cupy.ndarray): Input tensor. desc_C (class Descriptor): A descriptor that holds the information about the data type, modes, and strides of tensor C. mode_C (tuple of int/str): A tuple that holds the labels of the modes of tensor C. uop (cutensorOperator_t): The element-wise unary operator. compute_dtype (numpy.dtype): Compute type for the intermediate computation. algo (cutenorAlgo_t): Allows users to select a specific algorithm. ALGO_DEFAULT lets the heuristic choose the algorithm. Any value >= 0 selects a specific GEMM-like algorithm and deactivates the heuristic. If a specified algorithm is not supported, STATUS_NOT_SUPPORTED is returned. ws_perf (cutensorWorksizePreference_t): User preference for the workspace of cuTensor. Returns: out (cupy.ndarray): Output tensor. """ assert A.dtype == B.dtype == C.dtype assert A.ndim == len(mode_A) assert B.ndim == len(mode_B) assert C.ndim == len(mode_C) mode_A = numpy.array([ord(x) if isinstance(x, str) else x for x in mode_A], dtype=numpy.int32) mode_B = numpy.array([ord(x) if isinstance(x, str) else x for x in mode_B], dtype=numpy.int32) mode_C = numpy.array([ord(x) if isinstance(x, str) else x for x in mode_C], dtype=numpy.int32) out = C if compute_dtype is None: if A.dtype == numpy.float16: compute_dtype = numpy.float32 else: compute_dtype = A.dtype alpha = numpy.array(alpha, compute_dtype) beta = numpy.array(beta, compute_dtype) handle = get_handle() compute_dtype = get_cuda_dtype(compute_dtype) ws_allocation_success = False for pref in (ws_pref, cutensor.WORKSPACE_MIN): ws_size = cutensor.contractionGetWorkspace( handle, A.data.ptr, desc_A.value, mode_A.ctypes.data, B.data.ptr, desc_B.value, mode_B.ctypes.data, C.data.ptr, desc_C.value, mode_C.ctypes.data, out.data.ptr, desc_C.value, mode_C.ctypes.data, uop, compute_dtype, algo, pref) try: ws = cupy.ndarray((ws_size, ), dtype=numpy.int8) ws_allocation_success = True except Exception: warnings.warn('cuTENSOR: failed to allocate memory of workspace ' 'with preference ({}) and size ({}).' ''.format(pref, ws_size)) if ws_allocation_success: break if not ws_allocation_success: raise RuntimeError('cuTENSOR: failed to allocate memory of workspace.') cutensor.contraction(handle, alpha.ctypes.data, A.data.ptr, desc_A.value, mode_A.ctypes.data, B.data.ptr, desc_B.value, mode_B.ctypes.data, beta.ctypes.data, C.data.ptr, desc_C.value, mode_C.ctypes.data, out.data.ptr, desc_C.value, mode_C.ctypes.data, uop, compute_dtype, algo, ws.data.ptr, ws_size) return out