def test_generic_syntax_simple(self): ############################################################ from pykeops.torch import Genred aliases = [ 'P = Pm(2)', # 1st argument, a parameter, dim 2. 'X = Vi(' + str(self.xc.shape[1]) + ') ', # 2nd argument, indexed by i, dim D. 'Y = Vj(' + str(self.yc.shape[1]) + ') ' ] # 3rd argument, indexed by j, dim D. formula = 'Pow((X|Y),2) * ((Elem(P,0) * X) + (Elem(P,1) * Y))' if pykeops.gpu_available: backend_to_test = ['auto', 'GPU_1D', 'GPU_2D', 'GPU'] else: backend_to_test = ['auto'] for b in backend_to_test: with self.subTest(b=b): my_routine = Genred(formula, aliases, reduction_op='Sum', axis=1) gamma_keops = my_routine(self.pc, self.xc, self.yc, backend=b) # Numpy version scals = (self.x @ self.y.T)**2 # Memory-intensive computation! gamma_py = self.p[0] * scals.sum(1).reshape( -1, 1) * self.x + self.p[1] * (scals @ self.y) # compare output self.assertTrue( np.allclose(gamma_keops.cpu().data.numpy(), gamma_py, atol=1e-6))
def FeaturesKP(kernel, gs, xs, ys, bs, mode='sum', backend='auto', dtype='float32'): if backend in ['pytorch', 'matrix']: domain, torch_map = pytorch_routines[mode] if domain == 'sum': routine = kernel.routine_sum elif domain == 'log': routine = kernel.routine_log return torch_map(routine, gs, xs, ys, bs, matrix=(backend == 'matrix')) else: red, formula, bs_cat = keops_routines[mode] formula = formula.format(f_sum=kernel.formula_sum, f_log=kernel.formula_log) # Given the output sizes, we must generate the appropriate list of aliases # We will store the arguments as follow : # [ G_0, G_1, ..., X_0, X_1, Y_0, Y_1, ...] full_args, aliases, index = [], [], 0 # tensor list, string list, current input arg # First, the G_i's for (i, g) in enumerate(gs): if g is not None: g_var, g_dim, g_cat, g_str = extract_metric_parameters( g) # example : Tensor(...), 3, 0, 'Vi' aliases.append('G_{g_ind} = {g_str}({index}, {g_dim})'.format( g_ind=i, g_str=g_str, index=index, g_dim=g_dim)) full_args.append(g_var) index += 1 # Then, the X_i's for (i, x) in enumerate(xs): x_dim = x.size(1) aliases.append('X_{x_ind} = Vi({index}, {x_dim})'.format( x_ind=i, index=index, x_dim=x_dim)) full_args.append(x) index += 1 # Then, the Y_j's for (j, y) in enumerate(ys): y_dim = y.size(1) aliases.append('Y_{y_ind} = Vj({index}, {y_dim})'.format( y_ind=j, index=index, y_dim=y_dim)) full_args.append(y) index += 1 if not len(xs) == len(ys): raise ValueError( "Kernel_product works with pairs of variables. The 'x'-list of features should thus have the same length as the 'y' one." ) # Then, the B_i/j's for (i, (b, b_cat)) in enumerate(zip(bs, bs_cat)): b_dim = b.size(1) b_str = ['Vi', 'Vj', 'Pm'][b_cat] aliases.append('B_{b_ind} = {b_str}({index}, {b_dim})'.format( b_ind=i, b_str=b_str, index=index, b_dim=b_dim)) full_args.append(b) index += 1 axis = 1 # the output vector is indexed by 'i' (CAT=0) genconv = Genred(formula, aliases, reduction_op=red, axis=axis, dtype=dtype) return genconv(*full_args, backend=backend)
def generic_argkmin(formula, output, *aliases, **kwargs): r"""Alias for :class:`torch.Genred <pykeops.torch.Genred>` with an "ArgKMin" reduction. Args: formula (string): Scalar-valued symbolic KeOps expression, as in :class:`torch.Genred <pykeops.torch.Genred>`. output (string): An identifier of the form ``"AL = TYPE(K)"`` that specifies the category and dimension of the output variable. Here: - ``AL`` is a dummy alphanumerical name. - ``TYPE`` is a *category*. One of: - ``Vi``: indexation by :math:`i` along axis 0; reduction is performed along axis 1. - ``Vj``: indexation by :math:`j` along axis 1; reduction is performed along axis 0. - ``K`` is an integer, the number of values to extract. *aliases (strings): List of identifiers, as in :class:`torch.Genred <pykeops.torch.Genred>`. Keyword Args: dtype (string, default = ``"float32"``): Specifies the numerical **dtype** of the input and output arrays. The supported values are: - **dtype** = ``"float16"`` or ``"half"``. - **dtype** = ``"float32"`` or ``"float"``. - **dtype** = ``"float64"`` or ``"double"``. Returns: A generic reduction that can be called on arbitrary Torch tensors, as documented in :class:`torch.Genred <pykeops.torch.Genred>`. Example: Bruteforce K-nearest neighbors search in dimension 100: >>> knn = generic_argkmin( ... 'SqDist(x, y)', # Formula ... 'a = Vi(3)', # Output: 3 scalars per line ... 'x = Vi(100)', # 1st input: dim-100 vector per line ... 'y = Vj(100)') # 2nd input: dim-100 vector per line >>> x = torch.randn(5, 100) >>> y = torch.randn(20000, 100) >>> a = knn(x, y) >>> print(a) tensor([[ 9054., 11653., 11614.], [13466., 11903., 14180.], [14164., 8809., 3799.], [ 2092., 3323., 18479.], [14433., 11315., 11841.]]) >>> print( (x - y[ a[:,0].long() ]).norm(dim=1) ) # Distance to the nearest neighbor tensor([10.7933, 10.3235, 10.1218, 11.4919, 10.5100]) >>> print( (x - y[ a[:,1].long() ]).norm(dim=1) ) # Distance to the second neighbor tensor([11.3702, 10.6550, 10.7646, 11.5676, 11.1356]) >>> print( (x - y[ a[:,2].long() ]).norm(dim=1) ) # Distance to the third neighbor tensor([11.3820, 10.6725, 10.8510, 11.6071, 11.1968]) """ _, cat, k, _ = get_type(output) axis = cat2axis(cat) return Genred(formula, aliases, reduction_op='ArgKMin', axis=axis, opt_arg=k, **kwargs)
def run_keops_mmv(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, other_vars: List[torch.Tensor], out: Optional[torch.Tensor], formula: str, aliases: List[str], axis: int, reduction: str = 'Sum', opt: Optional[FalkonOptions] = None) -> torch.Tensor: if opt is None: opt = FalkonOptions() # Choose backend N, D = X1.shape T = v.shape[1] backend = _decide_backend(opt, D) dtype = _keops_dtype(X1.dtype) device = X1.device if not check_same_device(X1, X2, v, out, *other_vars): raise RuntimeError("All input tensors must be on the same device.") if (device.type == 'cuda') and (not backend.startswith("GPU")): warnings.warn("KeOps backend was chosen to be CPU, but GPU input tensors found. " "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, " "please pass CPU tensors; to avoid this warning if the GPU backend is " "desired, check your options (i.e. set 'use_cpu=False').") backend = "GPU_1D" # Define formula wrapper fn = Genred(formula, aliases, reduction_op=reduction, axis=axis, dtype=dtype, dtype_acc=opt.keops_acc_dtype, sum_scheme=opt.keops_sum_scheme) # Compile on a small data subset small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars small_data_out = torch.empty((100, T), dtype=X1.dtype, device=device) fn(*small_data_variables, out=small_data_out, backend=backend) # Create output matrix if out is None: # noinspection PyArgumentList out = torch.empty(N, T, dtype=X1.dtype, device=device, pin_memory=(backend != 'CPU') and (device.type == 'cpu')) if backend.startswith("GPU") and device.type == 'cpu': # Info about GPUs ram_slack = 0.7 # slack is high due to imprecise memory usage estimates gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0] gpu_ram = [ min((g.free_memory - 300 * 2 ** 20) * ram_slack, opt.max_gpu_mem * ram_slack) for g in gpu_info ] block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i in range(len(gpu_info)): # First round of subdivision bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv( X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), other_vars=other_vars, function=fn, backend=backend, gpu_ram=gpu_ram[i] ), gpu_info[i].Id)) _start_wait_processes(_single_gpu_method, args) else: # Run on CPU or GPU with CUDA inputs variables = [X1, X2, v] + other_vars out = fn(*variables, out=out, backend=backend) return out
formula = 'Square(p-a)*Exp(x+y)' variables = [ 'x = Vi(3)', # First arg : i-variable, of size 3 'y = Vj(3)', # Second arg : j-variable, of size 3 'a = Vj(1)', # Third arg : j-variable, of size 1 (scalar) 'p = Pm(1)' ] # Fourth arg : Parameter, of size 1 (scalar) #################################################################### # Our sum reduction is performed over the index :math:`j`, # i.e. on the axis ``1`` of the kernel matrix. # The output c is an :math:`x`-variable indexed by :math:`i`. my_routine = Genred(formula, variables, reduction_op='Sum', axis=1, dtype=dtype) c = my_routine(x, y, a, p) #################################################################### # Compute the gradient # -------------------- # Now, let's compute the gradient of :math:`c` with # respect to :math:`y`. Since :math:`c` is not scalar valued, # its "gradient" :math:`\partial c` should be understood as the adjoint of the # differential operator, i.e. as the linear operator that: # # - takes as input a new tensor :math:`e` with the shape of :math:`c` # - outputs a tensor :math:`g` with the shape of :math:`y` #
# --------------- # # Create a new generic routine using the :class:`pykeops.numpy.Genred` # constructor: formula = 'SqDist(x,y)' formula_weights = 'b' aliases = [ 'x = Vi(' + str(D) + ')', # First arg: i-variable of size D 'y = Vj(' + str(D) + ')', # Second arg: j-variable of size D 'b = Vj(' + str(Dv) + ')' ] # Third arg: j-variable of size Dv softmax_op = Genred(formula, aliases, reduction_op='SumSoftMaxWeight', axis=1, formula2=formula_weights) # Dummy first call to warmup the GPU and get accurate timings: _ = softmax_op(x, y, b) ############################################################################### # Use our new function on arbitrary Numpy arrays: # start = time.time() c = softmax_op(x, y, b) print("Timing (KeOps implementation): ", round(time.time() - start, 5), "s") # compare with direct implementation
def run_keops_mmv(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, other_vars: List[torch.Tensor], out: Optional[torch.Tensor], formula: str, aliases: List[str], axis: int, reduction: str = 'Sum', opt: Optional[FalkonOptions] = None) -> torch.Tensor: if opt is None: opt = FalkonOptions() # Choose backend N, D = X1.shape T = v.shape[1] backend = _decide_backend(opt, D) dtype = _keops_dtype(X1.dtype) device = X1.device if not check_same_device(X1, X2, v, out, *other_vars): raise RuntimeError("All input tensors must be on the same device.") if (device.type == 'cuda') and (not backend.startswith("GPU")): warnings.warn( "KeOps backend was chosen to be CPU, but GPU input tensors found. " "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, " "please pass CPU tensors; to avoid this warning if the GPU backend is " "desired, check your options (i.e. set 'use_cpu=False').") backend = "GPU_1D" # Define formula wrapper fn = Genred(formula, aliases, reduction_op=reduction, axis=axis, dtype=dtype, dtype_acc=opt.keops_acc_dtype, sum_scheme=opt.keops_sum_scheme) # Create output matrix if out is None: # noinspection PyArgumentList out = torch.empty(N, T, dtype=X1.dtype, device=device, pin_memory=(backend != 'CPU') and (device.type == 'cpu')) if backend.startswith("GPU") and device.type == 'cpu': # slack is high due to imprecise memory usage estimates for keops gpu_info = _get_gpu_info(opt, slack=opt.keops_memory_slack) block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): # First round of subdivision bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), other_vars=other_vars, function=fn, backend=backend, gpu_ram=g.usable_ram), g.Id)) _start_wait_processes(_single_gpu_method, args) else: # Run on CPU or GPU with CUDA inputs variables = [X1, X2, v] + other_vars if device.type == 'cuda': with torch.cuda.device(device): sync_current_stream(device) out = fn(*variables, out=out, backend=backend) else: out = fn(*variables, out=out, backend=backend) return out
formula = 'Square(p-a)*Exp(x+y)' formula2 = 'b' variables = ['x = Vi(1)', # First arg : i-variable, of size 1 (scalar) 'y = Vj(1)', # Second arg : j-variable, of size 1 (scalar) 'a = Vj(1)', # Third arg : j-variable, of size 1 (scalar) 'p = Pm(1)', # Fourth arg : Parameter, of size 1 (scalar) 'b = Vj(3)'] # Fifth arg : j-variable, of size 3 (vector) start = time.time() #################################################################### # Our log-sum-exp reduction is performed over the index :math:`j`, # i.e. on the axis ``1`` of the kernel matrix. # The output c is an :math:`x`-variable indexed by :math:`i`. my_routine = Genred(formula, variables, reduction_op='LogSumExp', axis=1, dtype=dtype, formula2=formula2) c = my_routine(x, y, a, p, b, backend='CPU') # N.B.: By specifying backend='CPU', we can make sure that the result is computed using a simple C++ for loop. print('Time to compute the convolution operation on the cpu: ', round(time.time()-start,5), 's', end=' ') ####################################################################### # We compare with the unstable, naive computation "Log of Sum of Exp": my_routine2 = Genred('Exp('+formula+')*'+formula2, variables, reduction_op='Sum', axis=1, dtype=dtype) c2 = torch.log(my_routine2(x, y, a, p, b, backend='CPU')) print('(relative error: ',((c2-c).norm()/c.norm()).item(), ')') # Plot the results next to each other: for i in range(3): plt.subplot(3, 1, i+1)
"y = Vj(1)", # Second arg : j-variable, of size 1 (scalar) "a = Vj(1)", # Third arg : j-variable, of size 1 (scalar) "p = Pm(1)", # Fourth arg : Parameter, of size 1 (scalar) "b = Vj(3)", ] # Fifth arg : j-variable, of size 3 (vector) start = time.time() #################################################################### # Our log-sum-exp reduction is performed over the index :math:`j`, # i.e. on the axis ``1`` of the kernel matrix. # The output c is an :math:`x`-variable indexed by :math:`i`. my_routine = Genred(formula, variables, reduction_op="LogSumExp", axis=1, dtype=dtype, formula2=formula2) c = my_routine(x, y, a, p, b, backend="CPU") # N.B.: By specifying backend='CPU', we can make sure that the result is computed using a simple C++ for loop. print( "Time to compute the convolution operation on the cpu: ", round(time.time() - start, 5), "s", end=" ", ) ####################################################################### # We compare with the unstable, naive computation "Log of Sum of Exp":
# .. math:: # # a_i = \sum_{j=1}^M (\langle x_i,y_j \rangle^2) (p_0 x_i + p_1 y_j) # # where the two real parameters are stored in a 2-vector :math:`p=(p_0,p_1)`. # Keops implementation. # Note that Square(...) is more efficient than Pow(...,2) formula = "Square((X|Y)) * ((Elem(P, 0) * X) + (Elem(P, 1) * Y))" variables = [ "P = Pm(2)", # 1st argument, a parameter, dim 2. "X = Vi(3)", # 2nd argument, indexed by i, dim D. "Y = Vj(3)", ] # 3rd argument, indexed by j, dim D. my_routine = Genred(formula, variables, reduction_op="Sum", axis=1) a_keops = my_routine(p, x, y) # Vanilla PyTorch implementation scals = (torch.mm(x, y.t()))**2 # Memory-intensive computation! a_pytorch = p[0] * scals.sum(1).view(-1, 1) * x + p[1] * (torch.mm(scals, y)) # Plot the results next to each other: for i in range(D): plt.subplot(D, 1, i + 1) plt.plot(a_keops.detach().cpu().numpy()[:40, i], "-", label="KeOps") plt.plot(a_pytorch.detach().cpu().numpy()[:40, i], "--", label="PyTorch") plt.legend(loc="lower right") plt.tight_layout() plt.show()
def run_keops_mmv(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, other_vars: List[torch.Tensor], out: Optional[torch.Tensor], formula: str, aliases: List[str], axis: int, reduction: str = 'Sum', opt: Optional[FalkonOptions] = None) -> torch.Tensor: if opt is None: opt = FalkonOptions() # Choose backend N, D = X1.shape M = X2.shape[0] T = v.shape[1] backend = _decide_backend(opt, D) dtype = _keops_dtype(X1.dtype) # Define formula wrapper fn = Genred(formula, aliases, reduction_op=reduction, axis=axis, dtype=dtype, dtype_acc=opt.keops_acc_dtype, sum_scheme=opt.keops_sum_scheme) # Compile on a small data subset small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars small_data_out = torch.empty((100, T), dtype=X1.dtype, device=X1.device) fn(*small_data_variables, out=small_data_out, backend=backend) # Create output matrix if out is None: # noinspection PyArgumentList out = torch.empty(N, T, dtype=X1.dtype, device='cpu', pin_memory=backend != 'CPU') if backend.startswith("GPU"): # Info about GPUs ram_slack = 0.7 # slack is high due to imprecise memory usage estimates gpu_info = [ v for k, v in devices.get_device_info(opt).items() if k >= 0 ] gpu_ram = [ min((g.free_memory - 300 * 2**20) * ram_slack, opt.max_gpu_mem * ram_slack) for g in gpu_info ] block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i in range(len(gpu_info)): # First round of subdivision bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), other_vars=other_vars, function=fn, backend=backend, gpu_ram=gpu_ram[i]), gpu_info[i].Id)) _start_wait_processes(_single_gpu_method, args) else: # Run on CPU variables = [X1, X2, v] + other_vars out = fn(*variables, out=out, backend=backend) return out
keops_backend = 'GPU' def timeit(func, it): times = [] for i in range(it): start = time.perf_counter() func() times.append(time.perf_counter() - start) return sum(times) / it formula = "TensorDot(a, b, Ind(2,2), Ind(2,2), Ind(1), Ind(0))" alias = ["a=Vi(4)", "b=Vi(4)"] keops_bmm = Genred(formula, alias, reduction_op='Sum', axis=1) N = 1000000 A = torch.rand(N, 2, 2, device=device) B = torch.rand(N, 2, 2, device=device) it = 1000 print("torch.bmm() = torch.einsum() :", torch.allclose(torch.bmm(A, B), torch.einsum('nik, nkj->nij', A, B))) print( "torch.einsum() = keops_bmm() :", torch.allclose( torch.einsum('nik, nkj->nij', A, B), keops_bmm(A.view(-1, 4), B.view(-1, 4), backend=keops_backend).view(-1, 2, 2)))