Example #1
0
    def test_generic_syntax_simple(self):
        ############################################################
        from pykeops.torch import Genred

        aliases = [
            'P = Pm(2)',  # 1st argument,  a parameter, dim 2.
            'X = Vi(' + str(self.xc.shape[1]) +
            ') ',  # 2nd argument, indexed by i, dim D.
            'Y = Vj(' + str(self.yc.shape[1]) + ') '
        ]  # 3rd argument, indexed by j, dim D.

        formula = 'Pow((X|Y),2) * ((Elem(P,0) * X) + (Elem(P,1) * Y))'

        if pykeops.gpu_available:
            backend_to_test = ['auto', 'GPU_1D', 'GPU_2D', 'GPU']
        else:
            backend_to_test = ['auto']

        for b in backend_to_test:
            with self.subTest(b=b):
                my_routine = Genred(formula,
                                    aliases,
                                    reduction_op='Sum',
                                    axis=1)
                gamma_keops = my_routine(self.pc, self.xc, self.yc, backend=b)

                # Numpy version
                scals = (self.x @ self.y.T)**2  # Memory-intensive computation!
                gamma_py = self.p[0] * scals.sum(1).reshape(
                    -1, 1) * self.x + self.p[1] * (scals @ self.y)

                # compare output
                self.assertTrue(
                    np.allclose(gamma_keops.cpu().data.numpy(),
                                gamma_py,
                                atol=1e-6))
Example #2
0
def FeaturesKP(kernel,
               gs,
               xs,
               ys,
               bs,
               mode='sum',
               backend='auto',
               dtype='float32'):
    if backend in ['pytorch', 'matrix']:
        domain, torch_map = pytorch_routines[mode]
        if domain == 'sum':
            routine = kernel.routine_sum
        elif domain == 'log':
            routine = kernel.routine_log

        return torch_map(routine, gs, xs, ys, bs, matrix=(backend == 'matrix'))

    else:
        red, formula, bs_cat = keops_routines[mode]

        formula = formula.format(f_sum=kernel.formula_sum,
                                 f_log=kernel.formula_log)

        # Given the output sizes, we must generate the appropriate list of aliases

        # We will store the arguments as follow :
        # [ G_0, G_1, ..., X_0, X_1, Y_0, Y_1, ...]
        full_args, aliases, index = [], [], 0  # tensor list, string list, current input arg

        # First, the G_i's
        for (i, g) in enumerate(gs):
            if g is not None:
                g_var, g_dim, g_cat, g_str = extract_metric_parameters(
                    g)  # example : Tensor(...), 3, 0, 'Vi'
                aliases.append('G_{g_ind} = {g_str}({index}, {g_dim})'.format(
                    g_ind=i, g_str=g_str, index=index, g_dim=g_dim))
                full_args.append(g_var)
                index += 1

        # Then, the X_i's
        for (i, x) in enumerate(xs):
            x_dim = x.size(1)
            aliases.append('X_{x_ind} = Vi({index}, {x_dim})'.format(
                x_ind=i, index=index, x_dim=x_dim))
            full_args.append(x)
            index += 1

        # Then, the Y_j's
        for (j, y) in enumerate(ys):
            y_dim = y.size(1)
            aliases.append('Y_{y_ind} = Vj({index}, {y_dim})'.format(
                y_ind=j, index=index, y_dim=y_dim))
            full_args.append(y)
            index += 1

        if not len(xs) == len(ys):
            raise ValueError(
                "Kernel_product works with pairs of variables. The 'x'-list of features should thus have the same length as the 'y' one."
            )

        # Then, the B_i/j's
        for (i, (b, b_cat)) in enumerate(zip(bs, bs_cat)):
            b_dim = b.size(1)
            b_str = ['Vi', 'Vj', 'Pm'][b_cat]
            aliases.append('B_{b_ind} = {b_str}({index}, {b_dim})'.format(
                b_ind=i, b_str=b_str, index=index, b_dim=b_dim))
            full_args.append(b)
            index += 1

        axis = 1  # the output vector is indexed by 'i' (CAT=0)
        genconv = Genred(formula,
                         aliases,
                         reduction_op=red,
                         axis=axis,
                         dtype=dtype)

        return genconv(*full_args, backend=backend)
Example #3
0
def generic_argkmin(formula, output, *aliases, **kwargs):
    r"""Alias for :class:`torch.Genred <pykeops.torch.Genred>` with an "ArgKMin" reduction.

    Args:
        formula (string): Scalar-valued symbolic KeOps expression, as in :class:`torch.Genred <pykeops.torch.Genred>`.
        output (string): An identifier of the form ``"AL = TYPE(K)"`` 
            that specifies the category and dimension of the output variable. Here:

              - ``AL`` is a dummy alphanumerical name.
              - ``TYPE`` is a *category*. One of:

                - ``Vi``: indexation by :math:`i` along axis 0; reduction is performed along axis 1.
                - ``Vj``: indexation by :math:`j` along axis 1; reduction is performed along axis 0.

              - ``K`` is an integer, the number of values to extract.

        *aliases (strings): List of identifiers, as in :class:`torch.Genred <pykeops.torch.Genred>`.

    Keyword Args:
        dtype (string, default = ``"float32"``): Specifies the numerical **dtype** of the input and output arrays. 
            The supported values are:

              - **dtype** = ``"float16"`` or ``"half"``.
              - **dtype** = ``"float32"`` or ``"float"``.
              - **dtype** = ``"float64"`` or ``"double"``.

    Returns:
        A generic reduction that can be called on arbitrary
        Torch tensors, as documented in :class:`torch.Genred <pykeops.torch.Genred>`.

    Example:
        Bruteforce K-nearest neighbors search in dimension 100:

        >>> knn = generic_argkmin(
        ...     'SqDist(x, y)',   # Formula
        ...     'a = Vi(3)',      # Output: 3 scalars per line
        ...     'x = Vi(100)',    # 1st input: dim-100 vector per line
        ...     'y = Vj(100)')    # 2nd input: dim-100 vector per line
        >>> x = torch.randn(5,     100)
        >>> y = torch.randn(20000, 100)
        >>> a = knn(x, y)
        >>> print(a)
        tensor([[ 9054., 11653., 11614.],
                [13466., 11903., 14180.],
                [14164.,  8809.,  3799.],
                [ 2092.,  3323., 18479.],
                [14433., 11315., 11841.]])
        >>> print( (x - y[ a[:,0].long() ]).norm(dim=1) )  # Distance to the nearest neighbor
        tensor([10.7933, 10.3235, 10.1218, 11.4919, 10.5100])
        >>> print( (x - y[ a[:,1].long() ]).norm(dim=1) )  # Distance to the second neighbor
        tensor([11.3702, 10.6550, 10.7646, 11.5676, 11.1356])
        >>> print( (x - y[ a[:,2].long() ]).norm(dim=1) )  # Distance to the third neighbor
        tensor([11.3820, 10.6725, 10.8510, 11.6071, 11.1968])
    """
    _, cat, k, _ = get_type(output)
    axis = cat2axis(cat)
    return Genred(formula,
                  aliases,
                  reduction_op='ArgKMin',
                  axis=axis,
                  opt_arg=k,
                  **kwargs)
Example #4
0
def run_keops_mmv(X1: torch.Tensor,
                  X2: torch.Tensor,
                  v: torch.Tensor,
                  other_vars: List[torch.Tensor],
                  out: Optional[torch.Tensor],
                  formula: str,
                  aliases: List[str],
                  axis: int,
                  reduction: str = 'Sum',
                  opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if opt is None:
        opt = FalkonOptions()
    # Choose backend
    N, D = X1.shape
    T = v.shape[1]
    backend = _decide_backend(opt, D)
    dtype = _keops_dtype(X1.dtype)
    device = X1.device

    if not check_same_device(X1, X2, v, out, *other_vars):
        raise RuntimeError("All input tensors must be on the same device.")
    if (device.type == 'cuda') and (not backend.startswith("GPU")):
        warnings.warn("KeOps backend was chosen to be CPU, but GPU input tensors found. "
                      "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, "
                      "please pass CPU tensors; to avoid this warning if the GPU backend is "
                      "desired, check your options (i.e. set 'use_cpu=False').")
        backend = "GPU_1D"

    # Define formula wrapper
    fn = Genred(formula, aliases,
                reduction_op=reduction, axis=axis,
                dtype=dtype, dtype_acc=opt.keops_acc_dtype,
                sum_scheme=opt.keops_sum_scheme)

    # Compile on a small data subset
    small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars
    small_data_out = torch.empty((100, T), dtype=X1.dtype, device=device)
    fn(*small_data_variables, out=small_data_out, backend=backend)

    # Create output matrix
    if out is None:
        # noinspection PyArgumentList
        out = torch.empty(N, T, dtype=X1.dtype, device=device,
                          pin_memory=(backend != 'CPU') and (device.type == 'cpu'))

    if backend.startswith("GPU") and device.type == 'cpu':
        # Info about GPUs
        ram_slack = 0.7  # slack is high due to imprecise memory usage estimates
        gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
        gpu_ram = [
            min((g.free_memory - 300 * 2 ** 20) * ram_slack, opt.max_gpu_mem * ram_slack)
            for g in gpu_info
        ]
        block_sizes = calc_gpu_block_sizes(gpu_info, N)

        # Create queues
        args = []  # Arguments passed to each subprocess
        for i in range(len(gpu_info)):
            # First round of subdivision
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            args.append((ArgsFmmv(
                X1=X1.narrow(0, block_sizes[i], bwidth),
                X2=X2,
                v=v,
                out=out.narrow(0, block_sizes[i], bwidth),
                other_vars=other_vars,
                function=fn,
                backend=backend,
                gpu_ram=gpu_ram[i]
            ), gpu_info[i].Id))
        _start_wait_processes(_single_gpu_method, args)
    else:  # Run on CPU or GPU with CUDA inputs
        variables = [X1, X2, v] + other_vars
        out = fn(*variables, out=out, backend=backend)

    return out
Example #5
0
formula = 'Square(p-a)*Exp(x+y)'
variables = [
    'x = Vi(3)',  # First arg   : i-variable, of size 3
    'y = Vj(3)',  # Second arg  : j-variable, of size 3
    'a = Vj(1)',  # Third arg   : j-variable, of size 1 (scalar)
    'p = Pm(1)'
]  # Fourth  arg : Parameter,  of size 1 (scalar)

####################################################################
# Our sum reduction is performed over the index :math:`j`,
# i.e. on the axis ``1`` of the kernel matrix.
# The output c is an :math:`x`-variable indexed by :math:`i`.

my_routine = Genred(formula,
                    variables,
                    reduction_op='Sum',
                    axis=1,
                    dtype=dtype)
c = my_routine(x, y, a, p)

####################################################################
# Compute the gradient
# --------------------
# Now, let's compute the gradient of :math:`c` with
# respect to :math:`y`. Since :math:`c` is not scalar valued,
# its "gradient" :math:`\partial c` should be understood as the adjoint of the
# differential operator, i.e. as the linear operator that:
#
# - takes as input a new tensor :math:`e` with the shape of :math:`c`
# - outputs a tensor :math:`g` with the shape of :math:`y`
#
Example #6
0
# ---------------
#
# Create a new generic routine using the :class:`pykeops.numpy.Genred`
# constructor:

formula = 'SqDist(x,y)'
formula_weights = 'b'
aliases = [
    'x = Vi(' + str(D) + ')',  # First arg:  i-variable of size D
    'y = Vj(' + str(D) + ')',  # Second arg: j-variable of size D
    'b = Vj(' + str(Dv) + ')'
]  # Third arg:  j-variable of size Dv

softmax_op = Genred(formula,
                    aliases,
                    reduction_op='SumSoftMaxWeight',
                    axis=1,
                    formula2=formula_weights)

# Dummy first call to warmup the GPU and get accurate timings:
_ = softmax_op(x, y, b)

###############################################################################
# Use our new function on arbitrary Numpy arrays:
#

start = time.time()
c = softmax_op(x, y, b)
print("Timing (KeOps implementation): ", round(time.time() - start, 5), "s")

# compare with direct implementation
Example #7
0
def run_keops_mmv(X1: torch.Tensor,
                  X2: torch.Tensor,
                  v: torch.Tensor,
                  other_vars: List[torch.Tensor],
                  out: Optional[torch.Tensor],
                  formula: str,
                  aliases: List[str],
                  axis: int,
                  reduction: str = 'Sum',
                  opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if opt is None:
        opt = FalkonOptions()
    # Choose backend
    N, D = X1.shape
    T = v.shape[1]
    backend = _decide_backend(opt, D)
    dtype = _keops_dtype(X1.dtype)
    device = X1.device

    if not check_same_device(X1, X2, v, out, *other_vars):
        raise RuntimeError("All input tensors must be on the same device.")
    if (device.type == 'cuda') and (not backend.startswith("GPU")):
        warnings.warn(
            "KeOps backend was chosen to be CPU, but GPU input tensors found. "
            "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, "
            "please pass CPU tensors; to avoid this warning if the GPU backend is "
            "desired, check your options (i.e. set 'use_cpu=False').")
        backend = "GPU_1D"

    # Define formula wrapper
    fn = Genred(formula,
                aliases,
                reduction_op=reduction,
                axis=axis,
                dtype=dtype,
                dtype_acc=opt.keops_acc_dtype,
                sum_scheme=opt.keops_sum_scheme)

    # Create output matrix
    if out is None:
        # noinspection PyArgumentList
        out = torch.empty(N,
                          T,
                          dtype=X1.dtype,
                          device=device,
                          pin_memory=(backend != 'CPU')
                          and (device.type == 'cpu'))

    if backend.startswith("GPU") and device.type == 'cpu':
        # slack is high due to imprecise memory usage estimates for keops
        gpu_info = _get_gpu_info(opt, slack=opt.keops_memory_slack)
        block_sizes = calc_gpu_block_sizes(gpu_info, N)

        # Create queues
        args = []  # Arguments passed to each subprocess
        for i, g in enumerate(gpu_info):
            # First round of subdivision
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                                  X2=X2,
                                  v=v,
                                  out=out.narrow(0, block_sizes[i], bwidth),
                                  other_vars=other_vars,
                                  function=fn,
                                  backend=backend,
                                  gpu_ram=g.usable_ram), g.Id))
        _start_wait_processes(_single_gpu_method, args)
    else:  # Run on CPU or GPU with CUDA inputs
        variables = [X1, X2, v] + other_vars
        if device.type == 'cuda':
            with torch.cuda.device(device):
                sync_current_stream(device)
                out = fn(*variables, out=out, backend=backend)
        else:
            out = fn(*variables, out=out, backend=backend)

    return out
formula = 'Square(p-a)*Exp(x+y)'
formula2 = 'b'
variables = ['x = Vi(1)',  # First arg   : i-variable, of size 1 (scalar)
             'y = Vj(1)',  # Second arg  : j-variable, of size 1 (scalar)
             'a = Vj(1)',  # Third arg   : j-variable, of size 1 (scalar)
             'p = Pm(1)',  # Fourth arg  : Parameter,  of size 1 (scalar)
             'b = Vj(3)']  # Fifth arg   : j-variable, of size 3 (vector)
                      
start = time.time()

####################################################################
# Our log-sum-exp reduction is performed over the index :math:`j`,
# i.e. on the axis ``1`` of the kernel matrix.
# The output c is an :math:`x`-variable indexed by :math:`i`.

my_routine = Genred(formula, variables, reduction_op='LogSumExp', axis=1, dtype=dtype, formula2=formula2)
c = my_routine(x, y, a, p, b, backend='CPU')

# N.B.: By specifying backend='CPU', we can make sure that the result is computed using a simple C++ for loop.
print('Time to compute the convolution operation on the cpu: ', round(time.time()-start,5), 's', end=' ')

#######################################################################
# We compare with the unstable, naive computation "Log of Sum of Exp":

my_routine2 = Genred('Exp('+formula+')*'+formula2, variables, reduction_op='Sum', axis=1, dtype=dtype)
c2 = torch.log(my_routine2(x, y, a, p, b, backend='CPU'))
print('(relative error: ',((c2-c).norm()/c.norm()).item(), ')')

# Plot the results next to each other:
for i in range(3):
    plt.subplot(3, 1, i+1)
Example #9
0
    "y = Vj(1)",  # Second arg  : j-variable, of size 1 (scalar)
    "a = Vj(1)",  # Third arg   : j-variable, of size 1 (scalar)
    "p = Pm(1)",  # Fourth arg  : Parameter,  of size 1 (scalar)
    "b = Vj(3)",
]  # Fifth arg   : j-variable, of size 3 (vector)

start = time.time()

####################################################################
# Our log-sum-exp reduction is performed over the index :math:`j`,
# i.e. on the axis ``1`` of the kernel matrix.
# The output c is an :math:`x`-variable indexed by :math:`i`.

my_routine = Genred(formula,
                    variables,
                    reduction_op="LogSumExp",
                    axis=1,
                    dtype=dtype,
                    formula2=formula2)
c = my_routine(x, y, a, p, b, backend="CPU")

# N.B.: By specifying backend='CPU', we can make sure that the result is computed using a simple C++ for loop.
print(
    "Time to compute the convolution operation on the cpu: ",
    round(time.time() - start, 5),
    "s",
    end=" ",
)

#######################################################################
# We compare with the unstable, naive computation "Log of Sum of Exp":
Example #10
0
# .. math::
#
#   a_i = \sum_{j=1}^M (\langle x_i,y_j \rangle^2) (p_0 x_i + p_1 y_j)
#
# where the two real parameters are stored in a 2-vector :math:`p=(p_0,p_1)`.

# Keops implementation.
# Note that Square(...) is more efficient than Pow(...,2)
formula = "Square((X|Y)) * ((Elem(P, 0) * X) + (Elem(P, 1) * Y))"
variables = [
    "P = Pm(2)",  # 1st argument,  a parameter, dim 2.
    "X = Vi(3)",  # 2nd argument, indexed by i, dim D.
    "Y = Vj(3)",
]  # 3rd argument, indexed by j, dim D.

my_routine = Genred(formula, variables, reduction_op="Sum", axis=1)
a_keops = my_routine(p, x, y)

# Vanilla PyTorch implementation
scals = (torch.mm(x, y.t()))**2  # Memory-intensive computation!
a_pytorch = p[0] * scals.sum(1).view(-1, 1) * x + p[1] * (torch.mm(scals, y))

# Plot the results next to each other:
for i in range(D):
    plt.subplot(D, 1, i + 1)
    plt.plot(a_keops.detach().cpu().numpy()[:40, i], "-", label="KeOps")
    plt.plot(a_pytorch.detach().cpu().numpy()[:40, i], "--", label="PyTorch")
    plt.legend(loc="lower right")
plt.tight_layout()
plt.show()
Example #11
0
def run_keops_mmv(X1: torch.Tensor,
                  X2: torch.Tensor,
                  v: torch.Tensor,
                  other_vars: List[torch.Tensor],
                  out: Optional[torch.Tensor],
                  formula: str,
                  aliases: List[str],
                  axis: int,
                  reduction: str = 'Sum',
                  opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if opt is None:
        opt = FalkonOptions()
    # Choose backend
    N, D = X1.shape
    M = X2.shape[0]
    T = v.shape[1]
    backend = _decide_backend(opt, D)
    dtype = _keops_dtype(X1.dtype)

    # Define formula wrapper
    fn = Genred(formula,
                aliases,
                reduction_op=reduction,
                axis=axis,
                dtype=dtype,
                dtype_acc=opt.keops_acc_dtype,
                sum_scheme=opt.keops_sum_scheme)

    # Compile on a small data subset
    small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars
    small_data_out = torch.empty((100, T), dtype=X1.dtype, device=X1.device)
    fn(*small_data_variables, out=small_data_out, backend=backend)

    # Create output matrix
    if out is None:
        # noinspection PyArgumentList
        out = torch.empty(N,
                          T,
                          dtype=X1.dtype,
                          device='cpu',
                          pin_memory=backend != 'CPU')

    if backend.startswith("GPU"):
        # Info about GPUs
        ram_slack = 0.7  # slack is high due to imprecise memory usage estimates
        gpu_info = [
            v for k, v in devices.get_device_info(opt).items() if k >= 0
        ]
        gpu_ram = [
            min((g.free_memory - 300 * 2**20) * ram_slack,
                opt.max_gpu_mem * ram_slack) for g in gpu_info
        ]
        block_sizes = calc_gpu_block_sizes(gpu_info, N)

        # Create queues
        args = []  # Arguments passed to each subprocess
        for i in range(len(gpu_info)):
            # First round of subdivision
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                                  X2=X2,
                                  v=v,
                                  out=out.narrow(0, block_sizes[i], bwidth),
                                  other_vars=other_vars,
                                  function=fn,
                                  backend=backend,
                                  gpu_ram=gpu_ram[i]), gpu_info[i].Id))
        _start_wait_processes(_single_gpu_method, args)
    else:  # Run on CPU
        variables = [X1, X2, v] + other_vars
        out = fn(*variables, out=out, backend=backend)

    return out
Example #12
0
    keops_backend = 'GPU'


def timeit(func, it):
    times = []
    for i in range(it):
        start = time.perf_counter()
        func()
        times.append(time.perf_counter() - start)

    return sum(times) / it


formula = "TensorDot(a, b, Ind(2,2), Ind(2,2), Ind(1), Ind(0))"
alias = ["a=Vi(4)", "b=Vi(4)"]
keops_bmm = Genred(formula, alias, reduction_op='Sum', axis=1)

N = 1000000
A = torch.rand(N, 2, 2, device=device)
B = torch.rand(N, 2, 2, device=device)
it = 1000

print("torch.bmm() = torch.einsum() :",
      torch.allclose(torch.bmm(A, B), torch.einsum('nik, nkj->nij', A, B)))
print(
    "torch.einsum() = keops_bmm() :",
    torch.allclose(
        torch.einsum('nik, nkj->nij', A, B),
        keops_bmm(A.view(-1, 4), B.view(-1, 4),
                  backend=keops_backend).view(-1, 2, 2)))