Ejemplo n.º 1
0
def gpu_lauum(A,
              upper,
              overwrite=True,
              write_opposite=False,
              opt: Optional[FalkonOptions] = None):
    """
    Parameters
    -----------
    A : ndarray [N, N]
        2D positive-definite matrix that will be factorized as
        A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper`
        is False.
    overwrite : bool
        Whether to overwrite matrix A or to output the result in a new
        buffer.

    Notes
    ------
    The factorization will always be the 'lower' version of the factorization
    which could however end up on the upper-triangular part of the matrix
    in case A is not Fortran contiguous to begin with.
    """
    if opt is None:
        opt = FalkonOptions()
    gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
    for g in gpu_info:
        g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95,
                                opt.max_gpu_mem * 0.95)

    # Start matrix preparations
    if isinstance(A, np.ndarray):
        Anp = A
    elif isinstance(A, torch.Tensor):
        Anp = A.numpy()
    else:
        raise TypeError("Unexpected type encountered for A: %s" % (A.dtype))

    if not overwrite:
        Anp = np.copy(Anp, order='A')

    # Will give a fortran-contiguous numpy array. No copies are performed.
    Anp, transposed = prepare_matrix(Anp)
    if transposed:
        upper = not upper

    # Parallel can only do lower C or F-contiguous arrays
    # But by transposing as necessary, it is able to run with every combination of inputs.
    At = torch.from_numpy(Anp)
    if upper:
        At = At.T
    # The parallel runner chooses based on the contiguity pattern of the inputs.
    _parallel_lauum_runner(At, write_opposite, opt, gpu_info)

    if transposed:
        Anp = Anp.T

    if isinstance(A, np.ndarray):
        return Anp
    else:
        return torch.from_numpy(Anp)
Ejemplo n.º 2
0
def _get_gpu_info(opt: BaseOptions, slack: float = 0.9) -> List[DeviceInfo]:
    # List available devices, get their relative speed and split
    # computations based on device relative speed.
    gpu_info = [v for k, v in devices.get_device_info(opt).items() if v.isGPU]
    for g in gpu_info:
        g.usable_ram = min(g.free_memory * slack, opt.max_gpu_mem * slack)
    return gpu_info
Ejemplo n.º 3
0
def gpu_lauum(A,
              upper,
              overwrite=True,
              write_opposite=False,
              opt: Optional[FalkonOptions] = None):
    """
    Parameters
    -----------
    A : torch.Tensor
        (N x N) positive-definite matrix that will be factorized as
        A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper`
        is False.
    overwrite : bool
        Whether to overwrite matrix A or to output the result in a new
        buffer.

    Returns
    -------
    out : torch.Tensor
        A (N x N) tensor. This will share the same memory as the input tensor `A` if `overwrite`
        is set to True, otherwise it will be a newly allocated tensor.
    """
    if opt is None:
        opt = FalkonOptions()
    if not overwrite:
        A = copy_same_stride(A, pin_memory=True)
    # TODO: There is a helper function in mmv_ops for this.
    gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
    for g in gpu_info:
        g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95,
                                opt.max_gpu_mem * 0.95)

    # Parallel can only do lower C or F-contiguous arrays
    # By transposing as necessary, it is able to run with every combination of inputs.
    transposed = False
    # noinspection PyUnresolvedReferences
    if upper:
        A = A.T
        transposed = True

    # The parallel runner chooses based on the contiguity pattern of the inputs.
    _parallel_lauum_runner(A, write_opposite, gpu_info)

    if transposed:
        A = A.T
    return A
Ejemplo n.º 4
0
def init(opt: BaseOptions):
    if opt.use_cpu:
        return

    device_ids = [k for k in get_device_info(opt).keys() if k >= 0]

    global _cublas_handles
    global _cusolver_handles
    for i in device_ids:
        with torch.cuda.device(i):
            # CuBLAS handle
            if _cublas_handles.get(i, None) is None:
                handle = cublasCreate()
                _cublas_handles[i] = handle
            # CuSOLVER (Dense) handle
            if _cusolver_handles.get(i, None) is None:
                handle = cusolver.cusolverDnCreate()
                _cusolver_handles[i] = handle
Ejemplo n.º 5
0
    def fit(self,
            X: torch.Tensor,
            Y: torch.Tensor,
            Xts: Optional[torch.Tensor] = None,
            Yts: Optional[torch.Tensor] = None):
        """Fits the Falkon KRR model.

        Parameters
        -----------
        X : torch.Tensor
            The tensor of training data, of shape [num_samples, num_dimensions].
            If X is in Fortran order (i.e. column-contiguous) then we can avoid
            an extra copy of the data. Must be a CUDA tensor.
        Y : torch.Tensor
            The tensor of training targets, of shape [num_samples, num_outputs].
            If X and Y represent a classification problem, Y can be encoded as a one-hot
            vector.
            If Y is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data. Must be a CUDA tensor.
        Xts : torch.Tensor or None
            Tensor of validation data, of shape [num_test_samples, num_dimensions].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data. Must be a CUDA tensor.
        Yts : torch.Tensor or None
            Tensor of validation targets, of shape [num_test_samples, num_outputs].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data. Must be a CUDA tensor.

        Returns
        --------
        model: InCoreFalkon
            The fitted model
        """
        # Fix a synchronization bug which occurs when re-using center selector.
        torch.cuda.synchronize()
        X, Y, Xts, Yts = self._check_fit_inputs(X, Y, Xts, Yts)

        self.fit_times_ = []
        self.ny_points_ = None
        self.alpha_ = None

        # Start training timer
        t_s = time.time()

        # Pick Nystrom centers
        if self.weight_fn is not None:
            # noinspection PyTupleAssignmentBalance
            ny_points, ny_indices = self.center_selection.select_indices(
                X, None)
        else:
            # noinspection PyTypeChecker
            ny_points: Union[
                torch.Tensor,
                falkon.sparse.SparseTensor] = self.center_selection.select(
                    X, None)
            ny_indices = None
        num_centers = ny_points.shape[0]

        pc_stream = torch.cuda.Stream(X.device)
        with TicToc("Calcuating Preconditioner of size %d" % (num_centers),
                    debug=self.options.debug), torch.cuda.stream(pc_stream):
            precond = falkon.preconditioner.FalkonPreconditioner(
                self.penalty, self.kernel, self.options)
            ny_weight_vec = None
            if self.weight_fn is not None:
                ny_weight_vec = self.weight_fn(Y[ny_indices])
            precond.init(ny_points, weight_vec=ny_weight_vec)
        pc_stream.synchronize()

        # Cache must be emptied to ensure enough memory is visible to the optimizer
        torch.cuda.empty_cache()

        # K_NM storage decision
        gpu_info = get_device_info(self.options)[X.device.index]
        available_ram = min(self.options.max_gpu_mem,
                            gpu_info.free_memory) * 0.9
        if self._can_store_knm(X, ny_points, available_ram):
            Knm = self.kernel(X, ny_points, opt=self.options)
        else:
            Knm = None
        self.fit_times_.append(time.time() - t_s)  # Preparation time

        # Here we define the callback function which will run at the end
        # of conjugate gradient iterations. This function computes and
        # displays the validation error.
        validation_cback = None
        if self.error_fn is not None and self.error_every is not None:
            validation_cback = self._get_callback_fn(X, Y, Xts, Yts, ny_points,
                                                     precond)

        # Start with the falkon algorithm
        with TicToc('Computing Falkon iterations', debug=self.options.debug):
            optim = falkon.optim.FalkonConjugateGradient(
                self.kernel, precond, self.options, weight_fn=self.weight_fn)
            if Knm is not None:
                beta = optim.solve(Knm,
                                   None,
                                   Y,
                                   self.penalty,
                                   initial_solution=None,
                                   max_iter=self.maxiter,
                                   callback=validation_cback)
            else:
                beta = optim.solve(X,
                                   ny_points,
                                   Y,
                                   self.penalty,
                                   initial_solution=None,
                                   max_iter=self.maxiter,
                                   callback=validation_cback)

            self.alpha_ = precond.apply(beta)
            self.ny_points_ = ny_points
        return self
Ejemplo n.º 6
0
def run_keops_mmv(X1: torch.Tensor,
                  X2: torch.Tensor,
                  v: torch.Tensor,
                  other_vars: List[torch.Tensor],
                  out: Optional[torch.Tensor],
                  formula: str,
                  aliases: List[str],
                  axis: int,
                  reduction: str = 'Sum',
                  opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if opt is None:
        opt = FalkonOptions()
    # Choose backend
    N, D = X1.shape
    T = v.shape[1]
    backend = _decide_backend(opt, D)
    dtype = _keops_dtype(X1.dtype)
    device = X1.device

    if not check_same_device(X1, X2, v, out, *other_vars):
        raise RuntimeError("All input tensors must be on the same device.")
    if (device.type == 'cuda') and (not backend.startswith("GPU")):
        warnings.warn("KeOps backend was chosen to be CPU, but GPU input tensors found. "
                      "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, "
                      "please pass CPU tensors; to avoid this warning if the GPU backend is "
                      "desired, check your options (i.e. set 'use_cpu=False').")
        backend = "GPU_1D"

    # Define formula wrapper
    fn = Genred(formula, aliases,
                reduction_op=reduction, axis=axis,
                dtype=dtype, dtype_acc=opt.keops_acc_dtype,
                sum_scheme=opt.keops_sum_scheme)

    # Compile on a small data subset
    small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars
    small_data_out = torch.empty((100, T), dtype=X1.dtype, device=device)
    fn(*small_data_variables, out=small_data_out, backend=backend)

    # Create output matrix
    if out is None:
        # noinspection PyArgumentList
        out = torch.empty(N, T, dtype=X1.dtype, device=device,
                          pin_memory=(backend != 'CPU') and (device.type == 'cpu'))

    if backend.startswith("GPU") and device.type == 'cpu':
        # Info about GPUs
        ram_slack = 0.7  # slack is high due to imprecise memory usage estimates
        gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
        gpu_ram = [
            min((g.free_memory - 300 * 2 ** 20) * ram_slack, opt.max_gpu_mem * ram_slack)
            for g in gpu_info
        ]
        block_sizes = calc_gpu_block_sizes(gpu_info, N)

        # Create queues
        args = []  # Arguments passed to each subprocess
        for i in range(len(gpu_info)):
            # First round of subdivision
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            args.append((ArgsFmmv(
                X1=X1.narrow(0, block_sizes[i], bwidth),
                X2=X2,
                v=v,
                out=out.narrow(0, block_sizes[i], bwidth),
                other_vars=other_vars,
                function=fn,
                backend=backend,
                gpu_ram=gpu_ram[i]
            ), gpu_info[i].Id))
        _start_wait_processes(_single_gpu_method, args)
    else:  # Run on CPU or GPU with CUDA inputs
        variables = [X1, X2, v] + other_vars
        out = fn(*variables, out=out, backend=backend)

    return out
Ejemplo n.º 7
0
                t_e = time.time()
                timings.append(t_e - t_s)
                print("Exp %s - N %d - Rep %d - %.2fs" %
                      (exp, N, j, timings[-1]),
                      flush=True)
                if exp['torch']:
                    torch.cuda.empty_cache()
            exp['timings'].append(min(timings))
    return experiments


if __name__ == "__main__":
    init_opt = falkon.FalkonOptions(compute_arch_speed=False)
    initialization.init(init_opt)
    gpu_info = [
        v for k, v in devices.get_device_info(init_opt).items() if k >= 0
    ]
    num_gpu = len(gpu_info)

    defaultN32 = [
        10_000, 20_000, 30_000, 40_000, 50_000, 65_000, 80_000, 100_000,
        120_000, 140_000
    ]
    #defaultN64 = [10_000, 20_000, 30_000, 40_000, 50_000, 65_000, 80_000]
    falkon.FalkonOptions(chol_force_ooc=True,
                         chol_par_blk_multiplier=2,
                         compute_arch_speed=False)
    experiments = [
        {
            'name':
            'Parallel 32',
Ejemplo n.º 8
0
def _get_cpu_ram(opt: BaseOptions, slack: float = 0.9) -> float:
    cpu_info = devices.get_device_info(opt)[-1]
    avail_mem = min(cpu_info.free_memory,
                    opt.max_cpu_mem - cpu_info.used_memory)
    return avail_mem * slack
Ejemplo n.º 9
0
    def fit(self,
            X: torch.Tensor,
            Y: torch.Tensor,
            Xts: Optional[torch.Tensor] = None,
            Yts: Optional[torch.Tensor] = None):
        """Fits the Falkon KRR model.

        Parameters
        -----------
        X : torch.Tensor (2D)
            The tensor of training data, of shape [num_samples, num_dimensions].
            If X is in Fortran order (i.e. column-contiguous) then we can avoid
            an extra copy of the data.
        Y : torch.Tensor (1D or 2D)
            The tensor of training targets, of shape [num_samples, num_outputs].
            If X and Y represent a classification problem, Y can be encoded as a one-hot
            vector.
            If Y is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.
        Xts : torch.Tensor (2D) or None
            Tensor of validation data, of shape [num_test_samples, num_dimensions].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.
        Yts : torch.Tensor (1D or 2D) or None
            Tensor of validation targets, of shape [num_test_samples, num_outputs].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.

        Returns
        --------
        model: Falkon
            The fitted model
        """
        if X.size(0) != Y.size(0):
            raise ValueError("X and Y must have the same number of "
                             "samples (found %d and %d)" %
                             (X.size(0), Y.size(0)))
        if Y.dim() == 1:
            Y = torch.unsqueeze(Y, 1)
        if Y.dim() != 2:
            raise ValueError("Y is expected 1D or 2D. Found %dD." % (Y.dim()))
        if not check_same_dtype(X, Y):
            raise TypeError("X and Y must have the same data-type.")

        dtype = X.dtype
        # Decide whether to use CUDA for preconditioning based on M
        _use_cuda_preconditioner = (
                self.use_cuda_ and
                (not self.options.cpu_preconditioner) and
                self.M >= get_min_cuda_preconditioner_size(dtype)
        )
        _use_cuda_mmv = (
                self.use_cuda_ and
                X.shape[0] * X.shape[1] * self.M / self.num_gpus >= get_min_cuda_mmv_size(dtype)
        )

        self.fit_times_ = []
        self.ny_points_ = None
        self.alpha_ = None

        t_s = time.time()
        ny_points = self.center_selection.select(X, None, self.M)
        if self.use_cuda_:
            ny_points = ny_points.pin_memory()

        with TicToc("Calcuating Preconditioner of size %d" % (self.M), debug=self.options.debug):
            pc_opt: FalkonOptions = dataclasses.replace(self.options,
                                                        use_cpu=not _use_cuda_preconditioner)
            if pc_opt.debug:
                print("Preconditioner will run on %s" %
                      ("CPU" if pc_opt.use_cpu else ("%d GPUs" % self.num_gpus)))
            precond = falkon.preconditioner.FalkonPreconditioner(self.penalty, self.kernel, pc_opt)
            precond.init(ny_points)

        if _use_cuda_mmv:
            # Cache must be emptied to ensure enough memory is visible to the optimizer
            torch.cuda.empty_cache()
            X = X.pin_memory()

        # Decide whether it's worthwile to pre-compute the k_NM kernel.
        # If we precompute K_NM, each CG iteration costs
        # Given a single kernel evaluation between two D-dimensional vectors
        # costs D, at CG iteration we must perform N*M kernel evaluations.
        # Other than the kernel evaluations we must perform two matrix-vector
        # products 2(N*M*T) and a bunch of triangular solves.
        #
        # So if we precompute we have 2*(N*M*T), othewise we also have N*M*D
        # but precomputing costs us N*M memory.
        # So heuristic is the following:
        #  - If D is large (e.g. > 100) check if RAM is sufficient
        #  - If RAM is sufficient precompute
        #  - Otherwise do not precompute
        Knm = None
        if X.size(1) > 1200:
            necessary_ram = X.size(0) * ny_points.size(0) * sizeof_dtype(dtype)
            k_opt = dataclasses.replace(self.options, use_cpu=True)
            cpu_info = get_device_info(k_opt)
            available_ram = min(k_opt.max_cpu_mem, cpu_info[-1].free_memory) * 0.9
            del k_opt

            if available_ram > necessary_ram:
                if self.options.debug:
                    print("%d*%d Kernel matrix will be stored" %
                          (X.size(0), ny_points.size(0)))
                Knm = self.kernel(X, ny_points, opt=self.options)
                # TODO: Maybe we should do the same for Kts, but this complicates
                #       checks for fitting in memory
            elif self.options.debug:
                print(
                    "Cannot store full kernel matrix: not enough memory (have %.2fGB, need %.2fGB)" %
                    (available_ram / 2 ** 30, necessary_ram / 2 ** 30))
        self.fit_times_.append(time.time() - t_s)  # Preparation time

        # Here we define the callback function which will run at the end
        # of conjugate gradient iterations. This function computes and
        # displays the validation error.
        val_cback = None
        if self.error_fn is not None and self.error_every is not None:
            def val_cback(it, beta, train_time):
                self.fit_times_.append(self.fit_times_[0] + train_time)
                if it % self.error_every != 0:
                    print("Iteration %3d - Elapsed %.1fs" % (it, self.fit_times_[-1]), flush=True)
                    return
                err_str = "training" if Xts is None or Yts is None else "validation"
                alpha = precond.apply(beta)
                # Compute error: can be train or test;
                if Xts is not None and Yts is not None:
                    pred = self._predict(Xts, ny_points, alpha)
                    err = self.error_fn(Yts, pred)
                else:
                    pred = self._predict(X, ny_points, alpha)
                    err = self.error_fn(Y, pred)
                err_name = "error"
                if isinstance(err, tuple) and len(err) == 2:
                    err, err_name = err
                print("Iteration %3d - Elapsed %.1fs - %s %s: %.4f" %
                      (it, self.fit_times_[-1], err_str, err_name, err), flush=True)

        # Start with the falkon algorithm
        with TicToc('Computing Falkon iterations', debug=self.options.debug):
            o_opt: FalkonOptions = dataclasses.replace(self.options, use_cpu=not _use_cuda_mmv)
            if o_opt.debug:
                print("Optimizer will run on %s" %
                      ("CPU" if o_opt.use_cpu else ("%d GPUs" % self.num_gpus)), flush=True)
            optim = falkon.optim.FalkonConjugateGradient(self.kernel, precond, o_opt)
            if Knm is not None:
                beta = optim.solve(
                    Knm, None, Y, self.penalty, initial_solution=None,
                    max_iter=self.maxiter, callback=val_cback)
            else:
                beta = optim.solve(
                    X, ny_points, Y, self.penalty, initial_solution=None,
                    max_iter=self.maxiter, callback=val_cback)

            self.alpha_ = precond.apply(beta)
            self.ny_points_ = ny_points
        return self
Ejemplo n.º 10
0
def gpu_cholesky(A: torch.Tensor, upper: bool, clean: bool, overwrite: bool,
                 opt: FalkonOptions) -> torch.Tensor:
    """
    Parameters
    -----------
    A : torch.Tensor
        2D positive-definite matrix of size (n x n) that will be factorized as
        A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper`
        is False.
    upper : bool
        Whether the triangle which should be factorized is the upper or lower of `A`.
    clean : bool
        Whether the "other" triangle of the output matrix (the one that
        does not contain the factorization) will be filled with zeros or
        not.
    overwrite : bool
        Whether to overwrite matrix A or to output the result in a new
        buffer.
    opt : FalkonOptions
        Options forwarded for block calculation, and other knobs in the out-of-core
        parallel POTRF implementation. Useful options are the ones defined in
        :class:`~falkon.options.CholeskyOptions` .

    Notes
    ------
    The factorization will always be the 'lower' version of the factorization
    which could however end up on the upper-triangular part of the matrix
    in case A is not Fortran contiguous to begin with.
    """
    # Handle 'overwrite' option immediately so that its usage is reflected in memory
    # availability (in case A is on GPU).
    if not overwrite:
        # We could change the stride to be more favorable to the POTRF requirements
        # but it gets complicated. We leave such decisions to the user!
        A = copy_same_stride(A, pin_memory=True)

    # Decide which version of the algo we run: can be in-core or parallel.
    # (Note that the original OOC version is not going to run).

    # Determine GPU free RAM
    gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
    for g in gpu_info:
        g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95,
                                opt.max_gpu_mem * 0.95)

    if A.is_cuda:
        try:
            device = [d for d in gpu_info if d.Id == A.device.index][0]
        except IndexError:
            # This should never happen!
            raise RuntimeError("Device of matrix A (%s) is not recognized" %
                               (A.device))
    else:
        device = max(gpu_info, key=lambda g: g.actual_free_mem)
    ic = can_do_ic(A, device) and not opt.chol_force_ooc
    if opt.chol_force_in_core and not ic:
        raise RuntimeError(
            "Cannot run in-core POTRF but `chol_force_in_core` was specified.")

    f_order = is_f_contig(A)
    transposed = False
    if not f_order:
        A = A.T
        upper = not upper
        transposed = True
    # Now A is always in f_order. So we can only allow upper=False (ooc)
    if upper:
        # Can do only in-core!
        if not ic:
            raise ValueError(
                "GPU POTRF is only implemented on the "
                "lower triangle for Fortran-ordered matrices (or on the upper "
                "triangle for C-ordered matrices)")
    if not ic and A.is_cuda:
        _msg = "Cannot run out-of-core POTRF on CUDA matrix 'A'."
        if opt.chol_force_ooc:
            _msg += " Set the `chol_force_ooc` option to `False` in to allow in-core POTRF."
        raise ValueError(_msg)

    # Handle different implementations for POTRF: in-core and out-of-core
    if ic:
        if opt.debug:
            print("Using in-core POTRF")
        _ic_cholesky(A,
                     upper,
                     device=device.Id,
                     cusolver_handle=initialization.cusolver_handle(device.Id))
    else:
        if opt.debug:
            print("Using parallel POTRF")
        _parallel_potrf_runner(A, opt, gpu_info)

    # Perform cleaning of the 'other side' of the matrix
    if clean:
        la_helpers.zero_triang(A, upper=not upper)
    # Undo previous matrix transformations
    if transposed:
        A = A.T

    return A
Ejemplo n.º 11
0
    def fit(self,
            X: torch.Tensor,
            Y: torch.Tensor,
            Xts: Optional[torch.Tensor] = None,
            Yts: Optional[torch.Tensor] = None):
        """Fits the Falkon KRR model.

        Parameters
        -----------
        X : torch.Tensor
            The tensor of training data, of shape [num_samples, num_dimensions].
            If X is in Fortran order (i.e. column-contiguous) then we can avoid
            an extra copy of the data.
        Y : torch.Tensor
            The tensor of training targets, of shape [num_samples, num_outputs].
            If X and Y represent a classification problem, Y can be encoded as a one-hot
            vector.
            If Y is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.
        Xts : torch.Tensor or None
            Tensor of validation data, of shape [num_test_samples, num_dimensions].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.
        Yts : torch.Tensor or None
            Tensor of validation targets, of shape [num_test_samples, num_outputs].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.

        Returns
        --------
        model: Falkon
            The fitted model
        """
        X, Y, Xts, Yts = self._check_fit_inputs(X, Y, Xts, Yts)
        dtype = X.dtype
        self.fit_times_ = []
        self.ny_points_ = None
        self.alpha_ = None

        # Start training timer
        t_s = time.time()

        # Pick Nystrom centers
        if self.weight_fn is not None:
            # noinspection PyTupleAssignmentBalance
            ny_points, ny_indices = self.center_selection.select_indices(
                X, None)
        else:
            # noinspection PyTypeChecker
            ny_points: Union[
                torch.Tensor,
                falkon.sparse.SparseTensor] = self.center_selection.select(
                    X, None)
            ny_indices = None
        num_centers = ny_points.shape[0]

        # Decide whether to use CUDA for preconditioning and iterations, based on number of centers
        _use_cuda_preconditioner = (
            self.use_cuda_ and (not self.options.cpu_preconditioner)
            and num_centers >= get_min_cuda_preconditioner_size(
                dtype, self.options))
        _use_cuda_mmv = (self.use_cuda_ and
                         X.shape[0] * X.shape[1] * num_centers / self.num_gpus
                         >= get_min_cuda_mmv_size(dtype, self.options))

        if self.use_cuda_:
            ny_points = ny_points.pin_memory()

        with TicToc("Calcuating Preconditioner of size %d" % (num_centers),
                    debug=self.options.debug):
            pc_opt: FalkonOptions = dataclasses.replace(
                self.options, use_cpu=not _use_cuda_preconditioner)
            if pc_opt.debug:
                print("Preconditioner will run on %s" %
                      ("CPU" if pc_opt.use_cpu else
                       ("%d GPUs" % self.num_gpus)))
            precond = falkon.preconditioner.FalkonPreconditioner(
                self.penalty, self.kernel, pc_opt)
            ny_weight_vec = None
            if self.weight_fn is not None:
                ny_weight_vec = self.weight_fn(Y[ny_indices])
            precond.init(ny_points, weight_vec=ny_weight_vec)

        if _use_cuda_mmv:
            # Cache must be emptied to ensure enough memory is visible to the optimizer
            torch.cuda.empty_cache()
            X = X.pin_memory()

        # K_NM storage decision
        k_opt = dataclasses.replace(self.options, use_cpu=True)
        cpu_info = get_device_info(k_opt)
        available_ram = min(k_opt.max_cpu_mem, cpu_info[-1].free_memory) * 0.9
        if self._can_store_knm(X, ny_points, available_ram):
            Knm = self.kernel(X, ny_points, opt=self.options)
        else:
            Knm = None
        self.fit_times_.append(time.time() - t_s)  # Preparation time

        # Here we define the callback function which will run at the end
        # of conjugate gradient iterations. This function computes and
        # displays the validation error.
        validation_cback = None
        if self.error_fn is not None and self.error_every is not None:
            validation_cback = self._get_callback_fn(X, Y, Xts, Yts, ny_points,
                                                     precond)

        # Start with the falkon algorithm
        with TicToc('Computing Falkon iterations', debug=self.options.debug):
            o_opt: FalkonOptions = dataclasses.replace(
                self.options, use_cpu=not _use_cuda_mmv)
            if o_opt.debug:
                print("Optimizer will run on %s" %
                      ("CPU" if o_opt.use_cpu else
                       ("%d GPUs" % self.num_gpus)),
                      flush=True)
            optim = falkon.optim.FalkonConjugateGradient(
                self.kernel, precond, o_opt, weight_fn=self.weight_fn)
            if Knm is not None:
                beta = optim.solve(Knm,
                                   None,
                                   Y,
                                   self.penalty,
                                   initial_solution=None,
                                   max_iter=self.maxiter,
                                   callback=validation_cback)
            else:
                beta = optim.solve(X,
                                   ny_points,
                                   Y,
                                   self.penalty,
                                   initial_solution=None,
                                   max_iter=self.maxiter,
                                   callback=validation_cback)

            self.alpha_ = precond.apply(beta)
            self.ny_points_ = ny_points
        return self
Ejemplo n.º 12
0
def run_keops_mmv(X1: torch.Tensor,
                  X2: torch.Tensor,
                  v: torch.Tensor,
                  other_vars: List[torch.Tensor],
                  out: Optional[torch.Tensor],
                  formula: str,
                  aliases: List[str],
                  axis: int,
                  reduction: str = 'Sum',
                  opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if opt is None:
        opt = FalkonOptions()
    # Choose backend
    N, D = X1.shape
    M = X2.shape[0]
    T = v.shape[1]
    backend = _decide_backend(opt, D)
    dtype = _keops_dtype(X1.dtype)

    # Define formula wrapper
    fn = Genred(formula,
                aliases,
                reduction_op=reduction,
                axis=axis,
                dtype=dtype,
                dtype_acc=opt.keops_acc_dtype,
                sum_scheme=opt.keops_sum_scheme)

    # Compile on a small data subset
    small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars
    small_data_out = torch.empty((100, T), dtype=X1.dtype, device=X1.device)
    fn(*small_data_variables, out=small_data_out, backend=backend)

    # Create output matrix
    if out is None:
        # noinspection PyArgumentList
        out = torch.empty(N,
                          T,
                          dtype=X1.dtype,
                          device='cpu',
                          pin_memory=backend != 'CPU')

    if backend.startswith("GPU"):
        # Info about GPUs
        ram_slack = 0.7  # slack is high due to imprecise memory usage estimates
        gpu_info = [
            v for k, v in devices.get_device_info(opt).items() if k >= 0
        ]
        gpu_ram = [
            min((g.free_memory - 300 * 2**20) * ram_slack,
                opt.max_gpu_mem * ram_slack) for g in gpu_info
        ]
        block_sizes = calc_gpu_block_sizes(gpu_info, N)

        # Create queues
        args = []  # Arguments passed to each subprocess
        for i in range(len(gpu_info)):
            # First round of subdivision
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                                  X2=X2,
                                  v=v,
                                  out=out.narrow(0, block_sizes[i], bwidth),
                                  other_vars=other_vars,
                                  function=fn,
                                  backend=backend,
                                  gpu_ram=gpu_ram[i]), gpu_info[i].Id))
        _start_wait_processes(_single_gpu_method, args)
    else:  # Run on CPU
        variables = [X1, X2, v] + other_vars
        out = fn(*variables, out=out, backend=backend)

    return out