Ejemplo n.º 1
0
    def transform(self, X, y=None):
        """Calculate the permutation entropy of each two-dimensional array in
        `X`.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_points, n_dimensions)
            Input data.

        y : None
            There is no need for a target in a transformer, yet the pipeline
            API requires this parameter.

        Returns
        -------
        Xt : ndarray of int, shape (n_samples, 1)
            One permutation entropy per entry in `X` along axis 0.

        """
        check_is_fitted(self, '_is_fitted')
        Xt = check_array(X, allow_nd=True)

        Xt = Parallel(n_jobs=self.n_jobs)(
            delayed(self._permutation_entropy)(Xt[s])
            for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt)
        return Xt
Ejemplo n.º 2
0
def load_data_file_proxy(values, reduce_func, file_mapping, n_jobs=1):

    # Replace n_jobs w/ effective n_jobs
    n_jobs = effective_n_jobs(n_jobs)

    # Can at most be number of files
    n_jobs = min([n_jobs, len(values)])

    # Create proxy to fill in
    proxy = values.copy()

    # Generate splits based on n_jobs
    splits = np.array_split(np.array(values), n_jobs)

    # Nested func for multi-proc, to vectorize
    def change_to_map(x):
        return file_mapping[x]

    v_func = np.vectorize(change_to_map)

    # Apply v_func to each split
    file_splits = [v_func(split) for split in splits]

    # Load w/ joblib Parallel
    output = Parallel(n_jobs=n_jobs, backend="threading")(
        delayed(mp_single_load)(files=files, reduce_func=reduce_func)
        for files in file_splits)

    # Fill proxy with the concatenated output
    proxy[:] = np.concatenate(output)

    return proxy
Ejemplo n.º 3
0
    def __init__(
        self,
        obj,
        on_missing=_opts["on_missing"],
        on_error=_opts["on_error"],
        on_leaf=_opts["on_leaf"],
        leaf_types=_opts["leaf_types"],
        default=_opts["default"],
        n_jobs=_opts["n_jobs"],
    ) -> None:

        # Anything that gets shared with children goes in here.
        self._opts = {
            "on_missing": on_missing,
            "on_error": on_error,
            "on_leaf": on_leaf,
            "default": default,
            "leaf_types": leaf_types,
            "n_jobs": n_jobs,
        }

        # Properties that are unique to this instance.
        self._repr = None

        self._leaf_types, self._leaf_funcs = self._parse_leaf_types(leaf_types)

        self._effective_n_jobs = effective_n_jobs(n_jobs)

        self._parent = None
        self._obj = obj
Ejemplo n.º 4
0
def partition_cells_by_kmeans(data: AnnData, rep: str, n_jobs: int, n_clusters: int, n_clusters2: int, n_init: int, random_state: int) -> List[int]:
    start = time.time()

    n_jobs = effective_n_jobs(n_jobs)

    rep_key = "X_" + rep
    X = data.obsm[rep_key].astype("float64")

    km = KMeans(n_clusters=n_clusters, n_jobs=n_jobs, n_init = n_init, random_state=random_state)
    km.fit(X)
    coarse = km.labels_.copy()

    km.set_params(n_init = 1)
    labels = coarse.copy()
    base_sum = 0
    for i in range(n_clusters):
        idx = coarse == i
        nc = min(n_clusters2, idx.sum())
        km.set_params(n_clusters=nc)
        km.fit(X[idx,:])
        labels[idx] = base_sum + km.labels_
        base_sum += nc

    end = time.time()
    logger.info("partition_cells_by_kmeans finished in {:.2f}s.".format(end - start))

    return labels
Ejemplo n.º 5
0
def _parallel_pairwise(X1, X2, metric, metric_params, homology_dimensions,
                       n_jobs):
    metric_func = implemented_metric_recipes[metric]
    effective_metric_params = metric_params.copy()
    none_dict = {dim: None for dim in homology_dimensions}
    samplings = effective_metric_params.pop("samplings", none_dict)
    step_sizes = effective_metric_params.pop("step_sizes", none_dict)
    if metric in ["heat", "persistence_image"]:
        parallel_kwargs = {"mmap_mode": "c"}
    else:
        parallel_kwargs = {}

    n_columns = len(X2)
    distance_matrices = Parallel(n_jobs=n_jobs, **parallel_kwargs)(
        delayed(metric_func)(_subdiagrams(X1, [dim], remove_dim=True),
                             _subdiagrams(X2[s], [dim], remove_dim=True),
                             sampling=samplings[dim],
                             step_size=step_sizes[dim],
                             **effective_metric_params)
        for dim in homology_dimensions
        for s in gen_even_slices(n_columns, effective_n_jobs(n_jobs)))

    distance_matrices = np.concatenate(distance_matrices, axis=1)
    distance_matrices = np.stack([
        distance_matrices[:, i * n_columns:(i + 1) * n_columns]
        for i in range(len(homology_dimensions))
    ],
                                 axis=2)
    return distance_matrices
Ejemplo n.º 6
0
    def _mean_fn(self, X, fn, acc, slice=None):
        # Helper class that accumulates an arbitrary function in parallel on the accumulator acc
        # and calls the function fn on each tree e and returns the mean output. The function fn
        # should take as input a tree e and associated numerator n and denominator d structures and
        # return another function g_e, which takes as input X, check_input
        # If slice is not None, but rather a tuple (start, end), then a subset of the trees from
        # index start to index end will be used. The returned result is essentially:
        # (mean over e in slice)(g_e(X)).
        check_is_fitted(self, 'estimators_')
        # Check data
        X = self._validate_X_predict(X)

        if slice is None:
            estimator_slice = zip(self.estimators_, self.numerators_,
                                  self.denominators_)
            n_estimators = len(self.estimators_)
        else:
            estimator_slice = zip(self.estimators_[slice[0]:slice[1]],
                                  self.numerators_[slice[0]:slice[1]],
                                  self.denominators_[slice[0]:slice[1]])
            n_estimators = slice[1] - slice[0]

        # Assign chunk of trees to jobs
        n_jobs = min(effective_n_jobs(self.n_jobs), n_estimators)
        lock = threading.Lock()
        Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")(
            delayed(_accumulate_prediction)(fn(e, n, d), X, [acc], lock)
            for e, n, d in estimator_slice)
        acc /= n_estimators
        return acc
Ejemplo n.º 7
0
    def transform(self, X, y=None):
        """For each binary image in the collection `X`, calculate its negation.
        Return the collection of negated binary images.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z])
            Input data. Each entry along axis 0 is interpreted as a 2D or 3D
            binary image.

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        Xt : ndarray of shape (n_samples, n_pixels_x, n_pixels_y \
            [, n_pixels_z])
            Transformed collection of images. Each entry along axis 0 is a
            2D or 3D binary image.

        """
        check_is_fitted(self)
        Xt = check_array(X, allow_nd=True)

        Xt = Parallel(n_jobs=self.n_jobs)(delayed(
            self._invert)(Xt[s])
            for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt)

        return Xt
Ejemplo n.º 8
0
    def transform(self, X, y=None):
        """Compute the persistence entropies of diagrams in `X`.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features, 3)
            Input data. Array of persistence diagrams, each a collection of
            triples [b, d, q] representing persistent topological features
            through their birth (b), death (d) and homology dimension (q).

        y : None
            There is no need for a target in a transformer, yet the pipeline
            API requires this parameter.

        Returns
        -------
        Xt : ndarray of shape (n_samples, n_homology_dimensions)
            Persistence entropies: one value per sample and per homology
            dimension seen in :meth:`fit`. Index i along axis 1 corresponds
            to the i-th homology dimension in :attr:`homology_dimensions_`.

        """
        check_is_fitted(self)
        X = check_diagram(X)

        with np.errstate(divide='ignore', invalid='ignore'):
            Xt = Parallel(n_jobs=self.n_jobs)(
                delayed(self._persistence_entropy)(_subdiagrams(X, [dim])[s])
                for dim in self.homology_dimensions_ for s in gen_even_slices(
                    X.shape[0], effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt).reshape(self._n_dimensions, X.shape[0]).T
        return Xt
Ejemplo n.º 9
0
    def transform(self, X, y=None):
        """Compute derivatives of multi-channel curves.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_channels, n_bins)
            Input collection of multi-channel curves.

        y : None
            There is no need for a target in a transformer, yet the pipeline
            API requires this parameter.

        Returns
        -------
        Xt : ndarray of shape (n_samples, n_channels, n_bins - order)
            Output collection of multi-channel curves given by taking discrete
            differences of order `order` in each channel in the curves in `X`.

        """
        check_is_fitted(self)
        Xt = check_array(X, ensure_2d=False, allow_nd=True)
        if Xt.ndim != 3:
            raise ValueError("Input must be 3-dimensional.")

        Xt = Parallel(n_jobs=self.n_jobs)(
            delayed(np.diff)(Xt[s], n=self.order, axis=-1)
            for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt)

        return Xt
Ejemplo n.º 10
0
    def _partition_bmus(self, X):
        """Private function used to partition bmus between jobs.

        Parameters
        ----------
        X : np.array
            List of datapoints

        Returns
        -------
        n_jobs : int
            Number of jobs
        list of int
            List of number of datapoints per job
        list of int
            List of start values for every job list

        """
        n_datapoints = len(X)
        n_jobs = min(effective_n_jobs(self.n_jobs), n_datapoints)

        n_datapoints_per_job = np.full(n_jobs,
                                       n_datapoints // n_jobs,
                                       dtype=np.int)

        n_datapoints_per_job[:n_datapoints % n_jobs] += 1
        starts = np.cumsum(n_datapoints_per_job)

        return n_jobs, n_datapoints_per_job.tolist(), [0] + starts.tolist()
Ejemplo n.º 11
0
    def fit(self, X, y=None, disable_progress=False):
        """Fit the consensus clustering from features

        Parameters
        ----------
        X : np.ndarray, shape (n_samples, n_features)
            Training instances/objects to cluster

        y : Ignored
            Not used, present here for consistency with the sklearn API.

        disable_progress : bool, default=False
            Whether to show the progress bar or not, when fitting multiple iterations
            of K-Means on random subsets of the data. Set `True` to disable it.

        Returns
        -------
        self

        """
        self.num_samples_, _ = X.shape
        self.n_jobs = effective_n_jobs(self.n_jobs)
        self.consensus_matrix_ = self._fit(X, disable_progress)
        self.labels_ = self._fit_distance_matrix(self.consensus_matrix_)

        return self
Ejemplo n.º 12
0
    def transform(self, X, y=None):
        """For each collection of binary images, calculate the corresponding
        collection of point clouds based on the coordinates of activated
        pixels.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z])
            Input data. Each entry along axis 0 is interpreted as a 2D or 3D
            binary image.

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        Xt : ndarray of shape (n_samples, n_pixels_x * n_pixels_y [* \
            n_pixels_z], n_dimensions)
            Transformed collection of images. Each entry along axis 0 is a
            point cloud in ``n_dimensions``-dimensional space.

        """
        check_is_fitted(self)
        Xt = check_array(X, allow_nd=True)

        Xt = np.swapaxes(np.flip(Xt, axis=1), 1, 2)
        Xt = Parallel(n_jobs=self.n_jobs)(delayed(
            self._embed)(Xt[s])
            for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs)))
        Xt = reduce(iconcat, Xt, [])
        return Xt
Ejemplo n.º 13
0
def _parallel_pairwise(X1, X2, metric, metric_params, homology_dimensions,
                       n_jobs):
    metric_func = implemented_metric_recipes[metric]
    effective_metric_params = metric_params.copy()
    none_dict = {dim: None for dim in homology_dimensions}
    samplings = effective_metric_params.pop('samplings', none_dict)
    step_sizes = effective_metric_params.pop('step_sizes', none_dict)

    if X2 is None:
        X2 = X1

    distance_matrices = Parallel(n_jobs=n_jobs)(
        delayed(metric_func)(_subdiagrams(X1, [dim], remove_dim=True),
                             _subdiagrams(X2[s], [dim], remove_dim=True),
                             sampling=samplings[dim],
                             step_size=step_sizes[dim],
                             **effective_metric_params)
        for dim in homology_dimensions
        for s in gen_even_slices(X2.shape[0], effective_n_jobs(n_jobs)))

    distance_matrices = np.concatenate(distance_matrices, axis=1)
    distance_matrices = np.stack([
        distance_matrices[:, i * X2.shape[0]:(i + 1) * X2.shape[0]]
        for i in range(len(homology_dimensions))
    ],
                                 axis=2)
    return distance_matrices
Ejemplo n.º 14
0
    def transform(self, X, y=None):
        """Calculate the entropy of each array in `X`.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_points, d)
            Input data.

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        Xt : ndarray of int, shape (n_samples, n_points)
            Array of entropies (one per array in `X`).

        """

        # Check if fit had been called
        check_is_fitted(self, ['_is_fitted'])
        X = check_array(X, allow_nd=True)

        Xt = Parallel(n_jobs=self.n_jobs)(
            delayed(self._permutation_entropy)(X[s])
            for s in gen_even_slices(len(X), effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt)
        return Xt
Ejemplo n.º 15
0
    def transform(self, X, y=None):
        """For each binary image in the collection `X`, adds a padding.
        Return the collection of padded binary images.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z])
            Input data. Each entry along axis 0 is interpreted as a 2D or 3D
            image.

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        Xt : ndarray of shape (n_samples, n_pixels_x + 2 * padding_x, \
            n_pixels_y + 2 * padding_y [, n_pixels_z + 2 * padding_z])
            Transformed collection of images. Each entry along axis 0 is a
            2D or 3D binary image.

        """
        check_is_fitted(self)
        Xt = check_array(X, allow_nd=True)

        Xt = Parallel(n_jobs=self.n_jobs)(delayed(
            np.pad)(Xt[s], pad_width=self._pad_width,
                    constant_values=self.value)
            for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt)

        return Xt
Ejemplo n.º 16
0
    def _e_step(self, X, cal_sstats, random_init, parallel=None):
        """E-step in EM update.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Document word matrix.

        cal_sstats : boolean
            Parameter that indicate whether to calculate sufficient statistics
            or not. Set ``cal_sstats`` to True when we need to run M-step.

        random_init : boolean
            Parameter that indicate whether to initialize document topic
            distribution randomly in the E-step. Set it to True in training
            steps.

        parallel : joblib.Parallel (optional)
            Pre-initialized instance of joblib.Parallel.

        Returns
        -------
        (doc_topic_distr, suff_stats) :
            `doc_topic_distr` is unnormalized topic distribution for each
            document. In the literature, this is called `gamma`.
            `suff_stats` is expected sufficient statistics for the M-step.
            When `cal_sstats == False`, it will be None.

        """

        # Run e-step in parallel
        random_state = self.random_state_ if random_init else None

        # TODO: make Parallel._effective_n_jobs public instead?
        n_jobs = effective_n_jobs(self.n_jobs)
        if parallel is None:
            parallel = Parallel(n_jobs=n_jobs,
                                verbose=max(0, self.verbose - 1))
        results = parallel(
            delayed(_update_doc_distribution)
            (X[idx_slice, :], self.exp_dirichlet_component_,
             self.doc_topic_prior_, self.max_doc_update_iter,
             self.mean_change_tol, cal_sstats, random_state)
            for idx_slice in gen_even_slices(X.shape[0], n_jobs))

        # merge result
        doc_topics, sstats_list = zip(*results)
        doc_topic_distr = np.vstack(doc_topics)

        if cal_sstats:
            # This step finishes computing the sufficient statistics for the
            # M-step.
            suff_stats = np.zeros(self.components_.shape)
            for sstats in sstats_list:
                suff_stats += sstats
            suff_stats *= self.exp_dirichlet_component_
        else:
            suff_stats = None

        return (doc_topic_distr, suff_stats)
Ejemplo n.º 17
0
    def transform(self, X, y=None):
        """For each greyscale image in the collection `X`, calculate a
        corresponding binary image by applying the `threshold`. Return the
        collection of binary images.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z])
            Input data. Each entry along axis 0 is interpreted as a 2D or 3D
            greyscale image.

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        Xt : ndarray of shape (n_samples, n_pixels_x, n_pixels_y \
            [, n_pixels_z])
            Transformed collection of images. Each entry along axis 0 is a
            2D or 3D binary image.

        """
        check_is_fitted(self)
        Xt = check_array(X, allow_nd=True)

        Xt = Parallel(n_jobs=self.n_jobs)(delayed(
            self._binarize)(Xt[s])
            for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt)

        if self.n_dimensions_ == 2:
            Xt = Xt.reshape(X.shape)

        return Xt
    def transform(self, X, y=None):
        """For each collection of binary images, calculate the corresponding
        collection of point clouds based on the coordinates of activated
        pixels.
        Parameters
        ----------
        X : ndarray, shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z])
            Input data. Each entry along axis 0 is interpreted as a 2D or 3D
            binary image.
        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.
        Returns
        -------
        Xt : ndarray, shape (n_samples, n_pixels_x * n_pixels_y [* n_pixels_z],
            n_dimensions)
            Transformed collection of images. Each entry along axis 0 is a
            point cloud in a `n_dimensions` dimensional space.
        """
        check_is_fitted(self)
        Xt = check_array(X, ensure_2d=False, allow_nd=True, copy=True)

        Xt = Parallel(n_jobs=self.n_jobs)(
            delayed(self._embed)(X[s]) for s in gen_even_slices(
                X.shape[0], effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt)
        return Xt
Ejemplo n.º 19
0
    def transform(self, X, y=None):
        """For each binary image in the collection `X`, calculate a
        corresponding greyscale image based on the distance of its pixels to
        the hyperplane defined by the `direction` vector and the first seen
        edge of the images following that `direction`. Return the collection
        of greyscale images.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z])
            Input data. Each entry along axis 0 is interpreted as a 2D or 3D
            binary image.

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        Xt : ndarray of shape (n_samples, n_pixels_x,
            n_pixels_y [, n_pixels_z])
            Transformed collection of images. Each entry along axis 0 is a
            2D or 3D greyscale image.

        """
        check_is_fitted(self)
        Xt = check_array(X, allow_nd=True)

        Xt = Parallel(n_jobs=self.n_jobs)(
            delayed(self._calculate_height)(X[s])
            for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt)

        return Xt
Ejemplo n.º 20
0
    def transform(self, X, y=None):
        """Compute the Betti curves of diagrams in `X`.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features, 3)
            Input data. Array of persistence diagrams, each a collection of
            triples [b, d, q] representing persistent topological features
            through their birth (b), death (d) and homology dimension (q).

        y : None
            There is no need for a target in a transformer, yet the pipeline
            API requires this parameter.

        Returns
        -------
        Xt : ndarray of shape (n_samples, n_homology_dimensions, n_bins)
            Betti curves: one curve (represented as a one-dimensional array
            of integer values) per sample and per homology dimension seen
            in :meth:`fit`. Index i along axis 1 corresponds to the i-th
            homology dimension in :attr:`homology_dimensions_`.

        """
        check_is_fitted(self)
        X = check_diagram(X)

        Xt = Parallel(n_jobs=self.n_jobs)(
            delayed(betti_curves)(_subdiagrams(X, [dim], remove_dim=True)[s],
                                  self._samplings[dim])
            for dim in self.homology_dimensions_ for s in gen_even_slices(
                X.shape[0], effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt).\
            reshape(self._n_dimensions, X.shape[0], -1).\
            transpose((1, 0, 2))
        return Xt
Ejemplo n.º 21
0
    def transform(self, X, y=None):
        """For each binary image in the collection `X`, calculate a
        corresponding grayscale image based on the distance of its pixels to
        the center. Return the collection of grayscale images.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z])
            Input data. Each entry along axis 0 is interpreted as a 2D or 3D
            binary image.

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        Xt : ndarray of shape (n_samples, n_pixels_x,
            n_pixels_y [, n_pixels_z])
            Transformed collection of images. Each entry along axis 0 is a
            2D or 3D grayscale image.

        """
        check_is_fitted(self)
        Xt = check_array(X, ensure_2d=False, allow_nd=True, copy=True)

        Xt = Parallel(n_jobs=self.n_jobs)(
            delayed(self._calculate_radial)(X[s]) for s in gen_even_slices(
                Xt.shape[0], effective_n_jobs(self.n_jobs)))
        Xt = np.concatenate(Xt)

        return Xt
Ejemplo n.º 22
0
def run_multiple_kmeans(
    data: AnnData,
    rep: "str",
    n_jobs: int,
    n_clusters: int,
    n_init: int,
    random_state: int,
    temp_folder: None,
) -> List[str]:
    """ Spectral clustering in parallel
    """
    start = time.time()

    n_jobs = effective_n_jobs(n_jobs)

    rep_key = "X_" + rep
    X = data.obsm[rep_key].astype("float64")

    np.random.seed(random_state)
    seeds = np.random.randint(np.iinfo(np.int32).max, size=n_init)
    results = Parallel(n_jobs=n_jobs, max_nbytes=1e7, temp_folder=temp_folder)(
        delayed(run_one_instance_of_kmeans)(n_clusters, X, seed) for seed in
        seeds)  # Note that if n_jobs == 1, joblib will not fork a new process.

    labels = list(zip(*results))
    uniqs = np.unique(labels, axis=0)
    transfer_dict = {tuple(k): v for k, v in zip(uniqs, range(uniqs.shape[0]))}
    labels = [transfer_dict[x] for x in labels]

    end = time.time()
    logger.info("run_multiple_kmeans finished in {:.2f}s.".format(end - start))

    return labels
Ejemplo n.º 23
0
    def fit(self, X, y):
        """Fit linear model.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Training data.
        y : ndarray of shape (n_samples,)
            Target values.

        Returns
        -------
        self : returns an instance of self.
            Fitted `TheilSenRegressor` estimator.
        """
        random_state = check_random_state(self.random_state)
        X, y = self._validate_data(X, y, y_numeric=True)
        n_samples, n_features = X.shape
        n_subsamples, self.n_subpopulation_ = self._check_subparams(
            n_samples, n_features)
        self.breakdown_ = _breakdown_point(n_samples, n_subsamples)

        if self.verbose:
            print("Breakdown point: {0}".format(self.breakdown_))
            print("Number of samples: {0}".format(n_samples))
            tol_outliers = int(self.breakdown_ * n_samples)
            print("Tolerable outliers: {0}".format(tol_outliers))
            print("Number of subpopulations: {0}".format(
                self.n_subpopulation_))

        # Determine indices of subpopulation
        if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation:
            indices = list(combinations(range(n_samples), n_subsamples))
        else:
            indices = [
                random_state.choice(n_samples,
                                    size=n_subsamples,
                                    replace=False)
                for _ in range(self.n_subpopulation_)
            ]

        n_jobs = effective_n_jobs(self.n_jobs)
        index_list = np.array_split(indices, n_jobs)
        weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_lstsq)(X, y, index_list[job], self.fit_intercept)
            for job in range(n_jobs))
        weights = np.vstack(weights)
        self.n_iter_, coefs = _spatial_median(weights,
                                              max_iter=self.max_iter,
                                              tol=self.tol)

        if self.fit_intercept:
            self.intercept_ = coefs[0]
            self.coef_ = coefs[1:]
        else:
            self.intercept_ = 0.0
            self.coef_ = coefs

        return self
Ejemplo n.º 24
0
def get_neighbors(
    data: AnnData,
    K: int = 100,
    rep: str = "pca",
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
) -> Tuple[List[int], List[float]]:
    """Find K nearest neighbors for each data point and return the indices and distances arrays.

    Parameters
    ----------

    data : `AnnData`
        An AnnData object.
    K : `int`, optional (default: 100)
        Number of neighbors, including the data point itself.
    rep : `str`, optional (default: 'pca')
        Representation used to calculate kNN. If `None` use data.X
    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to all available threads
    random_state: `int`, optional (default: 0)
        Random seed for random number generator.
    full_speed: `bool`, optional (default: False)
        If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.

    Returns
    -------

    kNN indices and distances arrays.

    Examples
    --------
    >>> indices, distances = tools.get_neighbors(adata)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if knn_is_cached(data, indices_key, distances_key, K):
        indices = data.uns[indices_key]
        distances = data.uns[distances_key]
        logger.info("Found cached kNN results, no calculation is required.")
    else:
        indices, distances = calculate_nearest_neighbors(
            X_from_rep(data, rep),
            K=K,
            n_jobs=effective_n_jobs(n_jobs),
            random_state=random_state,
            full_speed=full_speed,
        )
        data.uns[indices_key] = indices
        data.uns[distances_key] = distances

    return indices, distances
Ejemplo n.º 25
0
def _partition_estimators(n_estimators, n_jobs):
    """Private function used to partition estimators between jobs."""
    # Compute the number of jobs
    n_jobs = min(effective_n_jobs(n_jobs), n_estimators)

    # Partition estimators between jobs
    n_estimators_per_job = np.full(n_jobs, n_estimators // n_jobs, dtype=int)
    n_estimators_per_job[:n_estimators % n_jobs] += 1
    starts = np.cumsum(n_estimators_per_job)

    return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
Ejemplo n.º 26
0
def parallel_pairwise_dist(func, X, Y=None, njobs=-1):
    if njobs < 1:
        njobs = joblib.cpu_count() + njobs

    if Y is None:
        Y = X

    fd = delayed(_dist_wrapper)
    out = Parallel(n_jobs=njobs)(
        fd(func, X, Y[s])
        for s in gen_even_slices(len(Y), effective_n_jobs(njobs)))
    return np.hstack(out)
Ejemplo n.º 27
0
def parallelize(n_jobs,
                func,
                iterable,
                respective=False,
                tq=True,
                batch_size='auto',
                **kwargs):
    """
    parallelize the function for iterable.

    make sure in if __name__ == "__main__":

    Parameters
    ----------
    batch_size
    respective:bool
        Import the parameters respectively or as a whole
    tq:bool
         View Progress or not
    n_jobs:int
        cpu numbers. n_jobs is the number of workers requested by the callers. Passing n_jobs=-1
    means requesting all available workers for instance matching the number of CPU cores on the worker host(s).
    func:
        function to calculate
    iterable:
        interable object
    kwargs:
        kwargs for function

    Returns
    -------
    results
        function results

    """

    func = partial(func, **kwargs)
    if effective_n_jobs(n_jobs) == 1:
        parallel, func = list, func
    else:
        parallel = Parallel(n_jobs=n_jobs, batch_size=batch_size)
        func = delayed(func)
    if tq:
        if respective:
            return parallel(func(*iter_i) for iter_i in tqdm(iterable))
        else:
            return parallel(func(iter_i) for iter_i in tqdm(iterable))
    else:
        if respective:
            return parallel(func(*iter_i) for iter_i in iterable)
        else:
            return parallel(func(iter_i) for iter_i in iterable)
Ejemplo n.º 28
0
def compute_fitness(x, y, population, metric, n_jobs):
    candidates = population.astype('bool')
    if n_jobs == -1:
        n_jobs = min(effective_n_jobs(n_jobs), len(candidates))
        print(n_jobs)
    models = list(map(lambda i: deepcopy(metric), range(len(candidates))))
    models = Parallel(n_jobs=n_jobs)(
        delayed(compute_score)(models[i], x, y, candidates[i])
        for i in range(len(candidates)))
    scores = np.array(list(map(lambda model: model.value, models)))
    learners = list(map(lambda model: model.learner_, models))
    weights = compute_feature_weights(learners, candidates)
    return scores, weights
Ejemplo n.º 29
0
def _partition_columns(columns, n_jobs):
    """Private function to partition columns splitting between jobs."""
    # Compute the number of jobs
    n_columns = len(columns)
    n_jobs = min(effective_n_jobs(n_jobs), n_columns)

    # Partition columns between jobs
    n_columns_per_job = np.full(n_jobs, n_columns // n_jobs, dtype=int)
    n_columns_per_job[:n_columns % n_jobs] += 1
    columns_per_job = np.cumsum(n_columns_per_job)
    columns_per_job = np.split(columns, columns_per_job)
    columns_per_job = columns_per_job[:-1]

    return n_jobs, columns_per_job
Ejemplo n.º 30
0
    def partial_fit(self, X, y=None):
        """Online VB with Mini-Batch update.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Document word matrix.

        y : Ignored

        Returns
        -------
        self
        """
        self._check_params()
        first_time = not hasattr(self, 'components_')

        # In theory reset should be equal to `first_time`, but there are tests
        # checking the input number of feature and they expect a specific
        # string, which is not the same one raised by check_n_features. So we
        # don't check n_features_in_ here for now (it's done with adhoc code in
        # the estimator anyway).
        # TODO: set reset=first_time when addressing reset in
        # predict/transform/etc.
        reset_n_features = True
        X = self._check_non_neg_array(X, reset_n_features,
                                      "LatentDirichletAllocation.partial_fit")
        n_samples, n_features = X.shape
        batch_size = self.batch_size

        # initialize parameters or check
        if first_time:
            self._init_latent_vars(n_features)

        if n_features != self.components_.shape[1]:
            raise ValueError(
                "The provided data has %d dimensions while "
                "the model was trained with feature size %d." %
                (n_features, self.components_.shape[1]))

        n_jobs = effective_n_jobs(self.n_jobs)
        with Parallel(n_jobs=n_jobs,
                      verbose=max(0, self.verbose - 1)) as parallel:
            for idx_slice in gen_batches(n_samples, batch_size):
                self._em_step(X[idx_slice, :],
                              total_samples=self.total_samples,
                              batch_update=False,
                              parallel=parallel)

        return self