def transform(self, X, y=None): """Calculate the permutation entropy of each two-dimensional array in `X`. Parameters ---------- X : ndarray of shape (n_samples, n_points, n_dimensions) Input data. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of int, shape (n_samples, 1) One permutation entropy per entry in `X` along axis 0. """ check_is_fitted(self, '_is_fitted') Xt = check_array(X, allow_nd=True) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._permutation_entropy)(Xt[s]) for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) return Xt
def load_data_file_proxy(values, reduce_func, file_mapping, n_jobs=1): # Replace n_jobs w/ effective n_jobs n_jobs = effective_n_jobs(n_jobs) # Can at most be number of files n_jobs = min([n_jobs, len(values)]) # Create proxy to fill in proxy = values.copy() # Generate splits based on n_jobs splits = np.array_split(np.array(values), n_jobs) # Nested func for multi-proc, to vectorize def change_to_map(x): return file_mapping[x] v_func = np.vectorize(change_to_map) # Apply v_func to each split file_splits = [v_func(split) for split in splits] # Load w/ joblib Parallel output = Parallel(n_jobs=n_jobs, backend="threading")( delayed(mp_single_load)(files=files, reduce_func=reduce_func) for files in file_splits) # Fill proxy with the concatenated output proxy[:] = np.concatenate(output) return proxy
def __init__( self, obj, on_missing=_opts["on_missing"], on_error=_opts["on_error"], on_leaf=_opts["on_leaf"], leaf_types=_opts["leaf_types"], default=_opts["default"], n_jobs=_opts["n_jobs"], ) -> None: # Anything that gets shared with children goes in here. self._opts = { "on_missing": on_missing, "on_error": on_error, "on_leaf": on_leaf, "default": default, "leaf_types": leaf_types, "n_jobs": n_jobs, } # Properties that are unique to this instance. self._repr = None self._leaf_types, self._leaf_funcs = self._parse_leaf_types(leaf_types) self._effective_n_jobs = effective_n_jobs(n_jobs) self._parent = None self._obj = obj
def partition_cells_by_kmeans(data: AnnData, rep: str, n_jobs: int, n_clusters: int, n_clusters2: int, n_init: int, random_state: int) -> List[int]: start = time.time() n_jobs = effective_n_jobs(n_jobs) rep_key = "X_" + rep X = data.obsm[rep_key].astype("float64") km = KMeans(n_clusters=n_clusters, n_jobs=n_jobs, n_init = n_init, random_state=random_state) km.fit(X) coarse = km.labels_.copy() km.set_params(n_init = 1) labels = coarse.copy() base_sum = 0 for i in range(n_clusters): idx = coarse == i nc = min(n_clusters2, idx.sum()) km.set_params(n_clusters=nc) km.fit(X[idx,:]) labels[idx] = base_sum + km.labels_ base_sum += nc end = time.time() logger.info("partition_cells_by_kmeans finished in {:.2f}s.".format(end - start)) return labels
def _parallel_pairwise(X1, X2, metric, metric_params, homology_dimensions, n_jobs): metric_func = implemented_metric_recipes[metric] effective_metric_params = metric_params.copy() none_dict = {dim: None for dim in homology_dimensions} samplings = effective_metric_params.pop("samplings", none_dict) step_sizes = effective_metric_params.pop("step_sizes", none_dict) if metric in ["heat", "persistence_image"]: parallel_kwargs = {"mmap_mode": "c"} else: parallel_kwargs = {} n_columns = len(X2) distance_matrices = Parallel(n_jobs=n_jobs, **parallel_kwargs)( delayed(metric_func)(_subdiagrams(X1, [dim], remove_dim=True), _subdiagrams(X2[s], [dim], remove_dim=True), sampling=samplings[dim], step_size=step_sizes[dim], **effective_metric_params) for dim in homology_dimensions for s in gen_even_slices(n_columns, effective_n_jobs(n_jobs))) distance_matrices = np.concatenate(distance_matrices, axis=1) distance_matrices = np.stack([ distance_matrices[:, i * n_columns:(i + 1) * n_columns] for i in range(len(homology_dimensions)) ], axis=2) return distance_matrices
def _mean_fn(self, X, fn, acc, slice=None): # Helper class that accumulates an arbitrary function in parallel on the accumulator acc # and calls the function fn on each tree e and returns the mean output. The function fn # should take as input a tree e and associated numerator n and denominator d structures and # return another function g_e, which takes as input X, check_input # If slice is not None, but rather a tuple (start, end), then a subset of the trees from # index start to index end will be used. The returned result is essentially: # (mean over e in slice)(g_e(X)). check_is_fitted(self, 'estimators_') # Check data X = self._validate_X_predict(X) if slice is None: estimator_slice = zip(self.estimators_, self.numerators_, self.denominators_) n_estimators = len(self.estimators_) else: estimator_slice = zip(self.estimators_[slice[0]:slice[1]], self.numerators_[slice[0]:slice[1]], self.denominators_[slice[0]:slice[1]]) n_estimators = slice[1] - slice[0] # Assign chunk of trees to jobs n_jobs = min(effective_n_jobs(self.n_jobs), n_estimators) lock = threading.Lock() Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( delayed(_accumulate_prediction)(fn(e, n, d), X, [acc], lock) for e, n, d in estimator_slice) acc /= n_estimators return acc
def transform(self, X, y=None): """For each binary image in the collection `X`, calculate its negation. Return the collection of negated binary images. Parameters ---------- X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Input data. Each entry along axis 0 is interpreted as a 2D or 3D binary image. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_pixels_x, n_pixels_y \ [, n_pixels_z]) Transformed collection of images. Each entry along axis 0 is a 2D or 3D binary image. """ check_is_fitted(self) Xt = check_array(X, allow_nd=True) Xt = Parallel(n_jobs=self.n_jobs)(delayed( self._invert)(Xt[s]) for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) return Xt
def transform(self, X, y=None): """Compute the persistence entropies of diagrams in `X`. Parameters ---------- X : ndarray of shape (n_samples, n_features, 3) Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_homology_dimensions) Persistence entropies: one value per sample and per homology dimension seen in :meth:`fit`. Index i along axis 1 corresponds to the i-th homology dimension in :attr:`homology_dimensions_`. """ check_is_fitted(self) X = check_diagram(X) with np.errstate(divide='ignore', invalid='ignore'): Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._persistence_entropy)(_subdiagrams(X, [dim])[s]) for dim in self.homology_dimensions_ for s in gen_even_slices( X.shape[0], effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt).reshape(self._n_dimensions, X.shape[0]).T return Xt
def transform(self, X, y=None): """Compute derivatives of multi-channel curves. Parameters ---------- X : ndarray of shape (n_samples, n_channels, n_bins) Input collection of multi-channel curves. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_channels, n_bins - order) Output collection of multi-channel curves given by taking discrete differences of order `order` in each channel in the curves in `X`. """ check_is_fitted(self) Xt = check_array(X, ensure_2d=False, allow_nd=True) if Xt.ndim != 3: raise ValueError("Input must be 3-dimensional.") Xt = Parallel(n_jobs=self.n_jobs)( delayed(np.diff)(Xt[s], n=self.order, axis=-1) for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) return Xt
def _partition_bmus(self, X): """Private function used to partition bmus between jobs. Parameters ---------- X : np.array List of datapoints Returns ------- n_jobs : int Number of jobs list of int List of number of datapoints per job list of int List of start values for every job list """ n_datapoints = len(X) n_jobs = min(effective_n_jobs(self.n_jobs), n_datapoints) n_datapoints_per_job = np.full(n_jobs, n_datapoints // n_jobs, dtype=np.int) n_datapoints_per_job[:n_datapoints % n_jobs] += 1 starts = np.cumsum(n_datapoints_per_job) return n_jobs, n_datapoints_per_job.tolist(), [0] + starts.tolist()
def fit(self, X, y=None, disable_progress=False): """Fit the consensus clustering from features Parameters ---------- X : np.ndarray, shape (n_samples, n_features) Training instances/objects to cluster y : Ignored Not used, present here for consistency with the sklearn API. disable_progress : bool, default=False Whether to show the progress bar or not, when fitting multiple iterations of K-Means on random subsets of the data. Set `True` to disable it. Returns ------- self """ self.num_samples_, _ = X.shape self.n_jobs = effective_n_jobs(self.n_jobs) self.consensus_matrix_ = self._fit(X, disable_progress) self.labels_ = self._fit_distance_matrix(self.consensus_matrix_) return self
def transform(self, X, y=None): """For each collection of binary images, calculate the corresponding collection of point clouds based on the coordinates of activated pixels. Parameters ---------- X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Input data. Each entry along axis 0 is interpreted as a 2D or 3D binary image. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_pixels_x * n_pixels_y [* \ n_pixels_z], n_dimensions) Transformed collection of images. Each entry along axis 0 is a point cloud in ``n_dimensions``-dimensional space. """ check_is_fitted(self) Xt = check_array(X, allow_nd=True) Xt = np.swapaxes(np.flip(Xt, axis=1), 1, 2) Xt = Parallel(n_jobs=self.n_jobs)(delayed( self._embed)(Xt[s]) for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs))) Xt = reduce(iconcat, Xt, []) return Xt
def _parallel_pairwise(X1, X2, metric, metric_params, homology_dimensions, n_jobs): metric_func = implemented_metric_recipes[metric] effective_metric_params = metric_params.copy() none_dict = {dim: None for dim in homology_dimensions} samplings = effective_metric_params.pop('samplings', none_dict) step_sizes = effective_metric_params.pop('step_sizes', none_dict) if X2 is None: X2 = X1 distance_matrices = Parallel(n_jobs=n_jobs)( delayed(metric_func)(_subdiagrams(X1, [dim], remove_dim=True), _subdiagrams(X2[s], [dim], remove_dim=True), sampling=samplings[dim], step_size=step_sizes[dim], **effective_metric_params) for dim in homology_dimensions for s in gen_even_slices(X2.shape[0], effective_n_jobs(n_jobs))) distance_matrices = np.concatenate(distance_matrices, axis=1) distance_matrices = np.stack([ distance_matrices[:, i * X2.shape[0]:(i + 1) * X2.shape[0]] for i in range(len(homology_dimensions)) ], axis=2) return distance_matrices
def transform(self, X, y=None): """Calculate the entropy of each array in `X`. Parameters ---------- X : ndarray, shape (n_samples, n_points, d) Input data. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of int, shape (n_samples, n_points) Array of entropies (one per array in `X`). """ # Check if fit had been called check_is_fitted(self, ['_is_fitted']) X = check_array(X, allow_nd=True) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._permutation_entropy)(X[s]) for s in gen_even_slices(len(X), effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) return Xt
def transform(self, X, y=None): """For each binary image in the collection `X`, adds a padding. Return the collection of padded binary images. Parameters ---------- X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Input data. Each entry along axis 0 is interpreted as a 2D or 3D image. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_pixels_x + 2 * padding_x, \ n_pixels_y + 2 * padding_y [, n_pixels_z + 2 * padding_z]) Transformed collection of images. Each entry along axis 0 is a 2D or 3D binary image. """ check_is_fitted(self) Xt = check_array(X, allow_nd=True) Xt = Parallel(n_jobs=self.n_jobs)(delayed( np.pad)(Xt[s], pad_width=self._pad_width, constant_values=self.value) for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) return Xt
def _e_step(self, X, cal_sstats, random_init, parallel=None): """E-step in EM update. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. cal_sstats : boolean Parameter that indicate whether to calculate sufficient statistics or not. Set ``cal_sstats`` to True when we need to run M-step. random_init : boolean Parameter that indicate whether to initialize document topic distribution randomly in the E-step. Set it to True in training steps. parallel : joblib.Parallel (optional) Pre-initialized instance of joblib.Parallel. Returns ------- (doc_topic_distr, suff_stats) : `doc_topic_distr` is unnormalized topic distribution for each document. In the literature, this is called `gamma`. `suff_stats` is expected sufficient statistics for the M-step. When `cal_sstats == False`, it will be None. """ # Run e-step in parallel random_state = self.random_state_ if random_init else None # TODO: make Parallel._effective_n_jobs public instead? n_jobs = effective_n_jobs(self.n_jobs) if parallel is None: parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) results = parallel( delayed(_update_doc_distribution) (X[idx_slice, :], self.exp_dirichlet_component_, self.doc_topic_prior_, self.max_doc_update_iter, self.mean_change_tol, cal_sstats, random_state) for idx_slice in gen_even_slices(X.shape[0], n_jobs)) # merge result doc_topics, sstats_list = zip(*results) doc_topic_distr = np.vstack(doc_topics) if cal_sstats: # This step finishes computing the sufficient statistics for the # M-step. suff_stats = np.zeros(self.components_.shape) for sstats in sstats_list: suff_stats += sstats suff_stats *= self.exp_dirichlet_component_ else: suff_stats = None return (doc_topic_distr, suff_stats)
def transform(self, X, y=None): """For each greyscale image in the collection `X`, calculate a corresponding binary image by applying the `threshold`. Return the collection of binary images. Parameters ---------- X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Input data. Each entry along axis 0 is interpreted as a 2D or 3D greyscale image. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_pixels_x, n_pixels_y \ [, n_pixels_z]) Transformed collection of images. Each entry along axis 0 is a 2D or 3D binary image. """ check_is_fitted(self) Xt = check_array(X, allow_nd=True) Xt = Parallel(n_jobs=self.n_jobs)(delayed( self._binarize)(Xt[s]) for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) if self.n_dimensions_ == 2: Xt = Xt.reshape(X.shape) return Xt
def transform(self, X, y=None): """For each collection of binary images, calculate the corresponding collection of point clouds based on the coordinates of activated pixels. Parameters ---------- X : ndarray, shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Input data. Each entry along axis 0 is interpreted as a 2D or 3D binary image. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray, shape (n_samples, n_pixels_x * n_pixels_y [* n_pixels_z], n_dimensions) Transformed collection of images. Each entry along axis 0 is a point cloud in a `n_dimensions` dimensional space. """ check_is_fitted(self) Xt = check_array(X, ensure_2d=False, allow_nd=True, copy=True) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._embed)(X[s]) for s in gen_even_slices( X.shape[0], effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) return Xt
def transform(self, X, y=None): """For each binary image in the collection `X`, calculate a corresponding greyscale image based on the distance of its pixels to the hyperplane defined by the `direction` vector and the first seen edge of the images following that `direction`. Return the collection of greyscale images. Parameters ---------- X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Input data. Each entry along axis 0 is interpreted as a 2D or 3D binary image. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Transformed collection of images. Each entry along axis 0 is a 2D or 3D greyscale image. """ check_is_fitted(self) Xt = check_array(X, allow_nd=True) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._calculate_height)(X[s]) for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) return Xt
def transform(self, X, y=None): """Compute the Betti curves of diagrams in `X`. Parameters ---------- X : ndarray of shape (n_samples, n_features, 3) Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_homology_dimensions, n_bins) Betti curves: one curve (represented as a one-dimensional array of integer values) per sample and per homology dimension seen in :meth:`fit`. Index i along axis 1 corresponds to the i-th homology dimension in :attr:`homology_dimensions_`. """ check_is_fitted(self) X = check_diagram(X) Xt = Parallel(n_jobs=self.n_jobs)( delayed(betti_curves)(_subdiagrams(X, [dim], remove_dim=True)[s], self._samplings[dim]) for dim in self.homology_dimensions_ for s in gen_even_slices( X.shape[0], effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt).\ reshape(self._n_dimensions, X.shape[0], -1).\ transpose((1, 0, 2)) return Xt
def transform(self, X, y=None): """For each binary image in the collection `X`, calculate a corresponding grayscale image based on the distance of its pixels to the center. Return the collection of grayscale images. Parameters ---------- X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Input data. Each entry along axis 0 is interpreted as a 2D or 3D binary image. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Transformed collection of images. Each entry along axis 0 is a 2D or 3D grayscale image. """ check_is_fitted(self) Xt = check_array(X, ensure_2d=False, allow_nd=True, copy=True) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._calculate_radial)(X[s]) for s in gen_even_slices( Xt.shape[0], effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) return Xt
def run_multiple_kmeans( data: AnnData, rep: "str", n_jobs: int, n_clusters: int, n_init: int, random_state: int, temp_folder: None, ) -> List[str]: """ Spectral clustering in parallel """ start = time.time() n_jobs = effective_n_jobs(n_jobs) rep_key = "X_" + rep X = data.obsm[rep_key].astype("float64") np.random.seed(random_state) seeds = np.random.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, max_nbytes=1e7, temp_folder=temp_folder)( delayed(run_one_instance_of_kmeans)(n_clusters, X, seed) for seed in seeds) # Note that if n_jobs == 1, joblib will not fork a new process. labels = list(zip(*results)) uniqs = np.unique(labels, axis=0) transfer_dict = {tuple(k): v for k, v in zip(uniqs, range(uniqs.shape[0]))} labels = [transfer_dict[x] for x in labels] end = time.time() logger.info("run_multiple_kmeans finished in {:.2f}s.".format(end - start)) return labels
def fit(self, X, y): """Fit linear model. Parameters ---------- X : ndarray of shape (n_samples, n_features) Training data. y : ndarray of shape (n_samples,) Target values. Returns ------- self : returns an instance of self. Fitted `TheilSenRegressor` estimator. """ random_state = check_random_state(self.random_state) X, y = self._validate_data(X, y, y_numeric=True) n_samples, n_features = X.shape n_subsamples, self.n_subpopulation_ = self._check_subparams( n_samples, n_features) self.breakdown_ = _breakdown_point(n_samples, n_subsamples) if self.verbose: print("Breakdown point: {0}".format(self.breakdown_)) print("Number of samples: {0}".format(n_samples)) tol_outliers = int(self.breakdown_ * n_samples) print("Tolerable outliers: {0}".format(tol_outliers)) print("Number of subpopulations: {0}".format( self.n_subpopulation_)) # Determine indices of subpopulation if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation: indices = list(combinations(range(n_samples), n_subsamples)) else: indices = [ random_state.choice(n_samples, size=n_subsamples, replace=False) for _ in range(self.n_subpopulation_) ] n_jobs = effective_n_jobs(self.n_jobs) index_list = np.array_split(indices, n_jobs) weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_lstsq)(X, y, index_list[job], self.fit_intercept) for job in range(n_jobs)) weights = np.vstack(weights) self.n_iter_, coefs = _spatial_median(weights, max_iter=self.max_iter, tol=self.tol) if self.fit_intercept: self.intercept_ = coefs[0] self.coef_ = coefs[1:] else: self.intercept_ = 0.0 self.coef_ = coefs return self
def get_neighbors( data: AnnData, K: int = 100, rep: str = "pca", n_jobs: int = -1, random_state: int = 0, full_speed: bool = False, ) -> Tuple[List[int], List[float]]: """Find K nearest neighbors for each data point and return the indices and distances arrays. Parameters ---------- data : `AnnData` An AnnData object. K : `int`, optional (default: 100) Number of neighbors, including the data point itself. rep : `str`, optional (default: 'pca') Representation used to calculate kNN. If `None` use data.X n_jobs : `int`, optional (default: -1) Number of threads to use. -1 refers to all available threads random_state: `int`, optional (default: 0) Random seed for random number generator. full_speed: `bool`, optional (default: False) If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible. Returns ------- kNN indices and distances arrays. Examples -------- >>> indices, distances = tools.get_neighbors(adata) """ rep = update_rep(rep) indices_key = rep + "_knn_indices" distances_key = rep + "_knn_distances" if knn_is_cached(data, indices_key, distances_key, K): indices = data.uns[indices_key] distances = data.uns[distances_key] logger.info("Found cached kNN results, no calculation is required.") else: indices, distances = calculate_nearest_neighbors( X_from_rep(data, rep), K=K, n_jobs=effective_n_jobs(n_jobs), random_state=random_state, full_speed=full_speed, ) data.uns[indices_key] = indices data.uns[distances_key] = distances return indices, distances
def _partition_estimators(n_estimators, n_jobs): """Private function used to partition estimators between jobs.""" # Compute the number of jobs n_jobs = min(effective_n_jobs(n_jobs), n_estimators) # Partition estimators between jobs n_estimators_per_job = np.full(n_jobs, n_estimators // n_jobs, dtype=int) n_estimators_per_job[:n_estimators % n_jobs] += 1 starts = np.cumsum(n_estimators_per_job) return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
def parallel_pairwise_dist(func, X, Y=None, njobs=-1): if njobs < 1: njobs = joblib.cpu_count() + njobs if Y is None: Y = X fd = delayed(_dist_wrapper) out = Parallel(n_jobs=njobs)( fd(func, X, Y[s]) for s in gen_even_slices(len(Y), effective_n_jobs(njobs))) return np.hstack(out)
def parallelize(n_jobs, func, iterable, respective=False, tq=True, batch_size='auto', **kwargs): """ parallelize the function for iterable. make sure in if __name__ == "__main__": Parameters ---------- batch_size respective:bool Import the parameters respectively or as a whole tq:bool View Progress or not n_jobs:int cpu numbers. n_jobs is the number of workers requested by the callers. Passing n_jobs=-1 means requesting all available workers for instance matching the number of CPU cores on the worker host(s). func: function to calculate iterable: interable object kwargs: kwargs for function Returns ------- results function results """ func = partial(func, **kwargs) if effective_n_jobs(n_jobs) == 1: parallel, func = list, func else: parallel = Parallel(n_jobs=n_jobs, batch_size=batch_size) func = delayed(func) if tq: if respective: return parallel(func(*iter_i) for iter_i in tqdm(iterable)) else: return parallel(func(iter_i) for iter_i in tqdm(iterable)) else: if respective: return parallel(func(*iter_i) for iter_i in iterable) else: return parallel(func(iter_i) for iter_i in iterable)
def compute_fitness(x, y, population, metric, n_jobs): candidates = population.astype('bool') if n_jobs == -1: n_jobs = min(effective_n_jobs(n_jobs), len(candidates)) print(n_jobs) models = list(map(lambda i: deepcopy(metric), range(len(candidates)))) models = Parallel(n_jobs=n_jobs)( delayed(compute_score)(models[i], x, y, candidates[i]) for i in range(len(candidates))) scores = np.array(list(map(lambda model: model.value, models))) learners = list(map(lambda model: model.learner_, models)) weights = compute_feature_weights(learners, candidates) return scores, weights
def _partition_columns(columns, n_jobs): """Private function to partition columns splitting between jobs.""" # Compute the number of jobs n_columns = len(columns) n_jobs = min(effective_n_jobs(n_jobs), n_columns) # Partition columns between jobs n_columns_per_job = np.full(n_jobs, n_columns // n_jobs, dtype=int) n_columns_per_job[:n_columns % n_jobs] += 1 columns_per_job = np.cumsum(n_columns_per_job) columns_per_job = np.split(columns, columns_per_job) columns_per_job = columns_per_job[:-1] return n_jobs, columns_per_job
def partial_fit(self, X, y=None): """Online VB with Mini-Batch update. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. y : Ignored Returns ------- self """ self._check_params() first_time = not hasattr(self, 'components_') # In theory reset should be equal to `first_time`, but there are tests # checking the input number of feature and they expect a specific # string, which is not the same one raised by check_n_features. So we # don't check n_features_in_ here for now (it's done with adhoc code in # the estimator anyway). # TODO: set reset=first_time when addressing reset in # predict/transform/etc. reset_n_features = True X = self._check_non_neg_array(X, reset_n_features, "LatentDirichletAllocation.partial_fit") n_samples, n_features = X.shape batch_size = self.batch_size # initialize parameters or check if first_time: self._init_latent_vars(n_features) if n_features != self.components_.shape[1]: raise ValueError( "The provided data has %d dimensions while " "the model was trained with feature size %d." % (n_features, self.components_.shape[1])) n_jobs = effective_n_jobs(self.n_jobs) with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel: for idx_slice in gen_batches(n_samples, batch_size): self._em_step(X[idx_slice, :], total_samples=self.total_samples, batch_update=False, parallel=parallel) return self