def compute_distances(self) -> sp.csr_matrix: """Compute the distances between clonotypes. `prepare` must have been ran previously. Returns a clonotype x clonotype sparse distance matrix.""" start = logging.info( "Computing clonotype x clonotype distances.") # type: ignore n_clonotypes = self.clonotypes.shape[0] # only use multiprocessing for sufficiently large datasets # for small datasets the overhead is too large for a benefit if self.n_jobs == 1 or n_clonotypes <= 2 * self.chunksize: dist_rows = tqdm( (self._dist_for_clonotype(i) for i in range(n_clonotypes)), total=n_clonotypes, ) else: logging.info( "NB: Computation happens in chunks. The progressbar only advances " "when a chunk has finished. ") # type: ignore dist_rows = process_map( self._dist_for_clonotype, range(n_clonotypes), max_workers=self.n_jobs if self.n_jobs is not None else cpu_count(), chunksize=2000, tqdm_class=tqdm, ) dist = sp.vstack(dist_rows) dist.eliminate_zeros() logging.hint("Done computing clonotype x clonotype distances. ", time=start) return dist # type: ignore
def test_timing(monkeypatch, capsys, logging_state): s.logfile = sys.stderr counter = 0 class IncTime: @staticmethod def now(tz): nonlocal counter counter += 1 return datetime(2000, 1, 1, second=counter, microsecond=counter, tzinfo=tz) monkeypatch.setattr(l, 'datetime', IncTime) s.verbosity = Verbosity.debug l.hint('1') assert counter == 1 and capsys.readouterr().err == '--> 1\n' start = l.info('2') assert counter == 2 and capsys.readouterr().err == '2\n' l.hint('3') assert counter == 3 and capsys.readouterr().err == '--> 3\n' l.info('4', time=start) assert counter == 4 and capsys.readouterr().err == '4 (0:00:02)\n' l.info('5 {time_passed}', time=start) assert counter == 5 and capsys.readouterr().err == '5 0:00:03\n'
def _prepare(self, adata: AnnData): """Initalize the DoubleLookupNeighborFinder and all required lookup tables""" start = logging.info("Initializing lookup tables. ") self._make_clonotype_table(adata) self._make_chain_count() self.neighbor_finder = DoubleLookupNeighborFinder(self.clonotypes) self._add_distance_matrices(adata) self._add_lookup_tables() logging.hint("Done initializing lookup tables.", time=start)
def test_formats(capsys, logging_state): s.logfile = sys.stderr s.verbosity = Verbosity.debug l.error('0') assert capsys.readouterr().err == 'ERROR: 0\n' l.warning('1') assert capsys.readouterr().err == 'WARNING: 1\n' l.info('2') assert capsys.readouterr().err == '2\n' l.hint('3') assert capsys.readouterr().err == '--> 3\n' l.debug('4') assert capsys.readouterr().err == ' 4\n'
def test_logfile(tmp_path, logging_state): s.verbosity = Verbosity.hint io = StringIO() s.logfile = io assert s.logfile is io assert s.logpath is None l.error('test!') assert io.getvalue() == 'ERROR: test!\n' p = tmp_path / 'test.log' s.logpath = p assert s.logpath == p assert s.logfile.name == str(p) l.hint('test2') l.debug('invisible') assert s.logpath.read_text() == '--> test2\n'
def test_deep(capsys, logging_state): s.logfile = sys.stderr s.verbosity = Verbosity.hint l.hint('0') assert capsys.readouterr().err == '--> 0\n' l.hint('1', deep='1!') assert capsys.readouterr().err == '--> 1\n' s.verbosity = Verbosity.debug l.hint('2') assert capsys.readouterr().err == '--> 2\n' l.hint('3', deep='3!') assert capsys.readouterr().err == '--> 3: 3!\n'
def filter_markers(adata, thresh=0.5, use_raw=False): ''' Filter the rank-biserial correlation coefficients computed with ``rbcde.RBC()`` to a list of markers for each cluster, provided as a data frame and a Scanpy plotting compatible ``var_names`` cluster marker dictionaty. Returns those two objects, in this order. Input ----- adata : ``AnnData`` Needs to have been processed with ``rbcde.RBC()``. thresh : ``float``, optional (default: 0.5) The threshold value used to call markers. Literature `critical values <https://en.wikipedia.org/wiki/Effect_size#Pearson_r_or_correlation_coefficient>`_ can be used. use_raw : ``bool``, optional (default: ``False``) Set this to ``True`` if the raw data was used for the computation so that the results can be retrieved from the correct field of the object. ''' #extract the RBC results embedded in .var and remove the prefix if use_raw: results = adata.raw.var.loc[:, [ i.startswith('RBC_') for i in adata.raw.var.columns ]] else: results = adata.var.loc[:, [ i.startswith('RBC_') for i in adata.var.columns ]] results.columns = [i.replace('RBC_', '', 1) for i in results.columns] #call the matrix version to get a marker data frame degs = rbcde.matrix.filter_markers(results, thresh) #parse up a plotting cluster marker dictionary plot_dict = {} for clus in results.columns: plot_dict[clus] = degs.loc[degs['cluster'] == clus, :].index logg.hint( str(len(plot_dict[clus])) + ' markers found for cluster ' + clus) #return both the data frame and the plot-ready form return degs, plot_dict
def sequence_dist( seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None, *, metric: MetricType = "identity", cutoff: Union[None, int] = None, n_jobs: Union[None, int] = None, **kwargs, ) -> csr_matrix: """ Calculate a sequence x sequence distance matrix. {dist_mat} When `seqs` or `seqs2` includes non-unique values, the function internally uses only unique sequences to calculate the distances. Note that, if the input arrays contain large numbers of duplicated values (i.e. hundreds each), this will lead to large "dense" blocks in the sparse matrix. This will result in slow processing and high memory usage. Parameters ---------- seqs Numpy array of nucleotide or amino acid sequences. Note that not all distance metrics support nucleotide sequences. seqs2 Second array sequences. When omitted, `sequence_dist` computes the square matrix of `unique_seqs`. {metric} {cutoff} n_jobs Number of CPU cores to use when running a DistanceCalculator that supports paralellization. A cutoff of 0 implies the `identity` metric. kwargs Additional parameters passed to the :class:`~scirpy.ir_dist.metrics.DistanceCalculator`. Returns ------- Symmetrical, sparse pairwise distance matrix. """ seqs = [x.upper() for x in seqs] seqs_unique, seqs_unique_inverse = np.unique(seqs, return_inverse=True) # type: ignore if seqs2 is not None: seqs2 = [x.upper() for x in seqs2] seqs2_unique, seqs2_unique_inverse = np.unique(seqs2, return_inverse=True) # type: ignore else: seqs2_unique, seqs2_unique_inverse = None, seqs_unique_inverse dist_calc = _get_distance_calculator(metric, cutoff, n_jobs=n_jobs, **kwargs) logging.info(f"Calculating distances with metric {metric}") dist_mat = dist_calc.calc_dist_mat(seqs_unique, seqs2_unique) # Slicing with CSR is faster than with DOK dist_mat = dist_mat.tocsr() logging.hint("Expanding non-unique sequences to sequence x sequence matrix") i, j = np.meshgrid( seqs_unique_inverse, seqs2_unique_inverse, sparse=True, indexing="ij" ) dist_mat = dist_mat[i, j] return dist_mat
def bbknn(adata, batch_key='batch', neighbors_within_batch=3, n_pcs=50, trim=None, scale_distance=False, approx=False, metric='euclidean', bandwidth=1, local_connectivity=1, n_jobs=None, save_knn=False, copy=False): ''' Batch balanced KNN, altering the KNN procedure to identify each cell's top neighbours in each batch separately instead of the entire cell pool with no accounting for batch. Aligns batches in a quick and lightweight manner. For use in the scanpy workflow as an alternative to ``scanpi.api.pp.neighbors()``. Input ----- adata : ``AnnData`` Needs the PCA computed and stored in ``adata.obsm["X_pca"]``. batch_key : ``str``, optional (default: "batch") ``adata.obs`` column name discriminating between your batches. neighbors_within_batch : ``int``, optional (default: 3) How many top neighbours to report for each batch; total number of neighbours will be this number times the number of batches. n_pcs : ``int``, optional (default: 50) How many principal components to use in the analysis. trim : ``int`` or ``None``, optional (default: ``None``) If not ``None``, trim the neighbours of each cell to these many top connectivities. May help with population independence and improve the tidiness of clustering. scale_distance : ``bool``, optional (default: ``False``) If ``True``, optionally lower the across-batch distances on a per-cell, per-batch basis to make the closest neighbour be closer to the furthest within-batch neighbour. May help smooth out very severe batch effects with a risk of overly connecting the cells. The exact algorithm is as follows: .. code-block:: python if min(corrected_batch) > max(original_batch): corrected_batch += max(original_batch) - min(corrected_batch) + np.std(corrected_batch) approx : ``bool``, optional (default: ``False``) If ``True``, use annoy's approximate neighbour finding. This results in a quicker run time for large datasets at a risk of loss of independence of some of the populations. It should be noted that annoy's default metric of choice is "angular", which BBKNN overrides to "euclidean" from its own default metric setting. metric : ``str`` or ``sklearn.neighbors.DistanceMetric``, optional (default: "euclidean") What distance metric to use. If using ``approx=True``, the options are "euclidean", "angular", "manhattan" and "hamming". Otherwise, the options are "euclidean", "manhattan", "chebyshev", or parameterised ``sklearn.neighbors.DistanceMetric`` for "minkowski", "wminkowski", "seuclidean" or "mahalanobis". >>> from sklearn.neighbors import DistanceMetric >>> pass_this_as_metric = DistanceMetric.get_metric('minkowski',p=3) bandwidth : ``float``, optional (default: 1) ``scanpy.neighbors.compute_connectivities_umap`` parameter, higher values result in a gentler slope of the connectivities exponentials (i.e. larger connectivity values being returned) local_connectivity : ``int``, optional (default: 1) ``scanpy.neighbors.compute_connectivities_umap`` parameter, how many nearest neighbors of each cell are assumed to be fully connected (and given a connectivity value of 1) n_jobs : ``int`` or ``None``, optional (default: ``None``) Parallelise neighbour identification when using an Euclidean distance metric, if ``None`` use all cores. Does nothing with a different metric. save_knn : ``bool``, optional (default: ``False``) If ``True``, save the indices of the nearest neighbours for each cell in ``adata.uns['bbknn']``. copy : ``bool``, optional (default: ``False``) If ``True``, return a copy instead of writing to the supplied adata. ''' adata = adata.copy() if copy else adata #basic sanity checks to begin #is our batch key actually present in the object? if batch_key not in adata.obs: raise ValueError("Batch key '" + batch_key + "' not present in `adata.obs`.") #do we have a computed PCA? (the .dtype.fields is because of how adata.obsm is formatted) if 'X_pca' not in adata.obsm.dtype.fields: raise ValueError( "`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.") #prepare bbknn_pca_matrix input pca = adata.obsm['X_pca'] batch_list = adata.obs[batch_key].values #call BBKNN proper bbknn_out = bbknn_pca_matrix(pca=pca, batch_list=batch_list, neighbors_within_batch=neighbors_within_batch, n_pcs=n_pcs, trim=trim, scale_distance=scale_distance, approx=approx, metric=metric, bandwidth=bandwidth, local_connectivity=local_connectivity, n_jobs=n_jobs, save_knn=save_knn) #optionally save knn_indices if save_knn: adata.uns['bbknn'] = bbknn_out[2] adata.uns['neighbors'] = {} adata.uns['neighbors']['params'] = { 'n_neighbors': neighbors_within_batch * len(np.unique(batch_list)), 'method': 'umap' } adata.uns['neighbors']['distances'] = bbknn_out[0] adata.uns['neighbors']['connectivities'] = bbknn_out[1] logg.hint('added to `.uns[\'neighbors\']`\n' ' \'distances\', weighted adjacency matrix\n' ' \'connectivities\', weighted adjacency matrix') return adata if copy else None
def bbknn(adata, batch_key='batch', approx=True, metric='angular', copy=False, **kwargs): ''' Batch balanced KNN, altering the KNN procedure to identify each cell's top neighbours in each batch separately instead of the entire cell pool with no accounting for batch. Aligns batches in a quick and lightweight manner. For use in the scanpy workflow as an alternative to ``scanpi.api.pp.neighbors()``. Input ----- adata : ``AnnData`` Needs the PCA computed and stored in ``adata.obsm["X_pca"]``. batch_key : ``str``, optional (default: "batch") ``adata.obs`` column name discriminating between your batches. neighbors_within_batch : ``int``, optional (default: 3) How many top neighbours to report for each batch; total number of neighbours will be this number times the number of batches. n_pcs : ``int``, optional (default: 50) How many principal components to use in the analysis. trim : ``int`` or ``None``, optional (default: ``None``) Trim the neighbours of each cell to these many top connectivities. May help with population independence and improve the tidiness of clustering. The lower the value the more independent the individual populations, at the cost of more conserved batch effect. If ``None``, sets the parameter value automatically to 10 times the total number of neighbours for each cell. Set to 0 to skip. approx : ``bool``, optional (default: ``True``) If ``True``, use annoy's approximate neighbour finding. This results in a quicker run time for large datasets while also potentially increasing the degree of batch correction. n_trees : ``int``, optional (default: 10) Only used when ``approx=True``. The number of trees to construct in the annoy forest. More trees give higher precision when querying, at the cost of increased run time and resource intensity. use_faiss : ``bool``, optional (default: ``True``) If ``approx=False`` and the metric is "euclidean", use the faiss package to compute nearest neighbours if installed. This improves performance at a minor cost to numerical precision as faiss operates on float32. metric : ``str`` or ``sklearn.neighbors.DistanceMetric``, optional (default: "angular") What distance metric to use. If using ``approx=True``, the options are "angular", "euclidean", "manhattan" and "hamming". Otherwise, the options are "euclidean", a member of the ``sklearn.neighbors.KDTree.valid_metrics`` list, or parameterised ``sklearn.neighbors.DistanceMetric`` `objects <https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html>`_: >>> from sklearn import neighbors >>> neighbors.KDTree.valid_metrics ['p', 'chebyshev', 'cityblock', 'minkowski', 'infinity', 'l2', 'euclidean', 'manhattan', 'l1'] >>> pass_this_as_metric = neighbors.DistanceMetric.get_metric('minkowski',p=3) set_op_mix_ratio : ``float``, optional (default: 1) UMAP connectivity computation parameter, float between 0 and 1, controlling the blend between a connectivity matrix formed exclusively from mutual nearest neighbour pairs (0) and a union of all observed neighbour relationships with the mutual pairs emphasised (1) local_connectivity : ``int``, optional (default: 1) UMAP connectivity computation parameter, how many nearest neighbors of each cell are assumed to be fully connected (and given a connectivity value of 1) copy : ``bool``, optional (default: ``False``) If ``True``, return a copy instead of writing to the supplied adata. ''' logg.info('computing batch balanced neighbors', r=True) adata = adata.copy() if copy else adata #basic sanity checks to begin #is our batch key actually present in the object? if batch_key not in adata.obs: raise ValueError("Batch key '"+batch_key+"' not present in `adata.obs`.") #do we have a computed PCA? (the .dtype.fields is because of how adata.obsm is formatted) if 'X_pca' not in adata.obsm.dtype.fields: raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.") #metric sanity checks if approx and metric not in ['angular', 'euclidean', 'manhattan', 'hamming']: logg.warn('unrecognised metric for type of neighbor calculation, switching to angular') metric = 'angular' elif not approx and not (metric=='euclidean' or isinstance(metric,DistanceMetric) or metric in KDTree.valid_metrics): logg.warn('unrecognised metric for type of neighbor calculation, switching to euclidean') metric = 'euclidean' #prepare bbknn_pca_matrix input pca = adata.obsm['X_pca'] batch_list = adata.obs[batch_key].values #call BBKNN proper bbknn_out = bbknn_pca_matrix(pca=pca, batch_list=batch_list, approx=approx, metric=metric, **kwargs) logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') adata.uns['neighbors'] = {} #we'll have a zero distance for our cell of origin, and nonzero for every other neighbour computed adata.uns['neighbors']['params'] = {'n_neighbors': len(bbknn_out[0][0,:].data)+1, 'method': 'umap'} adata.uns['neighbors']['distances'] = bbknn_out[0] adata.uns['neighbors']['connectivities'] = bbknn_out[1] logg.hint( 'added to `.uns[\'neighbors\']`\n' ' \'distances\', weighted adjacency matrix\n' ' \'connectivities\', weighted adjacency matrix') return adata if copy else None
def _highly_variable_pearson_residuals( adata: AnnData, theta: float = 100, clip: Optional[float] = None, n_top_genes: int = 1000, batch_key: Optional[str] = None, chunksize: int = 1000, check_values: bool = True, layer: Optional[str] = None, subset: bool = False, inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ See `scanpy.experimental.pp.highly_variable_genes`. Returns ------- If `inplace=True`, `adata.var` is updated with the following fields. Otherwise, returns the same fields as :class:`~pandas.DataFrame`. highly_variable : bool boolean indicator of highly-variable genes means : float means per gene variances : float variance per gene residual_variances : float Residual variance per gene. Averaged in the case of multiple batches. highly_variable_rank : float Rank of the gene according to residual variance, median rank in the case of multiple batches highly_variable_nbatches : int If `batch_key` given, denotes in how many batches genes are detected as HVG highly_variable_intersection : bool If `batch_key` given, denotes the genes that are highly variable in all batches """ view_to_actual(adata) X = _get_obs_rep(adata, layer=layer) computed_on = layer if layer else 'adata.X' # Check for raw counts if check_values and (check_nonnegative_integers(X) is False): warnings.warn( "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", UserWarning, ) # check theta if theta <= 0: # TODO: would "underdispersion" with negative theta make sense? # then only theta=0 were undefined.. raise ValueError('Pearson residuals require theta > 0') # prepare clipping if batch_key is None: batch_info = np.zeros(adata.shape[0], dtype=int) else: batch_info = adata.obs[batch_key].values n_batches = len(np.unique(batch_info)) # Get pearson residuals for each batch separately residual_gene_vars = [] for batch in np.unique(batch_info): adata_subset_prefilter = adata[batch_info == batch] X_batch_prefilter = _get_obs_rep(adata_subset_prefilter, layer=layer) # Filter out zero genes with settings.verbosity.override(Verbosity.error): nonzero_genes = np.ravel(X_batch_prefilter.sum(axis=0)) != 0 adata_subset = adata_subset_prefilter[:, nonzero_genes] X_batch = _get_obs_rep(adata_subset, layer=layer) # Prepare clipping if clip is None: n = X_batch.shape[0] clip = np.sqrt(n) if clip < 0: raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") if sp_sparse.issparse(X_batch): sums_genes = np.sum(X_batch, axis=0) sums_cells = np.sum(X_batch, axis=1) sum_total = np.sum(sums_genes).squeeze() else: sums_genes = np.sum(X_batch, axis=0, keepdims=True) sums_cells = np.sum(X_batch, axis=1, keepdims=True) sum_total = np.sum(sums_genes) # Compute pearson residuals in chunks residual_gene_var = np.empty((X_batch.shape[1])) for start in np.arange(0, X_batch.shape[1], chunksize): stop = start + chunksize mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total) X_dense = X_batch[:, start:stop].toarray() residuals = (X_dense - mu) / np.sqrt(mu + mu**2 / theta) residuals = np.clip(residuals, a_min=-clip, a_max=clip) residual_gene_var[start:stop] = np.var(residuals, axis=0) # Add 0 values for genes that were filtered out unmasked_residual_gene_var = np.zeros(len(nonzero_genes)) unmasked_residual_gene_var[nonzero_genes] = residual_gene_var residual_gene_vars.append(unmasked_residual_gene_var.reshape(1, -1)) residual_gene_vars = np.concatenate(residual_gene_vars, axis=0) # Get rank per gene within each batch # argsort twice gives ranks, small rank means most variable ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1) ranks_residual_var = ranks_residual_var.astype(np.float32) # count in how many batches a genes was among the n_top_genes highly_variable_nbatches = np.sum( (ranks_residual_var < n_top_genes).astype(int), axis=0 ) # set non-top genes within each batch to nan ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan ranks_masked_array = np.ma.masked_invalid(ranks_residual_var) # Median rank across batches, ignoring batches in which gene was not selected medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) means, variances = materialize_as_ndarray(_get_mean_var(X)) df = pd.DataFrame.from_dict( dict( means=means, variances=variances, residual_variances=np.mean(residual_gene_vars, axis=0), highly_variable_rank=medianrank_residual_var, highly_variable_nbatches=highly_variable_nbatches.astype(np.int64), highly_variable_intersection=highly_variable_nbatches == n_batches, ) ) df = df.set_index(adata.var_names) # Sort genes by how often they selected as hvg within each batch and # break ties with median rank of residual variance across batches df.sort_values( ['highly_variable_nbatches', 'highly_variable_rank'], ascending=[False, True], na_position='last', inplace=True, ) high_var = np.zeros(df.shape[0], dtype=bool) high_var[:n_top_genes] = True df['highly_variable'] = high_var df = df.loc[adata.var_names, :] if inplace: adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on} logg.hint( 'added\n' ' \'highly_variable\', boolean vector (adata.var)\n' ' \'highly_variable_rank\', float vector (adata.var)\n' ' \'highly_variable_nbatches\', int vector (adata.var)\n' ' \'highly_variable_intersection\', boolean vector (adata.var)\n' ' \'means\', float vector (adata.var)\n' ' \'variances\', float vector (adata.var)\n' ' \'residual_variances\', float vector (adata.var)' ) adata.var['means'] = df['means'].values adata.var['variances'] = df['variances'].values adata.var['residual_variances'] = df['residual_variances'] adata.var['highly_variable_rank'] = df['highly_variable_rank'].values if batch_key is not None: adata.var['highly_variable_nbatches'] = df[ 'highly_variable_nbatches' ].values adata.var['highly_variable_intersection'] = df[ 'highly_variable_intersection' ].values adata.var['highly_variable'] = df['highly_variable'].values if subset: adata._inplace_subset_var(df['highly_variable'].values) else: if batch_key is None: df = df.drop( ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1 ) if subset: df = df.iloc[df.highly_variable.values, :] return df