Example #1
0
    def compute_distances(self) -> sp.csr_matrix:
        """Compute the distances between clonotypes. `prepare` must have
        been ran previously. Returns a clonotype x clonotype sparse
        distance matrix."""
        start = logging.info(
            "Computing clonotype x clonotype distances.")  # type: ignore
        n_clonotypes = self.clonotypes.shape[0]

        # only use multiprocessing for sufficiently large datasets
        # for small datasets the overhead is too large for a benefit
        if self.n_jobs == 1 or n_clonotypes <= 2 * self.chunksize:
            dist_rows = tqdm(
                (self._dist_for_clonotype(i) for i in range(n_clonotypes)),
                total=n_clonotypes,
            )
        else:
            logging.info(
                "NB: Computation happens in chunks. The progressbar only advances "
                "when a chunk has finished. ")  # type: ignore

            dist_rows = process_map(
                self._dist_for_clonotype,
                range(n_clonotypes),
                max_workers=self.n_jobs
                if self.n_jobs is not None else cpu_count(),
                chunksize=2000,
                tqdm_class=tqdm,
            )

        dist = sp.vstack(dist_rows)
        dist.eliminate_zeros()
        logging.hint("Done computing clonotype x clonotype distances. ",
                     time=start)
        return dist  # type: ignore
Example #2
0
def test_timing(monkeypatch, capsys, logging_state):
    s.logfile = sys.stderr
    counter = 0

    class IncTime:
        @staticmethod
        def now(tz):
            nonlocal counter
            counter += 1
            return datetime(2000,
                            1,
                            1,
                            second=counter,
                            microsecond=counter,
                            tzinfo=tz)

    monkeypatch.setattr(l, 'datetime', IncTime)
    s.verbosity = Verbosity.debug

    l.hint('1')
    assert counter == 1 and capsys.readouterr().err == '--> 1\n'
    start = l.info('2')
    assert counter == 2 and capsys.readouterr().err == '2\n'
    l.hint('3')
    assert counter == 3 and capsys.readouterr().err == '--> 3\n'
    l.info('4', time=start)
    assert counter == 4 and capsys.readouterr().err == '4 (0:00:02)\n'
    l.info('5 {time_passed}', time=start)
    assert counter == 5 and capsys.readouterr().err == '5 0:00:03\n'
Example #3
0
 def _prepare(self, adata: AnnData):
     """Initalize the DoubleLookupNeighborFinder and all required lookup tables"""
     start = logging.info("Initializing lookup tables. ")
     self._make_clonotype_table(adata)
     self._make_chain_count()
     self.neighbor_finder = DoubleLookupNeighborFinder(self.clonotypes)
     self._add_distance_matrices(adata)
     self._add_lookup_tables()
     logging.hint("Done initializing lookup tables.", time=start)
Example #4
0
def test_formats(capsys, logging_state):
    s.logfile = sys.stderr
    s.verbosity = Verbosity.debug
    l.error('0')
    assert capsys.readouterr().err == 'ERROR: 0\n'
    l.warning('1')
    assert capsys.readouterr().err == 'WARNING: 1\n'
    l.info('2')
    assert capsys.readouterr().err == '2\n'
    l.hint('3')
    assert capsys.readouterr().err == '--> 3\n'
    l.debug('4')
    assert capsys.readouterr().err == '    4\n'
Example #5
0
def test_logfile(tmp_path, logging_state):
    s.verbosity = Verbosity.hint

    io = StringIO()
    s.logfile = io
    assert s.logfile is io
    assert s.logpath is None
    l.error('test!')
    assert io.getvalue() == 'ERROR: test!\n'

    p = tmp_path / 'test.log'
    s.logpath = p
    assert s.logpath == p
    assert s.logfile.name == str(p)
    l.hint('test2')
    l.debug('invisible')
    assert s.logpath.read_text() == '--> test2\n'
Example #6
0
def test_deep(capsys, logging_state):
    s.logfile = sys.stderr
    s.verbosity = Verbosity.hint
    l.hint('0')
    assert capsys.readouterr().err == '--> 0\n'
    l.hint('1', deep='1!')
    assert capsys.readouterr().err == '--> 1\n'
    s.verbosity = Verbosity.debug
    l.hint('2')
    assert capsys.readouterr().err == '--> 2\n'
    l.hint('3', deep='3!')
    assert capsys.readouterr().err == '--> 3: 3!\n'
Example #7
0
def filter_markers(adata, thresh=0.5, use_raw=False):
    '''
	Filter the rank-biserial correlation coefficients computed with ``rbcde.RBC()`` to a 
	list of markers for each cluster, provided as a data frame and a Scanpy plotting compatible 
	``var_names`` cluster marker dictionaty. Returns those two objects, in this order.
	
	Input
	-----
	adata : ``AnnData``
		Needs to have been processed with ``rbcde.RBC()``.
	thresh : ``float``, optional (default: 0.5)
		The threshold value used to call markers. Literature 
		`critical values <https://en.wikipedia.org/wiki/Effect_size#Pearson_r_or_correlation_coefficient>`_ 
		can be used.
	use_raw : ``bool``, optional (default: ``False``)
		Set this to ``True`` if the raw data was used for the computation so that the 
		results can be retrieved from the correct field of the object.
	'''

    #extract the RBC results embedded in .var and remove the prefix
    if use_raw:
        results = adata.raw.var.loc[:, [
            i.startswith('RBC_') for i in adata.raw.var.columns
        ]]
    else:
        results = adata.var.loc[:, [
            i.startswith('RBC_') for i in adata.var.columns
        ]]
    results.columns = [i.replace('RBC_', '', 1) for i in results.columns]
    #call the matrix version to get a marker data frame
    degs = rbcde.matrix.filter_markers(results, thresh)
    #parse up a plotting cluster marker dictionary
    plot_dict = {}
    for clus in results.columns:
        plot_dict[clus] = degs.loc[degs['cluster'] == clus, :].index
        logg.hint(
            str(len(plot_dict[clus])) + ' markers found for cluster ' + clus)
    #return both the data frame and the plot-ready form
    return degs, plot_dict
Example #8
0
def sequence_dist(
    seqs: Sequence[str],
    seqs2: Optional[Sequence[str]] = None,
    *,
    metric: MetricType = "identity",
    cutoff: Union[None, int] = None,
    n_jobs: Union[None, int] = None,
    **kwargs,
) -> csr_matrix:
    """
    Calculate a sequence x sequence distance matrix.

    {dist_mat}

    When `seqs` or `seqs2` includes non-unique values, the function internally
    uses only unique sequences to calculate the distances. Note that, if the
    input arrays contain large numbers of duplicated values (i.e. hundreds each),
    this will lead to large "dense" blocks in the sparse matrix. This will result in
    slow processing and high memory usage.

    Parameters
    ----------
    seqs
        Numpy array of nucleotide or amino acid sequences.
        Note that not all distance metrics support nucleotide sequences.
    seqs2
        Second array sequences. When omitted, `sequence_dist` computes
        the square matrix of `unique_seqs`.
    {metric}
    {cutoff}
    n_jobs
        Number of CPU cores to use when running a DistanceCalculator that supports
        paralellization.

        A cutoff of 0 implies the `identity` metric.
    kwargs
        Additional parameters passed to the :class:`~scirpy.ir_dist.metrics.DistanceCalculator`.

    Returns
    -------
    Symmetrical, sparse pairwise distance matrix.
    """
    seqs = [x.upper() for x in seqs]
    seqs_unique, seqs_unique_inverse = np.unique(seqs, return_inverse=True)  # type: ignore
    if seqs2 is not None:
        seqs2 = [x.upper() for x in seqs2]
        seqs2_unique, seqs2_unique_inverse = np.unique(seqs2, return_inverse=True)  # type: ignore
    else:
        seqs2_unique, seqs2_unique_inverse = None, seqs_unique_inverse

    dist_calc = _get_distance_calculator(metric, cutoff, n_jobs=n_jobs, **kwargs)

    logging.info(f"Calculating distances with metric {metric}")

    dist_mat = dist_calc.calc_dist_mat(seqs_unique, seqs2_unique)

    # Slicing with CSR is faster than with DOK
    dist_mat = dist_mat.tocsr()

    logging.hint("Expanding non-unique sequences to sequence x sequence matrix")
    i, j = np.meshgrid(
        seqs_unique_inverse, seqs2_unique_inverse, sparse=True, indexing="ij"
    )
    dist_mat = dist_mat[i, j]

    return dist_mat
Example #9
0
def bbknn(adata,
          batch_key='batch',
          neighbors_within_batch=3,
          n_pcs=50,
          trim=None,
          scale_distance=False,
          approx=False,
          metric='euclidean',
          bandwidth=1,
          local_connectivity=1,
          n_jobs=None,
          save_knn=False,
          copy=False):
    '''
	Batch balanced KNN, altering the KNN procedure to identify each cell's top neighbours in
	each batch separately instead of the entire cell pool with no accounting for batch.
	Aligns batches in a quick and lightweight manner.
	For use in the scanpy workflow as an alternative to ``scanpi.api.pp.neighbors()``.
	
	Input
	-----
	adata : ``AnnData``
		Needs the PCA computed and stored in ``adata.obsm["X_pca"]``.
	batch_key : ``str``, optional (default: "batch")
		``adata.obs`` column name discriminating between your batches.
	neighbors_within_batch : ``int``, optional (default: 3)
		How many top neighbours to report for each batch; total number of neighbours 
		will be this number times the number of batches.
	n_pcs : ``int``, optional (default: 50)
		How many principal components to use in the analysis.
	trim : ``int`` or ``None``, optional (default: ``None``)
		If not ``None``, trim the neighbours of each cell to these many top connectivities.
		May help with population independence and improve the tidiness of clustering.
	scale_distance : ``bool``, optional (default: ``False``) 
		If ``True``, optionally lower the across-batch distances on a per-cell, per-batch basis to make
		the closest neighbour be closer to the furthest within-batch neighbour. 
		May help smooth out very severe batch effects with a risk of overly 
		connecting the cells. The exact algorithm is as follows:
		
		.. code-block:: python
		
			if min(corrected_batch) > max(original_batch):
				corrected_batch += max(original_batch) - min(corrected_batch) + np.std(corrected_batch)
	approx : ``bool``, optional (default: ``False``)
		If ``True``, use annoy's approximate neighbour finding. This results in a quicker run time 
		for large datasets at a risk of loss of independence of some of the populations. It should
		be noted that annoy's default metric of choice is "angular", which BBKNN overrides to
		"euclidean" from its own default metric setting.
	metric : ``str`` or ``sklearn.neighbors.DistanceMetric``, optional (default: "euclidean")
		What distance metric to use. If using ``approx=True``, the options are "euclidean",
		"angular", "manhattan" and "hamming". Otherwise, the options are "euclidean", 
		"manhattan", "chebyshev", or parameterised ``sklearn.neighbors.DistanceMetric`` 
		for "minkowski", "wminkowski", "seuclidean" or "mahalanobis".
		
		>>> from sklearn.neighbors import DistanceMetric
		>>> pass_this_as_metric = DistanceMetric.get_metric('minkowski',p=3)
	bandwidth : ``float``, optional (default: 1)
		``scanpy.neighbors.compute_connectivities_umap`` parameter, higher values result in a
		gentler slope of the connectivities exponentials (i.e. larger connectivity values being returned)
	local_connectivity : ``int``, optional (default: 1)
		``scanpy.neighbors.compute_connectivities_umap`` parameter, how many nearest neighbors of
		each cell are assumed to be fully connected (and given a connectivity value of 1)
	n_jobs : ``int`` or ``None``, optional (default: ``None``)
		Parallelise neighbour identification when using an Euclidean distance metric, 
		if ``None`` use all cores. Does nothing with a different metric.
	save_knn : ``bool``, optional (default: ``False``)
		If ``True``, save the indices of the nearest neighbours for each cell in ``adata.uns['bbknn']``.
	copy : ``bool``, optional (default: ``False``)
		If ``True``, return a copy instead of writing to the supplied adata.
	'''
    adata = adata.copy() if copy else adata
    #basic sanity checks to begin
    #is our batch key actually present in the object?
    if batch_key not in adata.obs:
        raise ValueError("Batch key '" + batch_key +
                         "' not present in `adata.obs`.")
    #do we have a computed PCA? (the .dtype.fields is because of how adata.obsm is formatted)
    if 'X_pca' not in adata.obsm.dtype.fields:
        raise ValueError(
            "`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.")
    #prepare bbknn_pca_matrix input
    pca = adata.obsm['X_pca']
    batch_list = adata.obs[batch_key].values
    #call BBKNN proper
    bbknn_out = bbknn_pca_matrix(pca=pca,
                                 batch_list=batch_list,
                                 neighbors_within_batch=neighbors_within_batch,
                                 n_pcs=n_pcs,
                                 trim=trim,
                                 scale_distance=scale_distance,
                                 approx=approx,
                                 metric=metric,
                                 bandwidth=bandwidth,
                                 local_connectivity=local_connectivity,
                                 n_jobs=n_jobs,
                                 save_knn=save_knn)
    #optionally save knn_indices
    if save_knn:
        adata.uns['bbknn'] = bbknn_out[2]
    adata.uns['neighbors'] = {}
    adata.uns['neighbors']['params'] = {
        'n_neighbors': neighbors_within_batch * len(np.unique(batch_list)),
        'method': 'umap'
    }
    adata.uns['neighbors']['distances'] = bbknn_out[0]
    adata.uns['neighbors']['connectivities'] = bbknn_out[1]
    logg.hint('added to `.uns[\'neighbors\']`\n'
              '    \'distances\', weighted adjacency matrix\n'
              '    \'connectivities\', weighted adjacency matrix')
    return adata if copy else None
Example #10
0
def bbknn(adata, batch_key='batch', approx=True, metric='angular', copy=False, **kwargs):
	'''
	Batch balanced KNN, altering the KNN procedure to identify each cell's top neighbours in
	each batch separately instead of the entire cell pool with no accounting for batch.
	Aligns batches in a quick and lightweight manner.
	For use in the scanpy workflow as an alternative to ``scanpi.api.pp.neighbors()``.
	
	Input
	-----
	adata : ``AnnData``
		Needs the PCA computed and stored in ``adata.obsm["X_pca"]``.
	batch_key : ``str``, optional (default: "batch")
		``adata.obs`` column name discriminating between your batches.
	neighbors_within_batch : ``int``, optional (default: 3)
		How many top neighbours to report for each batch; total number of neighbours 
		will be this number times the number of batches.
	n_pcs : ``int``, optional (default: 50)
		How many principal components to use in the analysis.
	trim : ``int`` or ``None``, optional (default: ``None``)
		Trim the neighbours of each cell to these many top connectivities. May help with 
		population independence and improve the tidiness of clustering. The lower the value the
		more independent the individual populations, at the cost of more conserved batch effect.
		If ``None``, sets the parameter value automatically to 10 times the total number of
		neighbours for each cell. Set to 0 to skip.
	approx : ``bool``, optional (default: ``True``)
		If ``True``, use annoy's approximate neighbour finding. This results in a quicker run time 
		for large datasets while also potentially increasing the degree of batch correction.
	n_trees : ``int``, optional (default: 10)
		Only used when ``approx=True``. The number of trees to construct in the annoy forest.
		More trees give higher precision when querying, at the cost of increased run time and 
		resource intensity.
	use_faiss : ``bool``, optional (default: ``True``)
		If ``approx=False`` and the metric is "euclidean", use the faiss package to compute
		nearest neighbours if installed. This improves performance at a minor cost to numerical 
		precision as faiss operates on float32.
	metric : ``str`` or ``sklearn.neighbors.DistanceMetric``, optional (default: "angular")
		What distance metric to use. If using ``approx=True``, the options are "angular",
		"euclidean", "manhattan" and "hamming". Otherwise, the options are "euclidean", 
		a member of the ``sklearn.neighbors.KDTree.valid_metrics`` list, or parameterised 
		``sklearn.neighbors.DistanceMetric`` `objects 
		<https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html>`_:
		
		>>> from sklearn import neighbors
		>>> neighbors.KDTree.valid_metrics
		['p', 'chebyshev', 'cityblock', 'minkowski', 'infinity', 'l2', 'euclidean', 'manhattan', 'l1']
		>>> pass_this_as_metric = neighbors.DistanceMetric.get_metric('minkowski',p=3)
	set_op_mix_ratio : ``float``, optional (default: 1)
		UMAP connectivity computation parameter, float between 0 and 1, controlling the 
		blend between a connectivity matrix formed exclusively from mutual nearest neighbour
		pairs (0) and a union of all observed neighbour relationships with the mutual pairs
		emphasised (1)
	local_connectivity : ``int``, optional (default: 1)
		UMAP connectivity computation parameter, how many nearest neighbors of each cell
		are assumed to be fully connected (and given a connectivity value of 1)
	copy : ``bool``, optional (default: ``False``)
		If ``True``, return a copy instead of writing to the supplied adata.
	'''
	logg.info('computing batch balanced neighbors', r=True)
	adata = adata.copy() if copy else adata
	#basic sanity checks to begin
	#is our batch key actually present in the object?
	if batch_key not in adata.obs:
		raise ValueError("Batch key '"+batch_key+"' not present in `adata.obs`.")
	#do we have a computed PCA? (the .dtype.fields is because of how adata.obsm is formatted)
	if 'X_pca' not in adata.obsm.dtype.fields:
		raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.")
	#metric sanity checks
	if approx and metric not in ['angular', 'euclidean', 'manhattan', 'hamming']:
		logg.warn('unrecognised metric for type of neighbor calculation, switching to angular')
		metric = 'angular'
	elif not approx and not (metric=='euclidean' or isinstance(metric,DistanceMetric) or metric in KDTree.valid_metrics):
		logg.warn('unrecognised metric for type of neighbor calculation, switching to euclidean')
		metric = 'euclidean'
	#prepare bbknn_pca_matrix input
	pca = adata.obsm['X_pca']
	batch_list = adata.obs[batch_key].values
	#call BBKNN proper
	bbknn_out = bbknn_pca_matrix(pca=pca, batch_list=batch_list,
								 approx=approx, metric=metric, **kwargs)
	logg.info('	finished', time=True, end=' ' if settings.verbosity > 2 else '\n')
	adata.uns['neighbors'] = {}
	#we'll have a zero distance for our cell of origin, and nonzero for every other neighbour computed
	adata.uns['neighbors']['params'] = {'n_neighbors': len(bbknn_out[0][0,:].data)+1, 'method': 'umap'}
	adata.uns['neighbors']['distances'] = bbknn_out[0]
	adata.uns['neighbors']['connectivities'] = bbknn_out[1]
	logg.hint(
		'added to `.uns[\'neighbors\']`\n'
		'	\'distances\', weighted adjacency matrix\n'
		'	\'connectivities\', weighted adjacency matrix')
	return adata if copy else None
def _highly_variable_pearson_residuals(
    adata: AnnData,
    theta: float = 100,
    clip: Optional[float] = None,
    n_top_genes: int = 1000,
    batch_key: Optional[str] = None,
    chunksize: int = 1000,
    check_values: bool = True,
    layer: Optional[str] = None,
    subset: bool = False,
    inplace: bool = True,
) -> Optional[pd.DataFrame]:
    """\
    See `scanpy.experimental.pp.highly_variable_genes`.

    Returns
    -------
    If `inplace=True`, `adata.var` is updated with the following fields. Otherwise,
    returns the same fields as :class:`~pandas.DataFrame`.

    highly_variable : bool
        boolean indicator of highly-variable genes
    means : float
        means per gene
    variances : float
        variance per gene
    residual_variances : float
        Residual variance per gene. Averaged in the case of multiple batches.
    highly_variable_rank : float
        Rank of the gene according to residual variance, median rank in the case of multiple batches
    highly_variable_nbatches : int
        If `batch_key` given, denotes in how many batches genes are detected as HVG
    highly_variable_intersection : bool
        If `batch_key` given, denotes the genes that are highly variable in all batches
    """

    view_to_actual(adata)
    X = _get_obs_rep(adata, layer=layer)
    computed_on = layer if layer else 'adata.X'

    # Check for raw counts
    if check_values and (check_nonnegative_integers(X) is False):
        warnings.warn(
            "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
            UserWarning,
        )
    # check theta
    if theta <= 0:
        # TODO: would "underdispersion" with negative theta make sense?
        # then only theta=0 were undefined..
        raise ValueError('Pearson residuals require theta > 0')
    # prepare clipping

    if batch_key is None:
        batch_info = np.zeros(adata.shape[0], dtype=int)
    else:
        batch_info = adata.obs[batch_key].values
    n_batches = len(np.unique(batch_info))

    # Get pearson residuals for each batch separately
    residual_gene_vars = []
    for batch in np.unique(batch_info):

        adata_subset_prefilter = adata[batch_info == batch]
        X_batch_prefilter = _get_obs_rep(adata_subset_prefilter, layer=layer)

        # Filter out zero genes
        with settings.verbosity.override(Verbosity.error):
            nonzero_genes = np.ravel(X_batch_prefilter.sum(axis=0)) != 0
        adata_subset = adata_subset_prefilter[:, nonzero_genes]
        X_batch = _get_obs_rep(adata_subset, layer=layer)

        # Prepare clipping
        if clip is None:
            n = X_batch.shape[0]
            clip = np.sqrt(n)
        if clip < 0:
            raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")

        if sp_sparse.issparse(X_batch):
            sums_genes = np.sum(X_batch, axis=0)
            sums_cells = np.sum(X_batch, axis=1)
            sum_total = np.sum(sums_genes).squeeze()
        else:
            sums_genes = np.sum(X_batch, axis=0, keepdims=True)
            sums_cells = np.sum(X_batch, axis=1, keepdims=True)
            sum_total = np.sum(sums_genes)

        # Compute pearson residuals in chunks
        residual_gene_var = np.empty((X_batch.shape[1]))
        for start in np.arange(0, X_batch.shape[1], chunksize):
            stop = start + chunksize
            mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total)
            X_dense = X_batch[:, start:stop].toarray()
            residuals = (X_dense - mu) / np.sqrt(mu + mu**2 / theta)
            residuals = np.clip(residuals, a_min=-clip, a_max=clip)
            residual_gene_var[start:stop] = np.var(residuals, axis=0)

        # Add 0 values for genes that were filtered out
        unmasked_residual_gene_var = np.zeros(len(nonzero_genes))
        unmasked_residual_gene_var[nonzero_genes] = residual_gene_var
        residual_gene_vars.append(unmasked_residual_gene_var.reshape(1, -1))

    residual_gene_vars = np.concatenate(residual_gene_vars, axis=0)

    # Get rank per gene within each batch
    # argsort twice gives ranks, small rank means most variable
    ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1)
    ranks_residual_var = ranks_residual_var.astype(np.float32)
    # count in how many batches a genes was among the n_top_genes
    highly_variable_nbatches = np.sum(
        (ranks_residual_var < n_top_genes).astype(int), axis=0
    )
    # set non-top genes within each batch to nan
    ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan
    ranks_masked_array = np.ma.masked_invalid(ranks_residual_var)
    # Median rank across batches, ignoring batches in which gene was not selected
    medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan)

    means, variances = materialize_as_ndarray(_get_mean_var(X))
    df = pd.DataFrame.from_dict(
        dict(
            means=means,
            variances=variances,
            residual_variances=np.mean(residual_gene_vars, axis=0),
            highly_variable_rank=medianrank_residual_var,
            highly_variable_nbatches=highly_variable_nbatches.astype(np.int64),
            highly_variable_intersection=highly_variable_nbatches == n_batches,
        )
    )
    df = df.set_index(adata.var_names)

    # Sort genes by how often they selected as hvg within each batch and
    # break ties with median rank of residual variance across batches
    df.sort_values(
        ['highly_variable_nbatches', 'highly_variable_rank'],
        ascending=[False, True],
        na_position='last',
        inplace=True,
    )

    high_var = np.zeros(df.shape[0], dtype=bool)
    high_var[:n_top_genes] = True
    df['highly_variable'] = high_var
    df = df.loc[adata.var_names, :]

    if inplace:
        adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on}
        logg.hint(
            'added\n'
            '    \'highly_variable\', boolean vector (adata.var)\n'
            '    \'highly_variable_rank\', float vector (adata.var)\n'
            '    \'highly_variable_nbatches\', int vector (adata.var)\n'
            '    \'highly_variable_intersection\', boolean vector (adata.var)\n'
            '    \'means\', float vector (adata.var)\n'
            '    \'variances\', float vector (adata.var)\n'
            '    \'residual_variances\', float vector (adata.var)'
        )
        adata.var['means'] = df['means'].values
        adata.var['variances'] = df['variances'].values
        adata.var['residual_variances'] = df['residual_variances']
        adata.var['highly_variable_rank'] = df['highly_variable_rank'].values
        if batch_key is not None:
            adata.var['highly_variable_nbatches'] = df[
                'highly_variable_nbatches'
            ].values
            adata.var['highly_variable_intersection'] = df[
                'highly_variable_intersection'
            ].values
        adata.var['highly_variable'] = df['highly_variable'].values

        if subset:
            adata._inplace_subset_var(df['highly_variable'].values)

    else:
        if batch_key is None:
            df = df.drop(
                ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1
            )
        if subset:
            df = df.iloc[df.highly_variable.values, :]

        return df