Exemple #1
0
def test_efficient_corr():
    # valid inputs
    a, b = np.random.rand(2, 100, 10)
    corrs = utils.efficient_corr(a, b)
    assert len(corrs) == 10

    # known output
    a, b = np.arange(9).reshape(3, 3), np.arange(9).reshape(3, 3)[::-1]
    corrs = utils.efficient_corr(a, b)
    assert np.all(corrs == [-1, -1, -1])

    # empty input yields NaN
    assert np.isnan(utils.efficient_corr([], []))

    # different lengths
    with pytest.raises(ValueError):
        utils.efficient_corr(a[:2], b)
Exemple #2
0
def keep_stable_genes(expression, threshold=0.9, percentile=True, rank=True):
    """
    Removes genes in `expression` with differential stability < `threshold`

    Calculates the similarity of gene expression across brain regions for every
    pair of donors in `expression`. Similarity is averaged across donor pairs
    and genes whose mean similarity falls below `threshold` are removed.

    Parameters
    ----------
    expression : list of (R, G) pandas.DataFrame
        Where each entry is the microarray expression of `R` regions across `G`
        genes for a given donor
    threshold : [0, 1] float, optional
        Minimum required average similarity (e.g, correlation) across donors
        for a gene to be retained. Default: 0.1
    percentile : bool, optional
        Whether to treat `threshold` as a percentile instead of an absolute
        cutoff. For example, `threshold=0.9` and `percentile=True` would
        retain only those genes with a differential stability in the top 10% of
        all genes, whereas `percentile=False` would retain only those genes
        with differential stability > 0.9. Default: True
    rank : bool, optional
        Whether to calculate similarity as Spearman correlation instead of
        Pearson correlation. Default: True

    Returns
    -------
    expression : list of (R, Gr) pandas.DataFrame
        Microarray expression for `R` regions across `Gr` genes, where `Gr` is
        the number of retained genes
    """

    # get number of donors and number of genes
    num_subj = len(expression)
    num_gene = expression[0].shape[-1]

    # rank data, if necessary
    for_corr = expression if not rank else [e.rank() for e in expression]

    # get correlation of gene expression across regions for all donor pairs
    gene_corrs = np.zeros((num_gene, sum(range(num_subj))))
    for n, (s1, s2) in enumerate(itertools.combinations(range(num_subj), 2)):
        regions = np.intersect1d(for_corr[s1].dropna(axis=0, how='all').index,
                                 for_corr[s2].dropna(axis=0, how='all').index)
        gene_corrs[:, n] = utils.efficient_corr(for_corr[s1].loc[regions],
                                                for_corr[s2].loc[regions])

    # average similarity across donors
    gene_corrs = gene_corrs.mean(axis=1)
    # calculate absolute threshold if percentile is desired
    if percentile:
        threshold = np.percentile(gene_corrs, threshold * 100)
    keep_genes = gene_corrs > threshold
    expression = [e.iloc[:, keep_genes] for e in expression]

    return expression
Exemple #3
0
def get_stable_probes(microarray, annotation, probes):
    """
    Picks one probe to represent `microarray` data for each gene in `probes`

    If there are multiple probes with expression data for the same gene, this
    function will calculate the similarity of each probes' expression across
    donors and select the probe with the most consistent pattern of regional
    variation (i.e., "differential stability" or DS). Regions are defined by
    the "structure_id" column in `annotation`; similarity is calculated by the
    Spearman correlation coefficient.

    Parameters
    ----------
    microarray : list of str
        List of microarray expression files from Allen Brain Institute.
        Optimally obtained by calling `abagen.fetch_microarray()` and accessing
        the `microarray` attribute on the resulting object.
    annotation : list of str
        List of annotation files from Allen Brain Institute. Optimally obtained
        by calling `abagen.fetch_microarray()` and accessing the `annotation`
        attribute on the resulting object.
    probes : pandas.DataFrame
        Dataframe containing information on probes that should be considered in
        representative analysis. Generally, intensity-based-filtering (i.e.,
        `probe_ibf()`) should have been used to reduce this list to only those
        probes with good expression signal

    Returns
    -------
    representative : pandas.DataFrame
        Dataframe containing information on probes that are most representative
        of their genes based on differential stability analysis

    References
    ----------
    Hawrylycz, M., Miller, J. A., Menon, V., Feng, D., Dolbeare, T., Guillozet-
    Bongaarts, A. L., ... & Lein, E. (2015). Canonical genetic signatures of
    the adult human brain. Nature Neuroscience, 18(12), 1832.
    """

    # read in microarray data for all subjects
    num_subj = len(microarray)

    # this is a relatively slow procedure (i.e., takes a couple of seconds)
    micro = [_reduce_micro(microarray[n], annotation[n], probes)
             for n in range(num_subj)]

    # get correlation of probe expression across samples for all donor pairs
    probe_corrs = np.zeros((len(probes), sum(range(num_subj))))
    for n, (s1, s2) in enumerate(itertools.combinations(range(num_subj), 2)):

        # find samples that current donor pair have in common
        samples = np.intersect1d(micro[s1].columns, micro[s2].columns)

        # the ranking process can take a few seconds on each loop
        # unfortunately, we have to do it each time because `samples` changes
        probe_corrs[:, n] = utils.efficient_corr(micro[s1][samples].T.rank(),
                                                 micro[s2][samples].T.rank())

    # group probes by gene and get probe corresponding to max correlation
    df = pd.DataFrame(dict(gene_symbol=probes.gene_symbol.values,
                           corrs=probe_corrs.mean(axis=1),
                           probe_id=probes.index))
    retained = (df.groupby('gene_symbol')
                  .apply(lambda x: x.loc[x.corrs.idxmax(), 'probe_id']))

    return probes[probes.index.isin(retained.values)]