def test_efficient_corr(): # valid inputs a, b = np.random.rand(2, 100, 10) corrs = utils.efficient_corr(a, b) assert len(corrs) == 10 # known output a, b = np.arange(9).reshape(3, 3), np.arange(9).reshape(3, 3)[::-1] corrs = utils.efficient_corr(a, b) assert np.all(corrs == [-1, -1, -1]) # empty input yields NaN assert np.isnan(utils.efficient_corr([], [])) # different lengths with pytest.raises(ValueError): utils.efficient_corr(a[:2], b)
def keep_stable_genes(expression, threshold=0.9, percentile=True, rank=True): """ Removes genes in `expression` with differential stability < `threshold` Calculates the similarity of gene expression across brain regions for every pair of donors in `expression`. Similarity is averaged across donor pairs and genes whose mean similarity falls below `threshold` are removed. Parameters ---------- expression : list of (R, G) pandas.DataFrame Where each entry is the microarray expression of `R` regions across `G` genes for a given donor threshold : [0, 1] float, optional Minimum required average similarity (e.g, correlation) across donors for a gene to be retained. Default: 0.1 percentile : bool, optional Whether to treat `threshold` as a percentile instead of an absolute cutoff. For example, `threshold=0.9` and `percentile=True` would retain only those genes with a differential stability in the top 10% of all genes, whereas `percentile=False` would retain only those genes with differential stability > 0.9. Default: True rank : bool, optional Whether to calculate similarity as Spearman correlation instead of Pearson correlation. Default: True Returns ------- expression : list of (R, Gr) pandas.DataFrame Microarray expression for `R` regions across `Gr` genes, where `Gr` is the number of retained genes """ # get number of donors and number of genes num_subj = len(expression) num_gene = expression[0].shape[-1] # rank data, if necessary for_corr = expression if not rank else [e.rank() for e in expression] # get correlation of gene expression across regions for all donor pairs gene_corrs = np.zeros((num_gene, sum(range(num_subj)))) for n, (s1, s2) in enumerate(itertools.combinations(range(num_subj), 2)): regions = np.intersect1d(for_corr[s1].dropna(axis=0, how='all').index, for_corr[s2].dropna(axis=0, how='all').index) gene_corrs[:, n] = utils.efficient_corr(for_corr[s1].loc[regions], for_corr[s2].loc[regions]) # average similarity across donors gene_corrs = gene_corrs.mean(axis=1) # calculate absolute threshold if percentile is desired if percentile: threshold = np.percentile(gene_corrs, threshold * 100) keep_genes = gene_corrs > threshold expression = [e.iloc[:, keep_genes] for e in expression] return expression
def get_stable_probes(microarray, annotation, probes): """ Picks one probe to represent `microarray` data for each gene in `probes` If there are multiple probes with expression data for the same gene, this function will calculate the similarity of each probes' expression across donors and select the probe with the most consistent pattern of regional variation (i.e., "differential stability" or DS). Regions are defined by the "structure_id" column in `annotation`; similarity is calculated by the Spearman correlation coefficient. Parameters ---------- microarray : list of str List of microarray expression files from Allen Brain Institute. Optimally obtained by calling `abagen.fetch_microarray()` and accessing the `microarray` attribute on the resulting object. annotation : list of str List of annotation files from Allen Brain Institute. Optimally obtained by calling `abagen.fetch_microarray()` and accessing the `annotation` attribute on the resulting object. probes : pandas.DataFrame Dataframe containing information on probes that should be considered in representative analysis. Generally, intensity-based-filtering (i.e., `probe_ibf()`) should have been used to reduce this list to only those probes with good expression signal Returns ------- representative : pandas.DataFrame Dataframe containing information on probes that are most representative of their genes based on differential stability analysis References ---------- Hawrylycz, M., Miller, J. A., Menon, V., Feng, D., Dolbeare, T., Guillozet- Bongaarts, A. L., ... & Lein, E. (2015). Canonical genetic signatures of the adult human brain. Nature Neuroscience, 18(12), 1832. """ # read in microarray data for all subjects num_subj = len(microarray) # this is a relatively slow procedure (i.e., takes a couple of seconds) micro = [_reduce_micro(microarray[n], annotation[n], probes) for n in range(num_subj)] # get correlation of probe expression across samples for all donor pairs probe_corrs = np.zeros((len(probes), sum(range(num_subj)))) for n, (s1, s2) in enumerate(itertools.combinations(range(num_subj), 2)): # find samples that current donor pair have in common samples = np.intersect1d(micro[s1].columns, micro[s2].columns) # the ranking process can take a few seconds on each loop # unfortunately, we have to do it each time because `samples` changes probe_corrs[:, n] = utils.efficient_corr(micro[s1][samples].T.rank(), micro[s2][samples].T.rank()) # group probes by gene and get probe corresponding to max correlation df = pd.DataFrame(dict(gene_symbol=probes.gene_symbol.values, corrs=probe_corrs.mean(axis=1), probe_id=probes.index)) retained = (df.groupby('gene_symbol') .apply(lambda x: x.loc[x.corrs.idxmax(), 'probe_id'])) return probes[probes.index.isin(retained.values)]