def annotate_and_aggregate_gse28192(data, aggr_field=None, aggr_method=None): ann_cols = { 'ENTREZID': 'Entrez_Gene_ID', 'SYMBOL': 'Symbol', } if aggr_field == 'all' and aggr_method is not None: raise ValueError("Cannot specify an aggregation method when aggr_field=='all'.") if aggr_field is None and aggr_method is not None: raise ValueError("Must specify an aggr_field if aggr_method is not None") if aggr_field is not None and aggr_field != 'all' and aggr_field not in ann_cols: raise ValueError("Unrecognised aggr_field %s." % aggr_field) probeset = load_gse28192_probeset() common_probes = probeset.index.intersection(data.index) if aggr_field is None: return data if aggr_field == 'all': # add all relevant fields for a, b in ann_cols.items(): data.loc[common_probes, a] = probeset.loc[common_probes, b] else: # include only the aggregation field data.loc[common_probes, aggr_field] = probeset.loc[common_probes, ann_cols[aggr_field]] # aggregate data = process.aggregate_by_probe_set(data, groupby=aggr_field, method=aggr_method) return data
def load_from_r_processed(infile, sample_names, aggr_field=None, aggr_method=None): """ Load microarray data that has been created in R. This is (unfortunately) necessary when we want to apply certain pre-processing steps like RMA. :param infile: The input file name. :param sample_names: List (or iterable) of the sample names to keep. If None, all will be kept. NB: if None and the input file contains columns other than the 'standard' annotations, they will NOT be dropped. :param aggr_field: :return: """ if (aggr_method is not None and aggr_field is None) or (aggr_method is None and aggr_field is not None): raise ValueError("Must either supply BOTH aggr_field and aggr_method or NEITHER.") if aggr_field is not None and aggr_field not in AGGREGATION_FIELD_CHOICES: logger.warning("Unrecognised aggregation field. Supported options are %s.", ', '.join(AGGREGATION_FIELD_CHOICES)) arr_data = pd.read_csv(infile, sep='\t', header=0, index_col=0) if aggr_field is None: return arr_data # drop other meta fields if sample_names is None: # we'll have to guess what the other annotation fields are then drop them - this might not work remaining_annot = set(AGGREGATION_FIELD_CHOICES) try: remaining_annot.remove(aggr_field) except KeyError: pass arr_data.drop(remaining_annot, axis=1, errors='ignore', inplace=True) else: # Use sample names to ensure we only keep data columns arr_data = arr_data.loc[:, [aggr_field] + sample_names] if aggr_method is not None: arr_data = process.aggregate_by_probe_set(arr_data, groupby=aggr_field, method=aggr_method) return arr_data
def convert_microarray_to_gene_activity(marray_data, ilm_cat, genes=None, method='median'): """ Compute the gene-centric activity from microarray data. Many genes are represented on multiple probes. Following Zhao et al (PLoS One 2014), we take the most active gene whenever we encounter ambiguity (method='max'). Also can take the sum, or mean, etc... specified by the method param. Optionally permit providing a list of genes of interest, otherwise all are computed. :param marray_data: pandas dataframe containing normalised intensity data :param ilm_cat: Illumina catalogue, used for conversion (pandas dataframe) :param genes: Optional list of genes to use, otherwise use all. :param method: 'max', 'mean', 'sum' :return: """ marray_ann = add_gene_symbol_column(marray_data, ilm_cat) return aggregate_by_probe_set(marray_ann, method=method)
- Aggregate and take mean over repeats AFTER taking logs. """ # METHOD 1: the old loader LOAD_METHOD = 3 if LOAD_METHOD == 1: marray_data, pvals = load_illumina_data.load_normed_microarray_data( pval=None, return_pvals=True) # add constant to each array to force non-negative values marray_data = marray_data.subtract(marray_data.min(axis=0)) probe_set = load_illumina_data.load_illumina_array_library() marray_ann = load_illumina_data.add_gene_symbol_column( marray_data, probe_set) marray_all = aggregate_by_probe_set(marray_ann, method=AGGR_METHOD) # take mean over repeats for sn in load_illumina_data.SAMPLE_NAMES: marray_all.loc[:, sn] = marray_all.loc[:, [sn, sn + '-R']].mean(axis=1) # log2 marray_all_log = np.log2(marray_all + eps) elif LOAD_METHOD == 2: # METHOD 2: the new loader from load_data import microarray_data marray_all_log, marray_meta = microarray_data.load_annotated_gse28192( aggr_field='SYMBOL', aggr_method=AGGR_METHOD)
from scripts.comparison_rnaseq_microarray import load_illumina_data from microarray.process import aggregate_by_probe_set import numpy as np from scipy.stats import nbinom from statsmodels.base.model import GenericLikelihoodModel marray_data, pvals = load_illumina_data.load_normed_microarray_data( pval=None, return_pvals=True) # reduce to sign genes marray_data = marray_data.loc[(pvals < 0.05).all(axis=1), :] probe_set = load_illumina_data.load_illumina_array_library() marray_ann = load_illumina_data.add_gene_symbol_column(marray_data, probe_set) marray_by_gene = aggregate_by_probe_set(marray_ann, method="median") def ll_nbinom(y, X, beta, alph): """ :param y: The responses :param X: The regressors :param beta: Vector of coefficients :param alph: Negative binomial heterogeneity parameter :return: Log likelihood """ mu = np.exp(np.dot(X, beta)) # expectation size = 1 / float(alph) # r parameter: number of trials prob = size / (size + mu) # or 1 / (1 + alph * mu): probability of success ll = nbinom.logpmf(y, size, prob)
def plot_microarray_mb_gene_expression(mb_samples=('ICb1299-III', 'ICb1299-IV')): """ Produce 2 publications: 1) bar chart subplots showing the absolute normed intensity values for the MB-implicated genes in both healthy and MB samples. 2) bar chart subplots showing the log2 fold change in those same genes :param mb_samples: Iterable with the sample names to use in the MB data :return: """ from plotting import bar plt = bar.plt METHOD = 'median' HEALTHY_SAMPLE_NAMES = [ 'NT-1197', 'NCb-1', 'NCb-2', 'A911105', 'A508112', 'A508285', ] HEALTHY_SAMPLE_NAMES += [t + '-R' for t in HEALTHY_SAMPLE_NAMES] # load full microarray data marray_data = load_illumina_data.load_normed_microarray_data(pval=0.01) # replace null with zero marray_data.fillna(value=0., inplace=True) # load probe set definitions probe_set = load_illumina_data.load_illumina_array_library() marray_ann = load_illumina_data.add_gene_symbol_column( marray_data, probe_set) marray_by_gene = aggregate_by_probe_set(marray_ann, method=METHOD) mb_sample_names = list(mb_samples) + [t + '-R' for t in mb_samples] # pick out samples and aggregate mb = marray_by_gene.loc[:, mb_sample_names].mean(axis=1) he = marray_by_gene.loc[:, HEALTHY_SAMPLE_NAMES].mean(axis=1) # add MB group column marray_by_gene = annotate_by_MB_group(marray_by_gene) mb_grouped = dict([(grp, mb.loc[marray_by_gene.mb_group == grp]) for grp, arr in MB_GROUPS]) he_grouped = dict([(grp, he.loc[marray_by_gene.mb_group == grp]) for grp, arr in MB_GROUPS]) # figure 1: absolute TPM data = collections.OrderedDict([(grp, [he_grouped[grp], mb_grouped[grp]]) for grp, _ in REF_GROUPS]) fig, axs = bar.multi_grouped_bar_chart(data, xlabel_coords=(0.5, -.21)) axs[-1].legend(['Healthy cerebellum', 'MB']) axs[0].set_ylabel('Normed intensity') ylim = list(axs[-1].get_ylim()) ylim[0] = -1e-6 axs[-1].set_ylim(ylim) plt.subplots_adjust(left=0.1, right=0.99, bottom=0.2, top=0.95, wspace=0.08, hspace=0.) # figure 2: log2 fold change LOG_MIN = -7 LOG_MAX = 7 log_fold_diff = {} for grp in mb_grouped: t = np.log2(mb_grouped[grp] / he_grouped[grp]) log_fold_diff[grp] = t data = collections.OrderedDict([(grp, [log_fold_diff[grp]]) for grp, _ in REF_GROUPS]) fig, axs = bar.multi_grouped_bar_chart(data, xlabel_coords=(0.5, -.21), ylim=[LOG_MIN, LOG_MAX], colours=['gray']) axs[0].set_ylabel('Log2 fold change') plt.subplots_adjust(left=0.1, right=0.99, bottom=0.2, top=0.95, wspace=0.08, hspace=0.)
def load_microarray_reference_data(parent_struct_id=None, mask_nonsig=False, ann_field=('entrez_id', 'gene_symbol'), agg_method=None): """ Load and process the Allen microarray data from the raw source format residing on disk. :param parent_struct_id: If supplied, restrict to this structure and its children. e.g. cerebellum is 4696 :param mask_nonsig: If True, replace any values considered to be below statistical significance with NA :param ann_field: If supplied, annotate probe sets with this attribute from the probes annotation file. Examples: 'entrez_id' (Entrez gene ID) 'gene_symbol' (approved gene symbol) If None, no annotation is added. If an iterable, multiple annotation columns are added. This must be a single string if agg_method is supplied, as it is the field used for aggregation. :param agg_method: This string specifies the method used to aggregate over probe sets, grouping by the ann_field column. Options are None, 'min', 'max', 'mediam', 'mean'. If None, no aggregation is carried out. """ # sanity check inputs if agg_method is not None: if hasattr(ann_field, '__iter__') or ann_field is None: raise ValueError( "When agg_method is not None, ann_field must be a string.") DONOR_NUMBERS = [9861, 10021, 12876, 14380, 15496, 15697] # load probe library probe_fn = os.path.join(MICROARRAY_DIR, 'Probes.csv') probes = pd.read_csv(probe_fn, index_col=0) # keep only those probes with an Entrez ID probes = probes.dropna(axis=0, subset=['entrez_id']) struct_ids = get_structure_ids_by_parent(parent_struct_id) expression = pd.DataFrame() sample_meta = pd.DataFrame() p = mp.Pool() p_kwds = { 'struct_ids': struct_ids if parent_struct_id else None, 'mask_nonsig': mask_nonsig } jobs = {} for dn in DONOR_NUMBERS: jobs[dn] = p.apply_async(load_one_microarray_donor, args=(dn, probes), kwds=p_kwds) p.close() for dn, j in jobs.items(): logger.info("Processing donor %d", dn) expre, sampl = j.get(1e12) expression = pd.concat([expression, expre], axis=1) sample_meta = sample_meta.append(sampl) logger.info("Completed donor %d", dn) if ann_field is not None: # prepend gene symbol and entrez ID to the total expression dataframe expression = pd.concat( [probes.loc[expression.index, ann_field], expression], axis=1) if agg_method is not None: # aggregate by the annotation field expression = aggregate_by_probe_set(expression, method=agg_method, groupby=ann_field) return expression, sample_meta