Python aggregate_by_probe_setの例、microarray.process.aggregate_by_probe_set Pythonの例

コード例 #1

0

ファイルを表示

ファイル: microarray_data.py プロジェクト: gaberosser/qmul-bioinf

def annotate_and_aggregate_gse28192(data, aggr_field=None, aggr_method=None):
    ann_cols = {
        'ENTREZID': 'Entrez_Gene_ID',
        'SYMBOL': 'Symbol',
    }

    if aggr_field == 'all' and aggr_method is not None:
        raise ValueError("Cannot specify an aggregation method when aggr_field=='all'.")
    if aggr_field is None and aggr_method is not None:
        raise ValueError("Must specify an aggr_field if aggr_method is not None")
    if aggr_field is not None and aggr_field != 'all' and aggr_field not in ann_cols:
        raise ValueError("Unrecognised aggr_field %s." % aggr_field)

    probeset = load_gse28192_probeset()
    common_probes = probeset.index.intersection(data.index)
    if aggr_field is None:
        return data
    if aggr_field == 'all':
        # add all relevant fields
        for a, b in ann_cols.items():
            data.loc[common_probes, a] = probeset.loc[common_probes, b]
    else:
        # include only the aggregation field
        data.loc[common_probes, aggr_field] = probeset.loc[common_probes, ann_cols[aggr_field]]
        # aggregate
        data = process.aggregate_by_probe_set(data, groupby=aggr_field, method=aggr_method)

    return data

コード例 #2

0

ファイルを表示

ファイル: microarray_data.py プロジェクト: gaberosser/qmul-bioinf

def load_from_r_processed(infile, sample_names, aggr_field=None, aggr_method=None):
    """
    Load microarray data that has been created in R.
    This is (unfortunately) necessary when we want to apply certain pre-processing steps like RMA.
    :param infile: The input file name.
    :param sample_names: List (or iterable) of the sample names to keep. If None, all will be kept.
    NB: if None and the input file contains columns other than the 'standard' annotations, they will NOT be dropped.
    :param aggr_field:
    :return:
    """
    if (aggr_method is not None and aggr_field is None) or (aggr_method is None and aggr_field is not None):
        raise ValueError("Must either supply BOTH aggr_field and aggr_method or NEITHER.")
    if aggr_field is not None and aggr_field not in AGGREGATION_FIELD_CHOICES:
        logger.warning("Unrecognised aggregation field. Supported options are %s.", ', '.join(AGGREGATION_FIELD_CHOICES))

    arr_data = pd.read_csv(infile, sep='\t', header=0, index_col=0)
    if aggr_field is None:
        return arr_data

    # drop other meta fields
    if sample_names is None:
        # we'll have to guess what the other annotation fields are then drop them - this might not work
        remaining_annot = set(AGGREGATION_FIELD_CHOICES)
        try:
            remaining_annot.remove(aggr_field)
        except KeyError:
            pass
        arr_data.drop(remaining_annot, axis=1, errors='ignore', inplace=True)
    else:
        # Use sample names to ensure we only keep data columns
        arr_data = arr_data.loc[:, [aggr_field] + sample_names]

    if aggr_method is not None:
        arr_data = process.aggregate_by_probe_set(arr_data, groupby=aggr_field, method=aggr_method)
    return arr_data

コード例 #3

0

ファイルを表示

ファイル: load_illumina_data.py プロジェクト: gaberosser/qmul-bioinf

def convert_microarray_to_gene_activity(marray_data,
                                        ilm_cat,
                                        genes=None,
                                        method='median'):
    """
    Compute the gene-centric activity from microarray data. Many genes are represented on multiple probes.
    Following Zhao et al (PLoS One 2014), we take the most active gene whenever we encounter ambiguity (method='max').
    Also can take the sum, or mean, etc... specified by the method param.
    Optionally permit providing a list of genes of interest, otherwise all are computed.
    :param marray_data: pandas dataframe containing normalised intensity data
    :param ilm_cat: Illumina catalogue, used for conversion (pandas dataframe)
    :param genes: Optional list of genes to use, otherwise use all.
    :param method: 'max', 'mean', 'sum'
    :return:
    """
    marray_ann = add_gene_symbol_column(marray_data, ilm_cat)
    return aggregate_by_probe_set(marray_ann, method=method)

コード例 #4

0

ファイルを表示

- Aggregate and take mean over repeats AFTER taking logs.
"""

# METHOD 1: the old loader
LOAD_METHOD = 3

if LOAD_METHOD == 1:
    marray_data, pvals = load_illumina_data.load_normed_microarray_data(
        pval=None, return_pvals=True)
    # add constant to each array to force non-negative values
    marray_data = marray_data.subtract(marray_data.min(axis=0))

    probe_set = load_illumina_data.load_illumina_array_library()
    marray_ann = load_illumina_data.add_gene_symbol_column(
        marray_data, probe_set)
    marray_all = aggregate_by_probe_set(marray_ann, method=AGGR_METHOD)

    # take mean over repeats
    for sn in load_illumina_data.SAMPLE_NAMES:
        marray_all.loc[:, sn] = marray_all.loc[:, [sn, sn + '-R']].mean(axis=1)

    # log2
    marray_all_log = np.log2(marray_all + eps)

elif LOAD_METHOD == 2:

    # METHOD 2: the new loader
    from load_data import microarray_data
    marray_all_log, marray_meta = microarray_data.load_annotated_gse28192(
        aggr_field='SYMBOL', aggr_method=AGGR_METHOD)

コード例 #5

0

ファイルを表示

from scripts.comparison_rnaseq_microarray import load_illumina_data
from microarray.process import aggregate_by_probe_set
import numpy as np

from scipy.stats import nbinom
from statsmodels.base.model import GenericLikelihoodModel

marray_data, pvals = load_illumina_data.load_normed_microarray_data(
    pval=None, return_pvals=True)

# reduce to sign genes
marray_data = marray_data.loc[(pvals < 0.05).all(axis=1), :]

probe_set = load_illumina_data.load_illumina_array_library()
marray_ann = load_illumina_data.add_gene_symbol_column(marray_data, probe_set)
marray_by_gene = aggregate_by_probe_set(marray_ann, method="median")


def ll_nbinom(y, X, beta, alph):
    """

    :param y: The responses
    :param X: The regressors
    :param beta: Vector of coefficients
    :param alph: Negative binomial heterogeneity parameter
    :return: Log likelihood
    """
    mu = np.exp(np.dot(X, beta))  # expectation
    size = 1 / float(alph)  # r parameter: number of trials
    prob = size / (size + mu)  # or 1 / (1 + alph * mu): probability of success
    ll = nbinom.logpmf(y, size, prob)

コード例 #6

0

ファイルを表示

def plot_microarray_mb_gene_expression(mb_samples=('ICb1299-III',
                                                   'ICb1299-IV')):
    """
    Produce 2 publications:
    1) bar chart subplots showing the absolute normed intensity values for the MB-implicated genes in both
    healthy and MB samples.
    2) bar chart subplots showing the log2 fold change in those same genes
    :param mb_samples: Iterable with the sample names to use in the MB data
    :return:
    """
    from plotting import bar
    plt = bar.plt
    METHOD = 'median'
    HEALTHY_SAMPLE_NAMES = [
        'NT-1197',
        'NCb-1',
        'NCb-2',
        'A911105',
        'A508112',
        'A508285',
    ]
    HEALTHY_SAMPLE_NAMES += [t + '-R' for t in HEALTHY_SAMPLE_NAMES]

    # load full microarray data
    marray_data = load_illumina_data.load_normed_microarray_data(pval=0.01)
    # replace null with zero
    marray_data.fillna(value=0., inplace=True)
    # load probe set definitions
    probe_set = load_illumina_data.load_illumina_array_library()
    marray_ann = load_illumina_data.add_gene_symbol_column(
        marray_data, probe_set)
    marray_by_gene = aggregate_by_probe_set(marray_ann, method=METHOD)
    mb_sample_names = list(mb_samples) + [t + '-R' for t in mb_samples]

    # pick out samples and aggregate
    mb = marray_by_gene.loc[:, mb_sample_names].mean(axis=1)
    he = marray_by_gene.loc[:, HEALTHY_SAMPLE_NAMES].mean(axis=1)

    # add MB group column
    marray_by_gene = annotate_by_MB_group(marray_by_gene)

    mb_grouped = dict([(grp, mb.loc[marray_by_gene.mb_group == grp])
                       for grp, arr in MB_GROUPS])
    he_grouped = dict([(grp, he.loc[marray_by_gene.mb_group == grp])
                       for grp, arr in MB_GROUPS])

    # figure 1: absolute TPM

    data = collections.OrderedDict([(grp, [he_grouped[grp], mb_grouped[grp]])
                                    for grp, _ in REF_GROUPS])
    fig, axs = bar.multi_grouped_bar_chart(data, xlabel_coords=(0.5, -.21))

    axs[-1].legend(['Healthy cerebellum', 'MB'])
    axs[0].set_ylabel('Normed intensity')
    ylim = list(axs[-1].get_ylim())
    ylim[0] = -1e-6
    axs[-1].set_ylim(ylim)
    plt.subplots_adjust(left=0.1,
                        right=0.99,
                        bottom=0.2,
                        top=0.95,
                        wspace=0.08,
                        hspace=0.)

    # figure 2: log2 fold change

    LOG_MIN = -7
    LOG_MAX = 7
    log_fold_diff = {}
    for grp in mb_grouped:
        t = np.log2(mb_grouped[grp] / he_grouped[grp])
        log_fold_diff[grp] = t

    data = collections.OrderedDict([(grp, [log_fold_diff[grp]])
                                    for grp, _ in REF_GROUPS])
    fig, axs = bar.multi_grouped_bar_chart(data,
                                           xlabel_coords=(0.5, -.21),
                                           ylim=[LOG_MIN, LOG_MAX],
                                           colours=['gray'])

    axs[0].set_ylabel('Log2 fold change')
    plt.subplots_adjust(left=0.1,
                        right=0.99,
                        bottom=0.2,
                        top=0.95,
                        wspace=0.08,
                        hspace=0.)

コード例 #7

0

ファイルを表示

ファイル: allen_human_brain_atlas.py プロジェクト: gaberosser/qmul-bioinf

def load_microarray_reference_data(parent_struct_id=None,
                                   mask_nonsig=False,
                                   ann_field=('entrez_id', 'gene_symbol'),
                                   agg_method=None):
    """
    Load and process the Allen microarray data from the raw source format residing on disk.
    :param parent_struct_id: If supplied, restrict to this structure and its children. e.g. cerebellum is 4696
    :param mask_nonsig: If True, replace any values considered to be below statistical significance with NA
    :param ann_field: If supplied, annotate probe sets with this attribute from the probes annotation file. Examples:
        'entrez_id' (Entrez gene ID)
        'gene_symbol' (approved gene symbol)
    If None, no annotation is added.
    If an iterable, multiple annotation columns are added.
    This must be a single string if agg_method is supplied, as it is the field used for aggregation.
    :param agg_method: This string specifies the method used to aggregate over probe sets, grouping by the ann_field
    column.
    Options are None, 'min', 'max', 'mediam', 'mean'. If None, no aggregation is carried out.
    """
    # sanity check inputs
    if agg_method is not None:
        if hasattr(ann_field, '__iter__') or ann_field is None:
            raise ValueError(
                "When agg_method is not None, ann_field must be a string.")

    DONOR_NUMBERS = [9861, 10021, 12876, 14380, 15496, 15697]

    # load probe library
    probe_fn = os.path.join(MICROARRAY_DIR, 'Probes.csv')
    probes = pd.read_csv(probe_fn, index_col=0)
    # keep only those probes with an Entrez ID
    probes = probes.dropna(axis=0, subset=['entrez_id'])

    struct_ids = get_structure_ids_by_parent(parent_struct_id)

    expression = pd.DataFrame()
    sample_meta = pd.DataFrame()

    p = mp.Pool()
    p_kwds = {
        'struct_ids': struct_ids if parent_struct_id else None,
        'mask_nonsig': mask_nonsig
    }
    jobs = {}

    for dn in DONOR_NUMBERS:
        jobs[dn] = p.apply_async(load_one_microarray_donor,
                                 args=(dn, probes),
                                 kwds=p_kwds)
    p.close()

    for dn, j in jobs.items():
        logger.info("Processing donor %d", dn)
        expre, sampl = j.get(1e12)
        expression = pd.concat([expression, expre], axis=1)
        sample_meta = sample_meta.append(sampl)
        logger.info("Completed donor %d", dn)

    if ann_field is not None:
        # prepend gene symbol and entrez ID to the total expression dataframe
        expression = pd.concat(
            [probes.loc[expression.index, ann_field], expression], axis=1)

    if agg_method is not None:
        # aggregate by the annotation field
        expression = aggregate_by_probe_set(expression,
                                            method=agg_method,
                                            groupby=ann_field)

    return expression, sample_meta