Esempio n. 1
0
def load_xz_rnaseq(kind='cuff', yugene=True, gene_symbols=None):
    """
    Load RNA-Seq data from XZ samples
    :param kind: The source of the data, either 'cuff' or 'htseq'
    :param yugene: If True, apply YuGene normalisation
    :param gene_symbols: If supplied, this is a list containing the gene symbols. Any that are not present are filled
    with zeros
    :return:
    """
    if kind == 'cuff':
        X = load_rnaseq_data.load_rnaseq_cufflinks_gene_count_data(unit='fpkm')
        if yugene:
            X = process.yugene_transform(X)
        if gene_symbols is not None:
            X = pd.DataFrame(data=X, columns=X.columns, index=gene_symbols)
            X.fillna(0, inplace=True)
    elif kind == 'htseq':
        X = rnaseq_data.gse83696(index_by='Approved Symbol')
        if yugene:
            X = process.yugene_transform(X)
        if gene_symbols is not None:
            X = pd.DataFrame(data=X, columns=X.columns, index=gene_symbols)
            X.fillna(0, inplace=True)
    else:
        raise ValueError("Unrecognised kind '%s'" % kind)
    return X
def plot_clustermap(data, yugene=False, n_genes=N_GENES, yugene_resolve_ties=False, **kwargs):
    if yugene:
        data = process.yugene_transform(data, resolve_ties=yugene_resolve_ties)

    kwargs.setdefault('cmap', 'RdBu_r')

    mad = transformations.median_absolute_deviation(data, axis=1).sort_values(ascending=False)
    top_mad = mad.iloc[:n_genes].index
    z = hierarchy.linkage(data.loc[top_mad].transpose(), method='average', metric='correlation')
    cg = clustering.plot_clustermap(
        data.loc[top_mad],
        col_linkage=z,
        **kwargs
    )
    plt.setp(
        cg.ax_heatmap.xaxis.get_ticklabels(), rotation=90
    )
    cg.gs.update(bottom=0.2)

    # it is helpful to have access to the row index so we'll add it here
    # I *think* certain kwargs might cause this to fail (if no row dend has been computed?) so add a generic try-exc
    try:
        cg.row_index = top_mad[cg.dendrogram_row.reordered_ind]
    except Exception:
        pass

    return cg
def plot_all_correlation_heatmaps(data, filestem, col_orders, **kwargs):
    ax = plot_correlation_heatmap(data.iloc[:, col_orders['counts']], **kwargs)
    ax.figure.savefig("%s.png" % filestem, dpi=200)
    ax.figure.savefig("%s.pdf" % filestem)

    t = process.yugene_transform(data)
    ax = plot_correlation_heatmap(t.iloc[:, col_orders['yg_counts']], **kwargs)
    ax.figure.savefig("%s_yg.png" % filestem, dpi=200)
    ax.figure.savefig("%s_yg.pdf" % filestem)

    t = np.log(data + 1)
    ax = plot_correlation_heatmap(t.iloc[:, col_orders['log_counts']], **kwargs)
    ax.figure.savefig("%s_log.png" % filestem, dpi=200)
    ax.figure.savefig("%s_log.pdf" % filestem)

    t = process.yugene_transform(np.log(data + 1))
    ax = plot_correlation_heatmap(t.iloc[:, col_orders['yg_log_counts']], **kwargs)
    ax.figure.savefig("%s_log_yg.png" % filestem, dpi=200)
    ax.figure.savefig("%s_log_yg.pdf" % filestem)
Esempio n. 4
0
def load_sb_rnaseq(yugene=True, gene_symbols=None):
    """
    Load RNA-Seq from SB samples, counted using featureCounts
    :param yugene: If True, apply YuGene normalisation
    :param gene_symbols: If supplied, this is a list containing the gene symbols. Any that are not present are filled
    with zeros
    :return:
    """
    X = rnaseq_data.mb_zhao_cultures(units='fpkm', annotate_by='Approved Symbol')
    if yugene:
        X = process.yugene_transform(X)
    if gene_symbols is not None:
        X = pd.DataFrame(data=X, columns=X.columns, index=gene_symbols)
        X.fillna(0, inplace=True)
    return X
Esempio n. 5
0
def load_xiaonan_microarray(yugene=True, gene_symbols=None, sample_names=None):
    """
    Load the Xiao-Nan microarray data
    :param yugene: If True, apply YuGene normalisation
    :param gene_symbols: If supplied, this is a list containing the gene symbols. Any that are not present are filled
    with zeros
    :return:
    """
    X, meta = microarray_data.load_annotated_gse28192(
        aggr_field='SYMBOL',
        aggr_method='max',
        log2=True,
        sample_names=sample_names
    )
    if yugene:
        X = process.yugene_transform(X)
    if gene_symbols is not None:
        X = pd.DataFrame(data=X, columns=X.columns, index=gene_symbols)
        X.fillna(0, inplace=True)
    return X, meta
def plot_correlation_heatmap(data, yugene=False, n_genes=None, **kwargs):
    if yugene:
        data = process.yugene_transform(data)

    kwargs.setdefault('cmap', 'Reds')
    kwargs.setdefault('vmin', 0.)
    kwargs.setdefault('vmax', 1.)

    if n_genes is not None:
        mad = process.median_absolute_deviation(data, axis=1).sort_values(ascending=False)
        top_mad = mad.iloc[:n_genes].index
        data = data.loc[top_mad]

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    ax = sns.heatmap(data.corr(), ax=ax, **kwargs)
    plt.setp(ax.xaxis.get_ticklabels(), rotation=90)
    plt.setp(ax.yaxis.get_ticklabels(), rotation=0)
    plt.tight_layout()

    return ax
Esempio n. 7
0
    mo_he = microarray_data.load_annotated_microarray_gse54650()  # indexed by Entrez gene ID

    # load mouse MB data
    mo_mb, chd7 = microarray_data.load_annotated_microarray_sb_data() # indexed by Entrez gene ID

    # reduce to common genes
    common_genes = mo_he.index.intersection(mo_mb.index)

    mo_mb = mo_mb.loc[common_genes]
    mo_he = mo_he.loc[common_genes]

    # combine
    mo_all = pd.concat((mo_he, mo_mb), axis=1)

    # YuGene
    yg_mo_all = process.yugene_transform(mo_all)

    # apply YuGene transform
    # yg_mo_he = process.yugene_transform(mo_he)

    # apply YuGene
    # yg_mb = process.yugene_transform(mo_mb)

    # yg_mb = yg_mb.loc[common_genes]
    # yg_mo_he = yg_mo_he.loc[common_genes]

    # plot correlation using ALL matching genes
    # if True:
    if False:
        corr.plot_correlation_coefficient_array(yg_mo_all, vmin=0.8, fig_kwargs=fig_kwargs)
        plt.tight_layout()
Esempio n. 8
0
    k = 10  # XV

    # it's useful to maintain a list of known upregulated genes
    nano_genes = []
    for grp, arr in consts.NANOSTRING_GENES:
        nano_genes.extend(arr)
    nano_genes.remove('EGFL11')
    nano_genes.append('EYS')

    # load Ncott data (285 non-WNT MB samples)

    ncott, ncott_meta = microarray_data.load_annotated_microarray_gse37382(
        aggr_field='SYMBOL', aggr_method='max')
    sort_idx = ncott_meta.subgroup.sort_values().index
    ncott_meta = ncott_meta.loc[sort_idx]
    ncott = process.yugene_transform(ncott.loc[:, sort_idx])

    # X = ncott.copy()
    # m = ncott_meta.copy()

    # load Allen (healthy cerebellum)

    # he, he_meta = allen_human_brain_atlas.cerebellum_microarray_reference_data(agg_field='gene_symbol', agg_method='max')

    # combine

    # common_genes = ncott.index.intersection(he.index)
    # res = pd.DataFrame(index=common_genes, columns=ncott.columns.union(he.columns))
    # res.loc[common_genes, he.columns] = he.loc[common_genes].values
    # res.loc[common_genes, ncott.columns] = ncott.loc[common_genes].values
    # res = res.astype(float)
Esempio n. 9
0
    # load Robinson dataset
    robi, robi_meta = microarray_data.load_annotated_microarray_gse37418(
        aggr_field='SYMBOL', aggr_method='max')
    robi_meta = robi_meta.loc[~robi_meta.subgroup.isin(['U', 'SHH OUTLIER'])]
    sort_idx = robi_meta.subgroup.sort_values().index
    robi_meta = robi_meta.loc[sort_idx]
    robi = robi.loc[:, sort_idx]
    robi_meta.loc[:, 'subgroup'] = robi_meta.subgroup.str.replace(
        'G3', 'Group 3').replace('G4', 'Group 4')

    X = robi.copy()
    m = robi_meta.copy()

    # YuGene
    X = process.yugene_transform(X)

    # the samples must be represented in ROWS
    X = X.transpose()

    Z = linkage(X, method='average', metric='correlation')
    # fig, ax, subax, gs = dendro_heatmap(Z, m.subgroup)
    # check the cophenetic distance: the sorrelation between ACTUAL pairwise distances between samples and the
    # distances according to the hierarchical clustering
    # c, coph_dists = cophenet(Z, pdist(X))
    # print "Correlation coeff between actual pdist and hierarchical pdist is %.2f" % c

    # pick top high stdev genes
    s = X.std(axis=0).sort_values(ascending=False)
    idx = s.index[:1500]
Esempio n. 10
0
            'Actb',
            # 'Gapdh', # missing
            # 'Rpl13a', # missing
            'B2m',
            'Hmbs',
            'Pgk1',
            'Hsp90ab1',
            'Hprt',
        ]))

    # standardise
    mo_all_sym_n = mo_all_sym.subtract(mo_he_sym.mean(axis=1),
                                       axis=0).divide(mo_he_sym.std(axis=1),
                                                      axis=0)
    # mo_all_sym_n = mo_all_sym.subtract(mo_all_sym.mean(axis=1), axis=0).divide(mo_all_sym.std(axis=1), axis=0)
    mo_all_sym_yg = process.yugene_transform(mo_all_sym)

    # distribution of all intensities
    if SAVE_PLOTS:
        # hist of intensities
        fig, axs = plt.subplots(2, 1, sharex=True)
        axs[0].hist(mo_mb.values.flatten(), 100)
        axs[0].set_title("Microarray intensities, SB")
        axs[1].hist(mo_he.values.flatten(), 100)
        axs[1].set_title("Microarray intensities, cerebellum")
        plt.tight_layout()
        fig.savefig(os.path.join(OUTDIR, "marr_sb-circad_hist_intensity.png"),
                    dpi=200)
        fig.savefig(os.path.join(OUTDIR, "marr_sb-circad_hist_intensity.pdf"))

    if SAVE_PLOTS:
Esempio n. 11
0
    nano_genes = []
    for grp, arr in consts.NANOSTRING_GENES:
        if grp != 'WNT':
            nano_genes.extend(arr)
    nano_genes.remove('EGFL11')
    nano_genes.append('EYS')

    # load Ncott data (285 non-WNT MB samples)
    ncott, ncott_meta = microarray_data.load_annotated_microarray_gse37382(
        aggr_field='SYMBOL',
        aggr_method='max'
    )
    sort_idx = ncott_meta.subgroup.sort_values().index
    ncott_meta = ncott_meta.loc[sort_idx]
    ncott = ncott.loc[:, sort_idx]
    ncott = process.yugene_transform(ncott)

    # load Allen (healthy cerebellum)

    he, he_meta = allen_human_brain_atlas.cerebellum_microarray_reference_data(agg_field='gene_symbol', agg_method='max')
    he_meta.loc[:, 'subgroup'] = 'control'

    # load Kool dataset
    kool, kool_meta = microarray_data.load_annotated_microarray_gse10327(
        aggr_field='SYMBOL',
        aggr_method='max',
    )
    sort_idx = kool_meta.subgroup.sort_values().index
    kool_meta = kool_meta.loc[sort_idx]
    kool = kool.loc[:, sort_idx]
    kool_meta.loc[:, 'subgroup'] = (
        for lbl, d in sample_groups.items():
            this_data = data.loc[:, data.columns.str.contains(d['regex'])]
            first = True
            for t in this_data.columns:
                col = this_data.loc[:, t].sort_values()
                col /= col.max()
                if renorm:
                    col = col.loc[col > 0]
                x = np.linspace(0, 1, col.size)
                plt_lbl = None
                if first:
                    plt_lbl = lbl
                    first = False
                ax.plot(x, col.values, color=d['colour'], label=plt_lbl)

    data_rr_yg = process.yugene_transform(data_rr)
    data_rr_log = np.log(data_rr + 1)
    data_rr_log_yg = process.yugene_transform(data_rr_log)

    fig, axs = plt.subplots(2, 2, sharex=True, sharey=True)

    dynamic_range_plot(data_rr, axs[0, 0])
    dynamic_range_plot(data_rr_log, axs[0, 1])
    dynamic_range_plot(data_rr_yg, axs[1, 0])
    dynamic_range_plot(data_rr_log_yg, axs[1, 1])
    axs[0, 0].set_xlim(0., 1.)
    axs[0, 0].set_ylabel('Expression level (normalised)')
    axs[1, 0].set_ylabel('Expression level (normalised)')
    axs[1, 0].set_xlabel('Non-zero percentile')
    axs[1, 1].set_xlabel('Non-zero percentile')
Esempio n. 13
0
    obj = rnaseq_data.gbm_ribozero_samples_loader(annotate_by='Ensembl Gene ID')

    data = obj.data.loc[obj.data.index.str.contains('ENSG')]
    data = data.loc[data.any(axis=1), :]
    data_n = norm(data)

    # load relevant poly(A) data, too

    obj_polya = rnaseq_data.gbm_paired_samples_loader(annotate_by='Ensembl Gene ID', source='star')
    data_polya = obj_polya.data.loc[:, obj_polya.data.columns.str.contains('GBM')]

    data_all = pd.concat((obj.data, data_polya), axis=1)
    data_all = extract_present_genes(data_all)
    data_all_n = norm(data_all)
    data_all_yg = process.yugene_transform(data_all, resolve_ties=False)
    data_all_log_yg = process.yugene_transform(np.log2(data_all_n), resolve_ties=False)


    # number assigned bar chart

    assgn = get_assigned_proportions(obj.data)
    assgn_polya = get_assigned_proportions(obj_polya.data).loc['GBM031']
    assgn = assgn.append(assgn_polya)

    ax = assgn.plot.bar()
    ax.set_position([0.05, 0.17, 0.7, 0.8])
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.setp(ax.xaxis.get_ticklabels(), rotation=45)
    ax.set_ylabel('# reads')