def load_xz_rnaseq(kind='cuff', yugene=True, gene_symbols=None): """ Load RNA-Seq data from XZ samples :param kind: The source of the data, either 'cuff' or 'htseq' :param yugene: If True, apply YuGene normalisation :param gene_symbols: If supplied, this is a list containing the gene symbols. Any that are not present are filled with zeros :return: """ if kind == 'cuff': X = load_rnaseq_data.load_rnaseq_cufflinks_gene_count_data(unit='fpkm') if yugene: X = process.yugene_transform(X) if gene_symbols is not None: X = pd.DataFrame(data=X, columns=X.columns, index=gene_symbols) X.fillna(0, inplace=True) elif kind == 'htseq': X = rnaseq_data.gse83696(index_by='Approved Symbol') if yugene: X = process.yugene_transform(X) if gene_symbols is not None: X = pd.DataFrame(data=X, columns=X.columns, index=gene_symbols) X.fillna(0, inplace=True) else: raise ValueError("Unrecognised kind '%s'" % kind) return X
def plot_clustermap(data, yugene=False, n_genes=N_GENES, yugene_resolve_ties=False, **kwargs): if yugene: data = process.yugene_transform(data, resolve_ties=yugene_resolve_ties) kwargs.setdefault('cmap', 'RdBu_r') mad = transformations.median_absolute_deviation(data, axis=1).sort_values(ascending=False) top_mad = mad.iloc[:n_genes].index z = hierarchy.linkage(data.loc[top_mad].transpose(), method='average', metric='correlation') cg = clustering.plot_clustermap( data.loc[top_mad], col_linkage=z, **kwargs ) plt.setp( cg.ax_heatmap.xaxis.get_ticklabels(), rotation=90 ) cg.gs.update(bottom=0.2) # it is helpful to have access to the row index so we'll add it here # I *think* certain kwargs might cause this to fail (if no row dend has been computed?) so add a generic try-exc try: cg.row_index = top_mad[cg.dendrogram_row.reordered_ind] except Exception: pass return cg
def plot_all_correlation_heatmaps(data, filestem, col_orders, **kwargs): ax = plot_correlation_heatmap(data.iloc[:, col_orders['counts']], **kwargs) ax.figure.savefig("%s.png" % filestem, dpi=200) ax.figure.savefig("%s.pdf" % filestem) t = process.yugene_transform(data) ax = plot_correlation_heatmap(t.iloc[:, col_orders['yg_counts']], **kwargs) ax.figure.savefig("%s_yg.png" % filestem, dpi=200) ax.figure.savefig("%s_yg.pdf" % filestem) t = np.log(data + 1) ax = plot_correlation_heatmap(t.iloc[:, col_orders['log_counts']], **kwargs) ax.figure.savefig("%s_log.png" % filestem, dpi=200) ax.figure.savefig("%s_log.pdf" % filestem) t = process.yugene_transform(np.log(data + 1)) ax = plot_correlation_heatmap(t.iloc[:, col_orders['yg_log_counts']], **kwargs) ax.figure.savefig("%s_log_yg.png" % filestem, dpi=200) ax.figure.savefig("%s_log_yg.pdf" % filestem)
def load_sb_rnaseq(yugene=True, gene_symbols=None): """ Load RNA-Seq from SB samples, counted using featureCounts :param yugene: If True, apply YuGene normalisation :param gene_symbols: If supplied, this is a list containing the gene symbols. Any that are not present are filled with zeros :return: """ X = rnaseq_data.mb_zhao_cultures(units='fpkm', annotate_by='Approved Symbol') if yugene: X = process.yugene_transform(X) if gene_symbols is not None: X = pd.DataFrame(data=X, columns=X.columns, index=gene_symbols) X.fillna(0, inplace=True) return X
def load_xiaonan_microarray(yugene=True, gene_symbols=None, sample_names=None): """ Load the Xiao-Nan microarray data :param yugene: If True, apply YuGene normalisation :param gene_symbols: If supplied, this is a list containing the gene symbols. Any that are not present are filled with zeros :return: """ X, meta = microarray_data.load_annotated_gse28192( aggr_field='SYMBOL', aggr_method='max', log2=True, sample_names=sample_names ) if yugene: X = process.yugene_transform(X) if gene_symbols is not None: X = pd.DataFrame(data=X, columns=X.columns, index=gene_symbols) X.fillna(0, inplace=True) return X, meta
def plot_correlation_heatmap(data, yugene=False, n_genes=None, **kwargs): if yugene: data = process.yugene_transform(data) kwargs.setdefault('cmap', 'Reds') kwargs.setdefault('vmin', 0.) kwargs.setdefault('vmax', 1.) if n_genes is not None: mad = process.median_absolute_deviation(data, axis=1).sort_values(ascending=False) top_mad = mad.iloc[:n_genes].index data = data.loc[top_mad] fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax = sns.heatmap(data.corr(), ax=ax, **kwargs) plt.setp(ax.xaxis.get_ticklabels(), rotation=90) plt.setp(ax.yaxis.get_ticklabels(), rotation=0) plt.tight_layout() return ax
mo_he = microarray_data.load_annotated_microarray_gse54650() # indexed by Entrez gene ID # load mouse MB data mo_mb, chd7 = microarray_data.load_annotated_microarray_sb_data() # indexed by Entrez gene ID # reduce to common genes common_genes = mo_he.index.intersection(mo_mb.index) mo_mb = mo_mb.loc[common_genes] mo_he = mo_he.loc[common_genes] # combine mo_all = pd.concat((mo_he, mo_mb), axis=1) # YuGene yg_mo_all = process.yugene_transform(mo_all) # apply YuGene transform # yg_mo_he = process.yugene_transform(mo_he) # apply YuGene # yg_mb = process.yugene_transform(mo_mb) # yg_mb = yg_mb.loc[common_genes] # yg_mo_he = yg_mo_he.loc[common_genes] # plot correlation using ALL matching genes # if True: if False: corr.plot_correlation_coefficient_array(yg_mo_all, vmin=0.8, fig_kwargs=fig_kwargs) plt.tight_layout()
k = 10 # XV # it's useful to maintain a list of known upregulated genes nano_genes = [] for grp, arr in consts.NANOSTRING_GENES: nano_genes.extend(arr) nano_genes.remove('EGFL11') nano_genes.append('EYS') # load Ncott data (285 non-WNT MB samples) ncott, ncott_meta = microarray_data.load_annotated_microarray_gse37382( aggr_field='SYMBOL', aggr_method='max') sort_idx = ncott_meta.subgroup.sort_values().index ncott_meta = ncott_meta.loc[sort_idx] ncott = process.yugene_transform(ncott.loc[:, sort_idx]) # X = ncott.copy() # m = ncott_meta.copy() # load Allen (healthy cerebellum) # he, he_meta = allen_human_brain_atlas.cerebellum_microarray_reference_data(agg_field='gene_symbol', agg_method='max') # combine # common_genes = ncott.index.intersection(he.index) # res = pd.DataFrame(index=common_genes, columns=ncott.columns.union(he.columns)) # res.loc[common_genes, he.columns] = he.loc[common_genes].values # res.loc[common_genes, ncott.columns] = ncott.loc[common_genes].values # res = res.astype(float)
# load Robinson dataset robi, robi_meta = microarray_data.load_annotated_microarray_gse37418( aggr_field='SYMBOL', aggr_method='max') robi_meta = robi_meta.loc[~robi_meta.subgroup.isin(['U', 'SHH OUTLIER'])] sort_idx = robi_meta.subgroup.sort_values().index robi_meta = robi_meta.loc[sort_idx] robi = robi.loc[:, sort_idx] robi_meta.loc[:, 'subgroup'] = robi_meta.subgroup.str.replace( 'G3', 'Group 3').replace('G4', 'Group 4') X = robi.copy() m = robi_meta.copy() # YuGene X = process.yugene_transform(X) # the samples must be represented in ROWS X = X.transpose() Z = linkage(X, method='average', metric='correlation') # fig, ax, subax, gs = dendro_heatmap(Z, m.subgroup) # check the cophenetic distance: the sorrelation between ACTUAL pairwise distances between samples and the # distances according to the hierarchical clustering # c, coph_dists = cophenet(Z, pdist(X)) # print "Correlation coeff between actual pdist and hierarchical pdist is %.2f" % c # pick top high stdev genes s = X.std(axis=0).sort_values(ascending=False) idx = s.index[:1500]
'Actb', # 'Gapdh', # missing # 'Rpl13a', # missing 'B2m', 'Hmbs', 'Pgk1', 'Hsp90ab1', 'Hprt', ])) # standardise mo_all_sym_n = mo_all_sym.subtract(mo_he_sym.mean(axis=1), axis=0).divide(mo_he_sym.std(axis=1), axis=0) # mo_all_sym_n = mo_all_sym.subtract(mo_all_sym.mean(axis=1), axis=0).divide(mo_all_sym.std(axis=1), axis=0) mo_all_sym_yg = process.yugene_transform(mo_all_sym) # distribution of all intensities if SAVE_PLOTS: # hist of intensities fig, axs = plt.subplots(2, 1, sharex=True) axs[0].hist(mo_mb.values.flatten(), 100) axs[0].set_title("Microarray intensities, SB") axs[1].hist(mo_he.values.flatten(), 100) axs[1].set_title("Microarray intensities, cerebellum") plt.tight_layout() fig.savefig(os.path.join(OUTDIR, "marr_sb-circad_hist_intensity.png"), dpi=200) fig.savefig(os.path.join(OUTDIR, "marr_sb-circad_hist_intensity.pdf")) if SAVE_PLOTS:
nano_genes = [] for grp, arr in consts.NANOSTRING_GENES: if grp != 'WNT': nano_genes.extend(arr) nano_genes.remove('EGFL11') nano_genes.append('EYS') # load Ncott data (285 non-WNT MB samples) ncott, ncott_meta = microarray_data.load_annotated_microarray_gse37382( aggr_field='SYMBOL', aggr_method='max' ) sort_idx = ncott_meta.subgroup.sort_values().index ncott_meta = ncott_meta.loc[sort_idx] ncott = ncott.loc[:, sort_idx] ncott = process.yugene_transform(ncott) # load Allen (healthy cerebellum) he, he_meta = allen_human_brain_atlas.cerebellum_microarray_reference_data(agg_field='gene_symbol', agg_method='max') he_meta.loc[:, 'subgroup'] = 'control' # load Kool dataset kool, kool_meta = microarray_data.load_annotated_microarray_gse10327( aggr_field='SYMBOL', aggr_method='max', ) sort_idx = kool_meta.subgroup.sort_values().index kool_meta = kool_meta.loc[sort_idx] kool = kool.loc[:, sort_idx] kool_meta.loc[:, 'subgroup'] = (
for lbl, d in sample_groups.items(): this_data = data.loc[:, data.columns.str.contains(d['regex'])] first = True for t in this_data.columns: col = this_data.loc[:, t].sort_values() col /= col.max() if renorm: col = col.loc[col > 0] x = np.linspace(0, 1, col.size) plt_lbl = None if first: plt_lbl = lbl first = False ax.plot(x, col.values, color=d['colour'], label=plt_lbl) data_rr_yg = process.yugene_transform(data_rr) data_rr_log = np.log(data_rr + 1) data_rr_log_yg = process.yugene_transform(data_rr_log) fig, axs = plt.subplots(2, 2, sharex=True, sharey=True) dynamic_range_plot(data_rr, axs[0, 0]) dynamic_range_plot(data_rr_log, axs[0, 1]) dynamic_range_plot(data_rr_yg, axs[1, 0]) dynamic_range_plot(data_rr_log_yg, axs[1, 1]) axs[0, 0].set_xlim(0., 1.) axs[0, 0].set_ylabel('Expression level (normalised)') axs[1, 0].set_ylabel('Expression level (normalised)') axs[1, 0].set_xlabel('Non-zero percentile') axs[1, 1].set_xlabel('Non-zero percentile')
obj = rnaseq_data.gbm_ribozero_samples_loader(annotate_by='Ensembl Gene ID') data = obj.data.loc[obj.data.index.str.contains('ENSG')] data = data.loc[data.any(axis=1), :] data_n = norm(data) # load relevant poly(A) data, too obj_polya = rnaseq_data.gbm_paired_samples_loader(annotate_by='Ensembl Gene ID', source='star') data_polya = obj_polya.data.loc[:, obj_polya.data.columns.str.contains('GBM')] data_all = pd.concat((obj.data, data_polya), axis=1) data_all = extract_present_genes(data_all) data_all_n = norm(data_all) data_all_yg = process.yugene_transform(data_all, resolve_ties=False) data_all_log_yg = process.yugene_transform(np.log2(data_all_n), resolve_ties=False) # number assigned bar chart assgn = get_assigned_proportions(obj.data) assgn_polya = get_assigned_proportions(obj_polya.data).loc['GBM031'] assgn = assgn.append(assgn_polya) ax = assgn.plot.bar() ax.set_position([0.05, 0.17, 0.7, 0.8]) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.setp(ax.xaxis.get_ticklabels(), rotation=45) ax.set_ylabel('# reads')