def cluster_data_with_threshold(data, min_val=None, n=None, mad=None, min_over=2, transform=None, **kwargs): if min_val is not None and min_over is not None: idx = (data > min_val).sum(axis=1) > min_over data = data.loc[idx] if transform is not None: data = transform(data) if n is not None: if mad is None: mad = transformations.median_absolute_deviation(data).sort_values( ascending=False) else: mad = mad.sort_values(ascending=False) if len(mad.index.intersection(data.index)) != data.shape[0]: raise AttributeError( "If a pre-computed MAD is supplied, it must contain all required entries" ) data = data.loc[mad.index[:n]] cm = clustering.plot_clustermap(data, cmap='RdBu_r', metric='correlation', **kwargs) cm.gs.update(bottom=0.2) return cm, mad
def plot_clustermap(data, yugene=False, n_genes=N_GENES, yugene_resolve_ties=False, **kwargs): if yugene: data = process.yugene_transform(data, resolve_ties=yugene_resolve_ties) kwargs.setdefault('cmap', 'RdBu_r') mad = transformations.median_absolute_deviation(data, axis=1).sort_values(ascending=False) top_mad = mad.iloc[:n_genes].index z = hierarchy.linkage(data.loc[top_mad].transpose(), method='average', metric='correlation') cg = clustering.plot_clustermap( data.loc[top_mad], col_linkage=z, **kwargs ) plt.setp( cg.ax_heatmap.xaxis.get_ticklabels(), rotation=90 ) cg.gs.update(bottom=0.2) # it is helpful to have access to the row index so we'll add it here # I *think* certain kwargs might cause this to fail (if no row dend has been computed?) so add a generic try-exc try: cg.row_index = top_mad[cg.dendrogram_row.reordered_ind] except Exception: pass return cg
ax.figure.savefig(os.path.join( outdir, "pca_top%d_by_mad_with_names.png" % n_t), dpi=200) row_colours = pd.DataFrame('gray', index=our_dat.columns, columns=['']) row_colours.loc[row_colours.index.str.contains( r'eNSC[0-9]med')] = '#66c2a5' row_colours.loc[row_colours.index.str.contains( r'eNSC[0-9]mouse')] = '#fc8d62' row_colours.loc[row_colours.index.str.contains( r'mDura.[AN0-9]*mouse')] = '#8da0cb' row_colours.loc[row_colours.index.str.contains( r'mDura.[AN0-9]*human')] = '#e78ac3' for n_t in n_gene_try: fname = "clustering_by_gene_corr_log_top%d_by_mad.{ext}" % n_t d = clustering.dendrogram_with_colours(our_dat.loc[mad.index[:n_t]], row_colours, fig_kws={'figsize': (10, 5.5)}) d['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200) cm = clustering.plot_clustermap(our_dat.loc[mad.index[:n_t]], cmap='RdBu_r', metric='correlation', col_colors=row_colours) cm.gs.update(bottom=0.2) cm.savefig(os.path.join( outdir, "clustermap_by_gene_corr_log_top%d_by_mad.png" % n_t), dpi=200)
# reduce to significant and relevant keep_genes = cor_gene.index[(cor_gene.abs() > cross_corr_threshold) & (pval_gene < alpha)] # remove MYC itself when reporting print "Having aggregated these by gene, %d are correlated with %s" % (len( keep_genes.drop(myc_gene)), myc_gene) # cluster using this representation of the data # force the order of the columns to match the correlation with MYC keep_genes = cor_gene.loc[keep_genes].sort_values(ascending=False).index cg = clustering.plot_clustermap(dat_corr_with_myc_aggr.loc[keep_genes], cmap='RdBu_r', metric='euclidean', method='ward', row_cluster=False, vmin=-4.5, vmax=4.5) cg.gs.update(bottom=0.1) jennie_list = [ 'MYC', 'CXCL2', 'CXCL1', 'TNFAIP3', 'IL8', 'C17orf47', 'RAET1L', 'TEX14', 'SERPINE1',
header=None).squeeze().str.decode('utf-8') ssgsea.index = ssgsea_pathway_names.reindex( ssgsea.index.str.replace('_', ' ')) # heatmap: proportions for each patient # standardise across columns, because each cell type has different mean proportion rl = hc.linkage(xcell_prop.astype(float).transpose(), method='average', metric='euclidean') cg = clustering.plot_clustermap( xcell_prop.astype(float).transpose(), metric='euclidean', show_gene_labels=False, show_gene_clustering=True, cmap='YlOrRd', row_linkage=rl, z_score=1, vmin=-1.5, vmax=6., ) cg.gs.update(left=0.03, bottom=0.22, right=0.9) c_labels = [''] * len(cg.cax.get_yticks()) c_labels[0] = 'Low' c_labels[-1] = 'High' cg.cax.set_yticklabels(c_labels) cg.cax.set_ylabel( 'Normalised proportion', labelpad=-70) # bit hacky, but this places the label correctly cg.savefig(os.path.join(outdir, "cell_proportion_cluster_by_patient.png"), dpi=200)
# relabel the FFPE samples idx = obj.meta.index.tolist() for k, v in hgic_consts.NH_ID_TO_PATIENT_ID_MAP.items(): for i, t in enumerate(idx): if k.replace('-', '_') in t: idx[i] = "FFPE GBM%s" % v obj.meta.index = idx obj.data.columns = idx cpm = obj.data.divide(obj.data.sum(), axis=1) lcpm = np.log2((obj.data + 1).divide((obj.data + 1).sum(), axis=1)) mad = transformations.median_absolute_deviation(lcpm).sort_values( ascending=False) cg = clustering.plot_clustermap(lcpm.loc[mad.index[:3000]], cmap='RdBu_r') cg.gs.update(bottom=0.15) cg.savefig(os.path.join(outdir, "cluster_by_top_3000_genes.png"), dpi=200) # load Verhaak signatures # manual amendments: # Classical. C14orf159 -> DGLUCY, KIAA0494 -> EFCAB14, LHFP -> LHFPL6 # Proneural. HN1 -> JPT1, PAK7 -> PAK5, ZNF643 -> ZFP69B cl = [ 'PTPRA', 'ELOVL2', 'MLC1', 'SOX9', 'ARNTL', 'DENND2A', 'BBS1', 'ABLIM1', 'PAX6', 'ZHX3', 'USP8', 'PLCG1', 'CDH4', 'RASGRP1', 'ACSBG1', 'CST3', 'BCKDHB', 'LHFPL6', 'VAV3', 'ACSL3', 'EYA2', 'SEPT11', 'SLC4A4', 'SLC20A2', 'DGLUCY', 'CTNND1', 'ZFHX4', 'SPRY2', 'ZNF45', 'NCOA1', 'PLCE1', 'DTNA', 'POLRMT', 'SALL1', 'TYK2', 'TJP1', 'MEOX2', 'FGFR3', 'STXBP3', 'GRIK1', 'GATM', 'UPF1', 'NPEPL1', 'EFCAB14',
the_data = rnaseq_obj.data.loc[:, rnaseq_obj.data.columns.str. contains("GBM%s" % pid)] the_aggr = the_data.mean(axis=1) dat_gbm_aggr.loc[:, pid] = the_aggr # select genes g = set(venn_set['111111']) for x in sets_full.values() + sets_partial.values(): for k in x: g.update(venn_set[k]) the_data = dat_gbm_aggr.loc[g] # remove any rows that have no variation the_data = the_data.loc[~(the_data.diff(axis=1).iloc[:, 1:] == 0).all(axis=1)] the_data = np.log2(the_data + 1) cg = clustering.plot_clustermap(the_data, cmap='RdBu_r', vmax=12., figsize=(3.6, 8)) cg.gs.update(bottom=0.1) cg.savefig(os.path.join(outdir, "clustermap_gbm_by_subgroup_gene_sets.png"), dpi=200) cg.savefig(os.path.join(outdir, "clustermap_gbm_by_subgroup_gene_sets.tiff"), dpi=200)
def plot_clustermap(obj, quantile_norm, method='average', metric='correlation', n_gene_by_mad=5000, n_gene_for_heatmap=500, fmin=0.05, fmax=0.95, eps=0.01, cell_line_colours=None): if cell_line_colours is None: cell_line_colours = { 'FB': '#fff89e', # yellow 'GBM (this study)': '#e6e6e6', # light gray 'GBM': '#4d4d4d', # dark grey 'ESC': '#ff7777', # light red 'iPSC': '#990000', # dark red 'iPSC (this study)': '#fdc086', # orange 'NSC': '#006600', # dark green 'iNSC (this study)': '#7fc97f', # green } the_dat = np.log2(obj.data + eps) if quantile_norm is not None: the_dat = transformations.quantile_normalisation(the_dat, method=quantile_norm) the_mad = transformations.median_absolute_deviation(the_dat).sort_values( ascending=False) cc, st, leg_dict = construct_colour_array_legend_studies(obj.meta) # linkage lkg = hc.linkage( the_dat.loc[the_mad.index[:n_gene_by_mad]].transpose(), method=method, metric=metric, ) # ref line colours for k, v in cell_line_colours.items(): cc.loc[obj.meta.type == k, 'Cell type'] = v # our line colours cc.loc[obj.meta.batch.str.contains('wtchg') & (obj.meta.type == 'iPSC'), 'Cell type'] = \ cell_line_colours['iPSC (this study)'] # get appropriate clims the_dat = the_dat.loc[the_mad.index[:n_for_heatmap]] the_dat_flat = np.sort(the_dat.values.flatten()) vmin = the_dat_flat[int(len(the_dat_flat) * fmin)] - 0.5 vmax = the_dat_flat[int(len(the_dat_flat) * fmax)] + 0.5 gc = clustering.plot_clustermap( the_dat.loc[the_mad.index[:n_gene_for_heatmap]], cmap='RdBu_r', col_linkage=lkg, col_colors=cc, vmin=vmin, vmax=vmax, ) leg_entry = { 'class': 'patch', 'edgecolor': 'k', 'linewidth': 1., } leg_dict2 = collections.OrderedDict() leg_dict2['Cell type'] = collections.OrderedDict() for k in sorted(cell_line_colours): if k.replace(' (this study)', '') in obj.meta.type.unique(): leg_dict2['Cell type'][k] = dict(leg_entry) leg_dict2['Cell type'][k].update( {'facecolor': cell_line_colours[k]}) leg_dict2['Study'] = {} for k, v in leg_dict['Study'].items(): leg_dict2['Study'][k] = dict(leg_entry) leg_dict2['Study'][k].update({'facecolor': v}) common.add_custom_legend(gc.ax_heatmap, leg_dict2, loc_outside=True, fontsize=14) format_clustermap(gc) return gc
ax.figure.set_size_inches(5.9, 4.8) ax.figure.subplots_adjust(right=0.8, left=0.12, bottom=0.1, top=0.98) ax.figure.savefig(os.path.join(outdir, "pca_our_samples.png"), dpi=200) # clustermap: just our samples colour_bar = pd.DataFrame(treatment_colour['Rheb KO'], index=dat.columns, columns=['']) colour_bar.loc[colour_subgroups == 'WT'] = treatment_colour['WT'] this_mad = transformations.median_absolute_deviation(log_dat).sort_values( ascending=False) this_log_dat = log_dat.loc[this_mad.index[:n_by_mad]] cm = clustering.plot_clustermap(this_log_dat, cmap='RdYlBu_r', metric='correlation', col_colors=colour_bar, vmin=-2, vmax=2) cm.fig.set_size_inches(5, 8.4) cm.gs.update(bottom=0.15, right=0.98) cm.savefig(os.path.join(outdir, "clustermap_our_samples.png"), dpi=200) # DE 3 vs 3 dat = filter.filter_by_cpm(obj_star.data, min_cpm=min_cpm, min_n_samples=2) the_groups = obj_star.meta.treatment.str.replace( 'Rheb KO', 'Rheb_KO') # group names must be valid in R the_comparison = ('Rheb_KO', 'WT') de_res = differential_expression.run_one_de(dat, the_groups, the_comparison,
# norm = colors.Normalize(vmin=-1, vmax=0.) # sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap) # vals = [colors.rgb2hex(sm.to_rgba(t)) for t in base_n] norm = colors.Normalize(vmin=base.min(), vmax=base.max()) sm = plt.cm.ScalarMappable(cmap=plt.cm.gray_r, norm=norm) vals = [colors.rgb2hex(sm.to_rgba(t)) for t in base] col_colours = pd.DataFrame(vals, index=the_data.columns, columns=['MYC']) cg = clustering.plot_clustermap( dat_corr_with_myc_aggr.loc[keep_genes_sorted], cmap='RdBu_r', metric='euclidean', method='ward', row_cluster=False, col_colors=col_colours, vmin=-8, vmax=8) cg.fig.set_size_inches((7., 7.)) cg.cax.set_ylabel("Gene expression") cg.cax.yaxis.set_label_coords(-.7, 0.5) cg.gs.update(bottom=0.12, left=0.04, top=0.97, right=0.93) cg.savefig(os.path.join(outdir, 'myc_genes_clustermap.png'), dpi=200) cg.savefig(os.path.join(outdir, 'myc_genes_clustermap.tiff'), dpi=200) # does the clustering partition by MYC expression level? dend = cg.dendrogram_col.calculate_dendrogram() lkg = cg.dendrogram_col.linkage
# clustermap colour_bar = pd.DataFrame(treatment_colour['Rheb KO'], index=dat.columns, columns=['']) colour_bar.loc[meta.treatment == 'WT'] = treatment_colour['WT'] this_mad = transformations.median_absolute_deviation(log_dat).sort_values( ascending=False) this_log_dat = log_dat.loc[this_mad.index[:n_by_mad]] # version 1: cluster rows cm = clustering.plot_clustermap( this_log_dat, cmap='RdYlBu_r', metric='correlation', col_colors=colour_bar, vmin=-2, vmax=2, ) leg_dict = { 'fl/fl': { 'class': 'patch', 'edgecolor': 'none', 'facecolor': treatment_colour['WT'], }, r'$\Delta$/$\Delta$': { 'class': 'patch', 'edgecolor': 'none', 'facecolor': treatment_colour['Rheb KO'], },
'Cell type'] = cell_line_colours['iNSC (this study)'] cc.loc[the_obj.meta.batch.str.contains('wtchg') & (the_obj.meta.type == 'iPSC'), 'Cell type'] = cell_line_colours['iPSC (this study)'] # get appropriate clims the_dat = the_dat.loc[the_mad.index[:n_for_heatmap]] the_dat_flat = np.sort(the_dat.values.flatten()) fmin = 0.05 fmax = 0.95 vmin = the_dat_flat[int(len(the_dat_flat) * fmin)] - 0.5 vmax = the_dat_flat[int(len(the_dat_flat) * fmax)] + 0.5 gc = clustering.plot_clustermap(the_dat.loc[the_mad.index[:n_for_heatmap]], cmap='RdBu_r', col_linkage=dend['linkage'], col_colors=cc, vmin=vmin, vmax=vmax) leg_entry = { 'class': 'patch', 'edgecolor': 'k', 'linewidth': 1., } leg_dict2 = collections.OrderedDict() leg_dict2['Cell type'] = collections.OrderedDict() for k in sorted(cell_line_colours): if k.replace(' (this study)', '') in the_obj.meta.type.unique(): leg_dict2['Cell type'][k] = dict(leg_entry) leg_dict2['Cell type'][k].update(
ix = obj2.meta.type == 'ESC' obj2.filter_samples(ix) dend = plot_dendrogram([obj1, obj2], qn_method=quantile_norm, n_by_mad=n_gene_by_mad) dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc.png"), dpi=200) # 3. iPSC, ESC, Ruiz signature (only) the_obj = loader.MultipleBatchLoader([obj1, obj2]) dat_r_z = pd.DataFrame(np.log2(the_obj.data + eps)) dat_r_z = dat_r_z.reindex(gene_sign_ens.values).dropna() for r in dat_r_z.index: dat_r_z.loc[r] = zscore(dat_r_z.loc[r]) dat_r_z.index = gene_sign_ens.index[gene_sign_ens.isin(dat_r_z.index)] cg = clustering.plot_clustermap(dat_r_z, show_gene_labels=True, cmap='RdBu_r') cg.gs.update(bottom=0.2) cg.savefig(os.path.join(outdir, "clustermap_ruiz_ipsc_esc_ztrans.png"), dpi=200) # 4. HipSci, iPSC, ESC, FB obj1 = copy(obj) ix = obj1.meta.type.isin(['iPSC', 'FB']) obj1.filter_samples(ix) dend = plot_dendrogram([obj1, ref_obj, hip_obj], qn_method=quantile_norm, n_by_mad=n_gene_by_mad) dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc_fb_with_hipsci%d.png" % n_hipsci), dpi=200) # 5. HipSci, iPSC, ESC obj1 = copy(obj) ix = obj1.meta.type.isin(['iPSC']) obj1.filter_samples(ix)
"nsc_correlation_dendrogram_vsttransform_top%d.pdf" % NGENE)) cg.gs.update(bottom=0.3, right=0.7) cg.fig.savefig(os.path.join( outdir, "nsc_correlation_clustermap_vsttransform_top%d.png" % NGENE), dpi=200) cg.fig.savefig( os.path.join( outdir, "nsc_correlation_clustermap_vsttransform_top%d.pdf" % NGENE)) cg = clustering.plot_clustermap( log_nsc_data.loc[mad_log_nsc_srt.index[:NGENE]], show_gene_labels=False, rotate_xticklabels=True, cmap='RdBu_r', metric='correlation', ) cg.gs.update(bottom=0.2) cg.fig.savefig(os.path.join( outdir, "nsc_expression_clustermap_logtransform_top%d.png" % NGENE), dpi=200) cg.fig.savefig( os.path.join( outdir, "nsc_expression_clustermap_logtransform_top%d.pdf" % NGENE)) cg = clustering.plot_clustermap( vst_nsc_data.loc[mad_vst_nsc_srt.index[:NGENE]], show_gene_labels=False,
"Unknown": 'gray' } wang_cmap = { 'PN': consts.SUBGROUP_SET_COLOURS['RTK I partial'], 'CL': consts.SUBGROUP_SET_COLOURS['RTK II partial'], 'MS': consts.SUBGROUP_SET_COLOURS['MES partial'], 'Unknown': 'grey' } row_colours.insert(0, 'Sturm', sturm_class.map(sturm_cmap)) row_colours.insert(0, 'Verhaak', wang_class.map(wang_cmap)) cg = clustering.plot_clustermap(xcell_tcga.astype(float).transpose(), metric='euclidean', show_gene_labels=False, show_gene_clustering=True, cmap='YlOrRd', row_linkage=rl, z_score=1, vmin=-1.5, vmax=6., row_colors=row_colours) cg.gs.update(left=0.03, bottom=0.22, right=0.9) c_labels = [''] * len(cg.cax.get_yticks()) c_labels[0] = 'Low' c_labels[-1] = 'High' cg.cax.set_yticklabels(c_labels) cg.cax.set_ylabel( 'Normalised proportion', labelpad=-70) # bit hacky, but this places the label correctly cg.savefig(os.path.join(outdir, "cell_proportion_cluster_by_patient.png"), dpi=200) cg.savefig(os.path.join(outdir, "cell_proportion_cluster_by_patient.tiff"),
cv.plot.barh(color='k', ax=ax) fig.subplots_adjust(bottom=0.07, left=0.4, right=0.98, top=0.98) ax.set_xlabel('CV across samples') fig.savefig(os.path.join(outdir, "cv_across_samples.png"), dpi=200) fig.savefig(os.path.join(outdir, "cv_across_samples.tiff"), dpi=200) # heatmap: proportions for each patient # standardise across columns, because each cell type has different mean proportion rl = hc.linkage(df.astype(float).transpose(), method='average', metric='euclidean') cg = clustering.plot_clustermap(df.astype(float).transpose(), metric='euclidean', show_gene_labels=True, show_gene_clustering=True, cmap='YlOrRd', row_linkage=rl, z_score=1, figsize=(5.5, 5.5)) # cg.gs.update(left=0.03, bottom=0.22, right=0.9) cg.gs.update(left=0.1, bottom=0.4, right=0.9, top=0.93) cg.cax.set_yticklabels(['Low', '', '', '', 'High']) cg.cax.set_ylabel( 'Normalised\nproportion', labelpad=-70) # bit hacky, but this places the label correctly cg.savefig(os.path.join(outdir, "cell_proportion_cluster_by_patient.png"), dpi=200) cg.savefig(os.path.join(outdir, "cell_proportion_cluster_by_patient.tiff"), dpi=200) cg.savefig(os.path.join(outdir, "cell_proportion_cluster_by_patient.pdf"), dpi=200)
'class': 'patch', 'edgecolor': 'k', 'linewidth': 1., } lkg = plt_dict[clust_n_ftr]['linkage'] leg_dict = collections.OrderedDict() for k in sorted(cell_line_colours): if cell_line_colours[k] in row_colours_all.values: leg_dict[k] = dict(leg_entry) leg_dict[k].update({'facecolor': cell_line_colours[k]}) cm = clustering.plot_clustermap(this_dat, cmap='RdYlBu_r', metric='correlation', col_colors=row_colours_all, col_linkage=lkg, vmin=-10, vmax=10) cm.fig.set_size_inches((10.9, 8.)) common.add_custom_legend(cm.ax_heatmap, leg_dict, loc_outside=True, fontsize=14) cm.gs.update(bottom=0.3, right=0.79, left=0.01) cm.savefig(os.path.join(outdir, "clustermap_ipsc_esc_nsc_fb.png"), dpi=200) cm.savefig(os.path.join(outdir, "clustermap_ipsc_esc_nsc_fb.tiff"), dpi=200)
# lnk = hc.linkage(dist) # dend = clustering.dendrogram_with_colours( # dat, # cc, # linkage=lnk, # vertical=True, # legend_labels=leg_dict, # fig_kws={'figsize': [14, 6]} # ) # Pearson correlation distance dend = clustering.dendrogram_with_colours(dat, cc, vertical=True, legend_labels=leg_dict, fig_kws={'figsize': [14, 6]}) # Pearson with a limited number of probes # dend = clustering.dendrogram_with_colours(dat.loc[mad.index[:5000]], cc, vertical=True, legend_labels=leg_dict, fig_kws={'figsize': [14, 6]}) dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc_fb_all_probes.png"), dpi=200) # similar, but clustermap (dendrogram + heatmap) gc = clustering.plot_clustermap( dat.loc[mad.index[:5000]], cmap='RdBu_r', col_linkage=dend['linkage'], col_colors=cc ) clustering.add_legend(leg_dict, gc.ax_heatmap, loc='right') gc.gs.update(bottom=0.2, right=0.82) gc.savefig(os.path.join(outdir, "clustermap_ipsc_esc_fb_all_probes.png"), dpi=200)