def construct_colour_array_legend_studies(meta): ct_leg = { 'FB': '#fff89e', 'iPSC (this study)': 'blue', 'iPSC': '#96daff', 'ESC': 'green', 'iNSC (this study)': '#9e3900', 'iNSC': '#db7b00', 'NSC': '#f4b342', 'Fetal NSC': '#ffaf47', } cc = pd.DataFrame('gray', index=meta.index, columns=['Cell type', 'Study']) cols_incl = ['FB', 'ESC', 'NSC'] for t in cols_incl: cc.loc[meta['type'] == t, 'Cell type'] = ct_leg[t] cc.loc[meta['type'] == 'NPC', 'Cell type'] = ct_leg['NSC'] cc.loc[(meta['type'] == 'iPSC') & (meta['batch'].str.contains('WTCHG')), 'Cell type'] = ct_leg['iPSC (this study)'] cc.loc[(meta['type'] == 'iPSC') & (~meta['batch'].str.contains('WTCHG')), 'Cell type'] = ct_leg['iPSC'] cc.loc[(meta['type'] == 'iNSC') & (meta['batch'].str.contains('WTCHG')), 'Cell type'] = ct_leg['iNSC (this study)'] # chestnut cc.loc[(meta['type'] == 'iNSC') & (~meta['batch'].str.contains('WTCHG')), 'Cell type'] = ct_leg['iNSC'] # chestnut cc.loc[(meta['type'] == 'NSC') & (meta.index.str.contains('fetal')), 'Cell type'] = ct_leg['Fetal NSC'] # light orange all_batches = meta.batch.copy() # to keep individual batches, comment out this next line all_batches[all_batches.str.contains('wtchg')] = 'This study' batches = all_batches.unique() n_study = len(batches) study_colours = common.get_best_cmap(n_study) studies = {} for i, c in enumerate(study_colours): cc.loc[all_batches == batches[i], 'Study'] = c studies[batches[i]] = c all_colours = cc.loc[:, 'Cell type'].unique() for k in ct_leg.keys(): if ct_leg[k] not in all_colours: ct_leg.pop(k) # legend dictionary leg_dict = {'Cell type': ct_leg, 'Study': studies} return cc, studies, leg_dict
def plot_one_scatter_pie(xy, weights, colours=None, ax=None, marker_kwargs=None, **scatter_kwargs): """ Add one pie chart path marker to the supplied or current axes. :param xy: Array of length two specifying the centre of the pie chart. :param weights: Array containing unnormalised weights for the pie segment sizes. :param colours: Optionally provide an array of colours. If missing, we choose a the 'best colourmap' option. If supplied, the length must match that of weights. :param ax: Axis object to plot into. If missing, use gca(). :param marker_kwargs: If supplied, this dictionary is passed to pie_path_markers. :param scatter_kwargs: Passed to the scatter() method of ax. :return: """ if len(xy) != 2: raise ValueError("xy must have length 2.") x, y = xy if colours is None: colours = common.get_best_cmap(len(weights)) if colours is not None and len(weights) != len(colours): raise ValueError("Length of weights and colours must be equal.") if marker_kwargs is None: marker_kwargs = {} if ax is None: ax = plt.gca() markers = pie_path_marker(weights, **marker_kwargs) handles = [] for m, c in zip(markers, colours): h = ax.scatter([x], [y], marker=m, facecolor=c, **scatter_kwargs) handles.append(h) return handles
obj.meta.batch = obj.meta.batch.str.replace('2016-12-19_ucl_genomics', '2016-12-19') # the only batch names without letters are ours obj.meta.loc[~obj.meta.batch.str.contains(r'[A-Z]'), 'batch'] = 'This study' # PCA plot (by batch and cell type) colour_subgroups = obj.meta.batch c_sub_sorted = sorted(colour_subgroups.unique(), key=lambda x: 'A' if x == 'This study' else x) cmap = collections.OrderedDict( zip( c_sub_sorted, common.get_best_cmap(len(c_sub_sorted)), )) m_subgroups = obj.meta.type subgroups_sorted = sorted(m_subgroups.unique(), key=lambda x: x[0] if x[0] != 'i' else x[1]) mmap = pd.Series(common.FILLED_MARKERS[len(m_subgroups.unique())], index=subgroups_sorted) fig = plt.figure(figsize=(6.4, 4.8)) ax = fig.add_subplot(111) p, ax = plot_pca(mdat, colour_subgroups, colour_map=cmap, marker_subgroups=m_subgroups, marker_map=mmap,
'HSC', 'Mast cells', 'Tregs', ]) ix = [] for ct in cell_types: ix.extend(so_both.index[so_both.index.str.contains(ct + '_')]) pct_shared = (so_both.loc[ix] / (so_both.loc[ix] + so_ipa_not_cts.loc[ix]) * 100.).sort_index().transpose() col_colours = pd.Series(index=pct_shared.columns, name='Cell type') ix_lookup = pct_shared.columns.str.replace(r'(?P<ct>[^_]*)_.*', r'\g<ct>') a, b = ix_lookup.factorize() cc_cmap = common.get_best_cmap(a.max() + 1, cmap='jet') for i in range(a.max() + 1): col_colours[a == i] = cc_cmap[i] cg = sns.clustermap( pct_shared, cmap='Reds', row_cluster=False, col_cluster=False, # col_colors=col_colours, vmax=20.) # cg.gs.set_height_ratios([0.1, 0.005, 0.05, 0.9]) cg.gs.set_height_ratios([0.1, 0.005, 0.9]) cg.gs.set_width_ratios([0.04, 0.02, 0.8]) cg.gs.update(top=0.99, left=0.02, right=0.67, bottom=0.15)
for pid in pids: # GIC gic = meta.index[(meta.type == 'GBM') & (meta.patient_id == pid)] insc = meta.index[(meta.type == 'iNSC') & (meta.patient_id == pid)] gbm = meta.index[(meta.type == 'ffpe') & (meta.patient_id == pid)] comparisons["%s_ffpe-iNSC" % pid] = [gbm, insc] comparisons["%s_GIC-iNSC" % pid] = [gic, insc] dmr_res = addd.run_dmr_analyses(dat, comparisons, anno, dmr_params) # Save DMR results to disk dmr_res.to_pickle(fn, include_annotation=False) logger.info("Saved DMR results to %s", fn) dmr_res_all = dmr_res.results_significant # look at distn of M values (before any differences) # two figures, one for each replicate cols = common.get_best_cmap(len(pids)) fig, axs = plt.subplots(ncols=2, sharex=True, sharey=True) for i, pid in enumerate(pids): c = cols[i] this_ix = meta.index[(meta.patient_id == pid) & (meta.type == 'GBM')] for j, ix in enumerate(this_ix): this_dat = dat.loc[:, ix] this_ax = axs[j] sns.kdeplot(this_dat, color=c, label=pid, ax=this_ax) [ax.set_xlim([-10, 10]) for ax in axs] [ax.set_xlabel("M value") for ax in axs] axs[0].set_ylabel("Density (a.u.)") fig.tight_layout() fig.savefig(os.path.join(outdir, "m_value_density_gic.png"), dpi=200)
ax.text(0.5, 0.5 * np.sqrt(3) + 0.02, 'RTK II', horizontalalignment='center', verticalalignment='bottom') ax.set_aspect('equal') ax.axis('off') # bundle them up tax_dict = { 'RTK I': taxs[0], 'RTK II': taxs[1], 'MES': taxs[2] } # each patient is a 'trajectory': FFPE -> early pass -> later passage bases = ['GBM_RTK_I', 'GBM_RTK_II', 'GBM_MES'] # cmap_func = common.continuous_cmap(cc.mean_passage.max(), cmap='Blues') cmap = common.get_best_cmap(len(consts.PIDS)) ff_colour = '#ffffff' for p in consts.PIDS: this_ff_score = ff.loc[p, bases] this_cc = cc.loc[p].sort_values(by='mean_passage') this_cc_score = this_cc.loc[:, bases] this_cc_pass = this_cc.loc[:, 'mean_passage'] # this_colours = [ff_colour] + [cmap_func(x - 1) for x in this_cc_pass] this_colours = cmap[consts.PIDS.index(p)] this_sizes = 20 + this_cc_pass.values ** 2. points = np.concatenate([[this_ff_score.values], this_cc_score.values], axis=0) ## FIXME: we could just use the hardcoded classification here, eliminating the need to look it up? tax = tax_dict[ff.loc[p, 'Result']]
def scatter_with_pies(xy, weights_arr, colours_arr=None, ax=None, marker_kwargs=None, **scatter_kwargs): """ Generate a scatterplot with pie charts as markers. :param xy: Array of length N, where N is the number of markers required. Each entry is a length 2 array giving the central coordinate of the pie chart. :param weights_arr: Array of length N. Each entry is an array of weights. :param colours_arr: Optional. Either an array of length N with each entry being an array of the same length as the corresponding entry in `weights_arr` Or a single array of length M, where M is the length of ALL entries of weights_arr (this is checked). :param ax: Axis object to plot into. If missing, use gca(). :param marker_kwargs: If supplied, this dictionary is passed to pie_path_markers. :param scatter_kwargs: Passed to the scatter() method of ax. :return: Handles generated by the scatter() calls (array of length N, each entry is an array of PathCollection objects). """ if len(xy) != len(weights_arr): raise ValueError("Length of xy and weights_arr must be equal.") if colours_arr is not None: if not hasattr(colours_arr[0], '__iter__'): # option 1: all weights have the same length and this is the colours array to use for all all_len = np.array([len(t) for t in weights_arr]) if (all_len == len(colours_arr)).all(): colours_arr = [colours_arr] * len(weights_arr) else: raise ValueError( "If colours_arr is a single array, all weight_arr entries must have the same length." ) if len(weights_arr) != len(colours_arr): raise ValueError( "Length of weights_arr and colours_arr must be equal.") if ax is None: ax = plt.gca() handles = [] for i in range(len(xy)): w = weights_arr[i] if colours_arr is None: colours = common.get_best_cmap(len(w)) else: colours = colours_arr[i] if len(colours) != len(w): raise ValueError( "Pie number %d: number of weights does not match number of colours." % i) handles.append( plot_one_scatter_pie(xy[i], w, colours=colours, ax=ax, marker_kwargs=marker_kwargs, **scatter_kwargs)) return handles
def scatter_with_colour_and_markers( dat, colour_subgroups=None, colour_map=None, marker_subgroups=None, marker_map=None, ax=None, legend='outside', default_colour='gray', default_marker='o', ec='k', lw=1.0, ms=40, ): """ :param dat: Data to be plotted in any array format. Expect two columns (x and y). Can also be a pd.DataFrame. :param colour_subgroups: :param colour_map: :param marker_subgroups: :param marker_map: :param ax: :param legend: Include legend in plot? If True, use the 'best' location (according to matplotlib), if 'outside' (default), place outside. If False, do not plot legend. :param default_colour: :param default_marker: :param ec: Edgecolour :param lw: Linewidth :param ms: Marker size :return: """ # cast dat to pd DataFrame, should have two columns dat = pd.DataFrame(dat) if ax is None: fig = plt.figure() ax = fig.add_subplot(111, aspect='equal') c_has_labels = True if colour_subgroups is None: c_has_labels = False # colour_subgroups = pd.Series(default_colour, index=dat.index) # everything is in the same colour group colour_subgroups = pd.Series('foo', index=dat.index) cidx, clabels = colour_subgroups.factorize() m_has_labels = True if marker_subgroups is None: m_has_labels = False # marker_subgroups = pd.Series(default_marker, index=dat.index) # everything is in the same marker group marker_subgroups = pd.Series('foo', index=dat.index) midx, mlabels = marker_subgroups.factorize() if colour_map is None: if c_has_labels: cmap = common.get_best_cmap(len(clabels)) else: cmap = [default_colour] * len(clabels) colour_map = dict([(k, cmap[i]) for i, k in enumerate(clabels)]) if marker_map is None: if m_has_labels: mmap = common.get_best_marker_map(len(mlabels)) else: mmap = [default_marker] * len(mlabels) marker_map = dict([(k, mmap[i]) for i, k in enumerate(mlabels)]) for ic, lc in enumerate(clabels): for im, lm in enumerate(mlabels): c = colour_map.get(lc, default_colour) m = marker_map.get(lm, default_marker) if m in FILLED_MARKERS_TO_EXPAND: # apply a 10% increase to these markers (only) this_ms = 1.1 * ms else: this_ms = ms j = (cidx == ic) & (midx == im) if j.sum() != 0: if c_has_labels and not m_has_labels: lbl = lc elif m_has_labels and not c_has_labels: lbl = lm else: lbl = None ax.scatter(dat.values[j, 0], dat.values[j, 1], c=c, s=this_ms, label=lbl, marker=m, edgecolor=ec, linewidths=lw) # set legend manually if it requires two groups if c_has_labels and m_has_labels: for_legend = [] # colours: show in patches with no edgecolor # for lc in clabels: for lc in colour_map.keys(): if lc in clabels: the_patch = patches.Patch(edgecolor='none', facecolor=colour_map.get( lc, default_colour), linewidth=lw, label=lc) for_legend.append(the_patch) # spacer that doesn't show up the_spacer = patches.Patch(edgecolor='none', facecolor='none', label='') for_legend.append(the_spacer) # markers: show with no fill # for lm in mlabels: for lm in marker_map.keys(): if lm in mlabels: the_line = plt.Line2D( [0], [0], marker=marker_map.get(lm, default_marker), markerfacecolor='none', markeredgewidth=lw, markeredgecolor=ec, # markersize=ms, # the markersize units are different here, so don't specify linestyle='none', linewidth=0., label=lm) for_legend.append(the_line) if legend == 'outside': common.legend_outside_axes(ax, handles=for_legend) elif isinstance(legend, str): ax.legend(handles=for_legend, loc=legend) elif legend: ax.legend(handles=for_legend) # if (legend is not None) and (legend != False): elif (legend is not None) and (legend != False): if legend == 'outside': common.legend_outside_axes(ax) elif isinstance(legend, str): ax.legend(loc=legend) elif legend: ax.legend() return ax
def set_plot_parameters(self, figsize=(8, 6), colours=None, markers=None, zorder=None, alpha=None, size=None, de_direction_colours=DIRECTION_COLOURS, dm_direction_colours=DIRECTION_COLOURS, de_vmin=None, de_vmax=None, dm_vmin=None, dm_vmax=None): # Automatic plot parameters are only possible when we know which groups will be plotted. # If this is not the case, we just guess how many there will be and hope we guessed enough! if self.dmr_comparison_groups is not None: n_groups = self.n_comparison_groups else: n_groups = 4 default_colours = common.get_best_cmap(n_groups) default_markers = common.get_best_marker_map(n_groups) default_zorder = range(20, 20 + n_groups) default_alpha = 0.6 default_size = 20 def set_property(x, default): if x is None: # cycle through default values if not hasattr(default, '__iter__'): default = [default] it = itertools.cycle(default) out = collections.defaultdict(lambda: it.next()) elif hasattr(x, 'get'): # dictionary of values: no modification needed out = x elif hasattr(x, '__iter__'): # iterable of values: cycle through these it = itertools.cycle(x) out = collections.defaultdict(lambda: it.next()) else: # single value supplied # result: dictionary that always returns this value out = collections.defaultdict(lambda: x) return out self.colours = set_property( colours, default_colours, ) self.markers = set_property( markers, default_markers, ) self.zorder = set_property( zorder, default_zorder, ) self.alpha = set_property(alpha, default_alpha) self.size = set_property(size, default_size) self.m_plot_kws = { 'colours': self.colours, 'markers': self.markers, 'zorder': self.zorder, 'alpha': self.alpha, 'size': self.size } self.fig_kws = {'figsize': figsize} self.de_direction_colour = direction_colour_getter( de_direction_colours, vmin=de_vmin, vmax=de_vmax) self.dm_direction_colour = direction_colour_getter( dm_direction_colours, vmin=dm_vmin, vmax=dm_vmax) if dm_vmin is not None: self.dm_vmin = dm_vmin if dm_vmax is not None: self.dm_vmax = dm_vmax if de_vmin is not None: self.de_vmin = de_vmin if de_vmax is not None: self.de_vmax = de_vmax
def plot_m_values( self, mdat, probe_locations, comparisons, colours='default', markers='default', zorder='default', alpha='default', size='default', ): """ :param mdat: pd.DataFrame containing the data to plot. Columns are samples, rows are probes :param probe_locations: pd.Series containing the probe IDs to include and their genomic coordinates :param comparisons: Dictionary keyed by comparison (equivalent to row_names). Each entry is a dictionary keyed by group name (e.g. 'Disease' / 'Healthy') and with values giving the samples in that group. The sample names must be in the columns of `mdat`. :param colours: Dictionary keyed by group name (e.g. 'Disease') giving the colour to use for that group. Defaults are used if not supplied. To disable colours, set to None. :param markers: Dictionary keyed by group name giving the marker to use for that group. Defaults are used if not supplied. To use circle markers for everything, set to None. :param zorder: Dictionary keyed by group name giving the zorder to use for that group. Defaults are used if not supplied. To use matplotlib defaults for everything, set to None. :param alpha: Dictionary keyed by group name giving the alpha to use for that group. Defaults are used if not supplied. To use matplotlib defaults for everything, set to None. :return: """ all_groups = sorted( setops.reduce_union(*(t.keys() for t in comparisons.values()))) n_groups = len(all_groups) def set_property(x, default, default_static): if x == 'default': out = dict(zip(all_groups, default)) elif x is None: out = dict([(k, default_static) for k in all_groups]) elif not hasattr(x, 'get'): # single value supplied out = dict([(k, x) for k in all_groups]) else: out = x return out colours = set_property(colours, common.get_best_cmap(n_groups), '0.5') markers = set_property(markers, common.get_best_marker_map(n_groups), 'o') zorder = set_property(zorder, range(20, 20 + n_groups), 20) # default alpha will be based on zorder a = sorted([(k, zorder[k]) for k in all_groups], key=lambda x: x[1]) a_ix = dict([(t[0], i) for i, t in enumerate(a)]) alpha_values = np.linspace(0.4, 0.6, n_groups) alpha_default = [alpha_values[a_ix[k]] for k in all_groups] alpha = set_property(alpha, alpha_default, '0.6') # default size will be based on zorder s_values = range(20, 20 + n_groups) s_default = [s_values[a_ix[k]] for k in all_groups] size = set_property(size, s_default, 20) # scatter plot individual probes ymin = 0 ymax = 0 for nm in self.row_names: grp_dict = comparisons[nm] this_ax = self.m_axs[nm] for grp_nm, grp_samples in grp_dict.items(): the_colour = colours.get(grp_nm) the_marker = markers.get(grp_nm) the_z = zorder.get(grp_nm) the_alpha = alpha.get(grp_nm) the_s = size.get(grp_nm) for col, x in mdat.loc[probe_locations.index, grp_samples].iteritems(): this_ax.scatter(probe_locations, x.values, c=the_colour, marker=the_marker, zorder=the_z, alpha=the_alpha, s=the_s, edgecolor='k', linewidth=0.5) ymin = min(x.values.min(), ymin) ymax = max(x.values.max(), ymax) this_ax.set_ylabel(nm) self.mdat_min = ymin self.mdat_max = ymax if self.coord_max is None: self.coord_min = probe_locations.min() self.coord_max = probe_locations.max() else: self.coord_min = min(probe_locations.min(), self.coord_min) self.coord_max = max(probe_locations.max(), self.coord_max)
dpi=200) fig.savefig(os.path.join( outdir, "cell_proportion_pathway_pval_%s_clustering_sign_annot_syngeneic_tregs.tiff" % corr_metric), dpi=200) fig.savefig(os.path.join( outdir, "cell_proportion_pathway_pval_%s_clustering_sign_annot_syngeneic_tregs.pdf" % corr_metric), dpi=200) # generate scatterplots for each pathway all_cts = co_p.index[(co_p.loc[:, ix] < alpha).any(axis=1)] colour_by_cell_type = dict(zip(all_cts, common.get_best_cmap(len(all_cts)))) for pw in ix: cts = co_p.index[(co_p[pw] < alpha)] fig = plt.figure(figsize=(6, 5)) ax = fig.add_subplot(111) xmax = 0. ymax = 0. for ct in cts: this_p = p.loc[pw].sort_index() this_df = df.loc[ct, this_p.index].sort_index() this_comb = pd.concat((this_df, this_p), axis=1).dropna(axis=0).astype(float) this_comb = this_comb.sort_values(by=[ct, pw], axis=0) x = this_comb.iloc[:, 0] y = this_comb.iloc[:, 1]
min_n_samples=2) log_dat = np.log10(obj_salmon.data + eps) # ECDF ax = rnaseq.log_cpm_ecdf_plot(dat, units='tpm', min_cpm=min_cpm) ax.figure.set_size_inches(6, 4) ax.figure.tight_layout() ax.figure.savefig(os.path.join(outdir, "cdf_our_samples.png"), dpi=200) # PCA colour_subgroups = obj_salmon.meta.treatment cmap = collections.OrderedDict( zip( colour_subgroups, common.get_best_cmap(len(colour_subgroups)), )) p = PCA() pc_dat = p.fit_transform(log_dat.transpose()) p, ax = plot_pca(log_dat, colour_subgroups, colour_map=cmap, p=p) for i, col in enumerate(log_dat.columns): ax.text(pc_dat[i, 0], pc_dat[i, 1], col) ax.figure.set_size_inches(5.9, 4.8) ax.figure.subplots_adjust(right=0.8, left=0.12, bottom=0.1, top=0.98) ax.figure.savefig(os.path.join(outdir, "pca_our_samples.png"), dpi=200) # clustermap: just our samples
fig = plt.figure(figsize=(10, 7)) ax = fig.add_subplot(111) p, ax = plot_pca(mdat, colour_subgroups, marker_subgroups=m_subgroups, marker_map=mmap, ax=ax) ax.figure.subplots_adjust(left=0.1, right=0.8) ax.figure.savefig(os.path.join(outdir, "pca_plot_batch_cell_type_all.png"), dpi=200) # ECDF plot (separate for cell types) to show batch effects cell_types = ['GBM', 'iNSC'] cmap = common.get_best_cmap(len(colour_subgroups.unique())) colour_map = dict(zip(colour_subgroups.unique(), cmap)) xi = np.linspace(-8, 8, 200) for ct in cell_types: fig = plt.figure() ax = fig.add_subplot(111) this_idx = obj.meta.type == ct this_dat = mdat.loc[:, this_idx] this_batch = colour_subgroups[this_idx] labels_included = dict([(k, False) for k in colour_subgroups.unique()]) for i in range(this_dat.shape[1]): func = basic.ecdf_func(this_dat.iloc[:, i]) yi = func(xi)
bdat = obj.data mdat = process.m_from_beta(bdat) if qn_method is not None: mdat = transformations.quantile_normalisation(mdat, method=qn_method) # PCA plot (by batch and cell type) colour_subgroups = obj.meta.batch c_sub_sorted = sorted(colour_subgroups.unique(), key=lambda x: 'A' if x == 'This study' else x) cmap = collections.OrderedDict( zip( c_sub_sorted, common.get_best_cmap(len(c_sub_sorted)), )) m_subgroups = obj.meta.type subgroups_sorted = sorted(m_subgroups.unique(), key=lambda x: x[0] if x[0] != 'i' else x[1]) mmap = pd.Series(common.FILLED_MARKERS[len(m_subgroups.unique())], index=subgroups_sorted) fig = plt.figure(figsize=(6.4, 4.8)) ax = fig.add_subplot(111) p, ax = plot_pca(mdat, colour_subgroups, colour_map=cmap, marker_subgroups=m_subgroups, marker_map=mmap,
ntot = sum(n_by_patient.values()) # 1) Null: DMRs are picked uniformly randomly from the pool with variable marginal totals for each patient. # Marginal totals are given by the (real) number of DMRs in each patient. rvs = dict([(pid, [ np.random.choice(range(ntot), replace=False, size=n_by_patient[pid]) for i in range(n_iter) ]) for pid in pids]) inters_1 = [[ len(x) for x in setops.specific_features(*[rvs[pid][i] for pid in pids]) ] for i in range(n_iter)] inters_1 = dict(zip(pids, zip(*inters_1))) clist = common.get_best_cmap(len(pids)) patient_colours = dict(zip(pids, clist)) fig, axs = plt.subplots(nrows=len(pids), sharex=True, sharey=True) big_ax = fig.add_subplot(111, frameon=False) big_ax.tick_params(top='off', bottom='off', left='off', right='off', labelcolor='none') big_ax.grid(False) big_ax.set_ylabel('Density (a.u.)') for pid, ax in zip(pids, axs): sns.kdeplot(np.array(inters_1[pid]), color=patient_colours[pid],
the_dat.index[the_dat.index.str.contains(r'^NDST')].tolist(), 'Sulfotransferase': the_dat.index[the_dat.index.str.contains(r'^SULT')].tolist() + ['UST', 'CHSY3'], 'Glucuronosyltransferase': ['B3GAT1', 'B3GAT2', 'CHPF', 'CHPF2', 'CSGALNACT1'], 'Carbohydrate transferase': the_dat.index[the_dat.index.str.contains(r'^CHST')].tolist(), 'Dermatan sulfate epimerase': the_dat.index[the_dat.index.str.contains(r'^DSE')].tolist(), } gene_to_function = dictionary.complement_dictionary_of_iterables( function_to_gene, squeeze=True) function_colours = dict( zip(function_to_gene.keys(), common.get_best_cmap(len(function_to_gene)))) # reorder data the_dat = the_dat.loc[reduce(lambda x, y: x + y, function_to_gene.values())] row_colours = pd.DataFrame( [function_colours[gene_to_function[t]] for t in the_dat.index], index=the_dat.index, columns=['Function']) # standardise (Z) z = the_dat.subtract(the_dat.mean(axis=1), axis=0).divide(the_dat.std(axis=1), axis=0) cg = sns.clustermap(z, col_cluster=False,