Example #1
0
def compare_mad_genes(data, samples1, samples2=None, max_ng=10000, step=50):
    """
    Test the overlap of two samples in terms of the top genes drawn (by descending MAD).
    :param data:
    :param samples1:
    :param samples2: Second sample set. If None, use all samples.
    :return:
    """
    d1 = data.loc[:, samples1]
    if samples2 is not None:
        d2 = data.loc[:, samples2]
    else:
        d2 = data

    mad1 = transformations.median_absolute_deviation(d1).sort_values(
        ascending=False)
    mad2 = transformations.median_absolute_deviation(d2).sort_values(
        ascending=False)

    ng = np.arange(step, max_ng + 1, step)

    iu = [(mad1.index[:i].intersection(mad2.index[:i]),
           mad2.index[:i].union(mad2.index[:i])) for i in ng]
    iu_pct = np.array([t[0].size / float(t[1].size) * 100 for t in iu])
    return ng, iu_pct
def plot_clustermap(data, yugene=False, n_genes=N_GENES, yugene_resolve_ties=False, **kwargs):
    if yugene:
        data = process.yugene_transform(data, resolve_ties=yugene_resolve_ties)

    kwargs.setdefault('cmap', 'RdBu_r')

    mad = transformations.median_absolute_deviation(data, axis=1).sort_values(ascending=False)
    top_mad = mad.iloc[:n_genes].index
    z = hierarchy.linkage(data.loc[top_mad].transpose(), method='average', metric='correlation')
    cg = clustering.plot_clustermap(
        data.loc[top_mad],
        col_linkage=z,
        **kwargs
    )
    plt.setp(
        cg.ax_heatmap.xaxis.get_ticklabels(), rotation=90
    )
    cg.gs.update(bottom=0.2)

    # it is helpful to have access to the row index so we'll add it here
    # I *think* certain kwargs might cause this to fail (if no row dend has been computed?) so add a generic try-exc
    try:
        cg.row_index = top_mad[cg.dendrogram_row.reordered_ind]
    except Exception:
        pass

    return cg
Example #3
0
def plot_correlation_clustermap(data,
                                row_colors=None,
                                n_gene=None,
                                method='average'):
    """
    :param n_gene: If supplied, this is the number of genes to use, ordered by descending MAD
    """
    if n_gene is not None:
        # reduce data to the specified number using MAD
        mad = transformations.median_absolute_deviation(data).sort_values(
            ascending=False)
        genes = mad.index[:n_gene]
        data = data.loc[genes]
    corr = 1. - data.corr()
    z = hc.linkage(corr, method=method)
    cg = sns.clustermap(corr,
                        cmap='RdBu_r',
                        row_colors=row_colors,
                        col_colors=row_colors,
                        row_linkage=z,
                        col_linkage=z)
    plt.setp(cg.ax_heatmap.get_xticklabels(), rotation=90, fontsize=14)
    plt.setp(cg.ax_heatmap.get_yticklabels(), rotation=0, fontsize=14)
    # shift the margins a bit to fit axis tick labels
    cg.gs.update(bottom=0.2, right=0.8, top=0.99, left=0.01)
    return cg
def cluster_data_with_threshold(data,
                                min_val=None,
                                n=None,
                                mad=None,
                                min_over=2,
                                transform=None,
                                **kwargs):
    if min_val is not None and min_over is not None:
        idx = (data > min_val).sum(axis=1) > min_over
        data = data.loc[idx]

    if transform is not None:
        data = transform(data)

    if n is not None:
        if mad is None:
            mad = transformations.median_absolute_deviation(data).sort_values(
                ascending=False)
        else:
            mad = mad.sort_values(ascending=False)
            if len(mad.index.intersection(data.index)) != data.shape[0]:
                raise AttributeError(
                    "If a pre-computed MAD is supplied, it must contain all required entries"
                )

        data = data.loc[mad.index[:n]]

    cm = clustering.plot_clustermap(data,
                                    cmap='RdBu_r',
                                    metric='correlation',
                                    **kwargs)
    cm.gs.update(bottom=0.2)
    return cm, mad
Example #5
0
def hc_plot_dendrogram(data,
                       row_colours,
                       mad=None,
                       n_ftr=3000,
                       metric='correlation',
                       **kwargs):
    """
    For each value in n_gene_arr, plot a dendrogram showing the result of hierarchical clustering of the data using
    that many genes (selected in descending MAD order)
    :param data: Cols are samples, rows are genes (or similar)
    :param row_colours: As passed to dendrogram routine
    :param n_gene_arr: The values to test
    :return:
    """
    if 'fig_kws' not in kwargs:
        kwargs['fig_kws'] = {'figsize': (5.5, 10)}
    if mad is None:
        mad = transformations.median_absolute_deviation(data).sort_values(
            ascending=False)
    the_dat = data.loc[mad.index[:n_ftr]]
    fig_dict = clustering.dendrogram_with_colours(the_dat,
                                                  row_colours,
                                                  vertical=False,
                                                  metric=metric,
                                                  **kwargs)
    return fig_dict
def plot_dendrogram(obj_arr,
                    n_by_mad=None,
                    qn_method=None,
                    eps=0.01,
                    min_val=1,
                    n_above_min=3,
                    vertical=False,
                    figsize=(7, 8),
                    **kwargs):
    if len(obj_arr) > 1:
        the_obj = loader.MultipleBatchLoader(obj_arr)
    else:
        the_obj = obj_arr[0]

    the_obj = filter_loader(the_obj, min_val=min_val, n_above_min=n_above_min)
    dat = np.log2(the_obj.data + eps)
    if qn_method is not None:
        dat = transformations.quantile_normalisation(dat, method=qn_method)

    if n_by_mad is not None:
        mad = transformations.median_absolute_deviation(dat).sort_values(
            ascending=False)
        dat = dat.loc[mad.index[:n_by_mad]]

    cc, st, leg_dict = construct_colour_array_legend_studies(the_obj.meta)

    dend = clustering.dendrogram_with_colours(dat,
                                              cc,
                                              vertical=vertical,
                                              legend_labels=leg_dict,
                                              fig_kws={'figsize': figsize},
                                              **kwargs)

    return dend
def hc_plot_dendrogram_vary_n_gene(data,
                                   row_colours,
                                   mad=None,
                                   n_ftr=(1000, 2000, 3000, 5000, 10000),
                                   metric='correlation'):
    """
    For each value in n_gene_arr, plot a dendrogram showing the result of hierarchical clustering of the data using
    that many genes (selected in descending MAD order)
    :param data: Cols are samples, rows are genes (or similar)
    :param row_colours: As passed to dendrogram routine
    :param n_gene_arr: The values to test
    :return:
    """
    if mad is None:
        mad = transformations.median_absolute_deviation(data).sort_values(
            ascending=False)
    fig_dict = {}
    for ng in n_ftr:
        the_dat = data.loc[mad.index[:ng]]
        d = clustering.dendrogram_with_colours(the_dat,
                                               row_colours,
                                               fig_kws={'figsize': (5.5, 10)},
                                               vertical=False,
                                               metric=metric)
        fig_dict[ng] = d
    return fig_dict
Example #8
0
def plot_correlation_clustermap(data,
                                row_colors=None,
                                n_gene=None,
                                method='average',
                                metric='correlation',
                                distance=None,
                                **kwargs):
    """
    :param n_gene: If supplied, this is the number of genes to use, ordered by descending MAD
    :param kwargs: Passed to seaborn's `clustermap`
    """
    if n_gene is not None:
        # reduce data to the specified number using MAD
        mad = transformations.median_absolute_deviation(data).sort_values(
            ascending=False)
        genes = mad.index[:n_gene]
        data = data.loc[genes]
    rl = None
    if distance is not None:
        rl = hc.linkage(distance)
        sq = hc.distance.squareform(distance)
    else:
        rl = hc.linkage(data.transpose(), method=method, metric=metric)
        sq = hc.distance.squareform(
            hc.distance.pdist(data.transpose(), metric=metric))

    # invert distance so that closer samples have a larger number
    # do this even if distances have been provided directly
    if metric == 'correlation':
        sq = 1 - sq
    # else:
    # TODO: add specific versions for other metrics if required
    # sq = max(sq.flat) - sq

    # make a dataframe for clustering so that the plot has correct labels
    sq = pd.DataFrame(data=sq, index=data.columns, columns=data.columns)

    cg = sns.clustermap(sq,
                        cmap='RdBu_r',
                        row_colors=row_colors,
                        col_colors=row_colors,
                        row_linkage=rl,
                        col_linkage=rl,
                        **kwargs)
    plt.setp(cg.ax_heatmap.get_xticklabels(), rotation=90, fontsize=14)
    plt.setp(cg.ax_heatmap.get_yticklabels(), rotation=0, fontsize=14)
    # shift the margins a bit to fit axis tick labels
    cg.gs.update(bottom=0.2, right=0.8, top=0.99, left=0.01)
    return cg
Example #9
0
def compute_pairwise_corr(cell_line_dat, tissue_dat, n_genes=None):
    """
    Used
    :param _dat: Two pd DataFrames containing cell line and tissue data
    :param n_genes: If supplied, this is an integer, Genes are ranked by MAD
    :return:
    """
    if n_genes is not None:
        mad = transformations.median_absolute_deviation(
            pd.concat((cell_line_dat, tissue_dat), axis=1)
        ).sort_values(ascending=False)
        g = mad.index[:n_genes]
        cell_line_dat = cell_line_dat.loc[g]
        tissue_dat = tissue_dat.loc[g]

    return cell_line_dat.apply(lambda x: tissue_dat.corrwith(x), axis=0)
Example #10
0
    def pwise_corr_boxplot(dat, n_gene=None):
        # all pairwise comparisons
        if n_gene is not None:
            mad = transformations.median_absolute_deviation(dat).sort_values(
                ascending=False)
            genes = mad.index[:n_gene]
            dat = dat.loc[genes]

        the_corr = dat.corr()
        pwise_corr = []
        for s in [r'eNSC.mouse', r'mDura.*mouse', r'mDura.*human']:
            ii = the_corr.index.str.contains(r'eNSC.med')
            jj = the_corr.columns.str.contains(s)
            pwise_corr.append(the_corr.loc[ii, jj].values.flatten())

        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.boxplot(pwise_corr)
        ax.set_xticklabels([
            'eNSCmed - eNSCmouse', 'eNSCmed - iNSCmouse', 'eNSCmed - iNSChuman'
        ])
        ax.set_ylabel('pairwise correlation')

        return ax
Example #11
0
    label_symbols = hkg + ['BMI1']
    label_ens = gene_symbol_to_ensembl(label_symbols)

    hkg_dat = dat_n.loc[hkg_ens, sorted(dat_n.columns)]
    hkg_dat.index = pd.Index(hkg, name='')
    hkg_dat_rel = hkg_dat.divide(hkg_dat.loc[:, ref], axis=0)
    ax = hkg_dat_rel.transpose().plot.bar()
    ax.set_ylim([0, 3.4])
    plt.tight_layout()
    ax.figure.savefig(os.path.join(outdir, 'housekeeping_levels.png'), dpi=200)

    # identifying stable HKG
    ranked_count = pd.Series(rankdata(median_count, method='ordinal'),
                             index=median_count.index)
    ranked_perc = ranked_count / float(ranked_count.shape[0])
    mad = transformations.median_absolute_deviation(dat_n)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(ranked_perc, np.log10(median_count))
    ax.scatter(ranked_perc.loc[label_ens],
               np.log10(median_count.loc[label_ens]),
               c='r')
    for g, e in zip(label_symbols, label_ens):
        ax.text(ranked_perc.loc[e], np.log10(median_count.loc[e]), g)
    ax.set_xlabel("Abundance percentile")
    ax.set_ylabel("Log10 normalised abundance")
    ax.figure.savefig(os.path.join(outdir, 'hkg_abundance.png'), dpi=200)

    # show the total variation using fill_between
    min_count = dat_n.min(axis=1)
        min_n_samples=1)
    matched_log_cpm = log_cpm(matched_data)
    row_colours = pd.DataFrame(common.COLOUR_BREWERS[2][0],
                               index=matched_data.columns,
                               columns=['Library'])
    row_colours.loc[row_colours.index.str.contains(
        'smartseq')] = common.COLOUR_BREWERS[2][1]

    # clustering plot
    cg = clustering.plot_correlation_clustermap(matched_log_cpm,
                                                row_colors=row_colours)
    cg.gs.update(bottom=0.35, right=0.65)
    cg.savefig(os.path.join(outdir, "cluster_log_cpm_corr_all_genes.png"),
               dpi=200)

    mad = transformations.median_absolute_deviation(
        matched_log_cpm).sort_values(ascending=False)
    cg = clustering.plot_correlation_clustermap(
        matched_log_cpm.loc[mad.index[:3000]], row_colors=row_colours)
    cg.gs.update(bottom=0.35, right=0.65)
    cg.savefig(os.path.join(outdir, "cluster_log_cpm_corr_3000_genes.png"),
               dpi=200)

    # repeat with TMM norming
    matched_log_cpm_n = transformations.edger_tmm_normalisation_cpm(
        matched_data)

    cg = clustering.plot_correlation_clustermap(matched_log_cpm_n,
                                                row_colors=row_colours)
    cg.gs.update(bottom=0.35, right=0.65)
    cg.savefig(os.path.join(outdir, "cluster_log_cpm_corr_all_genes_tmm.png"),
               dpi=200)
def plot_clustermap(obj,
                    quantile_norm,
                    method='average',
                    metric='correlation',
                    n_gene_by_mad=5000,
                    n_gene_for_heatmap=500,
                    fmin=0.05,
                    fmax=0.95,
                    eps=0.01,
                    cell_line_colours=None):
    if cell_line_colours is None:
        cell_line_colours = {
            'FB': '#fff89e',  # yellow
            'GBM (this study)': '#e6e6e6',  # light gray
            'GBM': '#4d4d4d',  # dark grey
            'ESC': '#ff7777',  # light red
            'iPSC': '#990000',  # dark red
            'iPSC (this study)': '#fdc086',  # orange
            'NSC': '#006600',  # dark green
            'iNSC (this study)': '#7fc97f',  # green
        }

    the_dat = np.log2(obj.data + eps)

    if quantile_norm is not None:
        the_dat = transformations.quantile_normalisation(the_dat,
                                                         method=quantile_norm)
    the_mad = transformations.median_absolute_deviation(the_dat).sort_values(
        ascending=False)
    cc, st, leg_dict = construct_colour_array_legend_studies(obj.meta)

    # linkage
    lkg = hc.linkage(
        the_dat.loc[the_mad.index[:n_gene_by_mad]].transpose(),
        method=method,
        metric=metric,
    )

    # ref line colours
    for k, v in cell_line_colours.items():
        cc.loc[obj.meta.type == k, 'Cell type'] = v
    # our line colours
    cc.loc[obj.meta.batch.str.contains('wtchg') & (obj.meta.type == 'iPSC'), 'Cell type'] = \
    cell_line_colours['iPSC (this study)']

    # get appropriate clims
    the_dat = the_dat.loc[the_mad.index[:n_for_heatmap]]
    the_dat_flat = np.sort(the_dat.values.flatten())
    vmin = the_dat_flat[int(len(the_dat_flat) * fmin)] - 0.5
    vmax = the_dat_flat[int(len(the_dat_flat) * fmax)] + 0.5

    gc = clustering.plot_clustermap(
        the_dat.loc[the_mad.index[:n_gene_for_heatmap]],
        cmap='RdBu_r',
        col_linkage=lkg,
        col_colors=cc,
        vmin=vmin,
        vmax=vmax,
    )

    leg_entry = {
        'class': 'patch',
        'edgecolor': 'k',
        'linewidth': 1.,
    }
    leg_dict2 = collections.OrderedDict()
    leg_dict2['Cell type'] = collections.OrderedDict()

    for k in sorted(cell_line_colours):
        if k.replace(' (this study)', '') in obj.meta.type.unique():
            leg_dict2['Cell type'][k] = dict(leg_entry)
            leg_dict2['Cell type'][k].update(
                {'facecolor': cell_line_colours[k]})

    leg_dict2['Study'] = {}
    for k, v in leg_dict['Study'].items():
        leg_dict2['Study'][k] = dict(leg_entry)
        leg_dict2['Study'][k].update({'facecolor': v})

    common.add_custom_legend(gc.ax_heatmap,
                             leg_dict2,
                             loc_outside=True,
                             fontsize=14)
    format_clustermap(gc)

    return gc
        5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200,
        250, 300, 400, 500, 750, 1000
    ]
    nrem = [(gene_idx & ((obj.data > t).sum(axis=1) > 10)).sum() for t in co]
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(co, nrem)
    ax.set_xlabel("10 / 21 samples must have count higher than x")
    ax.set_ylabel("Number of genes remaining")
    fig.tight_layout()
    fig.savefig(os.path.join(outdir, "filtering_effect.png"), dpi=200)
    fig.savefig(os.path.join(outdir, "filtering_effect.pdf"))

    idx = gene_idx & ((obj.data > 10).sum(axis=1) > 10)
    data = obj.data.loc[idx]
    mad = transformations.median_absolute_deviation(data).sort_values(
        ascending=False)
    logdata = np.log(data + 1)

    # start with a dendrogram
    col_colours = clustering.generate_colour_map_dict(obj.meta,
                                                      'sample',
                                                      matches,
                                                      label='Patient',
                                                      non_matching='gray')
    out = clustering.dendrogram_with_colours(logdata,
                                             col_colours=col_colours,
                                             vertical=False)
    dist = clustering.dendrogram_threshold_by_nclust(out['linkage'], 3)
    out['dendrogram_ax'].axvline(dist, ls='--', c='gray')
    out['fig'].savefig(os.path.join(outdir, "dendrogram_all_genes.png"),
                       dpi=200)
    abg = pd.concat((patient_data_by_gene, ref_by_gene), axis=1)

    # discard mitochondrial genes
    if remove_mt:
        idx = ~abg.index.isin(mt_ensg)
        abg = abg.loc[idx]
        # renorm
        if units == 'tpm':
            abg = abg.divide(abg.sum(), axis=1) * 1e6

    # discard genes expressed at low values
    idx = (abg > min_val).sum(axis=1) > min_n
    abg = abg.loc[idx]

    abg_log = np.log2(abg + eps)
    amad_log = transformations.median_absolute_deviation(abg_log).sort_values(
        ascending=False)

    if units == 'estimated_counts':
        # optionally could normalise here?
        pass

    row_colours_all = pd.DataFrame('gray', index=abg.columns, columns=[''])
    row_colours_all.loc[row_colours_all.index.str.contains(r'NSC')] = 'blue'
    row_colours_all.loc[row_colours_all.index.str.contains(r'NPC')] = 'blue'
    row_colours_all.loc[row_colours_all.index.str.contains(
        r'GIBCO')] = '#96daff'
    row_colours_all.loc[row_colours_all.index.str.contains(
        r'Fibroblast')] = '#fff89e'
    row_colours_all.loc[row_colours_all.index.str.contains(
        r'Fetal')] = 'yellow'
    row_colours_all.loc[row_colours_all.index.str.contains('ES1')] = '#ff7777'
Example #16
0
    meta.loc[:, 'study'] = STUDY
    data = data.loc[:, meta.index]
    if 'EYS' in data.index:
        idx = data.index.str.replace('EYS', 'EGFL11')
        data.index = idx

    # Kool
    # STUDY = 'Kool'
    # data, meta = microarray_data.load_annotated_microarray_gse10327(aggr_field='SYMBOL', aggr_method='max_std')

    # Northcott
    # STUDY = 'Northcott'
    # data, meta = microarray_data.load_annotated_microarray_gse37382(aggr_field='SYMBOL', aggr_method='max_std')

    # find top genes by MAD - all genes included
    mad = transformations.median_absolute_deviation(data, axis=1)
    top_genes = mad.sort_values(ascending=False).index[:n_genes]
    print "Selecting top %d genes by MAD from %s study..." % (n_genes, STUDY)
    print "%d / %d genes (nanostring)" % (len(
        top_genes.intersection(all_nstring)), len(all_nstring))
    print "%d / %d genes (northcott)" % (len(
        top_genes.intersection(all_ncott)), len(all_ncott))

    # Zhao data
    zhao_sample_names = (
        'Pt1299',
        'Pt1487',
        'Pt1595',
        'ICb1299-III',
        'ICb1299-IV',
        'ICb1487-I',
    nz_idx = (
        ((data_nsc < MIN_COUNT).sum(axis=1) < MAX_BELOW) | ((data_nsc > HIGH_COUNT).any(axis=1))
    )
    data_nsc_nz = data_nsc.loc[nz_idx, :]

    # yugene
    data_nsc_nz_yg = process.yugene_transform(data_nsc_nz)

    # add one, norm, take log
    data_nsc_nz += 1
    data_nsc_nz = data_nsc_nz.divide(data_nsc_nz.sum(axis=0), axis=1)
    data_nsc_nz = np.log(data_nsc_nz + 1)

    # MAD - compute on normalised values
    mad_nsc_nz = transformations.median_absolute_deviation(data_nsc_nz).sort_values(ascending=False)
    top_idx = mad_nsc_nz.index[:N_GENES]
    rem_idx = mad_nsc_nz.index[N_GENES:]

    # reduce number of remainder for plotting purposes
    to_discard = rem_idx[np.random.permutation(rem_idx.size)[N_GENES:]]
    data_nsc_nz = data_nsc_nz.drop(to_discard)

    # add 'hue' column
    data_nsc_nz.loc[:, 'hue'] = 'Remainder'
    data_nsc_nz.loc[top_idx, 'hue'] = 'Top %d by MAD' % N_GENES

    # generate the plot
    # pg = sns.pairplot(data_nsc_nz, hue='hue')

    # repeat on YuGene
    the_dat = mdat

    plt_dict = hc_plot_dendrogram_vary_n_gene(the_dat,
                                              row_colours_all,
                                              n_ftr=[clust_n_ftr])
    for ng, x in plt_dict.items():
        fname = "dendrogram_M_corr_top%d_by_mad.{ext}" % ng
        x['fig'].savefig(os.path.join(outdir, fname.format(ext='png')),
                         dpi=200)
        x['fig'].savefig(os.path.join(outdir, fname.format(ext='tiff')),
                         dpi=200)

    # heatmap: use clustering from n=20000 probes (M vals), but show fewer probes values

    # pick these using MAD
    this_mad = transformations.median_absolute_deviation(mdat).sort_values(
        ascending=False)
    this_dat = mdat.loc[this_mad.index[:n_probe_to_show]]

    leg_entry = {
        'class': 'patch',
        'edgecolor': 'k',
        'linewidth': 1.,
    }

    lkg = plt_dict[clust_n_ftr]['linkage']
    leg_dict = collections.OrderedDict()
    for k in sorted(cell_line_colours):
        if cell_line_colours[k] in row_colours_all.values:
            leg_dict[k] = dict(leg_entry)
            leg_dict[k].update({'facecolor': cell_line_colours[k]})
Example #19
0
                                                      non_matching='gray')

    # filter
    data = filter.filter_by_cpm(data,
                                min_cpm=1,
                                min_n_samples=3,
                                unless_cpm_gt=10)
    # data = filter.filter_by_cpm(data, min_cpm=1, min_n_samples=3, unless_cpm_gt=None)

    # normalise by read count
    cpm = (data + 1).divide((data + 1).sum(axis=0), axis=1) * 1e6

    # transform
    log_data = np.log2(cpm)
    vst_data = variance_stabilizing_transform(cpm)
    mad_log_srt = median_absolute_deviation(log_data).sort_values(
        ascending=False)
    mad_vst_srt = median_absolute_deviation(vst_data).sort_values(
        ascending=False)

    for NGENE in [500, 1000, 1500, 2000, 2500]:
        out = clustering.dendrogram_with_colours(
            log_data.loc[mad_log_srt.index[:NGENE]],
            col_colours=col_colours,
            vertical=False,
            metric='correlation',
            method='average',
        )
        out['fig'].savefig(os.path.join(
            outdir,
            "gbm_nsc_correlation_dendrogram_logtransform_top%d.png" % NGENE),
                           dpi=200)
    for ng, x in plt_dict.items():
        if ng > the_dat.shape[0]:
            fname = "dendrogram_M_corr_all.{ext}"
        else:
            fname = "dendrogram_M_corr_top%d_by_mad.{ext}" % ng
        x['fig'].savefig(os.path.join(outdir, fname.format(ext='png')),
                         dpi=200)
        x['fig'].savefig(os.path.join(outdir, fname.format(ext='tiff')),
                         dpi=200)
    plt.close('all')

    # heatmap: use clustering from n=20000 probes (M vals), but only show top 500 most variable between clusters
    clust_n_ftr = 20000
    n_probe_to_show = 500
    lkg = plt_dict[clust_n_ftr]['linkage']
    this_mad = transformations.median_absolute_deviation(mdat).sort_values(
        ascending=False)
    this_dat = mdat.loc[this_mad.index[:n_probe_to_show]]

    # heatmap for 3000 probes
    cm = clustering.plot_clustermap(this_dat,
                                    cmap='RdYlBu_r',
                                    metric='correlation',
                                    col_colors=row_colours_all,
                                    col_linkage=lkg,
                                    vmin=-10,
                                    vmax=10,
                                    figsize=(11.8, 10.))
    cm.gs.update(bottom=0.25, right=0.99)

    cm.savefig(os.path.join(
        outdir, "clustermap_M_corr_linkage%d_heatmap%d.png" %
Example #21
0
    idx = (pdbg > min_val).sum(axis=1) > min_n
    pdbg = pdbg.loc[idx]

    if units == 'estimated_counts':
        # here we can normalise by library size if desired
        pass

    ax = hist_logvalues(patient_data, thresholds=[min_val])
    ax.figure.savefig(os.path.join(
        outdir, "log2_intensities_by_gene_with_min_tpm_threshold.png"),
                      dpi=200)
    ax.figure.savefig(
        os.path.join(outdir,
                     "log2_intensities_by_gene_with_min_tpm_threshold.pdf"))

    mad = transformations.median_absolute_deviation(pdbg).sort_values(
        ascending=False)
    pdbg_log = np.log2(pdbg + eps)
    mad_log = transformations.median_absolute_deviation(pdbg_log).sort_values(
        ascending=False)
    row_colours = pd.DataFrame('gray', index=pdbg_log.columns, columns=[''])
    row_colours.loc[row_colours.index.str.contains('IPSC')] = '#fdc086'
    row_colours.loc[row_colours.index.str.contains(
        r'DURA[0-9]*_NSC')] = '#7fc97f'
    row_colours.loc[row_colours.index.str.contains('GIBCO')] = '#96daff'

    for n_t in n_gene_try:
        fname = "clustering_by_gene_corr_top%d_by_mad.{ext}" % n_t
        fname_log = "clustering_by_gene_corr_log_top%d_by_mad.{ext}" % n_t

        d = clustering.dendrogram_with_colours(
            pdbg_log.loc[mad_log.index[:n_t]],