Python quantile_normalisationの例、stats.transformations.quantile_normalisation Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cluster_lines_rnaseq.py プロジェクト: gaberosser/qmul-bioinf

def plot_dendrogram(obj_arr,
                    n_by_mad=None,
                    qn_method=None,
                    eps=0.01,
                    min_val=1,
                    n_above_min=3,
                    vertical=False,
                    figsize=(7, 8),
                    **kwargs):
    if len(obj_arr) > 1:
        the_obj = loader.MultipleBatchLoader(obj_arr)
    else:
        the_obj = obj_arr[0]

    the_obj = filter_loader(the_obj, min_val=min_val, n_above_min=n_above_min)
    dat = np.log2(the_obj.data + eps)
    if qn_method is not None:
        dat = transformations.quantile_normalisation(dat, method=qn_method)

    if n_by_mad is not None:
        mad = transformations.median_absolute_deviation(dat).sort_values(
            ascending=False)
        dat = dat.loc[mad.index[:n_by_mad]]

    cc, st, leg_dict = construct_colour_array_legend_studies(the_obj.meta)

    dend = clustering.dendrogram_with_colours(dat,
                                              cc,
                                              vertical=vertical,
                                              legend_labels=leg_dict,
                                              fig_kws={'figsize': figsize},
                                              **kwargs)

    return dend

コード例 #2

0

ファイルを表示

ファイル: cluster_lines_methylation.py プロジェクト: gaberosser/qmul-bioinf

    # obj.filter_samples(ix)
    #
    # ix = obj.meta.type != 'iAPC'
    # obj.filter_samples(ix)
    #
    # ix = ~obj.meta.index.str.contains('GBM')
    # obj.filter_samples(ix)

    # ix = obj.meta.index != 'H9 NPC (Encode EPIC)'
    # obj.filter_samples(ix)

    bdat = obj.data
    mdat = process.m_from_beta(bdat)

    if qn_method is not None:
        mdat = transformations.quantile_normalisation(mdat, method=qn_method)

    # tidy up batch IDs
    obj.meta.loc[obj.meta.batch.isnull(),
                 'batch'] = obj.meta.loc[obj.meta.batch.isnull(), 'batch_1']
    obj.meta.batch = obj.meta.batch.str.replace('2016-12-19_ucl_genomics',
                                                '2016-12-19')

    # the only batch names without letters are ours
    obj.meta.loc[~obj.meta.batch.str.contains(r'[A-Z]'),
                 'batch'] = 'This study'

    # PCA plot (by batch and cell type)
    colour_subgroups = obj.meta.batch
    c_sub_sorted = sorted(colour_subgroups.unique(),
                          key=lambda x: 'A' if x == 'This study' else x)

コード例 #3

0

ファイルを表示

ファイル: ffpe_cell_culture_correlation.py プロジェクト: gaberosser/qmul-bioinf

    rna_cc_dat = filter.filter_by_cpm(rna_cc_dat,
                                      min_cpm=min_tpm,
                                      min_n_samples=2)

    # reduce to matching probes
    probes = rna_cc_dat.index.intersection(rna_ff_dat.index)

    if remove_mt:
        probes = probes[~probes.isin(mt_ens)]

    rna_ff_dat = np.log2(rna_ff_dat.loc[probes] + eps)
    rna_cc_dat = np.log2(rna_cc_dat.loc[probes] + eps)

    # QN
    if apply_qn:
        rna_ff_dat = transformations.quantile_normalisation(rna_ff_dat)
        rna_cc_dat = transformations.quantile_normalisation(rna_cc_dat)

    # correlation plot
    pdist = pd.DataFrame(index=rna_ff_dat.columns.sort_values(),
                         columns=rna_cc_dat.columns.sort_values(),
                         dtype=float)
    for ff in pdist.index:
        for cc in pdist.columns:
            if dist_metric == 'pearson':
                pdist.loc[ff, cc] = stats.pearsonr(rna_ff_dat[ff],
                                                   rna_cc_dat[cc])[0]
            elif dist_metric == 'spearman':
                pdist.loc[ff, cc] = stats.spearmanr(rna_ff_dat[ff],
                                                    rna_cc_dat[cc]).correlation
            else:

コード例 #4

0

ファイルを表示

ファイル: hgic_methylation_with_references.py プロジェクト: gaberosser/qmul-bioinf

            if labels_included[csg]:
                lbl = None
            else:
                lbl = csg
                labels_included[csg] = True
            ax.plot(xi, yi, c=colour_map[csg], label=lbl)

        ax.set_xlabel('M value')
        ax.set_ylabel('ECDF')
        ax.set_title(ct)
        common.legend_outside_axes(ax)
        fig.subplots_adjust(left=0.1, bottom=0.1, right=0.8, top=0.99)
        fig.savefig(os.path.join(outdir, "ecdf_batches_%s.png" % ct), dpi=200)

    # normalise and repeat PCA
    mdat_qn = transformations.quantile_normalisation(mdat)
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111)
    p_qn, ax = plot_pca(mdat_qn,
                        colour_subgroups,
                        marker_subgroups=m_subgroups,
                        marker_map=mmap,
                        ax=ax)
    ax.figure.subplots_adjust(left=0.1, right=0.8)
    ax.figure.savefig(os.path.join(outdir,
                                   "pca_plot_batch_cell_type_all_qn.png"),
                      dpi=200)

    # now try without GBM

    # (a) no QN

コード例 #5

0

ファイルを表示

ファイル: cluster_lines_rnaseq.py プロジェクト: gaberosser/qmul-bioinf

    obj1.filter_samples(ix)

    dend = plot_dendrogram([obj1, ref_obj, nsc_ref_obj],
                           vertical=False,
                           figsize=(7, 14),
                           qn_method=quantile_norm,
                           n_by_mad=n_gene_by_mad)
    dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc_fb_nsc.png"),
                        dpi=200)

    # 1b. Heatmap from clustering result of (1a)
    n_for_heatmap = 500
    the_obj = loader.MultipleBatchLoader([obj1, ref_obj, nsc_ref_obj])
    the_dat = np.log2(the_obj.data + eps)
    if quantile_norm is not None:
        the_dat = transformations.quantile_normalisation(the_dat,
                                                         method=quantile_norm)
    the_mad = transformations.median_absolute_deviation(the_dat).sort_values(
        ascending=False)
    cc, st, leg_dict = construct_colour_array_legend_studies(the_obj.meta)

    # ref line colours
    for k, v in cell_line_colours.items():
        cc.loc[the_obj.meta.type == k, 'Cell type'] = v
    # our line colours
    cc.loc[the_obj.meta.batch.str.contains('wtchg') &
           (the_obj.meta.type == 'iNSC'),
           'Cell type'] = cell_line_colours['iNSC (this study)']
    cc.loc[the_obj.meta.batch.str.contains('wtchg') &
           (the_obj.meta.type == 'iPSC'),
           'Cell type'] = cell_line_colours['iPSC (this study)']

コード例 #6

0

ファイルを表示

ファイル: cluster_lines_rnaseq.py プロジェクト: gaberosser/qmul-bioinf

def plot_clustermap(obj,
                    quantile_norm,
                    method='average',
                    metric='correlation',
                    n_gene_by_mad=5000,
                    n_gene_for_heatmap=500,
                    fmin=0.05,
                    fmax=0.95,
                    eps=0.01,
                    cell_line_colours=None):
    if cell_line_colours is None:
        cell_line_colours = {
            'FB': '#fff89e',  # yellow
            'GBM (this study)': '#e6e6e6',  # light gray
            'GBM': '#4d4d4d',  # dark grey
            'ESC': '#ff7777',  # light red
            'iPSC': '#990000',  # dark red
            'iPSC (this study)': '#fdc086',  # orange
            'NSC': '#006600',  # dark green
            'iNSC (this study)': '#7fc97f',  # green
        }

    the_dat = np.log2(obj.data + eps)

    if quantile_norm is not None:
        the_dat = transformations.quantile_normalisation(the_dat,
                                                         method=quantile_norm)
    the_mad = transformations.median_absolute_deviation(the_dat).sort_values(
        ascending=False)
    cc, st, leg_dict = construct_colour_array_legend_studies(obj.meta)

    # linkage
    lkg = hc.linkage(
        the_dat.loc[the_mad.index[:n_gene_by_mad]].transpose(),
        method=method,
        metric=metric,
    )

    # ref line colours
    for k, v in cell_line_colours.items():
        cc.loc[obj.meta.type == k, 'Cell type'] = v
    # our line colours
    cc.loc[obj.meta.batch.str.contains('wtchg') & (obj.meta.type == 'iPSC'), 'Cell type'] = \
    cell_line_colours['iPSC (this study)']

    # get appropriate clims
    the_dat = the_dat.loc[the_mad.index[:n_for_heatmap]]
    the_dat_flat = np.sort(the_dat.values.flatten())
    vmin = the_dat_flat[int(len(the_dat_flat) * fmin)] - 0.5
    vmax = the_dat_flat[int(len(the_dat_flat) * fmax)] + 0.5

    gc = clustering.plot_clustermap(
        the_dat.loc[the_mad.index[:n_gene_for_heatmap]],
        cmap='RdBu_r',
        col_linkage=lkg,
        col_colors=cc,
        vmin=vmin,
        vmax=vmax,
    )

    leg_entry = {
        'class': 'patch',
        'edgecolor': 'k',
        'linewidth': 1.,
    }
    leg_dict2 = collections.OrderedDict()
    leg_dict2['Cell type'] = collections.OrderedDict()

    for k in sorted(cell_line_colours):
        if k.replace(' (this study)', '') in obj.meta.type.unique():
            leg_dict2['Cell type'][k] = dict(leg_entry)
            leg_dict2['Cell type'][k].update(
                {'facecolor': cell_line_colours[k]})

    leg_dict2['Study'] = {}
    for k, v in leg_dict['Study'].items():
        leg_dict2['Study'][k] = dict(leg_entry)
        leg_dict2['Study'][k].update({'facecolor': v})

    common.add_custom_legend(gc.ax_heatmap,
                             leg_dict2,
                             loc_outside=True,
                             fontsize=14)
    format_clustermap(gc)

    return gc