def construct_colour_array_legend_studies(meta):
    ct_leg = {
        'FB': '#fff89e',
        'iPSC (this study)': 'blue',
        'iPSC': '#96daff',
        'ESC': 'green',
        'iNSC (this study)': '#9e3900',
        'iNSC': '#db7b00',
        'NSC': '#f4b342',
        'Fetal NSC': '#ffaf47',
    }

    cc = pd.DataFrame('gray', index=meta.index, columns=['Cell type', 'Study'])

    cols_incl = ['FB', 'ESC', 'NSC']
    for t in cols_incl:
        cc.loc[meta['type'] == t, 'Cell type'] = ct_leg[t]

    cc.loc[meta['type'] == 'NPC', 'Cell type'] = ct_leg['NSC']
    cc.loc[(meta['type'] == 'iPSC') & (meta['batch'].str.contains('WTCHG')),
           'Cell type'] = ct_leg['iPSC (this study)']
    cc.loc[(meta['type'] == 'iPSC') & (~meta['batch'].str.contains('WTCHG')),
           'Cell type'] = ct_leg['iPSC']
    cc.loc[(meta['type'] == 'iNSC') & (meta['batch'].str.contains('WTCHG')),
           'Cell type'] = ct_leg['iNSC (this study)']  # chestnut
    cc.loc[(meta['type'] == 'iNSC') & (~meta['batch'].str.contains('WTCHG')),
           'Cell type'] = ct_leg['iNSC']  # chestnut
    cc.loc[(meta['type'] == 'NSC') & (meta.index.str.contains('fetal')),
           'Cell type'] = ct_leg['Fetal NSC']  # light orange

    all_batches = meta.batch.copy()
    # to keep individual batches, comment out this next line
    all_batches[all_batches.str.contains('wtchg')] = 'This study'
    batches = all_batches.unique()
    n_study = len(batches)
    study_colours = common.get_best_cmap(n_study)
    studies = {}
    for i, c in enumerate(study_colours):
        cc.loc[all_batches == batches[i], 'Study'] = c
        studies[batches[i]] = c

    all_colours = cc.loc[:, 'Cell type'].unique()
    for k in ct_leg.keys():
        if ct_leg[k] not in all_colours:
            ct_leg.pop(k)

    # legend dictionary
    leg_dict = {'Cell type': ct_leg, 'Study': studies}

    return cc, studies, leg_dict
Esempio n. 2
0
def plot_one_scatter_pie(xy,
                         weights,
                         colours=None,
                         ax=None,
                         marker_kwargs=None,
                         **scatter_kwargs):
    """
    Add one pie chart path marker to the supplied or current axes.
    :param xy: Array of length two specifying the centre of the pie chart.
    :param weights: Array containing unnormalised weights for the pie segment sizes.
    :param colours: Optionally provide an array of colours. If missing, we choose a the 'best colourmap' option. If
    supplied, the length must match that of weights.
    :param ax: Axis object to plot into. If missing, use gca().
    :param marker_kwargs: If supplied, this dictionary is passed to pie_path_markers.
    :param scatter_kwargs: Passed to the scatter() method of ax.
    :return:
    """
    if len(xy) != 2:
        raise ValueError("xy must have length 2.")
    x, y = xy

    if colours is None:
        colours = common.get_best_cmap(len(weights))

    if colours is not None and len(weights) != len(colours):
        raise ValueError("Length of weights and colours must be equal.")

    if marker_kwargs is None:
        marker_kwargs = {}

    if ax is None:
        ax = plt.gca()

    markers = pie_path_marker(weights, **marker_kwargs)
    handles = []
    for m, c in zip(markers, colours):
        h = ax.scatter([x], [y], marker=m, facecolor=c, **scatter_kwargs)
        handles.append(h)

    return handles
    obj.meta.batch = obj.meta.batch.str.replace('2016-12-19_ucl_genomics',
                                                '2016-12-19')

    # the only batch names without letters are ours
    obj.meta.loc[~obj.meta.batch.str.contains(r'[A-Z]'),
                 'batch'] = 'This study'

    # PCA plot (by batch and cell type)
    colour_subgroups = obj.meta.batch
    c_sub_sorted = sorted(colour_subgroups.unique(),
                          key=lambda x: 'A' if x == 'This study' else x)

    cmap = collections.OrderedDict(
        zip(
            c_sub_sorted,
            common.get_best_cmap(len(c_sub_sorted)),
        ))

    m_subgroups = obj.meta.type
    subgroups_sorted = sorted(m_subgroups.unique(),
                              key=lambda x: x[0] if x[0] != 'i' else x[1])
    mmap = pd.Series(common.FILLED_MARKERS[len(m_subgroups.unique())],
                     index=subgroups_sorted)

    fig = plt.figure(figsize=(6.4, 4.8))
    ax = fig.add_subplot(111)
    p, ax = plot_pca(mdat,
                     colour_subgroups,
                     colour_map=cmap,
                     marker_subgroups=m_subgroups,
                     marker_map=mmap,
        'HSC',
        'Mast cells',
        'Tregs',
    ])
    ix = []
    for ct in cell_types:
        ix.extend(so_both.index[so_both.index.str.contains(ct + '_')])

    pct_shared = (so_both.loc[ix] /
                  (so_both.loc[ix] + so_ipa_not_cts.loc[ix]) *
                  100.).sort_index().transpose()

    col_colours = pd.Series(index=pct_shared.columns, name='Cell type')
    ix_lookup = pct_shared.columns.str.replace(r'(?P<ct>[^_]*)_.*', r'\g<ct>')
    a, b = ix_lookup.factorize()
    cc_cmap = common.get_best_cmap(a.max() + 1, cmap='jet')
    for i in range(a.max() + 1):
        col_colours[a == i] = cc_cmap[i]

    cg = sns.clustermap(
        pct_shared,
        cmap='Reds',
        row_cluster=False,
        col_cluster=False,
        # col_colors=col_colours,
        vmax=20.)
    # cg.gs.set_height_ratios([0.1, 0.005, 0.05, 0.9])
    cg.gs.set_height_ratios([0.1, 0.005, 0.9])
    cg.gs.set_width_ratios([0.04, 0.02, 0.8])
    cg.gs.update(top=0.99, left=0.02, right=0.67, bottom=0.15)
        for pid in pids:  # GIC
            gic = meta.index[(meta.type == 'GBM') & (meta.patient_id == pid)]
            insc = meta.index[(meta.type == 'iNSC') & (meta.patient_id == pid)]
            gbm = meta.index[(meta.type == 'ffpe') & (meta.patient_id == pid)]
            comparisons["%s_ffpe-iNSC" % pid] = [gbm, insc]
            comparisons["%s_GIC-iNSC" % pid] = [gic, insc]
        dmr_res = addd.run_dmr_analyses(dat, comparisons, anno, dmr_params)
        # Save DMR results to disk
        dmr_res.to_pickle(fn, include_annotation=False)
        logger.info("Saved DMR results to %s", fn)

    dmr_res_all = dmr_res.results_significant

    # look at distn of M values (before any differences)
    # two figures, one for each replicate
    cols = common.get_best_cmap(len(pids))
    fig, axs = plt.subplots(ncols=2, sharex=True, sharey=True)

    for i, pid in enumerate(pids):
        c = cols[i]
        this_ix = meta.index[(meta.patient_id == pid) & (meta.type == 'GBM')]
        for j, ix in enumerate(this_ix):
            this_dat = dat.loc[:, ix]
            this_ax = axs[j]
            sns.kdeplot(this_dat, color=c, label=pid, ax=this_ax)

    [ax.set_xlim([-10, 10]) for ax in axs]
    [ax.set_xlabel("M value") for ax in axs]
    axs[0].set_ylabel("Density (a.u.)")
    fig.tight_layout()
    fig.savefig(os.path.join(outdir, "m_value_density_gic.png"), dpi=200)
Esempio n. 6
0
        ax.text(0.5, 0.5 * np.sqrt(3) + 0.02, 'RTK II', horizontalalignment='center', verticalalignment='bottom')

        ax.set_aspect('equal')
        ax.axis('off')

    # bundle them up
    tax_dict = {
        'RTK I': taxs[0],
        'RTK II': taxs[1],
        'MES': taxs[2]
    }

    # each patient is a 'trajectory': FFPE -> early pass -> later passage
    bases = ['GBM_RTK_I', 'GBM_RTK_II', 'GBM_MES']
    # cmap_func = common.continuous_cmap(cc.mean_passage.max(), cmap='Blues')
    cmap = common.get_best_cmap(len(consts.PIDS))
    ff_colour = '#ffffff'

    for p in consts.PIDS:
        this_ff_score = ff.loc[p, bases]
        this_cc = cc.loc[p].sort_values(by='mean_passage')
        this_cc_score = this_cc.loc[:, bases]
        this_cc_pass = this_cc.loc[:, 'mean_passage']
        # this_colours = [ff_colour] + [cmap_func(x - 1) for x in this_cc_pass]
        this_colours = cmap[consts.PIDS.index(p)]
        this_sizes = 20 + this_cc_pass.values ** 2.

        points = np.concatenate([[this_ff_score.values], this_cc_score.values], axis=0)

        ## FIXME: we could just use the hardcoded classification here, eliminating the need to look it up?
        tax = tax_dict[ff.loc[p, 'Result']]
Esempio n. 7
0
def scatter_with_pies(xy,
                      weights_arr,
                      colours_arr=None,
                      ax=None,
                      marker_kwargs=None,
                      **scatter_kwargs):
    """
    Generate a scatterplot with pie charts as markers.
    :param xy: Array of length N, where N is the number of markers required. Each entry is a length 2 array giving the
    central coordinate of the pie chart.
    :param weights_arr: Array of length N. Each entry is an array of weights.
    :param colours_arr: Optional.
    Either an array of length N with each entry being an array of the same length as the corresponding entry in
    `weights_arr`
    Or a single array of length M, where M is the length of ALL entries of weights_arr (this is checked).
    :param ax: Axis object to plot into. If missing, use gca().
    :param marker_kwargs: If supplied, this dictionary is passed to pie_path_markers.
    :param scatter_kwargs: Passed to the scatter() method of ax.
    :return: Handles generated by the scatter() calls (array of length N, each entry is an array of PathCollection
    objects).
    """

    if len(xy) != len(weights_arr):
        raise ValueError("Length of xy and weights_arr must be equal.")

    if colours_arr is not None:
        if not hasattr(colours_arr[0], '__iter__'):
            # option 1: all weights have the same length and this is the colours array to use for all
            all_len = np.array([len(t) for t in weights_arr])
            if (all_len == len(colours_arr)).all():
                colours_arr = [colours_arr] * len(weights_arr)
            else:
                raise ValueError(
                    "If colours_arr is a single array, all weight_arr entries must have the same length."
                )

        if len(weights_arr) != len(colours_arr):
            raise ValueError(
                "Length of weights_arr and colours_arr must be equal.")

    if ax is None:
        ax = plt.gca()

    handles = []

    for i in range(len(xy)):
        w = weights_arr[i]
        if colours_arr is None:
            colours = common.get_best_cmap(len(w))
        else:
            colours = colours_arr[i]
        if len(colours) != len(w):
            raise ValueError(
                "Pie number %d: number of weights does not match number of colours."
                % i)

        handles.append(
            plot_one_scatter_pie(xy[i],
                                 w,
                                 colours=colours,
                                 ax=ax,
                                 marker_kwargs=marker_kwargs,
                                 **scatter_kwargs))

    return handles
Esempio n. 8
0
def scatter_with_colour_and_markers(
    dat,
    colour_subgroups=None,
    colour_map=None,
    marker_subgroups=None,
    marker_map=None,
    ax=None,
    legend='outside',
    default_colour='gray',
    default_marker='o',
    ec='k',
    lw=1.0,
    ms=40,
):
    """
    :param dat: Data to be plotted in any array format. Expect two columns (x and y). Can also be a pd.DataFrame.
    :param colour_subgroups:
    :param colour_map:
    :param marker_subgroups:
    :param marker_map:
    :param ax:
    :param legend: Include legend in plot? If True, use the 'best' location (according to matplotlib), if 'outside'
    (default), place outside. If False, do not plot legend.
    :param default_colour:
    :param default_marker:
    :param ec: Edgecolour
    :param lw: Linewidth
    :param ms: Marker size
    :return:
    """

    # cast dat to pd DataFrame, should have two columns
    dat = pd.DataFrame(dat)

    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect='equal')

    c_has_labels = True
    if colour_subgroups is None:
        c_has_labels = False
        # colour_subgroups = pd.Series(default_colour, index=dat.index)
        # everything is in the same colour group
        colour_subgroups = pd.Series('foo', index=dat.index)

    cidx, clabels = colour_subgroups.factorize()

    m_has_labels = True
    if marker_subgroups is None:
        m_has_labels = False
        # marker_subgroups = pd.Series(default_marker, index=dat.index)
        # everything is in the same marker group
        marker_subgroups = pd.Series('foo', index=dat.index)

    midx, mlabels = marker_subgroups.factorize()

    if colour_map is None:
        if c_has_labels:
            cmap = common.get_best_cmap(len(clabels))
        else:
            cmap = [default_colour] * len(clabels)
        colour_map = dict([(k, cmap[i]) for i, k in enumerate(clabels)])

    if marker_map is None:
        if m_has_labels:
            mmap = common.get_best_marker_map(len(mlabels))
        else:
            mmap = [default_marker] * len(mlabels)

        marker_map = dict([(k, mmap[i]) for i, k in enumerate(mlabels)])

    for ic, lc in enumerate(clabels):
        for im, lm in enumerate(mlabels):
            c = colour_map.get(lc, default_colour)
            m = marker_map.get(lm, default_marker)

            if m in FILLED_MARKERS_TO_EXPAND:
                # apply a 10% increase to these markers (only)
                this_ms = 1.1 * ms
            else:
                this_ms = ms

            j = (cidx == ic) & (midx == im)
            if j.sum() != 0:
                if c_has_labels and not m_has_labels:
                    lbl = lc
                elif m_has_labels and not c_has_labels:
                    lbl = lm
                else:
                    lbl = None
                ax.scatter(dat.values[j, 0],
                           dat.values[j, 1],
                           c=c,
                           s=this_ms,
                           label=lbl,
                           marker=m,
                           edgecolor=ec,
                           linewidths=lw)

    # set legend manually if it requires two groups
    if c_has_labels and m_has_labels:
        for_legend = []

        # colours: show in patches with no edgecolor
        # for lc in clabels:
        for lc in colour_map.keys():
            if lc in clabels:
                the_patch = patches.Patch(edgecolor='none',
                                          facecolor=colour_map.get(
                                              lc, default_colour),
                                          linewidth=lw,
                                          label=lc)
                for_legend.append(the_patch)

        # spacer that doesn't show up
        the_spacer = patches.Patch(edgecolor='none',
                                   facecolor='none',
                                   label='')
        for_legend.append(the_spacer)

        # markers: show with no fill
        # for lm in mlabels:
        for lm in marker_map.keys():
            if lm in mlabels:
                the_line = plt.Line2D(
                    [0],
                    [0],
                    marker=marker_map.get(lm, default_marker),
                    markerfacecolor='none',
                    markeredgewidth=lw,
                    markeredgecolor=ec,
                    # markersize=ms,  # the markersize units are different here, so don't specify
                    linestyle='none',
                    linewidth=0.,
                    label=lm)
                for_legend.append(the_line)

        if legend == 'outside':
            common.legend_outside_axes(ax, handles=for_legend)
        elif isinstance(legend, str):
            ax.legend(handles=for_legend, loc=legend)
        elif legend:
            ax.legend(handles=for_legend)

    # if (legend is not None) and (legend != False):
    elif (legend is not None) and (legend != False):
        if legend == 'outside':
            common.legend_outside_axes(ax)
        elif isinstance(legend, str):
            ax.legend(loc=legend)
        elif legend:
            ax.legend()

    return ax
Esempio n. 9
0
    def set_plot_parameters(self,
                            figsize=(8, 6),
                            colours=None,
                            markers=None,
                            zorder=None,
                            alpha=None,
                            size=None,
                            de_direction_colours=DIRECTION_COLOURS,
                            dm_direction_colours=DIRECTION_COLOURS,
                            de_vmin=None,
                            de_vmax=None,
                            dm_vmin=None,
                            dm_vmax=None):

        # Automatic plot parameters are only possible when we know which groups will be plotted.
        # If this is not the case, we just guess how many there will be and hope we guessed enough!

        if self.dmr_comparison_groups is not None:
            n_groups = self.n_comparison_groups
        else:
            n_groups = 4

        default_colours = common.get_best_cmap(n_groups)
        default_markers = common.get_best_marker_map(n_groups)
        default_zorder = range(20, 20 + n_groups)
        default_alpha = 0.6
        default_size = 20

        def set_property(x, default):
            if x is None:
                # cycle through default values
                if not hasattr(default, '__iter__'):
                    default = [default]
                it = itertools.cycle(default)
                out = collections.defaultdict(lambda: it.next())
            elif hasattr(x, 'get'):
                # dictionary of values: no modification needed
                out = x
            elif hasattr(x, '__iter__'):
                # iterable of values: cycle through these
                it = itertools.cycle(x)
                out = collections.defaultdict(lambda: it.next())
            else:
                # single value supplied
                # result: dictionary that always returns this value
                out = collections.defaultdict(lambda: x)
            return out

        self.colours = set_property(
            colours,
            default_colours,
        )
        self.markers = set_property(
            markers,
            default_markers,
        )
        self.zorder = set_property(
            zorder,
            default_zorder,
        )
        self.alpha = set_property(alpha, default_alpha)
        self.size = set_property(size, default_size)

        self.m_plot_kws = {
            'colours': self.colours,
            'markers': self.markers,
            'zorder': self.zorder,
            'alpha': self.alpha,
            'size': self.size
        }
        self.fig_kws = {'figsize': figsize}
        self.de_direction_colour = direction_colour_getter(
            de_direction_colours, vmin=de_vmin, vmax=de_vmax)
        self.dm_direction_colour = direction_colour_getter(
            dm_direction_colours, vmin=dm_vmin, vmax=dm_vmax)
        if dm_vmin is not None:
            self.dm_vmin = dm_vmin
        if dm_vmax is not None:
            self.dm_vmax = dm_vmax
        if de_vmin is not None:
            self.de_vmin = de_vmin
        if de_vmax is not None:
            self.de_vmax = de_vmax
Esempio n. 10
0
    def plot_m_values(
        self,
        mdat,
        probe_locations,
        comparisons,
        colours='default',
        markers='default',
        zorder='default',
        alpha='default',
        size='default',
    ):
        """

        :param mdat: pd.DataFrame containing the data to plot. Columns are samples, rows are probes
        :param probe_locations: pd.Series containing the probe IDs to include and their genomic coordinates
        :param comparisons: Dictionary keyed by comparison (equivalent to row_names). Each entry is a dictionary keyed
        by group name (e.g. 'Disease' / 'Healthy') and with values giving the samples in that group. The sample names
        must be in the columns of `mdat`.
        :param colours: Dictionary keyed by group name (e.g. 'Disease') giving the colour to use for that group.
        Defaults are used if not supplied. To disable colours, set to None.
        :param markers: Dictionary keyed by group name giving the marker to use for that group.
        Defaults are used if not supplied. To use circle markers for everything, set to None.
        :param zorder: Dictionary keyed by group name giving the zorder to use for that group.
        Defaults are used if not supplied. To use matplotlib defaults for everything, set to None.
        :param alpha: Dictionary keyed by group name giving the alpha to use for that group.
        Defaults are used if not supplied. To use matplotlib defaults for everything, set to None.
        :return:
        """
        all_groups = sorted(
            setops.reduce_union(*(t.keys() for t in comparisons.values())))
        n_groups = len(all_groups)

        def set_property(x, default, default_static):
            if x == 'default':
                out = dict(zip(all_groups, default))
            elif x is None:
                out = dict([(k, default_static) for k in all_groups])
            elif not hasattr(x, 'get'):
                # single value supplied
                out = dict([(k, x) for k in all_groups])
            else:
                out = x
            return out

        colours = set_property(colours, common.get_best_cmap(n_groups), '0.5')
        markers = set_property(markers, common.get_best_marker_map(n_groups),
                               'o')
        zorder = set_property(zorder, range(20, 20 + n_groups), 20)
        # default alpha will be based on zorder
        a = sorted([(k, zorder[k]) for k in all_groups], key=lambda x: x[1])
        a_ix = dict([(t[0], i) for i, t in enumerate(a)])
        alpha_values = np.linspace(0.4, 0.6, n_groups)
        alpha_default = [alpha_values[a_ix[k]] for k in all_groups]

        alpha = set_property(alpha, alpha_default, '0.6')

        # default size will be based on zorder
        s_values = range(20, 20 + n_groups)
        s_default = [s_values[a_ix[k]] for k in all_groups]
        size = set_property(size, s_default, 20)

        # scatter plot individual probes
        ymin = 0
        ymax = 0
        for nm in self.row_names:
            grp_dict = comparisons[nm]
            this_ax = self.m_axs[nm]
            for grp_nm, grp_samples in grp_dict.items():
                the_colour = colours.get(grp_nm)
                the_marker = markers.get(grp_nm)
                the_z = zorder.get(grp_nm)
                the_alpha = alpha.get(grp_nm)
                the_s = size.get(grp_nm)
                for col, x in mdat.loc[probe_locations.index,
                                       grp_samples].iteritems():
                    this_ax.scatter(probe_locations,
                                    x.values,
                                    c=the_colour,
                                    marker=the_marker,
                                    zorder=the_z,
                                    alpha=the_alpha,
                                    s=the_s,
                                    edgecolor='k',
                                    linewidth=0.5)
                    ymin = min(x.values.min(), ymin)
                    ymax = max(x.values.max(), ymax)
                    this_ax.set_ylabel(nm)
        self.mdat_min = ymin
        self.mdat_max = ymax

        if self.coord_max is None:
            self.coord_min = probe_locations.min()
            self.coord_max = probe_locations.max()
        else:
            self.coord_min = min(probe_locations.min(), self.coord_min)
            self.coord_max = max(probe_locations.max(), self.coord_max)
                dpi=200)
    fig.savefig(os.path.join(
        outdir,
        "cell_proportion_pathway_pval_%s_clustering_sign_annot_syngeneic_tregs.tiff"
        % corr_metric),
                dpi=200)
    fig.savefig(os.path.join(
        outdir,
        "cell_proportion_pathway_pval_%s_clustering_sign_annot_syngeneic_tregs.pdf"
        % corr_metric),
                dpi=200)

    # generate scatterplots for each pathway
    all_cts = co_p.index[(co_p.loc[:, ix] < alpha).any(axis=1)]
    colour_by_cell_type = dict(zip(all_cts,
                                   common.get_best_cmap(len(all_cts))))

    for pw in ix:
        cts = co_p.index[(co_p[pw] < alpha)]
        fig = plt.figure(figsize=(6, 5))
        ax = fig.add_subplot(111)
        xmax = 0.
        ymax = 0.
        for ct in cts:
            this_p = p.loc[pw].sort_index()
            this_df = df.loc[ct, this_p.index].sort_index()
            this_comb = pd.concat((this_df, this_p),
                                  axis=1).dropna(axis=0).astype(float)
            this_comb = this_comb.sort_values(by=[ct, pw], axis=0)
            x = this_comb.iloc[:, 0]
            y = this_comb.iloc[:, 1]
                               min_n_samples=2)
    log_dat = np.log10(obj_salmon.data + eps)

    # ECDF
    ax = rnaseq.log_cpm_ecdf_plot(dat, units='tpm', min_cpm=min_cpm)
    ax.figure.set_size_inches(6, 4)
    ax.figure.tight_layout()
    ax.figure.savefig(os.path.join(outdir, "cdf_our_samples.png"), dpi=200)

    # PCA
    colour_subgroups = obj_salmon.meta.treatment

    cmap = collections.OrderedDict(
        zip(
            colour_subgroups,
            common.get_best_cmap(len(colour_subgroups)),
        ))

    p = PCA()
    pc_dat = p.fit_transform(log_dat.transpose())

    p, ax = plot_pca(log_dat, colour_subgroups, colour_map=cmap, p=p)

    for i, col in enumerate(log_dat.columns):
        ax.text(pc_dat[i, 0], pc_dat[i, 1], col)

    ax.figure.set_size_inches(5.9, 4.8)
    ax.figure.subplots_adjust(right=0.8, left=0.12, bottom=0.1, top=0.98)
    ax.figure.savefig(os.path.join(outdir, "pca_our_samples.png"), dpi=200)

    # clustermap: just our samples
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111)
    p, ax = plot_pca(mdat,
                     colour_subgroups,
                     marker_subgroups=m_subgroups,
                     marker_map=mmap,
                     ax=ax)
    ax.figure.subplots_adjust(left=0.1, right=0.8)
    ax.figure.savefig(os.path.join(outdir, "pca_plot_batch_cell_type_all.png"),
                      dpi=200)

    # ECDF plot (separate for cell types) to show batch effects

    cell_types = ['GBM', 'iNSC']
    cmap = common.get_best_cmap(len(colour_subgroups.unique()))
    colour_map = dict(zip(colour_subgroups.unique(), cmap))

    xi = np.linspace(-8, 8, 200)
    for ct in cell_types:
        fig = plt.figure()
        ax = fig.add_subplot(111)

        this_idx = obj.meta.type == ct
        this_dat = mdat.loc[:, this_idx]
        this_batch = colour_subgroups[this_idx]
        labels_included = dict([(k, False) for k in colour_subgroups.unique()])

        for i in range(this_dat.shape[1]):
            func = basic.ecdf_func(this_dat.iloc[:, i])
            yi = func(xi)
    bdat = obj.data
    mdat = process.m_from_beta(bdat)

    if qn_method is not None:
        mdat = transformations.quantile_normalisation(mdat, method=qn_method)

    # PCA plot (by batch and cell type)
    colour_subgroups = obj.meta.batch
    c_sub_sorted = sorted(colour_subgroups.unique(),
                          key=lambda x: 'A' if x == 'This study' else x)

    cmap = collections.OrderedDict(
        zip(
            c_sub_sorted,
            common.get_best_cmap(len(c_sub_sorted)),
        ))

    m_subgroups = obj.meta.type
    subgroups_sorted = sorted(m_subgroups.unique(),
                              key=lambda x: x[0] if x[0] != 'i' else x[1])
    mmap = pd.Series(common.FILLED_MARKERS[len(m_subgroups.unique())],
                     index=subgroups_sorted)

    fig = plt.figure(figsize=(6.4, 4.8))
    ax = fig.add_subplot(111)
    p, ax = plot_pca(mdat,
                     colour_subgroups,
                     colour_map=cmap,
                     marker_subgroups=m_subgroups,
                     marker_map=mmap,
Esempio n. 15
0
    ntot = sum(n_by_patient.values())

    # 1) Null: DMRs are picked uniformly randomly from the pool with variable marginal totals for each patient.
    # Marginal totals are given by the (real) number of DMRs in each patient.
    rvs = dict([(pid, [
        np.random.choice(range(ntot), replace=False, size=n_by_patient[pid])
        for i in range(n_iter)
    ]) for pid in pids])

    inters_1 = [[
        len(x)
        for x in setops.specific_features(*[rvs[pid][i] for pid in pids])
    ] for i in range(n_iter)]
    inters_1 = dict(zip(pids, zip(*inters_1)))

    clist = common.get_best_cmap(len(pids))
    patient_colours = dict(zip(pids, clist))

    fig, axs = plt.subplots(nrows=len(pids), sharex=True, sharey=True)
    big_ax = fig.add_subplot(111, frameon=False)
    big_ax.tick_params(top='off',
                       bottom='off',
                       left='off',
                       right='off',
                       labelcolor='none')
    big_ax.grid(False)
    big_ax.set_ylabel('Density (a.u.)')

    for pid, ax in zip(pids, axs):
        sns.kdeplot(np.array(inters_1[pid]),
                    color=patient_colours[pid],
Esempio n. 16
0
        the_dat.index[the_dat.index.str.contains(r'^NDST')].tolist(),
        'Sulfotransferase':
        the_dat.index[the_dat.index.str.contains(r'^SULT')].tolist() +
        ['UST', 'CHSY3'],
        'Glucuronosyltransferase':
        ['B3GAT1', 'B3GAT2', 'CHPF', 'CHPF2', 'CSGALNACT1'],
        'Carbohydrate transferase':
        the_dat.index[the_dat.index.str.contains(r'^CHST')].tolist(),
        'Dermatan sulfate epimerase':
        the_dat.index[the_dat.index.str.contains(r'^DSE')].tolist(),
    }
    gene_to_function = dictionary.complement_dictionary_of_iterables(
        function_to_gene, squeeze=True)
    function_colours = dict(
        zip(function_to_gene.keys(),
            common.get_best_cmap(len(function_to_gene))))

    # reorder data
    the_dat = the_dat.loc[reduce(lambda x, y: x + y,
                                 function_to_gene.values())]
    row_colours = pd.DataFrame(
        [function_colours[gene_to_function[t]] for t in the_dat.index],
        index=the_dat.index,
        columns=['Function'])

    # standardise (Z)
    z = the_dat.subtract(the_dat.mean(axis=1),
                         axis=0).divide(the_dat.std(axis=1), axis=0)

    cg = sns.clustermap(z,
                        col_cluster=False,