def ipa_results_to_wideform(res, plogalpha):
    """
    Convert the IPA results dictionary into a wideform pd.DataFrame.
    Owing to the potentially large number of comparisons, we can't use the Venn approach here, but there's no need.
    :param res:
    :param plogalpha:
    :return:
    """
    de_all_pathways = sorted(setops.reduce_union(*[t.index for t in res.values()]))
    export_wideform = pd.DataFrame(index=de_all_pathways)
    member_cols = []
    for k, v in res.items():
        sign_ix = v.index[v['-logp'] >= plogalpha]
        this_yn = pd.Series('N', index=de_all_pathways)
        this_yn.loc[sign_ix] = 'Y'
        member_cols.append(k)
        export_wideform.insert(
            export_wideform.shape[1],
            k,
            this_yn
        )
        for col in ['-logp', 'z', 'ratio', 'n_gene']:
            export_wideform.insert(
                export_wideform.shape[1],
                "%s_%s" % (k, col),
                v.reindex(de_all_pathways)[col]
            )

    # add n gene in pathway as single const column
    rr = export_wideform.loc[:, export_wideform.columns.str.contains('ratio')]
    ng = export_wideform.loc[:, export_wideform.columns.str.contains('n_gene')]
    n_gene_tot = (ng.astype(float).values / rr.astype(float)).mean(axis=1).round().astype(int)
    export_wideform.insert(0, 'n_gene_in_pathway', n_gene_tot)

    return export_wideform
Esempio n. 2
0
def get_dm_associated_de(dmr_ids, de_res_full, dmr_res_full, dmr_id_to_ens,
                         ens_to_dmr_id, ref_labels):
    all_genes = setops.reduce_union(
        *[dmr_id_to_ens[i].values for i in dmr_ids])
    this_de_full = {}
    for r in ref_labels:
        tt = de_res_full[r]
        tt = tt.loc[tt['Gene Symbol'].isin(all_genes)]
        this_de_full[r] = tt

    common_ix = sorted(
        setops.reduce_intersection(*[t.index for t in this_de_full.values()]))
    common_gs = this_de_full.values()[0].loc[common_ix, 'Gene Symbol']

    dmr_median_delta = {}
    for e in common_ix:
        dmr_median_delta[e] = {}
        for r in ref_labels:
            dmr_median_delta[e][r] = np.mean([
                dmr_res_full[r][i]['median_change'] for i in ens_to_dmr_id[e]
            ])
    dmr_median_delta = pd.DataFrame(dmr_median_delta).transpose().sort_index()

    this_logfc = pd.concat(
        (this_de_full[r].loc[common_ix, 'logFC'] for r in ref_labels), axis=1)
    this_logfc.columns = ref_labels
    this_logfc.index = common_gs

    de_logfc = this_logfc.sort_index()

    return {'de': de_logfc, 'dmr': dmr_median_delta}
Esempio n. 3
0
def compute_cross_comparison_correction(res, samples, external_refs, set_type='pair_only'):
    """
    Compute the _correction_ list of features for the supplied results. These are the features that are
    EITHER present in every reference comparison but no cross-comparisons (set_type='ref_only')
    OR present in no reference comparison but all cross-comparisons (set_type='pair_only')
    :param res: Dictionary containing comparison results. Each comparison is keyed by the tuple (i, j), where i and j
    are the IDs of the two groups being compared. Values are iterables of unique feature identifiers (e.g. gene IDs,
    DMR cluster IDs).
    :param samples: The core sample list, without including external references.
    :param external_refs: A list of external reference sample names.
    :param set_type: See description.
    :return: Iterable of feature IDs
    """

    members_rows = samples
    members_cols = members_rows + external_refs

    the_venn_set = pd.DataFrame(index=members_rows, columns=members_cols)
    for i in members_rows:
        p = res[(i, i)]
        for j in members_cols:
            r = res[(i, j)]
            x, _ = setops.venn_from_arrays(p, r)
            if set_type == 'pair_only':
                kset = '10'
            elif set_type == 'ref_only':
                kset = '01'
            else:
                raise AttributeError("set_type must be 'pair_only' or 'ref_only'.")
            the_venn_set.loc[i, j] = x[kset]

    # For each reference, get the features that are pair only in that reference and not in any of the iNSC
    vs_diff = pd.DataFrame(index=members_rows, columns=external_refs)
    for i in members_rows:
        for j in external_refs:
            the_ref = the_venn_set.loc[i, j]
            all_else = the_venn_set.loc[i, members_rows]
            union_all_else = setops.reduce_union(*all_else)
            vs_diff.loc[i, j] = sorted(set(the_ref).difference(union_all_else))

    # Intersection down the columns gives us a correction list for each reference
    vs_specific_to_ref = vs_diff.apply(lambda x: setops.reduce_intersection(*x))

    # Intersection across the references gives us a final list that need correcting
    vs_specific_to_all_refs = setops.reduce_intersection(*vs_specific_to_ref)

    return {
        'specific_to_each_ref': vs_specific_to_ref,
        'specific_to_all_refs': vs_specific_to_all_refs,
        'venn_set': the_venn_set,
        'ref_diff_set': vs_diff
    }
def set_permutation_test(data, n_iter=1000, parallel=True):
    K = len(data)
    N = len(setops.reduce_union(*data.values()))

    set_sizes = collections.OrderedDict([(k, len(v)) for k, v in data.items()])
    simulated_sizes = collections.defaultdict(list)

    if parallel:
        pool = mp.Pool()
        jobs = {}
        for i in range(n_iter):
            jobs[i] = pool.apply_async(one_random_perm, args=(set_sizes, N))

        pool.close()
        pool.join()
        for i, j in jobs.items():
            vc = j.get()
            for k, v in vc.items():
                simulated_sizes[k].append(v)
    else:
        for i in range(n_iter):
            vc = one_random_perm(set_sizes, N)
            for k, v in vc.items():
                simulated_sizes[k].append(v)

    _, vc_true = setops.venn_from_arrays(*data.values())

    # to calculate the P value, we EITHER need to specify a single sided test OR decide how to compute a two-sided P
    # Some interesting discussions on this topic:
    # https://stats.stackexchange.com/questions/140107/p-value-in-a-two-tail-test-with-asymmetric-null-distribution
    # https://stats.stackexchange.com/questions/360864/2-tailed-permutation-tests-for-obviously-non-symmetric-data
    # https://stats.stackexchange.com/questions/34052/two-sided-permutation-test-vs-two-one-sided
    # However, a 'Z' value is easier to compute
    z = {}
    p = {}
    for k in simulated_sizes.keys():
        obs = vc_true[k]
        t = stats.percentileofscore(simulated_sizes[k], obs)
        if t <= 50:
            p[k] = 2 * t / 100.
        else:
            p[k] = 2 * (1 - t / 100.)

        z[k] = t - 50.

    return {
        'simulated_set_sizes': simulated_sizes,
        'observed_set_sizes': vc_true,
        'p': p,
        'z': z
    }
Esempio n. 5
0
    def check_data_compat(self):

        if self.dmr_comparison_groups is not None:

            if self.dmr_res is not None:
                for grp_name, grp_dict in self.dmr_comparison_groups.items():
                    if grp_name not in self.dmr_res:
                        raise ValueError("Group %s is not in the DMR results" %
                                         grp_name)

            if self.de_res is not None:
                for grp_name, grp_dict in self.dmr_comparison_groups.items():
                    if grp_name not in self.de_res:
                        raise ValueError("Group %s is not in the DE results" %
                                         grp_name)

            if self.mdat is not None:
                for grp_name, grp_dict in self.dmr_comparison_groups.items():
                    all_samples = list(setops.reduce_union(*grp_dict.values()))
                    if len(self.mdat.columns.intersection(all_samples)) != len(
                            all_samples):
                        raise ValueError(
                            "Group %s contains samples that are missing from mdat"
                            % grp_name)
def export_de_dmr_groups_for_ipa(de_fdr, de_logfc, groups, fn_out=None, pids=consts.PIDS):
    """

    :param de_fdr: Output of `get_de_dmr_groups`
    :param de_logfc: Output of `get_de_dmr_groups`
    :param groups:
    :param fn_out: If supplied, the IPA results (in Excel format) will be written to this path
    :param pids:
    :return:
    """
    # export these for IPA analysis
    df_for_ipa = pd.DataFrame(
        index=sorted(setops.reduce_union(*[t.index for t in de_logfc.values()])),
        columns=reduce(operator.add, [["%s_logFC" % pid, "%s_FDR" % pid] for pid in pids])
    )
    for grp in groups:
        for pid in groups[grp]:
            this_logfc = de_logfc[grp][pid].dropna()
            this_fdr = de_fdr[grp][pid].dropna()
            df_for_ipa.loc[this_logfc.index, "%s_logFC" % pid] = this_logfc
            df_for_ipa.loc[this_fdr.index, "%s_FDR" % pid] = this_fdr
    if fn_out is not None:
        df_for_ipa.to_excel(fn_out)
    return df_for_ipa
Esempio n. 7
0
               ls='--')
    ax.set_xlabel('Number of variants')
    ax.set_ylabel('Density')
    fig.tight_layout()
    fig.savefig(os.path.join(outdir,
                             "permute_partial_counts_meth_assoc_hyper.png"),
                dpi=200)

    # track these partial matches down
    aa_hypo = [(setops.key_to_members(t, pids), vs[t])
               for t in venn_sets_by_group['partial']['Hypo']
               if len(setops.key_to_members(t, pids)) > 2]
    aa_hyper = [(setops.key_to_members(t, pids), vs[t])
                for t in venn_sets_by_group['partial']['Hyper']
                if len(setops.key_to_members(t, pids)) > 4]
    all_hypo = setops.reduce_union(*[t[1] for t in aa_hypo])
    all_hyper = setops.reduce_union(*[t[1] for t in aa_hyper])

    partial_hypo_recs = []
    partial_hyper_recs = []

    for pid_arr, arr in aa_hypo:
        for x in arr:
            the_search_list = dat_classified[pid_arr[0]]['GIC only']
            the_search_list.extend([
                t['GIC']
                for t in dat_classified[pid_arr[0]]['GIC hom iNSC het']
            ])
            the_search_list.extend(
                [t['GIC'] for t in dat_classified[pid_arr[0]]['other']])
            the_recs = [t for t in the_search_list if str(t) == x]
        'Hypo': '#c70039',
        'Hyper': '#3d3d6b',
        'Discordant': 'b'
    }

    # Don't think we need this, but may be useful for a comparison?
    if False:
        dmr_by_member = [dmr_res_all[pid].keys() for pid in pids]
        venn_set, venn_ct = setops.venn_from_arrays(*dmr_by_member)

        venn_sets_by_group = setops.full_partial_unique_other_sets_from_groups(pids, groups)
        dmr_groups = {}
        for grp in groups:
            # generate bar chart showing number / pct in each direction (DM)
            this_sets = venn_sets_by_group['full'][grp] + venn_sets_by_group['partial'][grp]
            this_dmrs = sorted(setops.reduce_union(*[venn_set[k] for k in this_sets]))
            dmr_groups[grp] = this_dmrs

    # Rather than just looking at genes corresponding to group-specific DMRs, we make the requirements more
    # stringent. For each Venn set (e.g. 018, 054, 052 - hyper group), we require DE genes in the same patients.
    # Simplest approach is to use the joint_de_dmr dataframes, which have already been combined.
    # all relations

    tmp = get_de_dmr_groups(joint_de_dmr_s1, dmr_res_s1.clusters, groups)
    de_dmrs_all = tmp['de_dmr_groups']
    de_dmr_de_fdr_all = tmp['de_FDR']
    de_dmr_de_logfc_all = tmp['de_logFC']
    de_dmr_dmr_median_delta_all = tmp['dmr_median_delta_m']
    de_dmr_ipa_res_all = export_de_dmr_groups_for_ipa(
        de_dmr_de_fdr_all,
        de_dmr_de_logfc_all,
def get_de_dmr_groups(
        joint_de_dmr,
        clusters,
        groups,
        pids=consts.PIDS,
        relation_filter=None
):
    """
    Get group-specific DE/DMRs. These are defined as DEs that are consistent with the DMRs in a given selection of
    patients (from one to many) that are NOT shared across groups.
    :param joint_de_dmr:
    :param clusters:
    :param groups: Dictionary, keyed by group name. Values are iterables giving patient IDs in each group.
    :param pids:
    :param relation_filter:
    :return:
    """
    venn_sets_by_group = setops.full_partial_unique_other_sets_from_groups(pids, groups)

    if relation_filter is not None:
        if not hasattr(relation_filter, '__iter__'):
            relation_filter = [relation_filter]

    de_dmr_groups = {}
    de_dmr_de_logfc = {}
    de_dmr_de_fdr = {}
    de_dmr_dmr_delta = {}

    if relation_filter is None:
        de_dmr_by_member = [joint_de_dmr[pid].index for pid in pids]
    else:
        de_dmr_by_member = []
        for pid in pids:
            this_members = []
            for t in joint_de_dmr[pid].index:
                gene_rel_options = [(t[1], rel) for rel in relation_filter]
                if len(set(clusters[t[0]].genes).intersection(gene_rel_options)) > 0:
                    this_members.append(t)
            de_dmr_by_member.append(this_members)
    venn_set, venn_count = setops.venn_from_arrays(*de_dmr_by_member)

    for grp in groups:
        this_sets = venn_sets_by_group['full'][grp] + venn_sets_by_group['partial'][grp]
        this_de_dmrs = sorted(setops.reduce_union(*[venn_set[k] for k in this_sets]))

        if relation_filter is not None:
            new_de_dmrs = []
            for t in this_de_dmrs:
                # look for any intersection here
                gene_rel_options = [(t[1], rel) for rel in relation_filter]
                if len(set(clusters[t[0]].genes).intersection(gene_rel_options)) > 0:
                    new_de_dmrs.append(t)
            this_de_dmrs = new_de_dmrs

        de_dmr_groups[grp] = this_de_dmrs

        # get separate lists of DE genes and DMR IDs
        # DMRs is straightforward
        de_dmr_dmr_delta[grp] = pd.DataFrame(
            index=sorted(set([t[0] for t in this_de_dmrs])),
            columns=pids + ['consistent'],
        )
        # DEs is trickier: some genes have mapped twice because I was so diligent in curating the original lists!
        this_de_genes = sorted(set([t[1] for t in this_de_dmrs]))
        this_de_ens = annotation_gene_to_ensembl.gene_to_ens(this_de_genes)
        this_de_ens = this_de_ens[~this_de_ens.duplicated()]
        this_de_genes = this_de_ens.index

        de_dmr_de_logfc[grp] = pd.DataFrame(
            index=this_de_genes.tolist(),
            columns=pids + ['consistent'],
        )
        de_dmr_de_fdr[grp] = pd.DataFrame(
            index=this_de_genes.tolist(),
            columns=pids + ['consistent'],
        )

        # fill them in
        for k in this_sets:
            this_vs = [t for t in venn_set[k] if t[1] in this_de_genes]
            this_pids = [pids[i] for i, t in enumerate(k) if t == '1']
            for pid in this_pids:
                de_dmr_dmr_delta[grp].loc[[t[0] for t in this_vs], pid] = joint_de_dmr[pid].loc[
                    this_vs, 'dmr_median_delta'].values
                de_dmr_de_logfc[grp].loc[[t[1] for t in this_vs], pid] = joint_de_dmr[pid].loc[
                    this_vs, 'de_logFC'].values
                de_dmr_de_fdr[grp].loc[[t[1] for t in this_vs], pid] = joint_de_dmr[pid].loc[
                    this_vs, 'de_FDR'].values

        for k, row in de_dmr_dmr_delta[grp].iterrows():
            tmp_dm = np.sign(row.dropna().astype(float))
            row['consistent'] = (tmp_dm == tmp_dm.iloc[0]).all()

        for k, row in de_dmr_de_logfc[grp].iterrows():
            tmp_de = np.sign(row.dropna().astype(float))
            row['consistent'] = (tmp_de == tmp_de.iloc[0]).all()
            de_dmr_de_fdr[grp].loc[k, 'consistent'] = row['consistent']

    return {
        'dmr_median_delta_m': de_dmr_dmr_delta,
        'de_logFC': de_dmr_de_logfc,
        'de_FDR': de_dmr_de_fdr,
        'de_dmr_groups': de_dmr_groups
    }
Esempio n. 10
0
def venn_set_to_dataframe(
        data,
        venn_set,
        set_labels,
        include_sets=None,
        full_data=None,
        logfc_col='logFC',
        fdr_col='FDR',
        run_sanity_check=False,
        add_null_set=False,
):
    """
    Given the input DE data and Venn sets, generate a wide format dataframe containing all the data, one column
    per patient and one row per gene.
    Optionally filter the sets to include only a subset.
    Optionally include non-significant results too.
    :param data: Dict containing DE results, keyed by the entries of set_labels
    :param venn_set:
    :param set_labels:
    :param include_sets:
    :param full_data: If supplied, this has the same format as `data`, but the lists are complete so that even non-
    significant results can be accessed.
    :param logfc_col: The name of the log fold change column in the input data. Also used to name columns in the df.
    :param fdr_col: The name of the FDR column in the input data. Also used to name columns in the df.
    :param run_sanity_check: (default: False) If True, run an additional sanity check at the end. This *should* be
    unnecessary. It's slow for larger numbers of members.
    :return:
    """
    if add_null_set and full_data is None:
        raise ValueError("Can only add_null_set if full_data is supplied.")
    if include_sets is not None:
        venn_set = dict([
            (k, v) for k, v in venn_set.items() if k in include_sets
        ])


    # precompute columns
    cols = reduce(
        lambda x, y: x + y,
        [[t, "%s_%s" % (t, logfc_col), "%s_%s" % (t, fdr_col)] for t in set_labels]
    ) + ['consistency']

    res = []
    genes_seen = set()
    for k in venn_set:
        the_genes = venn_set[k]
        genes_seen.update(the_genes)

        # populate with individual patient results
        this_block = pd.DataFrame(index=the_genes, columns=cols)
        # blocks = []
        consistency_check = []
        for i, t in enumerate(k):
            pid = set_labels[i]

            if t == '1':
                this_block.loc[:, pid] = 'Y'
                this_block.loc[the_genes, "%s_%s" % (pid, logfc_col)] = data[pid].loc[the_genes, logfc_col]
                this_block.loc[the_genes, "%s_%s" % (pid, fdr_col)] = data[pid].loc[the_genes, fdr_col]

                cc = data[pid].loc[the_genes, 'Direction']
                cc.name = pid
                consistency_check.append(cc)
            else:
                this_block.loc[:, pid] = 'N'

                # this_datum.loc[the_genes, pid] = 'N'
                if full_data is not None:
                    # we can't guarantee there will be entries for all genes, as filtering removes some
                    # therefore find matches in advance and only fill in those rows
                    the_genes_present = pd.Index(the_genes).intersection(full_data[pid].index)
                    this_block.loc[the_genes_present, "%s_%s" % (pid, logfc_col)] = full_data[pid].loc[the_genes_present, logfc_col]
                    this_block.loc[the_genes_present, "%s_%s" % (pid, fdr_col)] = full_data[pid].loc[the_genes_present, fdr_col]

        # assess consistency of DE direction
        consist = pd.Series(index=the_genes)

        if len(consistency_check) > 0:
            consistency_check = pd.concat(consistency_check, axis=1)
            idx = consistency_check.apply(lambda col: col == consistency_check.iloc[:, 0]).all(axis=1)
            consist.loc[idx] = 'Y'
            consist.loc[~idx] = 'N'

        this_block.loc[:, 'consistency'] = consist

        res.append(this_block)

    # check: no genes should be in more than one data entry
    if run_sanity_check:
        for i, k in enumerate(venn_set):
            for j, k2 in enumerate(venn_set):
                if k == k2: continue
                bb = len(res[i].index.intersection(res[j].index))
                if bb > 0:
                    raise AttributeError("Identified %d genes that are in BOTH %s and %s" % (bb, k, k2))

    if add_null_set:
        all_genes = setops.reduce_union(*[t.index for t in full_data.values()])
        add_genes = all_genes.difference(genes_seen)
        this_block = pd.DataFrame(index=add_genes, columns=cols)
        for pid in set_labels:
            # by definition, no samples are DE positive in the null set
            this_block.loc[:, pid] = 'N'
            the_genes_present = add_genes.intersection(full_data[pid].index)
            this_block.loc[the_genes_present, "%s_%s" % (pid, logfc_col)] = full_data[pid].loc[the_genes_present, logfc_col]
            this_block.loc[the_genes_present, "%s_%s" % (pid, fdr_col)] = full_data[pid].loc[the_genes_present, fdr_col]
        res.append(this_block)

    res = pd.concat(res, axis=0)

    # add gene symbols
    general.add_gene_symbols_to_ensembl_data(res)

    return res
Esempio n. 11
0
            sample_text = dat.columns
            rad = (np.array(zip(*[svd_res['feat_dat'][i + 1] for i in dims_pair])) ** 2).sum(axis=1) ** .5
            to_annotate = rad > selection_radius
            p1 = generate_plotly_plot(
                svd_res,
                filename="pca_biplot_dims_%d-%d" % tuple(t + 1 for t in dims_pair),
                feature_size_scaling=size_scaling,
                feature_text=feature_text,
                sample_text=sample_text,
                sample_colours=sample_colours,
                sample_markers=sample_markers,
                feature_text_mask=~to_annotate
            )

    # export lists for IPA
    ix_all = sorted(setops.reduce_union(*[t[0.99].index for t in selected_by_quantile_mean_logfc.values()]))
    ipa_mean_logfc = pd.DataFrame(index=ix_all)
    for k, v in selected_by_quantile_mean_logfc.items():
        ipa_mean_logfc.insert(0, "pc_%s_q99_logFC" % '-'.join([str(t+1) for t in k]), v[0.99])
        ipa_mean_logfc.insert(0, "pc_%s_q995_logFC" % '-'.join([str(t+1) for t in k]), v[0.995])
    ipa_mean_logfc.to_excel(os.path.join(outdir, "for_ipa_mean_logfc.xlsx"))

    for k, v in selected_by_quantile_separate_logfc.items():
        ix_all = setops.reduce_union(*[v[q].index for q in quantiles])
        this = []
        for q in quantiles:
            tt = v[q]
            tt.columns = ["%s_%d_logFC" % (p, int(q * 1000)) for p in pids]
            this.append(tt)
        pd.concat(this, axis=1, sort=True).to_excel(os.path.join(outdir, "for_ipa_separate_logfc_pc%s.xlsx" % '-'.join([str(t+1) for t in k])))
Esempio n. 12
0
    def plot_m_values(
        self,
        mdat,
        probe_locations,
        comparisons,
        colours='default',
        markers='default',
        zorder='default',
        alpha='default',
        size='default',
    ):
        """

        :param mdat: pd.DataFrame containing the data to plot. Columns are samples, rows are probes
        :param probe_locations: pd.Series containing the probe IDs to include and their genomic coordinates
        :param comparisons: Dictionary keyed by comparison (equivalent to row_names). Each entry is a dictionary keyed
        by group name (e.g. 'Disease' / 'Healthy') and with values giving the samples in that group. The sample names
        must be in the columns of `mdat`.
        :param colours: Dictionary keyed by group name (e.g. 'Disease') giving the colour to use for that group.
        Defaults are used if not supplied. To disable colours, set to None.
        :param markers: Dictionary keyed by group name giving the marker to use for that group.
        Defaults are used if not supplied. To use circle markers for everything, set to None.
        :param zorder: Dictionary keyed by group name giving the zorder to use for that group.
        Defaults are used if not supplied. To use matplotlib defaults for everything, set to None.
        :param alpha: Dictionary keyed by group name giving the alpha to use for that group.
        Defaults are used if not supplied. To use matplotlib defaults for everything, set to None.
        :return:
        """
        all_groups = sorted(
            setops.reduce_union(*(t.keys() for t in comparisons.values())))
        n_groups = len(all_groups)

        def set_property(x, default, default_static):
            if x == 'default':
                out = dict(zip(all_groups, default))
            elif x is None:
                out = dict([(k, default_static) for k in all_groups])
            elif not hasattr(x, 'get'):
                # single value supplied
                out = dict([(k, x) for k in all_groups])
            else:
                out = x
            return out

        colours = set_property(colours, common.get_best_cmap(n_groups), '0.5')
        markers = set_property(markers, common.get_best_marker_map(n_groups),
                               'o')
        zorder = set_property(zorder, range(20, 20 + n_groups), 20)
        # default alpha will be based on zorder
        a = sorted([(k, zorder[k]) for k in all_groups], key=lambda x: x[1])
        a_ix = dict([(t[0], i) for i, t in enumerate(a)])
        alpha_values = np.linspace(0.4, 0.6, n_groups)
        alpha_default = [alpha_values[a_ix[k]] for k in all_groups]

        alpha = set_property(alpha, alpha_default, '0.6')

        # default size will be based on zorder
        s_values = range(20, 20 + n_groups)
        s_default = [s_values[a_ix[k]] for k in all_groups]
        size = set_property(size, s_default, 20)

        # scatter plot individual probes
        ymin = 0
        ymax = 0
        for nm in self.row_names:
            grp_dict = comparisons[nm]
            this_ax = self.m_axs[nm]
            for grp_nm, grp_samples in grp_dict.items():
                the_colour = colours.get(grp_nm)
                the_marker = markers.get(grp_nm)
                the_z = zorder.get(grp_nm)
                the_alpha = alpha.get(grp_nm)
                the_s = size.get(grp_nm)
                for col, x in mdat.loc[probe_locations.index,
                                       grp_samples].iteritems():
                    this_ax.scatter(probe_locations,
                                    x.values,
                                    c=the_colour,
                                    marker=the_marker,
                                    zorder=the_z,
                                    alpha=the_alpha,
                                    s=the_s,
                                    edgecolor='k',
                                    linewidth=0.5)
                    ymin = min(x.values.min(), ymin)
                    ymax = max(x.values.max(), ymax)
                    this_ax.set_ylabel(nm)
        self.mdat_min = ymin
        self.mdat_max = ymax

        if self.coord_max is None:
            self.coord_min = probe_locations.min()
            self.coord_max = probe_locations.max()
        else:
            self.coord_min = min(probe_locations.min(), self.coord_min)
            self.coord_max = max(probe_locations.max(), self.coord_max)
            de_res_full_s1 = pickle.load(f)
    else:
        raise AttributeError(
            "Unable to load pre-computed DE results, expected at %s" % fn)

    de_res_s1 = dict([(k, v.loc[v.FDR < de_params['fdr']])
                      for k, v in de_res_full_s1.items()])

    # get the joint table
    joint_de_dmr_s1 = rnaseq_methylationarray.compute_joint_de_dmr(
        dmr_res_s1, de_res_s1)

    # run the dgidb lookup against all genes
    # have to chunk this operation to avoid error
    all_genes = sorted(
        setops.reduce_union(*[t.gene.values
                              for t in joint_de_dmr_s1.values()]))
    dgi_all = druggable_genome.dgidb_lookup_drug_gene_interactions(all_genes)

    # manually resolve a few known ambiguities
    ambig = {'ELTD1': 'ADGRL4', 'ODZ3': 'TENM3'}
    for k, v in ambig.items():
        x = [t for t in dgi_all['ambiguous'][k] if t['geneName'] == v][0]
        dgi_all['interactions'][k] = x['interactions']

    de_dmr_by_member = [joint_de_dmr_s1[pid].index for pid in pids]
    venn_set, venn_ct = setops.venn_from_arrays(*de_dmr_by_member)

    # define short and long list

    # long list
    ss = setops.specific_sets(pids)
Esempio n. 14
0
    # functional API - the python bindings are incomplete here?
    cy = CyRestClient()
    # reset the session (in case something is already loaded)
    cy.session.delete()

    # command API - the python bindings are much better
    cy_cmd = cyrest.cyclient()

    for pid in pids:

        # three networks to work with
        res_syn = res['%s_syngeneic' % pid]
        res_r1 = res['%s_h9' % pid]
        res_r2 = res['%s_gibco' % pid]

        all_pathways = setops.reduce_union(
            *[t.index for t in (res_syn, res_r1, res_r2)])

        p_to_g = dict([(p, gmt[p]) for p in all_pathways])

        # to get connectivity, we need to create the complementary dictionary (indexed by genes)
        g_to_p = {}
        for p in all_pathways:
            for g in p_to_g[p]:
                g_to_p.setdefault(g, []).append(p)

        # we're going to use passthrough mapping to customise the node colour
        # we'll define 3 colourmaps, with -log10(p) assigning the shade:
        # greyscale for syn. and ref.
        # reds for ref. only
        # blues for syn. only
        # colours are defined by HEX values? Add these to the nodes
        for j, (k2, v2) in enumerate(v1.iteritems()):
            k = 0
            ax = fig.add_subplot(gs_sub[j, k])
            if j == 0:
                ax.set_title('Hypo')
            this_members = [v2.index[v2["median_delta_%s" % r] < 0] for r in esc_ref_names]
            set_labels = None
            if j == (len(v1) - 1):
                set_labels = esc_ref_names

            vd = venn.venn_diagram(
                *this_members,
                set_labels=set_labels,
                set_colors=set_colours_hypo,
                ax=ax,
                normalize_to=(len(setops.reduce_union(*this_members)) / set_size_base) ** 2
            )[0]
            plt.setp(vd.patches, edgecolor='k')
            if vd.set_labels is not None:
                for lbl in vd.set_labels:
                    xx, yy = lbl.get_position()
                    lbl.set_position([xx * 3, yy])

            k = 1
            ax = fig.add_subplot(gs_sub[j, k])
            if j == 0:
                ax.set_title('Hyper')
            this_members = [v2.index[v2["median_delta_%s" % r] > 0] for r in esc_ref_names]
            vd = venn.venn_diagram(
                *this_members,
                set_labels=set_labels,
Esempio n. 16
0
        # these_probes = cor.index[(cor.abs() > cross_corr_threshold) & (pval < alpha)]
        # myc_corr_probes.append(these_probes)

    pool.close()
    pool.join()
    for p in myc_probes:
        cor, pval = jobs[p].get(1e4)
        these_probes = cor.index[(cor.abs() > cross_corr_threshold)
                                 & (pval < alpha)]
        myc_corr_probes.append(these_probes)

    #  out of interest, what is the overlap between these? (presumably quite high?)
    vs, vc = setops.venn_from_arrays(*myc_corr_probes)

    # union of probes
    keep_probes = setops.reduce_union(*myc_corr_probes)

    print "After comparing all data against each MYC probe, we are left with %d correlated probes" % len(
        keep_probes)

    genes_corr_with_myc = the_symbols.loc[keep_probes].dropna()
    print "These correspond to %d unique genes." % len(
        genes_corr_with_myc.unique())

    # check the overlap with validated genes
    overlap = pd.Index(validated_genes).intersection(
        genes_corr_with_myc.unique())
    if len(overlap) == len(validated_genes):
        print "Good news: all %d validated genes are in the genes shortlist." % len(
            validated_genes)
    else:
    if len(diff_kegg):
        print "%d genes in the geneset mTOR (KEGG) are not in the data and will be removed: %s" % (
            len(diff_kegg),
            ', '.join(diff_kegg.tolist())
        )
        for t in diff_kegg:
            mtor_geneset.remove(t)

    rna_list_hu['mTOR'] = mtor_geneset

    # export supplementary tables
    to_export = the_list_mo.copy()
    to_export.columns = ['Mouse BMDM', 'Mouse MG']


    all_genes_in_set = setops.reduce_union(*the_list_hu.values())

    # DEBUG: disable filtering genes - why would we need to?
    if False:
        # remove genes that have no appreciable expression level
        # >=10 samples must have FPKM >= 1
        to_keep = ((rnaseq_dat > fpkm_cutoff).sum(axis=1) > fpkm_min_samples) | (rnaseq_dat.index.isin(all_genes_in_set))
        print "Keeping %d / %d genes that are sufficiently abundant" % (to_keep.sum(), to_keep.size)
        rnaseq_dat = rnaseq_dat.loc[to_keep]

    # run ssGSEA
    rna_es = gsva.ssgsea(rnaseq_dat, rna_list_hu)
    ffpe_es = gsva.ssgsea(ffpe_dat, rna_list_hu)

    # scale using the Z transform
    # TODO: previous operation had axis=None
                                                  "iNSC%s" % insc_pid)])

    # for each GIC line: get DE genes in syngeneic comparison but NOT in any cross-comparison
    n_syn_only = pd.DataFrame(index=pd.Index(pids, name='GIC'),
                              columns=pd.Index(pids, name='iNSC'))
    syn_only = {}
    for gic_pid, insc_pid in itertools.product(pids, pids):
        # this_syn isn't necessarily syngeneic, but we're acting as if it were here
        this_syn = de_res_sign[("GBM%s" % gic_pid, "iNSC%s" % insc_pid)]
        others = [
            de_res_sign[("GBM%s" % gic_pid, "iNSC%s" % p)]
            for p in pd.Index(pids).drop(insc_pid)
        ]
        # 'syn only' index
        this_so_ix = this_syn.index.difference(
            setops.reduce_union(*[t.index for t in others]))
        syn_only[("GBM%s" % gic_pid,
                  "iNSC%s" % insc_pid)] = this_syn.loc[this_so_ix]
        n_syn_only.loc[gic_pid, insc_pid] = this_so_ix.size

    true_syn_only = dict([(p, syn_only[("GBM%s" % p, "iNSC%s" % p)])
                          for p in pids])

    # export to list
    excel.pandas_to_excel(true_syn_only,
                          os.path.join(outdir, "de_only_in_syngeneic.xlsx"))

    # export for IPA
    # we're going to run the true syngeneic (10) against non-syngeneic chosen to give the greatest number of DE genes
    # in practice, this means fixing the identity of the iNSC
    selected_insc = ['018', '030', '054', '052']
Esempio n. 19
0
    def __init__(self, loaders, intersection_only=True):
        """
        Class to combine multiple loader objects.
        Each loader represents a separate batch. Inputs can include multiple lane loaders.
        :param loaders: Iterable of loader objects.
        :param intersection_only: If True (default), reduce counts to the indices (e.g. genes) that are present in all
        loaders.
        """
        self.logger = log.get_console_logger(self.__class__.__name__)

        if len(loaders) < 2:
            raise ValueError("Must supply 2 or more loaders to use a MultipleBatchLoader.")

        # we can only claim the meta data is linked here if all loaders have this property
        self.meta_is_linked = True
        for l in loaders:
            if not l.meta_is_linked:
                self.meta_is_linked = False

        # set the batch  column name avoiding clashes
        batch_col = 'batch'
        meta_cols = sorted(setops.reduce_union(*[t.meta.columns for t in loaders if t.meta is not None]))

        if batch_col in meta_cols:
            i = 1
            while batch_col in meta_cols:
                batch_col = "batch_%d" % i
                i += 1
        meta_cols += [batch_col]

        # check attributes that must match in all loaders
        if len(set([t.tax_id for t in loaders])) > 1:
            raise AttributeError(
                "The tax_id of the samples differ between loaders: %s" % ', '.join([str(t.tax_id) for t in loaders])
            )
        else:
            self.tax_id = loaders[0].tax_id

        if len(set([t.row_indexed for t in loaders])) > 1:
            raise AttributeError("row_indexed bool must be the same in all loaders")
        else:
            self.row_indexed = loaders[0].row_indexed

        extra_df_attributes = {}

        if self.row_indexed:
            row_indexed_dat_arr = {}
        else:
            dat = {}

        meta_values = []
        meta_index = []
        blank_meta_row = dict([(k, None) for k in meta_cols])

        # we may need to append a number to sample names
        sample_appendix = 0
        auto_batch = 1
        meta_auto_idx = 0
        samples_seen = set()

        for l in loaders:
            this_batch = l.batch_id
            if not hasattr(this_batch, '__iter__'):
                if l.batch_id is None:
                    this_batch = auto_batch
                    auto_batch += 1
                this_batch = pd.Series(this_batch, index=l.meta.index)

            try:
                this_samples = l.input_files.index.tolist()
            except AttributeError:
                # occurs when we are loading a single file
                # FIXME: find a better catch - this is too general
                if hasattr(l, 'input_files'):
                    # this occurs if l is a single file loader
                    ## FIXME: single file loaders may contain multiple samples
                    ## in that case, this doesn't spot name clashes!!

                    # FIXME: here's a workaround for now: may not be bulletproof
                    this_samples = [l.input_files]
                    if len(this_samples) != len(l.meta.index):
                        this_samples = l.meta.index.tolist()
                else:
                    # this occurs if l is a batch loader
                    # FIXME: may not give us valid sample names?
                    this_samples = l.meta.index.tolist()

            # get a copy of the data
            if self.row_indexed:
                this_dat = l.data.copy()
            else:
                this_dat = copy.copy(l.data)

            # get a copy of meta
            if l.meta is not None:
                this_meta = l.meta.copy()

            # resolve any sample clashes in the data (NOT the meta data)
            clash_resolved = False
            new_names = []

            while len(samples_seen.intersection(this_samples)) > 0:
                sample_appendix += 1
                # find the clash
                clashes = samples_seen.intersection(this_samples)
                self.logger.warning(
                    "Found sample name clash(es): %s. Modifying names to avoid errors.",
                    ', '.join(clashes)
                )
                for c in clashes:
                    new_names.append([
                        this_samples[this_samples.index(c)],
                        this_samples[this_samples.index(c)] + "_%d" % sample_appendix
                    ])
                    this_samples[this_samples.index(c)] += "_%d" % sample_appendix
                clash_resolved = True
            samples_seen.update(this_samples)

            if clash_resolved:
                # relabel metadata if linked
                if l.meta_is_linked:
                    # reorder first to be sure it's the same as data
                    this_meta = this_meta.loc[this_dat.columns]
                    this_meta.index = this_samples

                # relabel the data
                if self.row_indexed:
                    this_dat.columns = this_samples
                else:
                    for prev, new in new_names:
                        this_dat[new] = this_dat.pop(prev)

                # relabel the batch IDs
                this_batch.index = this_samples
                # relabel any other DF data if present
                for fld in l.extra_df_attributes:
                    x = getattr(l, fld)
                    x.columns = this_samples

            # data
            if self.row_indexed:
                if isinstance(this_dat.columns, pd.MultiIndex):
                    col_list = this_dat.columns.levels[0].tolist()
                else:
                    col_list = this_dat.columns.tolist()
                for c in col_list:
                    row_indexed_dat_arr[c] = this_dat[[c]]

            else:
                dat.update(this_dat)

            # other df attributes
            for fld in l.extra_df_attributes:
                if fld not in extra_df_attributes:
                    extra_df_attributes[fld] = getattr(l, fld).copy()
                else:
                    extra_df_attributes[fld] = pd.concat((extra_df_attributes[fld], getattr(l, fld)), axis=1)

            # rebuild meta
            if l.meta is not None:
                for i in this_meta.index:
                    this_row = dict(blank_meta_row)
                    this_row.update(this_meta.loc[i].to_dict())
                    this_row[batch_col] = this_batch[i]
                    meta_values.append(this_row)
                    if l.meta_is_linked:
                        meta_index.append(i)
                    else:
                        meta_index.append(meta_auto_idx)
                        meta_auto_idx += 1
            else:
                for c in this_dat.columns:
                    this_row = dict(blank_meta_row)
                    this_row[batch_col] = this_batch[c]
                    meta_values.append(this_row)
                    meta_index.append(meta_auto_idx)
                    meta_auto_idx += 1

        self.meta = pd.DataFrame(meta_values, index=meta_index, columns=meta_cols)
        if intersection_only:
            join = 'inner'
        else:
            join = 'outer'

        if self.row_indexed:
            dat = pd.concat(
                [row_indexed_dat_arr[k] for k in self.meta.index],
                axis=1, sort=True, join=join
            )

        self.data = dat
        self.batch_id = self.meta.loc[:, batch_col]

        self.extra_df_attributes = tuple()
        for fld in extra_df_attributes:
            setattr(self, fld, extra_df_attributes[fld])
            self.extra_df_attributes += (fld,)
Esempio n. 20
0
    indir = os.path.join(GIT_LFS_DATA_DIR, 'ipa_from_biplots')

    # single components
    for q in [99, 995]:
        ipa_pathways_single = {}
        for dim in range(1, 4):
            fn = os.path.join(indir, "%d_%d.txt" % (dim, q))
            this = pd.read_csv(fn, sep='\t', skiprows=2, header=0, index_col=0)
            this.columns = ['-logp', 'ratio', 'z', 'genes']
            # add ngenes column
            this.insert(3, 'n_gene', this.genes.str.split(',').apply(len))
            this.index = [x.decode('utf-8') for x in this.index]
            ipa_pathways_single[dim] = this

        ipa_single = pd.DataFrame(index=sorted(setops.reduce_union(*[ipa_pathways_single[i].index for i in range(1, 4)])))
        [ipa_single.insert(0, i, ipa_pathways_single[i]['-logp']) for i in range(1, 4)[::-1]]
        ipa_single.fillna(0., inplace=True)
        # drop rows with no significant results
        ipa_single = ipa_single.loc[(ipa_single > -np.log10(0.05)).sum(axis=1) > 0]
        p_order = ipa_single.sum(axis=1).sort_values(ascending=False).index
        fig = plt.figure(figsize=(7., 9.8))
        ax = fig.add_subplot(111)
        sns.heatmap(
            ipa_single.loc[p_order],
            mask=ipa_single.loc[p_order] == 0,
            cmap='YlOrRd',
            linewidths=.2,
            linecolor='w',
            cbar_kws={"orientation": 'vertical', "shrink": 0.6},
            ax=ax
                svd_res,
                filename="pca_biplot_dims_%d-%d" % tuple(t + 1
                                                         for t in dims_pair),
                feature_size_scaling=size_scaling,
                feature_text=feature_text,
                sample_text=sample_text,
                sample_colours=sample_colours,
                sample_markers=sample_markers,
                feature_text_mask=~to_annotate,
                components=tuple(i + 1 for i in dims_pair),
            )

    # export lists for IPA

    ix_all = sorted(
        setops.reduce_union(
            *[t.index for t in selected_by_quantile_mean_logfc.values()]))

    ipa_mean_logfc = pd.DataFrame(index=ix_all)
    for k, v in selected_by_quantile_mean_logfc.items():
        ipa_mean_logfc.insert(
            0, "pc_%s_logFC" % '-'.join([str(t + 1) for t in k]), v)
    ipa_mean_logfc.to_excel(os.path.join(outdir, "for_ipa_mean_logfc.xlsx"))

    # for separated data, combine single and paired PC for maximum efficiency
    for first_dim in dims:
        dims_pair = (first_dim, first_dim + 1)
        ix_all = setops.reduce_union(*[
            selected_by_quantile_separate_logfc[k].index
            for k in [(first_dim, ), dims_pair]
        ])
        this_df = pd.DataFrame(index=ix_all)
if __name__ == "__main__":
    """
    Here I'm trying to assemble a function that automates statistical testing of upset plot intersection sizes against
    a fixed-set-size uniform random null.
    """
    n_iter = 1000
    data = {
        'A': range(5) + range(10, 16),
        'B': range(0, 21, 2),
        'C': range(1, 22, 2),
        'D': range(0, 25, 4)
    }

    K = len(data)
    full = setops.reduce_union(*data.values())
    N = len(full)
    set_sizes = collections.OrderedDict([(k, len(v)) for k, v in data.items()])
    # n_intersections = int(sum([special.comb(K, i, exact=True) for i in range(1, K + 1)]))
    intersections = list(setops.binary_combinations(K))
    simulated_sizes = collections.defaultdict(list)

    pool = mp.Pool()
    jobs = {}
    for i in range(n_iter):
        jobs[i] = pool.apply_async(one_random_perm, args=(set_sizes, N))

    pool.close()
    pool.join()
    for i, j in jobs.items():
        vc = j.get()
Esempio n. 23
0
            ax = fig.add_subplot(gs_sub[j, k])
            if j == 0:
                ax.set_title('Hypo')
            this_members = [
                v2.index[v2["median_delta_%s" % r] < 0] for r in esc_ref_names
            ]
            set_labels = None
            if j == (len(v1) - 1):
                set_labels = esc_ref_names

            vd = venn.venn_diagram(
                *this_members,
                set_labels=set_labels,
                set_colors=set_colours_hypo,
                ax=ax,
                normalize_to=(len(setops.reduce_union(*this_members)) /
                              set_size_base)**2)[0]
            plt.setp(vd.patches, edgecolor='k')
            if vd.set_labels is not None:
                for lbl in vd.set_labels:
                    xx, yy = lbl.get_position()
                    lbl.set_position([xx * 3, yy])

            k = 1
            ax = fig.add_subplot(gs_sub[j, k])
            if j == 0:
                ax.set_title('Hyper')
            this_members = [
                v2.index[v2["median_delta_%s" % r] > 0] for r in esc_ref_names
            ]
            vd = venn.venn_diagram(
Esempio n. 24
0
    c5_gmt = gsea.read_gmt_file(msigdb_c5_fn)

    keep_pathways = c5_gmt.keys()

    res = collections.OrderedDict()
    res_full = collections.OrderedDict()
    for pid in pids:
        for c in comparison_names:
            fn = os.path.join(indir, "%s%s.csv" % (pid, c))
            this = pd.read_csv(fn, sep='\t', header=0, index_col=0, usecols=[0, 3, 5, 7])
            this.columns = ['n_gene', 'nes', 'fdr']
            this = this.reindex(keep_pathways).dropna(how='all')
            res_full["%s_%s" % (pid, comparison_names[c])] = this.loc[this.fdr < alpha_relevant]
            res["%s_%s" % (pid, comparison_names[c])] = this.loc[this.fdr < alpha]

    pathways_sign = sorted(setops.reduce_union(*[t.index for t in res.values()]))
    pathways_rele = sorted(setops.reduce_union(*[t.index for t in res_full.values()]))

    excel.pandas_to_excel(res, os.path.join(outdir, "gsea_results_significant_by_patient.xlsx"))

    # use this list to export a second wideform Excel file with the top list of pathways
    for_export = pd.DataFrame(index=pathways_sign, columns=['n_gene'])
    nes_columns = []
    fdr_columns = []
    for k, v in res.items():
        for_export.loc[v.index, 'n_gene'] = v.n_gene
        this_yn = pd.Series('N', index=pathways_sign)
        this_yn.loc[v.index] = 'Y'
        for_export.insert(
            for_export.shape[1],
            k,
Esempio n. 25
0
    with open(fn, 'wb') as f:
        pickle.dump(de_res_full_s1, f)

    logger.info("Saved S1 DE results to %s", fn)

# extract only significant DE genes
de_res_s1 = dict([(k, v.loc[v.FDR < de_params['fdr']])
                  for k, v in de_res_full_s1.items()])

# generate wide-form lists and save to Excel file
de_by_member = [de_res_s1[pid].index for pid in pids]
venn_set, venn_ct = setops.venn_from_arrays(*de_by_member)

# add null set manually from full DE results
de_genes_all = setops.reduce_union(*venn_set.values())
k_null = ''.join(['0'] * len(pids))
venn_set[k_null] = list(de_res_full_s1[pids[0]].index.difference(de_genes_all))
venn_ct[k_null] = len(venn_set[k_null])

de_data = setops.venn_set_to_wide_dataframe(de_res_s1,
                                            venn_set,
                                            pids,
                                            full_data=de_res_full_s1,
                                            cols_to_include=['logFC', 'FDR'],
                                            consistency_check_col='logFC',
                                            consistency_check_method='sign')
# add gene symbols back in
general.add_gene_symbols_to_ensembl_data(de_data)
de_data.to_excel(os.path.join(outdir, 'full_de.xlsx'))
Esempio n. 26
0
    dmrs_classified = {}
    dedmr_results = {}
    both_genes = {}

    for pid in pids:
        fn = os.path.join(dmr_indir, "iPSC%s_classified_dmrs.csv" % pid)
        if os.path.exists(fn):
            this_dmr = pd.read_csv(fn, header=0, index_col=0)
            this_dmr.loc[:, 'genes'] = this_dmr.genes.apply(make_tuple)
            dmrs_classified[pid] = this_dmr
            if "iPSC_%s_ours" % pid in ipsc_esc_fb:
                this_de_res = ipsc_esc_fb["iPSC_%s_ours" % pid]

                de_genes = this_de_res.loc[:, 'Gene Symbol'].dropna()
                dmr_genes = this_dmr.loc[:, 'genes'].values
                dmr_genes = setops.reduce_union(
                    *dmr_genes) if len(dmr_genes) else []
                both_genes[pid] = set(de_genes).intersection(dmr_genes)

                if len(both_genes[pid]):
                    # DE
                    the_de_res = this_de_res.loc[
                        this_de_res['Gene Symbol'].isin(both_genes[pid])]
                    the_de_res = the_de_res.loc[:, ~the_de_res.columns.str.
                                                contains('Direction')]
                    the_de_res.set_index(
                        'Gene Symbol', inplace=True
                    )  # TODO: may break if there are duplicates?

                    # DMR
                    the_dmr_res = this_dmr.loc[this_dmr.genes.astype(
                        str).str.contains('|'.join(both_genes[pid]))]
                ','.join(t) if hasattr(t, '__iter__') else ''
                for t in to_add.UCSC_RefGene_Group
            ])
            new_dat[k] = df
        excel.pandas_to_excel(
            new_dat,
            os.path.join(outdir, fn.replace('.xlsx', '.annotated.xlsx')))

    dmp_fn = os.path.join(indir, 'dmps_3021_swan.xlsx')
    dmps = pd.read_excel(dmp_fn, header=0, index_col=0, sheet_name=None)

    # combine all DMPs into a single wideform
    cols = reduce(
        lambda x, y: x + y,
        [['%s' % t, '%s_logFC' % t, '%s_FDR' % t] for t in dmps])
    all_probes = setops.reduce_union(
        *[v.loc[v['adj.P.Val'] < 0.05].index for v in dmps.values()])
    all_probes = all_probes.intersection(anno.index)

    dmps_all = pd.DataFrame(index=all_probes,
                            columns=['CHR', 'coord', 'genes'] + cols)
    dmps_all.loc[:, 'CHR'] = anno.loc[dmps_all.index, 'CHR']
    dmps_all.loc[:, 'coord'] = anno.loc[dmps_all.index, 'MAPINFO']
    dmps_all.loc[:, 'genes'] = anno.loc[dmps_all.index, 'UCSC_RefGene_Name']
    dmps_all.loc[:, dmps.keys()] = False

    for k, v in dmps.items():
        this = v.loc[v['adj.P.Val'] < 0.05]
        this = this.loc[this.index.intersection(all_probes)]
        dmps_all.loc[this.index, k] = True
        dmps_all.loc[this.index, "%s_logFC" % k] = this['logFC']
        dmps_all.loc[this.index, "%s_FDR" % k] = this['adj.P.Val']
Esempio n. 28
0
 def all_comparison_groups(self):
     if self.dmr_comparison_groups is None:
         raise ValueError("Must first call set_dmr_res.")
     return sorted(
         setops.reduce_union(
             *(t.keys() for t in self.dmr_comparison_groups.values())))
Esempio n. 29
0
    def plot_legend(self, figsize=None):
        """
        Generate a figure showing the interpretation of the various colours / markers
        :return:
        """
        the_fig_kws = dict(self.fig_kws)
        if self.dmr_comparison_groups is None:
            if figsize is None:
                height = min(2., self.n_comparison_groups / 3.)
                figsize = (4., height)
            the_fig_kws['figsize'] = figsize
            fig = plt.figure(**the_fig_kws)
            # no legend
            gs = plt.GridSpec(nrows=2, ncols=1)
            dm_ax = fig.add_subplot(gs[0])
            de_ax = fig.add_subplot(gs[1])
            leg_ax = None
        else:
            figsize = (5.5, 2.)
            the_fig_kws['figsize'] = figsize
            fig = plt.figure(**the_fig_kws)
            gs = plt.GridSpec(nrows=2, ncols=2, width_ratios=[5, 1])
            dm_ax = fig.add_subplot(gs[0, 0])
            de_ax = fig.add_subplot(gs[1, 0])
            leg_ax = fig.add_subplot(gs[:, 1], frameon=False)
            leg_ax.tick_params(labelcolor='none',
                               top='off',
                               bottom='off',
                               left='off',
                               right='off')
            leg_ax.grid(False)

        de_vmin = self.de_vmin or -5
        de_vmax = self.de_vmax or 5
        dm_vmin = self.dm_vmin or -8
        dm_vmax = self.dm_vmax or 8

        for_heatmap = {
            'de': {
                'vmin': de_vmin,
                'vmax': de_vmax,
                'cmap': self.de_direction_colour,
                'ax': de_ax,
                'label': "DE log2(fold change)",
            },
            'dm': {
                'vmin': dm_vmin,
                'vmax': dm_vmax,
                'cmap': self.dm_direction_colour,
                'ax': dm_ax,
                'label': r"DM median $\Delta$M",
            },
        }

        heatmaps = {}
        for k, d in for_heatmap.items():
            if isinstance(d['cmap'], colors.LinearSegmentedColormap):
                the_cmap = d['cmap']
            else:
                the_cmap = colors.LinearSegmentedColormap.from_list(k, [
                    d['cmap'](t)
                    for t in np.linspace(d['vmin'], d['vmax'], 256)
                ],
                                                                    N=256)
            heatmaps[k] = d['ax'].pcolor(
                [np.linspace(d['vmin'], d['vmax'], 257)] * 2,
                [np.zeros(257), np.ones(257)],
                [np.linspace(d['vmin'], d['vmax'], 257)] * 2,
                cmap=the_cmap)
            d['ax'].yaxis.set_ticks([])
            d['ax'].set_xlabel(d['label'], fontsize=14)

        # custom legend (if we have the groups needed to plot it)
        leg = None
        hleg = None
        if self.dmr_comparison_groups is not None:
            all_groups = sorted(
                setops.reduce_union(
                    *(t.keys() for t in self.dmr_comparison_groups.values())))
            type_attrs = {
                'class': 'line',
                'linestyle': 'none',
                'markeredgecolor': 'k',
                'markeredgewidth': 1.,
                'markerfacecolor': 'none',
                'markersize': 20
            }

            leg_dict = {}
            for nm in all_groups:
                leg_dict[nm] = dict(type_attrs)
                leg_dict[nm]['markerfacecolor'] = self.colours.get(nm)
                leg_dict[nm]['marker'] = self.markers.get(nm)
                leg_dict[nm]['alpha'] = self.alpha.get(nm)
                leg_dict[nm]['markersize'] = self.size.get(nm)
            leg = common.add_custom_legend(leg_ax,
                                           leg_dict,
                                           loc='center',
                                           fontsize=14)
            hleg = leg_ax.get_legend()
            hleg.set_frame_on(False)

        gs.update(bottom=0.3,
                  top=0.98,
                  left=0.04,
                  right=0.95,
                  wspace=0.05,
                  hspace=2.)

        return {
            'fig': fig,
            'gs': gs,
            'legend_objects': leg,
            'legend': hleg,
            'heatmaps': heatmaps,
            'dm_ax': dm_ax,
            'de_ax': de_ax,
            'leg_ax': leg_ax
        }
Esempio n. 30
0
    ipa_signatures = ipa.load_supported_signatures_from_raw(
        IPA_PATHWAY_DIR,
        "de_s2_{0}_{1}.txt", [pids, comparisons],
        pathways=ipa_res.index)

    cy_obj = cyto.CytoscapeSession()
    nx_graphs = {}

    # one network per patient:
    for pid in pids:
        this_ipa = [
            all_ipa[(pid,
                     c)].loc[all_ipa[(pid, c)]['-logp'] >= log_alpha_strict]
            for c in comparisons
        ]
        all_pathways = setops.reduce_union(*[t.index for t in this_ipa])

        p_to_g = {}
        for p in all_pathways:
            p_to_g[p] = setops.reduce_union(*[
                t.loc[p, 'genes'].split(',') if p in t.index else []
                for t in this_ipa
            ])

        # to get connectivity, we need to create the complementary dictionary (indexed by genes)
        g_to_p = {}
        for p in all_pathways:
            for g in p_to_g[p]:
                g_to_p.setdefault(g, []).append(p)

        # we're going to use passthrough mapping to customise the node colour