def load_and_prepare_data(indir, file_patt, comparisons, pids=consts.PIDS, alpha=0.005, alpha_relevant=0.05, outdir=None):
    """
    Load IPA pathway data from the raw exported files, preparing several representations.
    If requested, save some of these to the specified output directory.
    :param indir:
    :param file_patt:
    :param comparisons:
    :param pids:
    :param alpha:
    :param alpha_relevant:
    :param outdir:
    :return:
    """
    plogalpha = -np.log10(alpha)
    plogalpha_relevant = -np.log10(alpha_relevant)

    res = ipa.load_raw_reports(
        indir,
        file_patt,
        pids,
        comparisons
    )
    for k, v in res.items():
        rele_ix = v.index[v['-logp'] >= plogalpha_relevant]
        res['_'.join(k)] = v.loc[rele_ix]
        res.pop(k)

    # wideform version of this (i.e. 30 blocks)
    res_wide = ipa_results_to_wideform(res, plogalpha)

    # get a list of significant pathways (in at least one comparison)
    pathways_significant = set()
    for k, v in res.items():
        pathways_significant.update(v.index[v['-logp'] > plogalpha])

    if outdir is not None:
        # export full wideform results
        res_wide.to_excel(os.path.join(outdir, "ipa_results_full.xlsx"))

        # export significant results to an Excel file with separate tabs
        res_sign = dict([
            (k, v.loc[v['-logp'] > plogalpha]) for k, v in res.items()
        ])
        excel.pandas_to_excel(res_sign, os.path.join(outdir, "ipa_results_significant_separated.xlsx"))

        # export wideform, reduced to include only significant pathways
        res_wide.loc[sorted(pathways_significant)].to_excel(
            os.path.join(outdir, "ipa_results_significant.xlsx")
        )

    return res, res_wide, pathways_significant
    print "Found %d relevant input (DMP) files: %s" % (len(dmp_fns),
                                                       ', '.join(dmp_fns))
    outdir = output.unique_output_dir("mb_dmps")
    res = {}

    for fn in dmp_fns:
        base = os.path.splitext(os.path.basename(fn))[0]
        res[base] = {}
        dat = pd.read_excel(fn, sheet_name=None)
        for cmp, df in dat.items():
            res[base][cmp] = annot_one(df, anno)

        # save to Excel
        out_fn = os.path.join(outdir, os.path.basename(fn))
        excel.pandas_to_excel(res[base], out_fn, write_index=False)

    # 2.1 Look for common DMPs

    # 'Refold' the previous results dictionary
    res_flat = dictionary.nested_dict_to_flat(res)
    res_by_cmp = dict([(k[::-1], v) for k, v in res_flat.items()])
    res_by_cmp = dictionary.flat_dict_to_nested(res_by_cmp)
    common_dmps = {}

    for cmp, d in res_by_cmp.items():
        common_dmps[cmp] = setops.reduce_intersection(
            *[t.probe_id for t in d.values()])

    # 2.2 Look for common genes
            the_counts = pd.Series(the_counts)
            the_counts_full = pd.Series(the_counts_full)

            probe_counts[p][typ] = the_counts
            probe_counts_full[p][typ] = the_counts_full

            probe_dist[p][typ] = the_counts.divide(the_counts.sum())
            probe_dist_full[p][typ] = the_counts_full.divide(
                the_counts_full.sum())

            probe_dist_rel_bg[p][typ] = probe_dist[p][typ] / bg_dist
            probe_dist_full_rel_bg[p][typ] = probe_dist_full[p][typ] / bg_dist

    excel.pandas_to_excel(
        to_xls, os.path.join(outdir,
                             "dmr_motif_analysis_patient_specific.xlsx"))

    # motif_count_breaks = [0, 2, 5, 10, 20, 30, 100]
    # motif_count_breaks_colnames = interval_names_from_bin_edges(motif_count_breaks, add_infty=True)
    # motif_count_breaks_nif = [0, 2, 4, 6, 8, 10]
    # motif_count_breaks_nif_colnames = interval_names_from_bin_edges(motif_count_breaks_nif, add_infty=True)
    #
    # motif_counts_binned = {}
    # motif_counts_nif_binned = {}
    # for p in pids:
    #     motif_counts_binned[p] = {}
    #     motif_counts_nif_binned[p] = {}
    #     for typ in ['hypo', 'hyper']:
    #         motif_counts_binned[p][typ] = pd.Series(index=cpg_statuses)
    #         motif_counts_nif_binned[p][typ] = pd.Series(index=cpg_statuses)
    p_res = {}
    ss_res = {}

    for typ in ('cell_culture', 'ffpe'):
        for src in ('star', 'salmon', 'star/cufflinks'):
            fn = os.path.join(outdir, "%s_%s.gct" % (SRC_MAP[src], typ))
            the_dir, the_stem = os.path.split(fn)
            outfn = os.path.join(the_dir, "p_result_%s.txt" % the_stem)
            if not os.path.exists(outfn):
                continue
            this_pres = load_pvalue_results(outfn)
            p_res.setdefault(typ, {})[SRC_MAP[src]] = this_pres
            ss_res.setdefault(typ, {})[SRC_MAP[src]] = simplicity_score(this_pres)

    # export
    # easiest way is to flatten the dictionary, then combine
    export_p = dictionary.nested_dict_to_flat(p_res)
    export_ss = dictionary.nested_dict_to_flat(ss_res)
    to_export = {}

    for k in export_p:
        the_key = '_'.join(k)
        this = export_p[k].copy()
        this.insert(this.shape[1], 'Simplicity score', export_ss[k])
        if k[0] == 'ffpe':
            this.insert(this.shape[1], 'Patient ID', nh_id_to_patient_id(this.index))
        to_export[the_key] = this
    excel.pandas_to_excel(to_export, os.path.join(outdir, "wang_results.xlsx"))

        jobs[lbl] = pool.apply_async(run_one_de,
                                     args=(dat, groups, cmp),
                                     kwds=de_params)
        # res[lbl] = run_one_de(dat, groups, cmp, **de_params)
        # print "%d DE genes\n" % (res[lbl].FDR <= de_params['fdr']).sum()

    for lbl in jobs:
        res[lbl] = jobs[lbl].get(1e6)
        print lbl
        print "%d DE genes\n" % (res[lbl].FDR <= de_params['fdr']).sum()

    for k, v in res.items():
        general.add_gene_symbols_to_ensembl_data(v, tax_id=10090)
        res_sign[k] = v.loc[v.FDR <= de_params['fdr']]

    excel.pandas_to_excel(res, os.path.join(outdir,
                                            "mouse_GBM_NSC_DE_all.xlsx"))
    excel.pandas_to_excel(
        res_sign, os.path.join(outdir, "mouse_GBM_NSC_DE_significant.xlsx"))

    # finally, re-run with a lfc of zero
    # disabled for now to speed things up
    if False:
        de_params['lfc'] = 0

        jobs2 = {}
        print "No logFC requirement"

        for cmp in comparisons:
            lbl = "%s_vs_%s" % cmp
            jobs2[lbl] = pool.apply_async(run_one_de,
                                          args=(dat, groups, cmp),
    # for separated data, combine single and paired PC for maximum efficiency
    for first_dim in dims:
        dims_pair = (first_dim, first_dim + 1)
        ix_all = setops.reduce_union(*[
            selected_by_quantile_separate_logfc[k].index
            for k in [(first_dim, ), dims_pair]
        ])
        this_df = pd.DataFrame(index=ix_all)
        for k in [(first_dim, ), dims_pair]:
            tt = selected_by_quantile_separate_logfc[k].copy()
            tt = tt.loc[:, tt.columns.str.contains('logFC')]
            tt.columns = tt.columns.str.replace(
                '_logFC', '_%s_logFC' % '-'.join([str(t + 1) for t in k]))
            this_df = pd.concat((this_df, tt), axis=1, sort=True)
        this_df.to_excel(
            os.path.join(outdir,
                         "for_ipa_separate_logfc_pc%d.xlsx" % (first_dim + 1)))

    # combine with DE results and export to table
    for_export = {}
    for first_dim in dims:
        dims_pair = (first_dim, first_dim + 1)
        for dim in [(first_dim, ), dims_pair]:
            the_key = "PC_%s" % '-'.join([str(t + 1) for t in dim])
            this_feat = svd_res['feat_dat'][[i + 1 for i in dim]]
            this_ens = get_topmost_quantile_by_loading(
                this_feat, quantile).intersection(de_res.index)
            for_export[the_key] = de_res.loc[this_ens]
    excel.pandas_to_excel(
        for_export,
        os.path.join(outdir, "full_de_syngeneic_only_filtered_by_biplot.xlsx"))
        for_plot = {}
        for pid in groups[grp]:
            for_plot[pid] = de_dmr_de_logfc_tss[grp][[pid]].dropna()
            for_plot[pid].columns = ['logFC']
        plt_dict = same_de.bar_plot(for_plot, keys=groups[grp], figsize=(len(groups[grp]) - .5, 4.5))
        plt_dict['fig'].savefig(os.path.join(outdir, "de_direction_by_group_%s_tss.png" % grp.lower()), dpi=200)

    # export for publication
    de_dmr_dmr_all_for_export = {}
    for grp, x in de_dmr_dmr_median_delta_all.items():
        this_ = x.dropna(axis=1, how="all").reset_index().rename({"index": "dmr_id"}, axis=1).copy()
        this_["consistent"] = this_["consistent"].astype(int)
        de_dmr_dmr_all_for_export[grp] = this_
    fn = os.path.join(outdir, "dmr_from_group_spec_de_dmrs_all.xlsx")
    excel.pandas_to_excel(de_dmr_dmr_all_for_export, fn, write_index=False)

    # Venn diagrams of DE
    fig = plt.figure(figsize=(5., 3.3))
    ax = fig.add_subplot(111)
    plot_venn_de_directions(de_dmr_de_logfc_all, set_colours_dict, ax=ax)
    fig.savefig(os.path.join(outdir, "de_from_group_spec_de_dmrs_all.png"), dpi=200)

    fig = plt.figure(figsize=(5., 3.3))
    ax = fig.add_subplot(111)
    plot_venn_de_directions(de_dmr_de_logfc_tss, set_colours_dict, ax=ax)
    fig.savefig(os.path.join(outdir, "de_from_group_spec_de_dmrs_tss.png"), dpi=200)

    # export for publication
    de_dmr_de_all_for_export = {}
    for grp, x in de_dmr_de_logfc_all.items():
Exemple #8
0
    for_export.to_excel(
        os.path.join(outdir, 'consistently_in_pair_only_across_all_refs.xlsx'))

    # correct the reference PO lists, take the intersection, then export to a file
    po_de_export = {}
    for pid in pids:
        this_row = pair_only.loc[pid, external_ref_labels]
        this_genes_pre = reduce(intersecter, this_row)
        this_genes = sorted(this_genes_pre.difference(po_specific_to_all_refs))
        print "PID %s. Subtracted %d correction genes from the %d PO intersection genes to leave %d PO genes" % (
            pid, len(po_specific_to_all_refs), len(this_genes_pre),
            len(this_genes))
        po_de_export[pid] = de_res[(pid, pid)].loc[this_genes]

    excel.pandas_to_excel(
        po_de_export, os.path.join(outdir,
                                   'pair_only_de_lists_corrected.xlsx'))

    # export with a different layout, analogous to trial 2
    venn_set, venn_ct = setops.venn_from_arrays(
        *[po_de_export[pid].index for pid in pids])
    po_combination_export = differential_expression.venn_set_to_dataframe(
        po_de_export, venn_set, pids)
    po_combination_export.to_excel(
        os.path.join(outdir, 'pair_only_de_lists_combined_corrected.xlsx'))

    # plot: how many DE genes are present in each reference comparison?

    fig, axs = plt.subplots(nrows=2, ncols=3)
    for pid in pids:
        if pid in subgroups['RTK I']:
        this_ix = (df.p_bonferroni <=
                   alpha) & (df.NS.isin(namespaces)) & (df.enrichment == 'e')
        df_filt = df.loc[this_ix]
        # include bottom-most nodes only
        ix = []
        for go_id in df_filt.index:
            ix.append(len(go_obj.obo[go_id].get_all_children()) == 0)
        goea_res_filt[k] = df_filt.loc[ix]

    # minor manipulation of results, then save to a single Excel file
    # do this for full results and filtered

    all_res = reannotate(goea_res)
    all_res_filt = reannotate(goea_res_filt)

    excel.pandas_to_excel(all_res,
                          os.path.join(outdir, "goea_de_all_results.xlsx"))
    excel.pandas_to_excel(
        all_res_filt, os.path.join(outdir,
                                   "goea_de_all_results_filtered.xlsx"))

    # create (mega)heatmap of all results
    tmp = pd.concat([v.name for v in all_res_filt.values()])
    tmp = tmp.loc[~tmp.duplicated()]

    for_plot = pd.DataFrame(index=tmp.values)

    for pid in pids[::-1]:
        this = all_res[pid].reindex(tmp.index)
        this.index = tmp.values
        for_plot.insert(0, pid, -np.log10(this['p_bonferroni']))
Exemple #10
0
    to_export = collections.OrderedDict()
    to_export['Explanation'] = pd.Series(
        collections.OrderedDict([
            ('DE ESC line 1', 'ENCODE H1 ESC (%d replicates)' %
             the_groups.value_counts()['ESC_encode']),
            ('DE ESC line 2', 'Cacchiarelli et al. (%d lines)' %
             the_groups.value_counts()['ESC_cacchiarelli']),
            ('DMR ESC line 1', 'ENCODE H7 ESC (no replicates)'),
            ('DMR ESC line 2', 'Weltner et al. H9 (3 replicates)'),
        ], ),
        name='All comparisons are stated as iPSC - ESC.')
    to_export.update(
        collections.OrderedDict([(pid, dedmr_results[pid]) for pid in pids
                                 if pid in dedmr_results]))

    excel.pandas_to_excel(to_export,
                          os.path.join(outdir, "DE_DMR_results_combined.xlsx"))

    def aggregate_dm_results_by_gene(dmr_res, genes):
        delta_m = {}
        fdr = {}

        for g in genes:
            this_ix = dmr_res.index[dmr_res.genes.apply(lambda x: g in x)]
            delta_m[g] = dmr_res.loc[
                this_ix,
                dmr_res.columns.str.contains('median_delta')].mean(axis=0)
            fdr[g] = dmr_res.loc[this_ix,
                                 dmr_res.columns.str.contains('padj')].mean(
                                     axis=0)

        delta_m = pd.DataFrame(delta_m).transpose().sort_index()
        others = [
            de_res_sign[("GBM%s" % gic_pid, "iNSC%s" % p)]
            for p in pd.Index(pids).drop(insc_pid)
        ]
        # 'syn only' index
        this_so_ix = this_syn.index.difference(
            setops.reduce_union(*[t.index for t in others]))
        syn_only[("GBM%s" % gic_pid,
                  "iNSC%s" % insc_pid)] = this_syn.loc[this_so_ix]
        n_syn_only.loc[gic_pid, insc_pid] = this_so_ix.size

    true_syn_only = dict([(p, syn_only[("GBM%s" % p, "iNSC%s" % p)])
                          for p in pids])

    # export to list
    excel.pandas_to_excel(true_syn_only,
                          os.path.join(outdir, "de_only_in_syngeneic.xlsx"))

    # export for IPA
    # we're going to run the true syngeneic (10) against non-syngeneic chosen to give the greatest number of DE genes
    # in practice, this means fixing the identity of the iNSC
    selected_insc = ['018', '030', '054', '052']

    cols = []
    common_probes = set()
    for pid in pids:
        cols.append("GBM%siNSC%s_logFC" % (pid, pid))
        common_probes.update(syn_only[("GBM%s" % pid, "iNSC%s" % pid)].index)
    for p1 in selected_insc:
        for p2 in pids:
            k = "GBM%siNSC%s_logFC" % (p1, p2)
            if k not in cols:
Exemple #12
0
    ## genes that are pair-only in every possible ref comparison
    po_each = [
        sorted(
            reduce(intersecter,
                   pair_only.loc[pid, ~pair_only.columns.str.contains(pid)]))
        for pid in pids
    ]
    po_each = pd.Series(po_each, index=pids)

    # export gene lists here
    po_export = {}
    for pid in pids:
        po_export["GBM%s_pair_only" %
                  pid] = de_res[(pid, pid)].loc[po_each.loc[pid]]
    excel.pandas_to_excel(
        po_export, os.path.join(outdir, "pair_only_all_consistent.xlsx"))
    subdir = os.path.join(outdir, "ipa_all_consistent")
    if not os.path.isdir(subdir):
        os.makedirs(subdir)
    ipa.results_to_ipa_format(po_export, outdir=subdir)

    # now relax this requirement: which genes would be included if we require their inclusion in N of the cells
    # (rather than all)?
    possible_counts = range(1, pair_only.shape[1])
    po_each_threshold = pd.DataFrame(index=pids, columns=possible_counts)
    for pid in pids:
        this_counter = collections.Counter()
        # iterate over each column
        # we can include the empty diagonal cell, since it will not affect the counting
        for col in pair_only.columns:
            for e in pair_only.loc[pid, col]:
    res = collections.OrderedDict()
    res_full = collections.OrderedDict()
    for pid in pids:
        for c in comparison_names:
            fn = os.path.join(indir, "%s%s.csv" % (pid, c))
            this = pd.read_csv(fn, sep='\t', header=0, index_col=0, usecols=[0, 3, 5, 7])
            this.columns = ['n_gene', 'nes', 'fdr']
            this = this.reindex(keep_pathways).dropna(how='all')
            res_full["%s_%s" % (pid, comparison_names[c])] = this.loc[this.fdr < alpha_relevant]
            res["%s_%s" % (pid, comparison_names[c])] = this.loc[this.fdr < alpha]

    pathways_sign = sorted(setops.reduce_union(*[t.index for t in res.values()]))
    pathways_rele = sorted(setops.reduce_union(*[t.index for t in res_full.values()]))

    excel.pandas_to_excel(res, os.path.join(outdir, "gsea_results_significant_by_patient.xlsx"))

    # use this list to export a second wideform Excel file with the top list of pathways
    for_export = pd.DataFrame(index=pathways_sign, columns=['n_gene'])
    nes_columns = []
    fdr_columns = []
    for k, v in res.items():
        for_export.loc[v.index, 'n_gene'] = v.n_gene
        this_yn = pd.Series('N', index=pathways_sign)
        this_yn.loc[v.index] = 'Y'
        for_export.insert(
            for_export.shape[1],
            k,
            this_yn
        )
        for_export.insert(
            ]]
            # to_add.columns = ['chrom', 'coord', 'genes', 'gene_relation']
            df.insert(df.shape[1], 'chrom', to_add.CHR)
            df.insert(df.shape[1], 'coord',
                      to_add.MAPINFO.fillna(-1).astype(int))
            df.insert(df.shape[1], 'gene', [
                ','.join(t) if hasattr(t, '__iter__') else ''
                for t in to_add.UCSC_RefGene_Name
            ])
            df.insert(df.shape[1], 'gene_relation', [
                ','.join(t) if hasattr(t, '__iter__') else ''
                for t in to_add.UCSC_RefGene_Group
            ])
            new_dat[k] = df
        excel.pandas_to_excel(
            new_dat,
            os.path.join(outdir, fn.replace('.xlsx', '.annotated.xlsx')))

    dmp_fn = os.path.join(indir, 'dmps_3021_swan.xlsx')
    dmps = pd.read_excel(dmp_fn, header=0, index_col=0, sheet_name=None)

    # combine all DMPs into a single wideform
    cols = reduce(
        lambda x, y: x + y,
        [['%s' % t, '%s_logFC' % t, '%s_FDR' % t] for t in dmps])
    all_probes = setops.reduce_union(
        *[v.loc[v['adj.P.Val'] < 0.05].index for v in dmps.values()])
    all_probes = all_probes.intersection(anno.index)

    dmps_all = pd.DataFrame(index=all_probes,
                            columns=['CHR', 'coord', 'genes'] + cols)
        1]

    # run clustering to order the rows/cols nicely
    rl = hc.linkage(co.fillna(0.).transpose(),
                    method='average',
                    metric='euclidean')
    row_ix = hc.leaves_list(rl)
    cl = hc.linkage(co.fillna(0.), method='average', metric='euclidean')
    col_ix = hc.leaves_list(cl)

    # reorder the data based on the clustering
    co = co.iloc[col_ix, row_ix]
    co_p = co_p.iloc[col_ix, row_ix]

    excel.pandas_to_excel({
        corr_metric: co,
        'pval': co_p
    }, os.path.join(outdir, "correlation_%s_syngeneic.xlsx" % corr_metric))

    # quantify the number of patients involved in each of the pathways for follow up
    follow_up_pathways = quantify_follow_up_pathways(ipa_res,
                                                     co_p,
                                                     comparisons,
                                                     pids,
                                                     alpha=alpha,
                                                     alpha_strict=alpha_strict)

    # for plotting, we only need an indicator of which values are significant
    plot_dict = plot_heatmap_with_quantification(
        co,
        co_p,
        follow_up_pathways,
        po_counts = pair_only.applymap(len)
        ro_counts = ref_only.applymap(len)

        ## genes that are pair-only in every possible ref comparison
        po_each = [
            sorted(
                reduce(intersecter, pair_only.loc[pid, ~pair_only.columns.str.contains(pid)])
            ) for pid in pids
            ]
        po_each = pd.Series(po_each, index=pids)

        # export gene lists here
        po_export = {}
        for pid in pids:
            po_export["GBM%s_pair_only" % pid] = de_res[(pid, pid)].loc[po_each.loc[pid]]
        excel.pandas_to_excel(po_export, os.path.join(outdir, "pair_only_all_consistent.xlsx"))
        subdir = os.path.join(outdir, "ipa_all_consistent")
        if not os.path.isdir(subdir):
            os.makedirs(subdir)
        ipa.results_to_ipa_format(po_export, outdir=subdir)

        # What is present in X vs Y_i that isn't in X vs any other Y?
        po_diff = pd.DataFrame(index=pair_only.index, columns=pair_only.columns)
        for pid in pids:
            for pid2 in pair_only.columns:
                the_ref = pair_only.loc[pid, pid2]
                all_else = pair_only.loc[pid, pair_only.columns != pid2]
                union_all_else = reduce(set.union, all_else, set())
                po_diff.loc[pid, pid2] = sorted(set(the_ref).difference(union_all_else))

        # find DE genes that are always PO when a (non-matching) iNSC reference is used, but NOT when an external reference
Exemple #17
0
        subgroup_ind,
        subgroup_set_colours,
        venn_set=venn_set,
        min_size=1,
        n_plot=30,
    )
    ups['axes']['set_size'].set_xlabel("Number of pathways in single patient")
    ups['axes']['main'].set_ylabel("Number of pathways in set")
    ups['figure'].savefig(os.path.join(outdir, "upset_pathways.png"), dpi=200)

    # export
    s1_specific = {}
    specific_sets = setops.specific_sets(pids)
    for p, s in specific_sets.items():
        s1_specific[p] = s1_reports_all[p].loc[venn_set[s]]
    excel.pandas_to_excel(s1_specific,
                          os.path.join(outdir, "s1_patient_specific.xlsx"))

    # S2 syngeneic-only

    s2_syngeneic = {}
    for p in pids:
        in_ours = s1_reports_all[p].index
        in_refs = setops.reduce_union(
            *[s2_reports_all["%s_%s" % (p, r)].index for r in refs])
        in_so = in_ours.difference(in_refs)
        tmp = s1_reports_all[p].loc[in_so]
        s2_syngeneic[p] = tmp.loc[tmp.nes.abs().sort_values(
            ascending=False).index]

    fig = plt.figure()
    ax = fig.add_subplot(111)