Beispiel #1
0
def partial_subgroup_specific(pids, subgroup_ind):
    """
    For each subgroup, get the list of sets that corresponds to >= 2 members of that subgroup and no members of any
    other.
    :param pids: List of PIDs
    :param subgroup_ind: Dict, keyed by subgroup name. Values are np.array of booleans (i.e. boolean indexers). Order
    of indexers must match pids.
    :return: Dict, keyed by subgroup name. Each value is a list of sets.
    """
    ss_part_sets = {}
    candidates = list(setops.binary_combinations_sum_gte(len(pids), 2))
    for grp in subgroup_ind:
        ss_part_sets[grp] = [t for t in candidates if np.array([x for x in t]).astype(int)[~subgroup_ind[grp]].sum() == 0]
    return ss_part_sets
    # add the combined DE results for the refs combined
    for pid in pids:
        # complete intersection
        the_idx = sorted(reduce(intersecter, [de_res[(pid, t)].index for t in external_ref_labels]))
        one_cols = de_res[(pid, pid)].columns
        tups = reduce(lambda x, y: x + y, [zip([t] * one_cols.size, one_cols.tolist()) for t in external_ref_labels])
        the_cols = pd.MultiIndex.from_tuples(tups, names=['ref', 'field'])
        the_block = pd.DataFrame(index=the_idx, columns=the_cols)
        for t in external_ref_labels:
            the_block.loc[the_idx, t] = de_res[(pid, t)].loc[the_idx].values
        de_res[(pid, 'ref_intersect')] = the_block

        # intersect 2
        this_venn, _ = setops.venn_from_arrays(*[de_res[(pid, t)].index for t in external_ref_labels])
        the_idx = reduce(unioner, [this_venn[k] for k in setops.binary_combinations_sum_gte(len(external_refs), 2)])
        the_block = pd.DataFrame(index=the_idx, columns=the_cols)
        for t in external_ref_labels:
            try:
                the_block.loc[the_idx, t] = de_res[(pid, t)].loc[the_idx].values
            except KeyError:
                # no matches for this ref - no problem
                pass
        de_res[(pid, 'ref_intersect2')] = the_block

        # union
        the_idx = sorted(reduce(unioner, [de_res[(pid, t)].index for t in external_ref_labels]))
        the_block = pd.DataFrame(index=the_idx, columns=the_cols)
        for t in external_ref_labels:
            try:
                the_block.loc[the_idx, t] = de_res[(pid, t)].loc[the_idx].values
Beispiel #3
0
    for k, v in genesets.items():
        for i, t in enumerate(v):
            if t in manual_gene_name_correction:
                v[i] = manual_gene_name_correction[t]
        g_in = rnaseq_dat.index.intersection(v)
        if set(g_in) != set(v):
            missing = set(v).difference(rnaseq_dat.index)
            logger.warn(
                "%d genes in the %s signature do not match with the data index and will be dropped: %s.",
                len(missing), k, ', '.join(missing))
            genesets[k] = g_in

    # check here whether there is any overlap
    vs, vc = setops.venn_from_arrays(*genesets.values())
    n_overlap = sum(
        [vc[t] for t in setops.binary_combinations_sum_gte(len(genesets), 2)])
    if n_overlap > 0:
        logger.warn(
            "The %d gene signatures used here have %d overlapping genes - please check this is OK.",
            len(genesets), n_overlap)

    # run ssGSEA then Z transform the results
    es = gsva.ssgsea(rnaseq_dat, genesets)
    es_z = z_transform(es, axis=1)

    # export
    for_export = es_z.transpose()
    for_export.insert(for_export.shape[1], 'Verhaak classification',
                      rnaseq_meta.loc[for_export.index, 'expression_subclass'])
    for_export.insert(for_export.shape[1], 'Wang classification',
                      rnaseq_meta.loc[for_export.index, 'wang_classification'])
    for pid in pids:
        po_core_genes[pid] = {}
        the_row = pair_only.loc[pid]
        the_refs = the_row.index.difference([pid])
        # all possible combinations of references
        for ref_selection in itertools.combinations(the_refs, len(pids)):
            this_portion = the_row.loc[list(ref_selection)]
            po_core_genes[pid][ref_selection] = pd.Series(
                index=range(1,
                            len(pids) + 1))
            gl, cts = setops.venn_from_arrays(*this_portion.values)
            for N in range(1, len(pids) + 1):
                po_core_genes[pid][ref_selection].loc[N] = reduce(
                    unioner,
                    (gl[k]
                     for k in setops.binary_combinations_sum_gte(len(pids), N)
                     ))

    # this strangeness is to remind me that there is currently only 1 ref (but that could change)
    n_perm = ncr(len(pids) + len(additional_pids) + 1 - 1, len(pids))

    # run over results and compute
    # 1) similarity score: the number of genes in each list divided by the number in the union over all permutations
    # 2) union of genes over all permutations
    # 3) intersection of genes over all permutations

    similarities = {}
    isct = {}
    unn = {}
    for N in range(1, len(pids) + 1):
        similarities[N] = pd.DataFrame(index=pids, columns=range(n_perm))
Beispiel #5
0
        # progressively filter the gene list based on counts
        the_genes = this_counter.keys()
        for i in possible_counts:
            the_genes = [k for k in this_counter if this_counter[k] >= i]
            po_each_threshold.loc[pid, i] = the_genes

    # ...how many of these are shared between patients?
    # consider all, K -1 and K-2
    K = len(pids)
    for i in possible_counts:
        _, cts = setops.venn_from_arrays(*po_each_threshold.loc[:, i].values)
        this_tally = []

        print "N = %d" % i
        for j in [K, K - 1, K - 2, K - 3]:
            this_ct = sum([cts[k] for k in setops.binary_combinations_sum_gte(K, j)])
            print "%d DMRs shared by >=%d patients" % (this_ct, j)
        # also look at the overlap within the subgroups
        for grp_name, grp_members in subgroups.items():
            # get the group member results
            this_po_each_threshold = po_each_threshold.loc[grp_members]
            _, cts = setops.venn_from_arrays(*this_po_each_threshold.loc[:, i].values)
            the_idx = ''.join(['1'] * len(grp_members))
            print "%d DMRs shared by all patients in subgroup %s" % (cts[the_idx], grp_name)

    # for reference: what do these numbers look like in the Gibco comparison (only)?
    po_gibco_common_counts = pd.Series(index=possible_counts)
    _, cts = setops.venn_from_arrays(*pair_only.loc[:, 'GIBCO'].values)
    for j in possible_counts:
        po_gibco_common_counts.loc[j] = sum([cts[k] for k in setops.binary_combinations_sum_gte(K, j)])
        print "%d DMRs shared by >=%d patients in the pair-only Gibco comparison" % (
Beispiel #6
0
                                    "pair_only_%d_consistent.xlsx" % j))
        subdir = os.path.join(outdir, "ipa_%d_consistent" % j)
        if not os.path.isdir(subdir):
            os.makedirs(subdir)
        ipa.results_to_ipa_format(po_export, outdir=subdir)

    # ...how many of these are shared between patients?
    # consider K to K-3 (inclusive)
    for i in possible_counts:
        _, cts = setops.venn_from_arrays(*po_each_threshold.loc[:, i].values)
        this_tally = []
        K = len(pids)
        print "N = %d" % i
        for j in [K, K - 1, K - 2, K - 3]:
            this_ct = sum(
                [cts[k] for k in setops.binary_combinations_sum_gte(K, j)])
            print "%d genes shared by >=%d patients" % (this_ct, j)

        # also look at the overlap within the subgroups
        for grp_name, grp_members in subgroups.items():
            # get the group member results
            this_po_each_threshold = po_each_threshold.loc[grp_members]
            _, cts = setops.venn_from_arrays(
                *this_po_each_threshold.loc[:, i].values)
            the_idx = ''.join(['1'] * len(grp_members))
            print "%d DE genes shared by all patients in subgroup %s" % (
                cts[the_idx], grp_name)

    # for reference: what do these numbers look like in the Gibco comparison (only)?
    po_gibco_common_counts = pd.Series(index=possible_counts, dtype=int)
    _, cts = setops.venn_from_arrays(*pair_only.loc[:, 'GIBCO'].values)
        j = sg.index(pid)
        the_lists_ref = [de_res[(pid, r)] for r in all_refs]
        the_pair = de_res[(pid, 'Paired')]
        for ref in all_refs:
            the_ref = de_res[(pid, ref)]
            the_sets, _ = setops.venn_from_arrays(the_pair.index,
                                                  the_ref.index)
            pair_only.loc[pid, ref] = the_sets['10']
        venn_sets, cts = setops.venn_from_arrays(*pair_only.loc[pid].values)
        for k in range(2, len(all_refs) + 1):
            for ref in all_refs:
                the_idx = all_refs.index(ref)
                pair_only_core[k].loc[pid, ref] = reduce(
                    lambda x, y: x + y, [
                        venn_sets[t]
                        for t in setops.binary_combinations_sum_gte(
                            len(all_refs), k) if t[the_idx] == '1'
                    ])
        venn.venn3(cts, set_labels=all_refs, ax=axs[i, j])
        axs[i, j].set_title("GBM%s pair only" % pid)
    fig.tight_layout()
    fig.savefig(os.path.join(outdir, 'number_po_de_multiple_references.png'),
                dpi=200)
    fig.savefig(os.path.join(outdir, 'number_po_de_multiple_references.tiff'),
                dpi=200)

    # proportion of each pair only DE count that is shared by all
    for k in range(2, len(all_refs) + 1):
        ax = (pair_only_core[k].applymap(len) / pair_only.applymap(len) *
              100).plot.bar()
        ax.set_xlabel('Patient')
        # ax.set_title('Percentage of pair only DE genes that are in %d / %d reference comparisons' % (k, len(all_refs)))