コード例 #1
0
def pathway_enrichment(genesets, genes, reference, prob=None, callback=None):
    result_sets = []
    p_values = []
    if prob is None:
        prob = statistics.Hypergeometric()

    for i, gs in enumerate(genesets):
        cluster = gs.genes.intersection(genes)
        ref = gs.genes.intersection(reference)
        k = len(cluster)
        N = len(reference)
        m = len(ref)
        n = len(genes)
        if k:
            p_val = prob.p_value(k, N, m, n)
            result_sets.append((gs.gs_id, cluster, ref))
            p_values.append(p_val)
        if callback is not None:
            callback(100.0 * i / len(genesets))

    # FDR correction
    p_values = statistics.FDR(p_values)

    return {
        _id: (genes, p_val, len(ref))
        for (_id, genes, ref), p_val in zip(result_sets, p_values)
    }
コード例 #2
0
    def __on_enrichment_done(self, results):
        # type: (Future[Dict[str, tuple]]) -> None
        self.progressBarFinished(processEvents=False)
        self.setBlocking(False)
        self.setStatusMessage("")
        if self.__state & State.Stale:
            self.__state = State.Ready
            self.__invalidate()
            return

        self.__state = State.Ready
        try:
            results = results.result()  # type: Dict[str, tuple]
        except Exception as ex:
            results = {}
            error = str(ex)
            self.error(1, error)

        if results:
            terms = list(results.items())
            fdr_vals = statistics.FDR([d[1] for _, d in terms])
            terms = [(key, d + (fdr, ))
                     for (key, d), fdr in zip(terms, fdr_vals)]
            terms = dict(terms)

        else:
            terms = {}

        self.terms = terms

        if not self.terms:
            self.warning(0, "No enriched terms found.")
        else:
            self.warning(0)

        self.treeStructDict = {}
        ids = self.terms.keys()

        self.treeStructRootKey = None

        parents = {}
        for _id in ids:
            parents[_id] = {term for _, term in self.ontology[_id].related}

        children = {}
        for term in self.terms:
            children[term] = {id for id in ids if term in parents[id]}

        for term in self.terms:
            self.treeStructDict[term] = TreeNode(self.terms[term],
                                                 children[term])
            if not self.ontology[term].related and not getattr(
                    self.ontology[term], "is_obsolete", False):
                self.treeStructRootKey = term

        self.set_graph(terms)
        self._update_enrichment_report_output()
        self.commit()
コード例 #3
0
        def hg_cell(item_attributes):
            p_values = []
            scores = []
            for i, (ct, attributes) in enumerate(grouped_annotations_items):
                intersect = item_attributes & attributes
                x = len(intersect)
                k = len(item_attributes)  # drawn balls - expressed for item
                m = len(attributes)  # marked balls - items for a process

                if x > 2:  # avoid the heavy computation when intersect small
                    p_value = p_fun(x, N, m, k)
                else:
                    p_value = 1
                p_values.append(p_value)

                if scoring == SCORING_EXP_RATIO:
                    scores.append(x / (m + 1e-16))

            fdrs = statistics.FDR(p_values)
            if scoring == SCORING_LOG_FDR or scoring == SCORING_LOG_PVALUE:
                scores = AnnotateSamples._scores_fdr(
                    fdrs if scoring == SCORING_LOG_FDR else p_values)

            return scores, fdrs
コード例 #4
0
    def get_enriched_terms(
            self,
            genes,
            reference=None,
            evidence_codes=None,
            slims_only=False,
            aspect=None,
            prob=statistics.Binomial(),
            use_fdr=True,
            progress_callback=None,
    ):
        """
        Return a dictionary of enriched terms, with tuples of
        (list_of_genes, p_value, reference_count) for items and term
        ids as keys. P-Values are FDR adjusted if use_fdr is True (default).

        :param genes: List of genes
        :param reference: List of genes (if None all genes included in the annotations will be used).
        :param evidence_codes:  List of evidence codes to consider.
        :param slims_only: If `True` return only slim terms.
        :param aspect: Which aspects to use. Use all by default;
                       one of Process (biological process),
                       Function (molecular function) or Component (cellular component)
        :param prob:
        :param use_fdr:
        :param progress_callback:
        """

        all_genes = set(genes)

        if aspect is None:
            aspects_set = {'Process', 'Component', 'Function'}
        elif isinstance(aspect, str):
            aspects_set = {aspect}
        else:
            aspects_set = aspect

        if reference is None:
            reference = self.genes()

        evidence_codes = set(evidence_codes or evidence_dict.keys())
        annotations = [
            ann for gene in genes for ann in self.gene_annotations[gene]
            if ann.evidence in evidence_codes and ann.aspect in aspects_set
        ]

        ref_annotations = {
            ann
            for gene in reference for ann in self.gene_annotations[gene]
            if ann.evidence in evidence_codes and ann.aspect in aspects_set
        }

        annotations_dict = defaultdict(set)
        for ann in annotations:
            annotations_dict[ann.go_id].add(ann)

        self._ensure_ontology()

        if slims_only and not self.ontology.slims_subset:
            warnings.warn(
                "Unspecified slims subset in the ontology! "
                "Using 'goslim_generic' subset", UserWarning)
            self.ontology.set_slims_subset('goslim_generic')

        terms = annotations_dict.keys()
        filtered_terms = [term for term in terms if term in self.ontology]

        if len(terms) != len(filtered_terms):
            term_diff = set(terms) - set(filtered_terms)
            warnings.warn(
                "%s terms in the annotations were not found in the "
                "ontology." % ",".join(map(repr, term_diff)),
                UserWarning,
            )

        terms = self.ontology.extract_super_graph(filtered_terms)
        res = {}

        milestones = progress_bar_milestones(len(terms), 100)

        for i, term in enumerate(terms):
            if slims_only and term not in self.ontology.slims_subset:
                continue
            all_annotations = self.get_annotations_by_go_id(term).intersection(
                ref_annotations)
            all_annotated_genes = {ann.gene_id for ann in all_annotations}
            mapped_genes = all_genes.intersection(all_annotated_genes)

            if len(reference) > len(all_annotated_genes):
                mapped_reference_genes = reference.intersection(
                    all_annotated_genes)
            else:
                mapped_reference_genes = all_annotated_genes.intersection(
                    reference)

            res[term] = (
                [gene for gene in mapped_genes],
                prob.p_value(len(mapped_genes), len(reference),
                             len(mapped_reference_genes), len(genes)),
                len(mapped_reference_genes),
            )

            if progress_callback and i in milestones:
                progress_callback(100.0 * i / len(terms))

        if use_fdr:
            res = sorted(res.items(), key=lambda x: x[1][1])
            res = {
                id: (genes, p, ref)
                for (id, (genes, _, ref)), p in zip(
                    res, statistics.FDR([p for _, (_, p, _) in res]))
            }
        return res