def Update(self): """ Update (recompute enriched pathways) the widget state. """ if not self.data: return self.error(0) self.information(0) # XXX: Check data in setData, do not even allow this to be executed if # data has no genes try: genes = self.GeneNamesFromData(self.data) except ValueError: self.error(0, "Cannot extract gene names from input.") genes = [] if not self.useAttrNames and any("," in gene for gene in genes): genes = reduce(add, (split_and_strip(gene, ",") for gene in genes), []) self.information(0, "Separators detected in input gene names. " "Assuming multiple genes per instance.") self.queryGenes = genes self.information(1) reference = None if self.useReference and self.refData: reference = self.GeneNamesFromData(self.refData) if not self.useAttrNames \ and any("," in gene for gene in reference): reference = reduce(add, (split_and_strip(gene, ",") for gene in reference), []) self.information(1, "Separators detected in reference gene " "names. Assuming multiple genes per " "instance.") org_code = self.SelectedOrganismCode() from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher gm = GeneMatcher(kegg.to_taxid(org_code)) gm.genes = genes gm.run_matcher() mapped_genes = {gene: str(ncbi_id) for gene, ncbi_id in gm.map_input_to_ncbi().items()} def run_enrichment(org_code, genes, reference=None, progress=None): org = kegg.KEGGOrganism(org_code) if reference is None: reference = org.get_ncbi_ids() # This is here just to keep widget working without any major changes. # map not needed, geneMatcher will not work on widget level. unique_genes = genes unique_ref_genes = dict([(gene, gene) for gene in set(reference)]) taxid = kegg.to_taxid(org.org_code) # Map the taxid back to standard 'common' taxids # (as used by 'geneset') if applicable r_tax_map = dict((v, k) for k, v in kegg.KEGGGenome.TAXID_MAP.items()) if taxid in r_tax_map: taxid = r_tax_map[taxid] # We use the kegg pathway gene sets provided by 'geneset' for # the enrichment calculation. kegg_api = kegg.api.CachedKeggApi() linkmap = kegg_api.link(org.org_code, "pathway") converted_ids = kegg_api.conv(org.org_code, 'ncbi-geneid') kegg_sets = relation_list_to_multimap(linkmap, dict((gene.upper(), ncbi.split(':')[-1]) for ncbi, gene in converted_ids)) kegg_sets = geneset.GeneSets(input=kegg_sets) pathways = pathway_enrichment( kegg_sets, unique_genes.values(), unique_ref_genes.keys(), callback=progress ) # Ensure that pathway entries are pre-cached for later use in the # list/tree view kegg_pathways = kegg.KEGGPathways() kegg_pathways.pre_cache( pathways.keys(), progress_callback=progress ) return pathways, org, unique_genes, unique_ref_genes self.progressBarInit() self.setEnabled(False) self.infoLabel.setText("Retrieving...\n") progress = concurrent.methodinvoke(self, "setProgress", (float,)) self._enrichTask = concurrent.Task( function=lambda: run_enrichment(org_code, mapped_genes, reference, progress) ) self._enrichTask.finished.connect(self._onEnrichTaskFinished) self._executor.submit(self._enrichTask)
class Annotations: """ :class:`Annotations` object holds the annotations. :param str organism: an organism specifier (e.g. ``'9606'``). Annotations for that organism will be loaded. :param ontology: :class:`Ontology` object for annotations :type ontology: :class:`Ontology` """ def __init__(self, organism, ontology=None, progress_callback=None): self.ontology = ontology #: A dictionary mapping a gene (gene_id) to a set of all annotations of that gene. self.gene_annotations = defaultdict(list) #: A dictionary mapping a GO term id to a set of annotations that are directly annotated to that term self.term_anotations = defaultdict(list) self.all_annotations = defaultdict(list) self._gene_names = None self._gene_names_dict = None self.gene_matcher = GeneMatcher(organism) #: A list of all :class:`AnnotationRecords` instances. self.annotations = [] self.header = '' self.taxid = organism self._ontology = None try: path = serverfiles.localpath_download( DOMAIN, FILENAME_ANNOTATION.format(organism), progress_callback=progress_callback) except FileNotFoundError: raise taxonomy.UnknownSpeciesIdentifier(organism) self._parse_file(path) @property def ontology(self): return self._ontology @ontology.setter def ontology(self, ontology): """ Set the ontology to use in the annotations mapping. """ self.all_annotations = defaultdict(list) self._ontology = ontology def _ensure_ontology(self): if self.ontology is None: self.ontology = Ontology() def _parse_file(self, file_path): with open(file_path, 'r') as anno_file: self.header = anno_file.readline() for line in anno_file.readlines(): self.add_annotation(AnnotationRecord.from_string(line)) def add_annotation(self, a): """ Add a single :class:`AnotationRecord` instance to this object. """ if not isinstance(a, AnnotationRecord): a = AnnotationRecord(a) if not a.gene_id or not a.go_id or a.qualifier == 'NOT': return self.gene_annotations[int(a.gene_id)].append(a) self.term_anotations[a.go_id].append(a) self.annotations.append(a) self.all_annotations = defaultdict(list) def map_to_ncbi_id(self, genes): """ Run gene name matching and return only known genes """ self.gene_matcher.genes = genes self.gene_matcher.run_matcher() if self.gene_matcher: return { input_gene: ncbi_id for input_gene, ncbi_id in self.gene_matcher.map_input_to_ncbi().items() if self.gene_annotations[ncbi_id] } def map_from_ncbi_id(self): if self.gene_matcher: return { ncbi_id: input_gene for input_gene, ncbi_id in self.gene_matcher.map_input_to_ncbi().items() if self.gene_annotations[ncbi_id] } def _collect_annotations(self, go_id, visited): """ Recursive function collects and caches all annotations for id """ if go_id not in self.all_annotations and go_id not in visited: if go_id in self.ontology.reverse_alias_mapper: annotations = [ self.term_anotations.get(alt_id, []) for alt_id in self.ontology.reverse_alias_mapper[go_id] ] + [self.term_anotations[go_id]] else: annotations = [self.term_anotations[go_id] ] # annotations for this term alone visited.add(go_id) for typeId, child in self.ontology[go_id].related_to: aa = self._collect_annotations(child, visited) if type(aa) == set: annotations.append( aa) # if it was already reduced in get_all_annotations else: annotations.extend(aa) self.all_annotations[go_id] = annotations return self.all_annotations[go_id] def get_annotations_by_go_id(self, go_id): """ Return a set of all annotations (instances of :obj:`AnnotationRecord`) for GO term `id` and all it's subterms. Args: go_id (:obj:`str`): GO term id """ self._ensure_ontology() id = self.ontology.alias_mapper.get(go_id, go_id) if id not in self.all_annotations or type( self.all_annotations[id]) == list: annot_set = set() for annots in self._collect_annotations(id, set()): annot_set.update(annots) self.all_annotations[id] = annot_set return self.all_annotations[id] def get_genes_by_go_term(self, go_id, evidence_codes=None): """ Return a list of genes annotated by specified `evidence_codes` to GO term 'id' and all it's subterms." :param str go_id: GO term id :param list-of-strings evidence_codes: List of evidence codes to consider when matching annotations to terms. """ evidence_codes = set(evidence_codes or evidenceDict.keys()) annotations = self.get_annotations_by_go_id(go_id) return list( set([ int(ann.gene_id) for ann in annotations if ann.evidence in evidence_codes ])) def genes(self): return set([int(ann.gene_id) for ann in self.annotations]) def get_enriched_terms(self, genes, reference=None, evidence_codes=None, slims_only=False, aspect=None, prob=statistics.Binomial(), use_fdr=True, progress_callback=None): """ Return a dictionary of enriched terms, with tuples of (list_of_genes, p_value, reference_count) for items and term ids as keys. P-Values are FDR adjusted if use_fdr is True (default). Args: genes: List of genes reference: List of genes (if None all genes included in the annotations will be used). evidence_codes: List of evidence codes to consider. slims_only: If `True` return only slim terms. aspect: Which aspects to use. Use all by default; one of Process (biological process), Function (molecular function) or Component (cellular component) prob: use_fdr: progress_callback: """ all_genes = set(genes) if not reference: reference = all_genes if aspect is None: aspects_set = {'Process', 'Component', 'Function'} elif isinstance(aspect, str): aspects_set = {aspect} else: aspects_set = aspect evidence_codes = set(evidence_codes or evidenceDict.keys()) annotations = [ ann for gene in genes for ann in self.gene_annotations[gene] if ann.evidence in evidence_codes and ann.aspect in aspects_set ] ref_annotations = set([ ann for gene in reference for ann in self.gene_annotations[gene] if ann.evidence in evidence_codes and ann.aspect in aspects_set ]) annotations_dict = defaultdict(set) for ann in annotations: annotations_dict[ann.go_id].add(ann) self._ensure_ontology() if slims_only and not self.ontology.slims_subset: warnings.warn( "Unspecified slims subset in the ontology! " "Using 'goslim_generic' subset", UserWarning) self.ontology.set_slims_subset('goslim_generic') terms = annotations_dict.keys() filtered_terms = [term for term in terms if term in self.ontology] if len(terms) != len(filtered_terms): term_diff = set(terms) - set(filtered_terms) warnings.warn( "%s terms in the annotations were not found in the " "ontology." % ",".join(map(repr, term_diff)), UserWarning) terms = self.ontology.extract_super_graph(filtered_terms) res = {} milestones = progress_bar_milestones(len(terms), 100) unmatch = self.map_from_ncbi_id() for i, term in enumerate(terms): if slims_only and term not in self.ontology.slims_subset: continue all_annotations = self.get_annotations_by_go_id(term).intersection( ref_annotations) all_annotated_genes = set( [int(ann.gene_id) for ann in all_annotations]) mapped_genes = all_genes.intersection(all_annotated_genes) if len(reference) > len(all_annotated_genes): mapped_reference_genes = reference.intersection( all_annotated_genes) else: mapped_reference_genes = all_annotated_genes.intersection( reference) res[term] = ([unmatch[gene] for gene in mapped_genes], prob.p_value(len(mapped_genes), len(reference), len(mapped_reference_genes), len(genes)), len(mapped_reference_genes)) if progress_callback and i in milestones: progress_callback(100.0 * i / len(terms)) if use_fdr: res = sorted(res.items(), key=lambda x: x[1][1]) res = dict([(id, (genes, p, ref)) for (id, (genes, _, ref)), p in zip( res, statistics.FDR([p for _, (_, p, _) in res]))]) return res def get_annotated_terms(self, genes, direct_annotation_only=False, evidence_codes=None, progress_callback=None): """ Return all terms that are annotated by genes with evidence_codes. """ genes = [genes] if type(genes) == str else genes match = self.map_to_ncbi_id(genes) unmatch = self.map_from_ncbi_id() genes = set([match[gene] for gene in genes]) evidence_codes = set(evidence_codes or evidenceDict.keys()) annotations = [ ann for gene in genes for ann in self.gene_annotations[gene] if ann.evidence in evidence_codes ] dd = defaultdict(set) for ann in annotations: dd[ann.go_id].add(unmatch[int(ann.gene_id)]) if not direct_annotation_only: self._ensure_ontology() terms = dd.keys() filtered_terms = [term for term in terms if term in self.ontology] if len(terms) != len(filtered_terms): term_diff = set(terms) - set(filtered_terms) warnings.warn( "%s terms in the annotations were not found in the " "ontology." % ",".join(map(repr, term_diff)), UserWarning) terms = self.ontology.extract_super_graph(filtered_terms) for i, term in enumerate(terms): term_annotations = self.get_annotations_by_go_id( term).intersection(annotations) dd[term].update( [unmatch[int(ann.gene_id)] for ann in term_annotations]) return dict(dd) def __add__(self, iterable): """ Return a new Annotations object with combined annotations """ return Annotations([a for a in self] + [a for a in iterable], ontology=self.ontology) def __iadd__(self, iterable): """ Add annotations to this instance """ self.extend(iterable) return self def __contains__(self, item): return item in self.annotations def __iter__(self): """ Iterate over all AnnotationRecord objects in annotations """ return iter(self.annotations) def __len__(self): """ Return the number of annotations """ return len(self.annotations) def __getitem__(self, index): """ Return the i-th annotation record """ return self.annotations[index] def __getslice__(self, *args): return self.annotations.__getslice__(*args) def add(self, line): """ Add one annotation """ self.add_annotation(line) def append(self, line): """ Add one annotation """ self.add_annotation(line) def extend(self, lines): """ Add multiple annotations """ for line in lines: self.add_annotation(line)