def omim_gene_sets(org): """ Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses """ if org == '9606': gene_matcher = GeneMatcher('9606') genesets = [] for disease in omim.diseases(): gene_symbols = omim.disease_genes(disease) gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) gs = GeneSet( gs_id=disease.id, name=disease.name, genes=genes, hierarchy=('OMIM', ), organism='9606', link=(OMIM_LINK.format(disease.id) if disease.id else None)) genesets.append(gs) return GeneSets(genesets)
def dicty_mutant_gene_sets(org): """ Return dicty mutant phenotype gene sets from Dictybase """ if org == '352472': gene_sets = [] gene_matcher = GeneMatcher('352472') for phenotype, mutants in dicty.phenotypes.phenotype_mutants().items(): gene_symbols = [ dicty.phenotypes.mutant_genes(mutant)[0] for mutant in mutants ] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) if len(gene_symbols) != len(genes): print(len(gene_symbols), len(genes)) gs = GeneSet(gs_id=phenotype, name=phenotype, genes=genes, hierarchy=('Dictybase', 'Phenotypes'), organism='352472', link='') gene_sets.append(gs) return GeneSets(gene_sets)
def cytoband_gene_sets(org): """ Create cytoband gene sets from Stanford Microarray Database """ if org == '9606': gene_matcher = GeneMatcher('9606') with urlopen(CYTOBAND_DOWNLOAD_LINK) as stream: data = stream.read().splitlines() genesets = [] for band in data: b = band.decode().split('\t') gene_symbols = b[2:] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) genesets.append( GeneSet(gs_id=b[0], name=b[1], genes=genes if b[2:] else [], hierarchy=('Cytobands', ), organism='9606', link='')) return GeneSets(genesets)
def reactome_gene_sets(org): """ Prepare human pathways gene sets from reactome pathways """ if org == '9606': gene_matcher = GeneMatcher('9606') with urlopen(REACTOME_DOWNLOAD_LINK) as url: memfile = io.BytesIO(url.read()) with ZipFile(memfile, 'r') as myzip: f = myzip.open(REACTOME_FILE_NAME) content = f.read().decode().splitlines() genesets = [] for path in content: gene_symbols = path.split('\t')[2:] if path.split( '\t')[2:] else [] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) gs = GeneSet(gs_id=path.split('\t')[0], name=path.split('\t')[0], genes=genes, hierarchy=('Reactome', 'Pathways'), organism='9606', link='') genesets.append(gs) return GeneSets(genesets)
def send_to_output(self, result): self.progress_bar.finish() self.setStatusMessage('') etc_json, table_name = result # convert to table data = etc_to_table(etc_json, bool(self.gene_as_attr_name)) # set table name data.name = table_name # match genes gene_matcher = GeneMatcher(str(self.orgnism)) if not bool(self.gene_as_attr_name): if 'Gene' in data.domain: gene_column = data.domain['Gene'] gene_names = data.get_column_view(gene_column)[0] gene_matcher.genes = gene_names gene_matcher.run_matcher() domain_ids = Domain([], metas=[StringVariable(NCBI_ID)]) data_ids = [[str(gene.ncbi_id) if gene.ncbi_id else '?'] for gene in gene_matcher.genes] table_ids = Table(domain_ids, data_ids) data = Table.concatenate([data, table_ids]) data.attributes[GENE_ID_COLUMN] = NCBI_ID else: gene_matcher.match_table_attributes(data) data.attributes[GENE_ID_ATTRIBUTE] = NCBI_ID # add table attributes data.attributes[TAX_ID] = str(self.orgnism) data.attributes[GENE_AS_ATTRIBUTE_NAME] = bool(self.gene_as_attr_name) # reset cache indicators self.set_cached_indicator() # send data to the output signal self.Outputs.etc_data.send(data)
from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher, GENE_INFO_TAGS # specify input organism = 9606 genes_symbols_to_match = ['HB1', 'BCKDHB', 'TWIST1'] # initialize gene matcher object gene_matcher = GeneMatcher(organism) gene_matcher.genes = genes_symbols_to_match # run matching process gene_matcher.run_matcher() # inspect results for gene in gene_matcher.genes: print("\ninput name: " + gene.input_name, "\nid from ncbi: ", gene.ncbi_id, "\nmatch type: ", gene.type_of_match ) if gene.ncbi_id is None and gene.possible_hits: print('possible_hits: ', [hit.ncbi_id for hit in gene.possible_hits])
def Update(self): """ Update (recompute enriched pathways) the widget state. """ if not self.data: return self.error(0) self.information(0) # XXX: Check data in setData, do not even allow this to be executed if # data has no genes try: genes = self.GeneNamesFromData(self.data) except ValueError: self.error(0, "Cannot extract gene names from input.") genes = [] if not self.useAttrNames and any("," in gene for gene in genes): genes = reduce(add, (split_and_strip(gene, ",") for gene in genes), []) self.information(0, "Separators detected in input gene names. " "Assuming multiple genes per instance.") self.queryGenes = genes self.information(1) reference = None if self.useReference and self.refData: reference = self.GeneNamesFromData(self.refData) if not self.useAttrNames \ and any("," in gene for gene in reference): reference = reduce(add, (split_and_strip(gene, ",") for gene in reference), []) self.information(1, "Separators detected in reference gene " "names. Assuming multiple genes per " "instance.") org_code = self.SelectedOrganismCode() from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher gm = GeneMatcher(kegg.to_taxid(org_code)) gm.genes = genes gm.run_matcher() mapped_genes = {gene: str(ncbi_id) for gene, ncbi_id in gm.map_input_to_ncbi().items()} def run_enrichment(org_code, genes, reference=None, progress=None): org = kegg.KEGGOrganism(org_code) if reference is None: reference = org.get_ncbi_ids() # This is here just to keep widget working without any major changes. # map not needed, geneMatcher will not work on widget level. unique_genes = genes unique_ref_genes = dict([(gene, gene) for gene in set(reference)]) taxid = kegg.to_taxid(org.org_code) # Map the taxid back to standard 'common' taxids # (as used by 'geneset') if applicable r_tax_map = dict((v, k) for k, v in kegg.KEGGGenome.TAXID_MAP.items()) if taxid in r_tax_map: taxid = r_tax_map[taxid] # We use the kegg pathway gene sets provided by 'geneset' for # the enrichment calculation. kegg_api = kegg.api.CachedKeggApi() linkmap = kegg_api.link(org.org_code, "pathway") converted_ids = kegg_api.conv(org.org_code, 'ncbi-geneid') kegg_sets = relation_list_to_multimap(linkmap, dict((gene.upper(), ncbi.split(':')[-1]) for ncbi, gene in converted_ids)) kegg_sets = geneset.GeneSets(input=kegg_sets) pathways = pathway_enrichment( kegg_sets, unique_genes.values(), unique_ref_genes.keys(), callback=progress ) # Ensure that pathway entries are pre-cached for later use in the # list/tree view kegg_pathways = kegg.KEGGPathways() kegg_pathways.pre_cache( pathways.keys(), progress_callback=progress ) return pathways, org, unique_genes, unique_ref_genes self.progressBarInit() self.setEnabled(False) self.infoLabel.setText("Retrieving...\n") progress = concurrent.methodinvoke(self, "setProgress", (float,)) self._enrichTask = concurrent.Task( function=lambda: run_enrichment(org_code, mapped_genes, reference, progress) ) self._enrichTask.finished.connect(self._onEnrichTaskFinished) self._executor.submit(self._enrichTask)
class Annotations: """ :class:`Annotations` object holds the annotations. :param str organism: an organism specifier (e.g. ``'9606'``). Annotations for that organism will be loaded. :param ontology: :class:`Ontology` object for annotations :type ontology: :class:`Ontology` """ def __init__(self, organism, ontology=None, progress_callback=None): self.ontology = ontology #: A dictionary mapping a gene (gene_id) to a set of all annotations of that gene. self.gene_annotations = defaultdict(list) #: A dictionary mapping a GO term id to a set of annotations that are directly annotated to that term self.term_anotations = defaultdict(list) self.all_annotations = defaultdict(list) self._gene_names = None self._gene_names_dict = None self.gene_matcher = GeneMatcher(organism) #: A list of all :class:`AnnotationRecords` instances. self.annotations = [] self.header = '' self.taxid = organism self._ontology = None try: path = serverfiles.localpath_download( DOMAIN, FILENAME_ANNOTATION.format(organism), progress_callback=progress_callback) except FileNotFoundError: raise taxonomy.UnknownSpeciesIdentifier(organism) self._parse_file(path) @property def ontology(self): return self._ontology @ontology.setter def ontology(self, ontology): """ Set the ontology to use in the annotations mapping. """ self.all_annotations = defaultdict(list) self._ontology = ontology def _ensure_ontology(self): if self.ontology is None: self.ontology = Ontology() def _parse_file(self, file_path): with open(file_path, 'r') as anno_file: self.header = anno_file.readline() for line in anno_file.readlines(): self.add_annotation(AnnotationRecord.from_string(line)) def add_annotation(self, a): """ Add a single :class:`AnotationRecord` instance to this object. """ if not isinstance(a, AnnotationRecord): a = AnnotationRecord(a) if not a.gene_id or not a.go_id or a.qualifier == 'NOT': return self.gene_annotations[int(a.gene_id)].append(a) self.term_anotations[a.go_id].append(a) self.annotations.append(a) self.all_annotations = defaultdict(list) def map_to_ncbi_id(self, genes): """ Run gene name matching and return only known genes """ self.gene_matcher.genes = genes self.gene_matcher.run_matcher() if self.gene_matcher: return { input_gene: ncbi_id for input_gene, ncbi_id in self.gene_matcher.map_input_to_ncbi().items() if self.gene_annotations[ncbi_id] } def map_from_ncbi_id(self): if self.gene_matcher: return { ncbi_id: input_gene for input_gene, ncbi_id in self.gene_matcher.map_input_to_ncbi().items() if self.gene_annotations[ncbi_id] } def _collect_annotations(self, go_id, visited): """ Recursive function collects and caches all annotations for id """ if go_id not in self.all_annotations and go_id not in visited: if go_id in self.ontology.reverse_alias_mapper: annotations = [ self.term_anotations.get(alt_id, []) for alt_id in self.ontology.reverse_alias_mapper[go_id] ] + [self.term_anotations[go_id]] else: annotations = [self.term_anotations[go_id] ] # annotations for this term alone visited.add(go_id) for typeId, child in self.ontology[go_id].related_to: aa = self._collect_annotations(child, visited) if type(aa) == set: annotations.append( aa) # if it was already reduced in get_all_annotations else: annotations.extend(aa) self.all_annotations[go_id] = annotations return self.all_annotations[go_id] def get_annotations_by_go_id(self, go_id): """ Return a set of all annotations (instances of :obj:`AnnotationRecord`) for GO term `id` and all it's subterms. Args: go_id (:obj:`str`): GO term id """ self._ensure_ontology() id = self.ontology.alias_mapper.get(go_id, go_id) if id not in self.all_annotations or type( self.all_annotations[id]) == list: annot_set = set() for annots in self._collect_annotations(id, set()): annot_set.update(annots) self.all_annotations[id] = annot_set return self.all_annotations[id] def get_genes_by_go_term(self, go_id, evidence_codes=None): """ Return a list of genes annotated by specified `evidence_codes` to GO term 'id' and all it's subterms." :param str go_id: GO term id :param list-of-strings evidence_codes: List of evidence codes to consider when matching annotations to terms. """ evidence_codes = set(evidence_codes or evidenceDict.keys()) annotations = self.get_annotations_by_go_id(go_id) return list( set([ int(ann.gene_id) for ann in annotations if ann.evidence in evidence_codes ])) def genes(self): return set([int(ann.gene_id) for ann in self.annotations]) def get_enriched_terms(self, genes, reference=None, evidence_codes=None, slims_only=False, aspect=None, prob=statistics.Binomial(), use_fdr=True, progress_callback=None): """ Return a dictionary of enriched terms, with tuples of (list_of_genes, p_value, reference_count) for items and term ids as keys. P-Values are FDR adjusted if use_fdr is True (default). Args: genes: List of genes reference: List of genes (if None all genes included in the annotations will be used). evidence_codes: List of evidence codes to consider. slims_only: If `True` return only slim terms. aspect: Which aspects to use. Use all by default; one of Process (biological process), Function (molecular function) or Component (cellular component) prob: use_fdr: progress_callback: """ all_genes = set(genes) if not reference: reference = all_genes if aspect is None: aspects_set = {'Process', 'Component', 'Function'} elif isinstance(aspect, str): aspects_set = {aspect} else: aspects_set = aspect evidence_codes = set(evidence_codes or evidenceDict.keys()) annotations = [ ann for gene in genes for ann in self.gene_annotations[gene] if ann.evidence in evidence_codes and ann.aspect in aspects_set ] ref_annotations = set([ ann for gene in reference for ann in self.gene_annotations[gene] if ann.evidence in evidence_codes and ann.aspect in aspects_set ]) annotations_dict = defaultdict(set) for ann in annotations: annotations_dict[ann.go_id].add(ann) self._ensure_ontology() if slims_only and not self.ontology.slims_subset: warnings.warn( "Unspecified slims subset in the ontology! " "Using 'goslim_generic' subset", UserWarning) self.ontology.set_slims_subset('goslim_generic') terms = annotations_dict.keys() filtered_terms = [term for term in terms if term in self.ontology] if len(terms) != len(filtered_terms): term_diff = set(terms) - set(filtered_terms) warnings.warn( "%s terms in the annotations were not found in the " "ontology." % ",".join(map(repr, term_diff)), UserWarning) terms = self.ontology.extract_super_graph(filtered_terms) res = {} milestones = progress_bar_milestones(len(terms), 100) unmatch = self.map_from_ncbi_id() for i, term in enumerate(terms): if slims_only and term not in self.ontology.slims_subset: continue all_annotations = self.get_annotations_by_go_id(term).intersection( ref_annotations) all_annotated_genes = set( [int(ann.gene_id) for ann in all_annotations]) mapped_genes = all_genes.intersection(all_annotated_genes) if len(reference) > len(all_annotated_genes): mapped_reference_genes = reference.intersection( all_annotated_genes) else: mapped_reference_genes = all_annotated_genes.intersection( reference) res[term] = ([unmatch[gene] for gene in mapped_genes], prob.p_value(len(mapped_genes), len(reference), len(mapped_reference_genes), len(genes)), len(mapped_reference_genes)) if progress_callback and i in milestones: progress_callback(100.0 * i / len(terms)) if use_fdr: res = sorted(res.items(), key=lambda x: x[1][1]) res = dict([(id, (genes, p, ref)) for (id, (genes, _, ref)), p in zip( res, statistics.FDR([p for _, (_, p, _) in res]))]) return res def get_annotated_terms(self, genes, direct_annotation_only=False, evidence_codes=None, progress_callback=None): """ Return all terms that are annotated by genes with evidence_codes. """ genes = [genes] if type(genes) == str else genes match = self.map_to_ncbi_id(genes) unmatch = self.map_from_ncbi_id() genes = set([match[gene] for gene in genes]) evidence_codes = set(evidence_codes or evidenceDict.keys()) annotations = [ ann for gene in genes for ann in self.gene_annotations[gene] if ann.evidence in evidence_codes ] dd = defaultdict(set) for ann in annotations: dd[ann.go_id].add(unmatch[int(ann.gene_id)]) if not direct_annotation_only: self._ensure_ontology() terms = dd.keys() filtered_terms = [term for term in terms if term in self.ontology] if len(terms) != len(filtered_terms): term_diff = set(terms) - set(filtered_terms) warnings.warn( "%s terms in the annotations were not found in the " "ontology." % ",".join(map(repr, term_diff)), UserWarning) terms = self.ontology.extract_super_graph(filtered_terms) for i, term in enumerate(terms): term_annotations = self.get_annotations_by_go_id( term).intersection(annotations) dd[term].update( [unmatch[int(ann.gene_id)] for ann in term_annotations]) return dict(dd) def __add__(self, iterable): """ Return a new Annotations object with combined annotations """ return Annotations([a for a in self] + [a for a in iterable], ontology=self.ontology) def __iadd__(self, iterable): """ Add annotations to this instance """ self.extend(iterable) return self def __contains__(self, item): return item in self.annotations def __iter__(self): """ Iterate over all AnnotationRecord objects in annotations """ return iter(self.annotations) def __len__(self): """ Return the number of annotations """ return len(self.annotations) def __getitem__(self, index): """ Return the i-th annotation record """ return self.annotations[index] def __getslice__(self, *args): return self.annotations.__getslice__(*args) def add(self, line): """ Add one annotation """ self.add_annotation(line) def append(self, line): """ Add one annotation """ self.add_annotation(line) def extend(self, lines): """ Add multiple annotations """ for line in lines: self.add_annotation(line)
def _on_dataready(self): self.setEnabled(True) self.setBlocking(False) self.progressBarFinished(processEvents=False) try: data = self._datatask.result() except urlrequest.URLError as error: self.error(0, ("Error while connecting to the NCBI ftp server! " "'%s'" % error)) sys.excepthook(type(error), error, getattr(error, "__traceback__")) return finally: self._datatask = None data_name = data.name samples, _ = self.selectedSamples() self.warning(0) message = None from orangecontrib.bioinformatics.ncbi.gene import GeneMatcher gene_matcher = GeneMatcher(self.currentGds.get('taxid', '')) if self.outputRows: def samplesinst(ex): out = [] for meta in data.domain.metas: out.append((meta.name, ex[meta].value)) if data.domain.class_var.name != 'class': out.append((data.domain.class_var.name, ex[data.domain.class_var].value)) return out samples = set(samples) mask = [samples.issuperset(samplesinst(ex)) for ex in data] data = data[numpy.array(mask, dtype=bool)] gene_matcher.match_table_attributes(data) if len(data) == 0: message = "No samples with selected sample annotations." else: samples = set(samples) domain = Domain( [attr for attr in data.domain.attributes if samples.issuperset(attr.attributes.items())], data.domain.class_var, data.domain.metas ) # domain.addmetas(data.domain.getmetas()) if len(domain.attributes) == 0: message = "No samples with selected sample annotations." stypes = set(s[0] for s in samples) for attr in domain.attributes: attr.attributes = dict( (key, value) for key, value in attr.attributes.items() if key in stypes ) data = Table(domain, data) if 'gene' in data.domain: gene_column = data.domain['gene'] gene_names = data.get_column_view(gene_column)[0] gene_matcher.genes = gene_names gene_matcher.run_matcher() domain_ids = Domain([], metas=[StringVariable(NCBI_ID)]) data_ids = [[str(gene.ncbi_id) if gene.ncbi_id else '?'] for gene in gene_matcher.genes] table_ids = Table(domain_ids, data_ids) data = Table.concatenate([data, table_ids]) if message is not None: self.warning(0, message) data.attributes[TAX_ID] = self.currentGds.get('taxid', '') data.attributes[GENE_AS_ATTRIBUTE_NAME] = bool(self.outputRows) if not bool(self.outputRows): data.attributes[GENE_ID_COLUMN] = NCBI_ID else: data.attributes[GENE_ID_ATTRIBUTE] = NCBI_ID data.name = data_name self.send("Expression Data", data) model = self.treeWidget.model().sourceModel() row = self.gds.index(self.currentGds) model.setData(model.index(row, 0), " ", Qt.DisplayRole) self.updateInfo() self.selectionChanged = False