def run_phenolog(ont, aset, args): """ Like run_enrichment_test, but uses classes from a 2nd ontology/assocset to build the gene set. """ ofactory = OntologyFactory() ont2 = ofactory.create(args.resource2) afactory = AssociationSetFactory() aset2 = afactory.create(ontology=ont2, file=args.file2) # only test for genes (or other subjects of statements) in common common = set(aset.subjects).intersection(aset2.subjects) num_common = len(common) logging.info("Genes in common between two KBs: {}/\{} = {}".format( len(aset.subjects), len(aset2.subjects), num_common)) if num_common < 2: logging.error("TOO FEW") return None for n in aset.ontology.nodes(): nl = ont.label(n, id_if_null=True) genes = aset.query([n]) num_genes = len(genes) if num_genes > 2: logging.info("BASE: {} {} num={}".format(n, nl, num_genes)) enr = aset2.enrichment_test(subjects=genes, background=aset2.subjects, labels=True) for r in enr: print("{:8.3g} {} {:20s} <-> {} {:20s}".format( r['p'], n, nl, r['c'], str(r['n'])))
def get(self): """ Summary statistics for objects associated """ args = parser.parse_args() M = GolrFields() ont = None ocat = args.get('object_category') ontid = args.get('ontology') if ontid is None: if ocat == 'function': ontid = 'go' if ocat == 'phenotype': # TODO: other phenotype ontologies ontid = 'hp' print("Loading: {}".format(ontid)) ont = get_ontology(ontid) taxid = args.get('taxon') max_p_value = float(args.max_p_value) subjects = args.get('subject') background = args.get('background') afactory = AssociationSetFactory() aset = afactory.create(ontology=ont, subject_category='gene', object_category=ocat, taxon=taxid) enr = aset.enrichment_test(subjects=subjects, threshold=max_p_value, labels=True) return {'results': enr}
def test_semsearch(): afa = AssociationSetFactory() f = POMBASE ont = OntologyFactory().create(ONT) parser = GafParser() assocs = parser.parse(POMBASE, skipheader=True) assocs = [a for a in assocs if a['subject']['label'] in GENES] aset = afa.create_from_assocs(assocs, ontology=ont) ont = aset.subontology() aset.ontology = ont logging.info('Genes={} Terms={}'.format(len(aset.subjects), len(ont.nodes()))) print('STATS={}'.format(aset.as_dataframe().describe())) #genes = aset.subjects[0:5] sse = SemSearchEngine(assocmodel=aset) logging.info('Calculating all MICAs') sse.calculate_all_micas() #h5path = 'tests/resources/mica_ic.h5' #logging.info('Saving to {}'.format(h5path)) #sse.mica_ic_df.to_hdf(h5path, key='mica_ic', mode='w') #logging.info('Saved to {}'.format(h5path)) logging.info('Doing pairwise') for i in aset.subjects: for j in aset.subjects: sim = sse.pw_score_cosine(i, j) #print('{} x {} = {}'.format(i,j,sim)) if i == j: assert (sim > 0.9999) tups = sse.pw_score_resnik_bestmatches(i, j) print('{} x {} = {} // {}'.format(i, j, sim, tups))
def test_factory(): afa = AssociationSetFactory() ont = OntologyFactory().create(ONT) aset = afa.create_from_file(POMBASE, ontology=ont, skim=False) found = 0 for s in aset.subjects: print('{} {}'.format(s, aset.label(s))) for c in aset.annotations(s): print(' {} {}'.format(c, ont.label(c))) for a in aset.associations(s, c): e = a['evidence'] print(' {} {} {}'.format(e['type'], e['with_support_from'], e['has_supporting_reference'])) if s == 'PomBase:SPBC2D10.10c' and c == 'GO:0005730': if e['type'] == 'ISO': if e['with_support_from'] == [ 'SGD:S000002172' ] and e['has_supporting_reference'] == [ 'GO_REF:0000024' ]: found += 1 logging.info('** FOUND: {}'.format(a)) if e['type'] == 'IDA': if e['has_supporting_reference'] == ['PMID:16823372']: found += 1 logging.info('** FOUND: {}'.format(a)) assert len(aset.associations_by_subj) > 0 assert found == 2
def __init__(self, ont: str, taxon: str) -> None: self.associations = None self.ont = ont self.taxon = taxon self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory() self.load_associations()
def test_create_from_file_no_fmt(): """ Test loading from gaf while setting fmt to None """ ont = OntologyFactory().create('go') f = AssociationSetFactory() aset = f.create(ontology=ont, fmt=None, file=POMBASE) print("SUBJS: {}".format(aset.subjects)) assert len(aset.subjects) > 100
def load_associations(self, group): p = GafParser() afactory = AssociationSetFactory() url = "http://geneontology.org/gene-associations/gene_association.{}.gz".format( group) if group == 'human': url = "http://geneontology.org/gene-associations/goa_human.gaf.gz" assocs = p.parse(url) assocs = [x for x in assocs if 'header' not in x.keys()] self.associations = afactory.create_from_assocs(assocs, ontology=self.ontology)
def test_gaf(): """ Test loading from gaf """ ofactory = OntologyFactory() afactory = AssociationSetFactory() ont = ofactory.create('go') aset = afactory.create_from_gaf(open(POMBASE, "r"), ontology=ont) print(str(aset)) genes = aset.query([INTRACELLULAR]) for g in genes: print("G={} '{}'".format(g, aset.label(g))) assert G1 in genes
def test_remote_disease(): """ factory test """ ofactory = OntologyFactory() afactory = AssociationSetFactory() ont = ofactory.create('doid') aset = afactory.create(ontology=ont, subject_category='disease', object_category='phenotype', taxon=HUMAN) rs = aset.query_associations([PD]) print("Gene Assocs to PD: {} {}".format(rs, len(rs)))
def test_learn(): afa = AssociationSetFactory() ont = OntologyFactory().create(ONT) aset = afa.create_from_file(file=GAF, ontology=ont) learner = ol.OntologyLearner(assocs=aset) print('L={}'.format(learner)) subont = ont.subontology(relations=['subClassOf']) learner.split_assocs(CC, ontology=subont) print('L.assocs={}'.format(learner.assocs)) print('L.tassocs={}'.format(learner.target_assocs)) with open('target/index.md', 'w') as file: learner.fit_all(reportfile=file) print('L.targets={}'.format(learner.targets))
def test_learn(): ont = OntologyFactory().create(ONT) tont = OntologyFactory().create(TONT) afa = AssociationSetFactory() aset = afa.create_from_file(file=GAF, ontology=ont) taset = afa.create_from_file(file=TGAF, ontology=tont) learner = ol.OntologyLearner(assocs=aset, target_assocs=taset, score_threshold=0.6) print('L={}'.format(learner)) print('L.assocs={}'.format(learner.assocs)) print('L.tassocs={}'.format(learner.target_assocs)) with open('target/pheno_index.md', 'w') as file: learner.fit_all(reportfile=file) print('L.targets={}'.format(learner.targets))
def load_associations(self, ontology_name:str=None, subject_category:str=None, object_category:str=None, evidence=None, taxon:str=None, relation=None, file:Union[str, TextIO]=None, fmt:str=None, skim:bool=False) -> None: ofactory = OntologyFactory() afactory = AssociationSetFactory() ontology = ofactory.create(ontology_name, subject_category) self.associations = afactory.create( ontology=ontology, subject_category=subject_category, object_category=object_category, evidence=evidence, taxon=taxon, relation=relation, file=file, fmt=fmt, skim=skim )
def test_remote_go(): """ factory test """ ofactory = OntologyFactory() afactory = AssociationSetFactory() ont = ofactory.create('go').subontology(relations=['subClassOf', PART_OF]) aset = afactory.create(ontology=ont, subject_category='gene', object_category='function', taxon=MOUSE) rs = aset.query([TRANSCRIPTION_FACTOR], []) print("Mouse genes annotated to TF: {} {}".format(rs, len(rs))) for g in rs: print(" Gene: {} {}".format(g, aset.label(g))) set_tf = rs rs = aset.query([NUCLEUS], []) print("Mouse genes annotated to nucleus: {} {}".format(rs, len(rs))) set_nucleus = rs assert (len(rs) > 100) rs = aset.query([TRANSCRIPTION_FACTOR, NUCLEUS], []) print("Mouse TF genes annotated to nucleus: {} {}".format(rs, len(rs))) assert (len(rs) > 100) set_nucleus_tf = rs assert (len(rs) < len(set_nucleus)) rs = aset.query([NUCLEUS], [TRANSCRIPTION_FACTOR]) print("Mouse non-TF genes annotated to nucleus: {} {}".format(rs, len(rs))) assert (len(rs) > 100) set_nucleus_non_tf = rs assert (len(rs) < len(set_nucleus)) assert (len(set_nucleus_tf) + len(set_nucleus_non_tf) == len(set_nucleus)) enr = aset.enrichment_test(subjects=set_tf, labels=True) print("ENRICHMENT (tf): {}".format(enr)) [match] = [x for x in enr if x['c'] == NUCLEUS] print("ENRICHMENT (tf) for NUCLEUS: {}".format(match)) assert match['p'] < 0.00001
def test_learn_from_phenotype(): """ Learn GO from Phenotypes (note: some phenotypes in FYPO have graph paths to GO classes, so GO will be used to predict GO, which may seem circular, but in fact the phenotype is different information) """ ont = OntologyFactory().create(ONT) tont = OntologyFactory().create(TONT) afa = AssociationSetFactory() aset = afa.create_from_file(file=GAF, ontology=ont) taset = afa.create_from_file(file=TGAF, ontology=tont) learner = ol.OntologyLearner(assocs=aset, target_assocs=taset, score_threshold=0.9) print('L={}'.format(learner)) print('L.assocs={}'.format(learner.assocs)) print('L.tassocs={}'.format(learner.target_assocs)) dir = 'target/from_phenotype' with open(dir + '/index.md', 'w') as file: learner.fit_all(dir=dir, reportfile=file) print('L.targets={}'.format(learner.targets))
log = logging.getLogger(__name__) from ontobio.ontol_factory import OntologyFactory from ontobio.assoc_factory import AssociationSetFactory if __name__ == '__main__': parser = argparse.ArgumentParser( description='Sample script to open phenotypes') args = parser.parse_args() parser.add_argument('input', help='Input') ## Create an ontology factory in order to fetch HPO ofactory = OntologyFactory() ont = ofactory.create("hp") ## Create an association factory to get gene-phenotype associations afactory = AssociationSetFactory() ## Load Associations from Monarch. Note the first time this runs Jupyter will show '*' - be patient aset = afactory.create(ontology=ont, subject_category='gene', object_category='phenotype', taxon='NCBITaxon:9606') ## Run enrichment tests using all classes in ontology enr = aset.enrichment_test(subjects=gene_ids, threshold=0.00005, labels=True) for r in enr[:20]: print("{:8.3g} {} {:40s}".format(r['p'], r['c'], str(r['n'])))
class GenericSimilarity(object): def __init__(self) -> None: self.associations = '' self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory() def load_associations(self, taxon): taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(self.ont) p = GafParser() url = '' if self.ont == 'go': go_roots = set(self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if taxon == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if taxon == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs(assocs, ontology=sub_ont) else: self.associations = \ self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[taxon] ) @staticmethod def jaccard_similarity(aset: AssociationSet, s1: str, s2: str) -> float: """ Calculate jaccard index of inferred associations of two subjects |ancs(s1) /\ ancs(s2)| --- |ancs(s1) \/ ancs(s2)| """ a1 = aset.inferred_types(s1) a2 = aset.inferred_types(s2) num_union = len(a1.union(a2)) if num_union == 0: return 0.0, set() shared_terms = a1.intersection(a2) return len(shared_terms) / num_union, shared_terms def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]: similarities = [] for index, igene in enumerate(input_genes): for subject_curie in self.associations.subject_label_map.keys(): input_gene = GenericSimilarity.trim_mgi_prefix( input_gene=igene['sim_input_curie'], subject_curie=subject_curie ) if input_gene is not subject_curie: score, shared_terms = \ GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie) if float(score) > float(lower_bound): subject_label = self.associations.label(subject_curie) similarities.append({ 'input_id': input_gene, 'input_symbol': igene['input_symbol'], 'hit_symbol': subject_label, 'hit_id': subject_curie, 'score': score, 'shared_terms': shared_terms, }) return similarities @staticmethod def trim_mgi_prefix(input_gene, subject_curie): if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene: return input_gene elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene: return input_gene[4:] else: return input_gene @staticmethod def sort_results(input_gene_set, results): results = pd.DataFrame(results) annotated_gene_set = input_gene_set['hit_id'].tolist() results = \ results[~results['hit_id'].isin(annotated_gene_set)]. \ sort_values('score', ascending=False) return results
def main(): """ Wrapper for OGR Assocs """ parser = argparse.ArgumentParser( description='Wrapper for obographs assocmodel library' """ By default, ontologies and assocs are cached locally and synced from a remote sparql endpoint """, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-r', '--resource', type=str, required=False, help='Name of ontology') parser.add_argument('-f', '--assocfile', type=str, required=False, help='Name of input file for associations') parser.add_argument( '--assocformat', type=str, default='gaf', required=False, help='Format of association file, if passed (default: gaf)') parser.add_argument('-o', '--outfile', type=str, required=False, help='Path to output file') parser.add_argument('-t', '--to', type=str, required=False, help='Output to (tree, dot, ...)') parser.add_argument('-d', '--direction', type=str, default='u', required=False, help='u = up, d = down, ud = up and down') parser.add_argument('-e', '--evidence', type=str, required=False, help='ECO class') parser.add_argument('-p', '--properties', nargs='*', type=str, required=False, help='Properties') parser.add_argument('-P', '--plot', type=bool, default=False, help='if set, plot output (requires plotly)') parser.add_argument('-y', '--yamlconfig', type=str, required=False, help='Path to setup/configuration yaml file') parser.add_argument('-S', '--slim', type=str, default='', required=False, help='Slim type. m=minimal') parser.add_argument('-c', '--container_properties', nargs='*', type=str, required=False, help='Properties to nest in graph') parser.add_argument('-C', '--category', nargs=2, type=str, required=False, help='category tuple (SUBJECT OBJECT)') parser.add_argument('-T', '--taxon', type=str, required=False, help='Taxon of associations') parser.add_argument('-v', '--verbosity', default=0, action='count', help='Increase output verbosity') subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help') # EXTRACT ONTOLOGY parser_n = subparsers.add_parser( 'subontology', help= 'Extract sub-ontology, include only annotated nodes or their descendants' ) parser_n.add_argument('-M', '--minimal', dest='minimal', action='store_true', default=False, help='If set, remove non-MRCA nodes') parser_n.set_defaults(function=extract_ontology) # ENRICHMENT parser_n = subparsers.add_parser( 'enrichment', help= 'Perform an enrichment test over a sample set of annotated entities') parser_n.add_argument( '-q', '--query', type=str, help='query all genes for this class an use as subject') parser_n.add_argument('-H', '--hypotheses', nargs='*', help='list of classes to test against') parser_n.add_argument( '-s', '--sample_file', type=str, help='file containing list of gene IDs in sample set') parser_n.add_argument( '-b', '--background_file', type=str, help='file containing list of gene IDs in background set') parser_n.add_argument('-t', '--threshold', type=float, help='p-value threshold') parser_n.add_argument('sample_ids', nargs='*', help='list of gene IDs in sample set') parser_n.set_defaults(function=run_enrichment_test) # PHENOLOG parser_n = subparsers.add_parser( 'phenolog', help= 'Perform multiple enrichment tests, using a second ontology and assoc set to build gene sets' ) parser_n.add_argument('-R', '--resource2', type=str, required=True, help='path to second GAF') parser_n.add_argument('-F', '--file2', type=str, required=True, help='handle for second ontology') parser_n.set_defaults(function=run_phenolog) # QUERY parser_n = subparsers.add_parser( 'query', help= 'Query for entities (e.g. genes) based on positive and negative terms') parser_n.add_argument('-q', '--query', nargs='*', help='positive classes') parser_n.add_argument('-N', '--negative', type=str, help='negative classes') parser_n.set_defaults(function=run_query) # QUERY ASSOCIATIONS parser_n = subparsers.add_parser( 'associations', help='Query for associations for a set of entities (e.g. genes)') parser_n.add_argument('subjects', nargs='*', help='subject ids') parser_n.add_argument('-D', '--dendrogram', type=bool, default=False) parser_n.set_defaults(function=run_query_associations) # INTERSECTIONS parser_n = subparsers.add_parser('intersections', help='Query intersections') parser_n.add_argument('-X', '--xterms', nargs='*', help='x classes') parser_n.add_argument('-Y', '--yterms', nargs='*', help='y classes') parser_n.add_argument('--useids', type=bool, default=False, help='if true, use IDs not labels on axes') parser_n.add_argument('terms', nargs='*', help='all terms (x and y)') parser_n.set_defaults(function=plot_intersections) # INTERSECTION DENDROGRAM (TODO: merge into previous?) parser_n = subparsers.add_parser('intersection-dendrogram', help='Plot dendrogram from intersections') parser_n.add_argument('-X', '--xterms', nargs='*', help='x classes') parser_n.add_argument('-Y', '--yterms', nargs='*', help='y classes') parser_n.add_argument('--useids', type=bool, default=False, help='if true, use IDs not labels on axes') parser_n.add_argument('terms', nargs='*', help='all terms (x and y)') parser_n.set_defaults(function=plot_term_intersection_dendrogram) # SIMILARITY MATRIX (may move to another module) parser_n = subparsers.add_parser( 'simmatrix', help='Plot dendrogram for similarities between subjects') parser_n.add_argument('-X', '--xsubjects', nargs='*', help='x subjects') parser_n.add_argument('-Y', '--ysubjects', nargs='*', help='y subjects') parser_n.add_argument('--useids', type=bool, default=False, help='if true, use IDs not labels on axes') parser_n.add_argument('subjects', nargs='*', help='all terms (x and y)') parser_n.set_defaults(function=plot_simmatrix) args = parser.parse_args() if args.verbosity >= 2: logging.basicConfig(level=logging.DEBUG) elif args.verbosity == 1: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARNING) if not args.assocfile: if not args.taxon or not args.category: raise ValueError( "Must specify EITHER assocfile OR both taxon and category") logging.info("Welcome!") if args.yamlconfig is not None: logging.info("Setting config from: {}".format(args.yamlconfig)) # note this sets a global: # we would not do this outside the context of a standalone script from ontobio.config import set_config set_config(args.yamlconfig) handle = args.resource # Ontology Factory ofactory = OntologyFactory() logging.info("Creating ont object from: {} {}".format(handle, ofactory)) ont = ofactory.create(handle) logging.info("ont: {}".format(ont)) evidence = args.evidence if evidence is not None and evidence.lower() == 'noiea': evidence = "-ECO:0000501" # Association Factory afactory = AssociationSetFactory() aset = None if args.assocfile is not None: aset = afactory.create_from_file(file=args.assocfile, fmt=args.assocformat, ontology=ont) else: [subject_category, object_category] = args.category # create using GO/Monarch services aset = afactory.create(ontology=ont, subject_category=subject_category, object_category=object_category, taxon=args.taxon) func = args.function func(ont, aset, args)
import pickle import pandas import numpy as np from ontobio.ontol_factory import OntologyFactory from ontobio.assoc_factory import AssociationSetFactory HUMAN = 'NCBITaxon:9606' #ontology paths ##''/Users/marcin/Documents/VIMSS/ontology/NCATS/HPO/hp.obo')#mondo#hp ofactory = OntologyFactory() afactory = AssociationSetFactory() print("creating...") ont = ofactory.create('hp') #ont = ofactory.create('mondo') aset = afactory.create(ontology=ont, subject_category='disease', object_category='phenotype', taxon=HUMAN) ###aset = afactory.create_from_gaf('my.gaf', ontology=ont) disease_ids = ["DECIPHER:1", "DECIPHER:16", "OMIM:614696", "OMIM:614699", "Orphanet:99978"] phenotype_ids = ["HP:0000007", "Orphanet:93299", "Orphanet:90794"] print("annotations\t"+phenotype_ids[1])
def test_factory(): ont = OntologyFactory().create(ONT) f = AssociationSetFactory() aset = f.create(ontology=ont, fmt='hpoa', file=ANNFILE) print("SUBJS: {}".format(aset.subjects)) assert len(aset.subjects) > 40
def main(): """ Phenologs """ parser = argparse.ArgumentParser( description='Phenologs' """ By default, ontologies are cached locally and synced from a remote sparql endpoint """, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-r', '--resource1', type=str, required=False, help='Name of ontology1') parser.add_argument('-R', '--resource2', type=str, required=False, help='Name of ontology2') parser.add_argument('-T', '--taxon', type=str, default='NCBITaxon:10090', required=False, help='NCBITaxon ID') parser.add_argument('-s', '--search', type=str, default='', required=False, help='Search type. p=partial, r=regex') parser.add_argument('-b', '--background', type=str, default=None, required=False, help='Class to use for background') parser.add_argument('-p', '--pthreshold', type=float, default=0.05, required=False, help='P-value threshold') parser.add_argument('-v', '--verbosity', default=0, action='count', help='Increase output verbosity') parser.add_argument('ids', nargs='*') args = parser.parse_args() if args.verbosity >= 2: logging.basicConfig(level=logging.DEBUG) if args.verbosity == 1: logging.basicConfig(level=logging.INFO) logging.info("Welcome!") ofactory = OntologyFactory() afactory = AssociationSetFactory() handle = args.resource1 ont1 = ofactory.create(args.resource1) ont2 = ofactory.create(args.resource2) logging.info("onts: {} {}".format(ont1, ont2)) searchp = args.search category = 'gene' aset1 = afactory.create(ontology=ont1, subject_category=category, object_category='phenotype', taxon=args.taxon) aset2 = afactory.create(ontology=ont2, subject_category=category, object_category='function', taxon=args.taxon) bg_cls = None if args.background is not None: bg_ids = resolve(ont1, [args.background], searchp) if len(bg_ids) == 0: logging.error("Cannnot resolve: '{}' using {} in {}".format( args.background, searchp, ont1)) sys.exit(1) elif len(bg_ids) > 1: logging.error("Multiple matches: '{}' using {} MATCHES={}".format( args.background, searchp, bg_ids)) sys.exit(1) else: logging.info("Background: {}".format(bg_cls)) [bg_cls] = bg_ids for id in resolve(ont1, args.ids, searchp): sample = aset1.query([id], []) print("Gene set class:{} Gene set: {}".format(id, sample)) bg = None if bg_cls is not None: bg = aset1.query([bg_cls], []) print("BACKGROUND SUBJECTS: {}".format(bg)) rs = aset2.enrichment_test(sample, bg, threshold=args.pthreshold, labels=True) print("RESULTS: {} < {}".format(len(rs), args.pthreshold)) for r in rs: print(str(r))
renderer = GraphRenderer.create('obo') renderer.outfile = './output/wd-ontology.obo' # renderer.write(wd_ontology) # >> AttributeError: 'EagerWikidataOntology' object has no attribute 'all_logical_definitions' renderer.write_subgraph(wd_ontology, nodes, query_ids=qids) # Get GO terms outfile = open('./output/go-terms.tsv', 'w') [ptsd] = wd_ontology.search('Sickle Cell Anemia') proteins = wd.canned_query('disease2protein', ptsd) go = onto_factory.create('go') afactory = AssociationSetFactory() aset = afactory.create(ontology=go, subject_category='gene', object_category='function', taxon='NCBITaxon:9606') for n in wd_ontology.nodes(): proteins = wd.canned_query('disease2protein', n) anns = [a for p in proteins for a in aset.annotations(p)] if len(anns) > 0: print("{} {}".format(n, wd_ontology.label(n))) for a in anns: outfile.write("{}\t{}\n".format(a, go.label(a))) # Endpoints SCIGRAPH_ONTOLOGY = 'https://scigraph-ontology-dev.monarchinitiative.org/scigraph/'
class GenericSimilarity(object): def __init__(self) -> None: self.associations = '' self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory() def retrieve_associations(self, ont, group): taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(ont) p = GafParser() url = '' if ont == 'go': go_roots = set( self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if group == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if group == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse('goa_human.gaf.gz') #assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs( assocs, ontology=sub_ont) else: self.associations = self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[group]) def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]: similarities = [] for index, igene in enumerate(input_genes): for subject_curie in self.associations.subject_label_map.keys(): input_gene = GenericSimilarity.trim_mgi_prefix( input_gene=igene['sim_input_curie'], subject_curie=subject_curie) if input_gene is not subject_curie: score = jaccard_similarity(self.associations, input_gene, subject_curie) if float(score) > float(lower_bound): subject_label = self.associations.label(subject_curie) similarities.append({ 'input_id': input_gene, 'input_symbol': igene['input_symbol'], 'hit_symbol': subject_label, 'hit_id': subject_curie, 'score': score, }) return similarities @staticmethod def trim_mgi_prefix(input_gene, subject_curie): if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene: return input_gene elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene: return input_gene[4:] else: return input_gene
class GenericSimilarity(object): def __init__(self) -> None: self.associations = None self.ont = '' self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory() def load_associations(self, taxon) -> None: taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(self.ont) p = GafParser() url = '' if self.ont == 'go': # CX: GO:0008150 is biological_process, GO:0003674 is molecular_function. # CX: These are 2 out of 3 top-level terms in GO ontology. # CX: The excluded term is cellular_component (where gene carries out a molecular function) go_roots = set(self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if taxon == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if taxon == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs(assocs, ontology=sub_ont) else: self.associations = \ self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[taxon] ) @staticmethod def jaccard_similarity(aset: AssociationSet, s1: str, s2: str) -> Tuple[float, list]: """ Calculate jaccard index of inferred associations of two subjects |ancs(s1) /\ ancs(s2)| --- |ancs(s1) \/ ancs(s2)| """ a1 = aset.inferred_types(s1) a2 = aset.inferred_types(s2) num_union = len(a1.union(a2)) if num_union == 0: return 0.0, list() shared_terms = a1.intersection(a2) # Note: we need to convert the shared_terms set to a list # to avoid later JSON serialization problems return len(shared_terms) / num_union, list(shared_terms) def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]: similarities = [] for index, igene in enumerate(input_genes): for subject_curie in self.associations.subject_label_map.keys(): input_gene = GenericSimilarity.trim_mgi_prefix( input_gene=igene['sim_input_curie'], subject_curie=subject_curie ) if input_gene is not subject_curie: score, shared_terms = \ GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie) if float(score) > float(lower_bound): subject_label = self.associations.label(subject_curie) # CX: addition of human-readable labels aka "shared_term_names" shared_term_names = [self.associations.label(x) for x in shared_terms] similarities.append({ 'input_id': input_gene, 'input_symbol': igene['input_symbol'], 'hit_symbol': subject_label, 'hit_id': subject_curie, 'score': score, 'shared_terms': shared_terms, 'shared_term_names': shared_term_names }) return similarities @staticmethod def trim_mgi_prefix(input_gene, subject_curie) -> str: if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene: return input_gene elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene: return input_gene[4:] else: return input_gene @staticmethod def sort_results(results) -> pd.DataFrame: results = pd.DataFrame(results) if not results.empty: # CX: Some users need to know the scores that input genes have for each other. # replacing code to remove GeneA input = GeneA output results results = \ results[~(results.hit_id == results.input_id)]. \ sort_values('score', ascending=False) return results
def test_remote_go_pombase(): ont = OntologyFactory().create('go') f = AssociationSetFactory() aset = f.create(ontology=ont, fmt='gaf', file=POMBASE) print("SUBJS: {}".format(aset.subjects)) assert len(aset.subjects) > 100
def __init__(self) -> None: self.associations = '' self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory()
class GenericSimilarity(object): # Class level singletons for similarity engines _ontology = {} # Class level cache for results of Jaccard similarity searches _jaccard_similarity_tasks = {} @classmethod def get_similarity_engine(cls, ontology, taxon): """ Returns a singleton GenericSimilarity instance for use in Jaccard similarity computations :param ontology: should be 'go', 'hp' or 'mp' :param taxon: should be 'human' or 'mouse' :return: GenericSimilarity() singleton """ if ontology not in ['go', 'hp', 'mp']: raise OntologyServerException( "compute_jaccard() ERROR: ontology '" + ontology + "' not recognized.") if taxon not in ['human', 'mouse']: raise OntologyServerException("compute_jaccard() ERROR: taxon '" + taxon + "' not recognized.") if ontology not in cls._ontology: cls._ontology[ontology] = {} if taxon not in cls._ontology[ontology]: cls._ontology[ontology][taxon] = GenericSimilarity(ontology, taxon) return cls._ontology[ontology][taxon] def __init__(self, ont: str, taxon: str) -> None: self.associations = None self.ont = ont self.taxon = taxon self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory() self.load_associations() def load_associations(self) -> None: taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(self.ont) p = GafParser() url = '' if self.ont == 'go': # CX: GO:0008150 is biological_process, GO:0003674 is molecular_function. # CX: These are 2 out of 3 top-level terms in GO ontology. # CX: The excluded term is cellular_component (where gene carries out a molecular function) go_roots = set( self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if self.taxon == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if self.taxon == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs( assocs, ontology=sub_ont) else: self.associations = \ self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[self.taxon] ) @staticmethod def jaccard_similarity(aset: AssociationSet, s1: str, s2: str) -> Tuple[float, list]: """ Calculate jaccard index of inferred associations of two subjects |ancs(s1) /\ ancs(s2)| --- |ancs(s1) \/ ancs(s2)| """ a1 = aset.inferred_types(s1) a2 = aset.inferred_types(s2) num_union = len(a1.union(a2)) if num_union == 0: return 0.0, list() shared_terms = a1.intersection(a2) # Note: we need to convert the shared_terms set to a list # to avoid later JSON serialization problems return len(shared_terms) / num_union, list(shared_terms) async def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]: similarities = [] for index, igene in enumerate(input_genes): for subject_curie in self.associations.subject_label_map.keys(): input_gene = GenericSimilarity.trim_mgi_prefix( input_gene=igene.sim_input_curie, subject_curie=subject_curie) if input_gene is not subject_curie: score, shared_terms = \ GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie) if score > lower_bound: subject_label = self.associations.label(subject_curie) # CX: addition of human-readable labels aka "shared_term_names" shared_term_names = [ self.associations.label(x) for x in shared_terms ] similarities.append({ 'input_id': input_gene, 'input_symbol': igene.input_symbol, 'hit_symbol': subject_label, 'hit_id': subject_curie, 'score': score, 'shared_terms': shared_terms, 'shared_term_names': shared_term_names }) return similarities async def compute_jaccard_task(self, uuid: str, input_genes: List[dict], lower_bound: float): self._jaccard_similarity_tasks[uuid] = asyncio.create_task( self.compute_jaccard(input_genes, lower_bound)) def compute_jaccard_async(self, input_genes: List[dict], lower_bound: float): uuid = str(uuid4()) asyncio.run(self.compute_jaccard_task(uuid, input_genes, lower_bound)) return uuid @classmethod def get_jaccard_similarity_result(cls, computation_id: str): if computation_id in cls._jaccard_similarity_tasks: jaccard_similarity_task = cls._jaccard_similarity_tasks[ computation_id] # Need to check if the result is ready to return, then return it if jaccard_similarity_task.done(): try: result = jaccard_similarity_task.result() except CancelledError: raise JaccardSimilarityResultNotFound except InvalidStateError: raise JaccardSimilarityComputationError return result else: raise JaccardSimilarityPending else: raise JaccardSimilarityResultNotFound @staticmethod def trim_mgi_prefix(input_gene, subject_curie) -> str: if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene: return input_gene elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene: return input_gene[4:] else: return input_gene