def run_phenolog(ont, aset, args): """ Like run_enrichment_test, but uses classes from a 2nd ontology/assocset to build the gene set. """ ofactory = OntologyFactory() ont2 = ofactory.create(args.resource2) afactory = AssociationSetFactory() aset2 = afactory.create(ontology=ont2, file=args.file2) # only test for genes (or other subjects of statements) in common common = set(aset.subjects).intersection(aset2.subjects) num_common = len(common) logging.info("Genes in common between two KBs: {}/\{} = {}".format( len(aset.subjects), len(aset2.subjects), num_common)) if num_common < 2: logging.error("TOO FEW") return None for n in aset.ontology.nodes(): nl = ont.label(n, id_if_null=True) genes = aset.query([n]) num_genes = len(genes) if num_genes > 2: logging.info("BASE: {} {} num={}".format(n, nl, num_genes)) enr = aset2.enrichment_test(subjects=genes, background=aset2.subjects, labels=True) for r in enr: print("{:8.3g} {} {:20s} <-> {} {:20s}".format( r['p'], n, nl, r['c'], str(r['n'])))
def get(self): """ Summary statistics for objects associated """ args = parser.parse_args() M = GolrFields() ont = None ocat = args.get('object_category') ontid = args.get('ontology') if ontid is None: if ocat == 'function': ontid = 'go' if ocat == 'phenotype': # TODO: other phenotype ontologies ontid = 'hp' print("Loading: {}".format(ontid)) ont = get_ontology(ontid) taxid = args.get('taxon') max_p_value = float(args.max_p_value) subjects = args.get('subject') background = args.get('background') afactory = AssociationSetFactory() aset = afactory.create(ontology=ont, subject_category='gene', object_category=ocat, taxon=taxid) enr = aset.enrichment_test(subjects=subjects, threshold=max_p_value, labels=True) return {'results': enr}
def test_factory(): afa = AssociationSetFactory() ont = OntologyFactory().create(ONT) aset = afa.create_from_file(POMBASE, ontology=ont, skim=False) found = 0 for s in aset.subjects: print('{} {}'.format(s, aset.label(s))) for c in aset.annotations(s): print(' {} {}'.format(c, ont.label(c))) for a in aset.associations(s, c): e = a['evidence'] print(' {} {} {}'.format(e['type'], e['with_support_from'], e['has_supporting_reference'])) if s == 'PomBase:SPBC2D10.10c' and c == 'GO:0005730': if e['type'] == 'ISO': if e['with_support_from'] == [ 'SGD:S000002172' ] and e['has_supporting_reference'] == [ 'GO_REF:0000024' ]: found += 1 logging.info('** FOUND: {}'.format(a)) if e['type'] == 'IDA': if e['has_supporting_reference'] == ['PMID:16823372']: found += 1 logging.info('** FOUND: {}'.format(a)) assert len(aset.associations_by_subj) > 0 assert found == 2
def test_semsearch(): afa = AssociationSetFactory() f = POMBASE ont = OntologyFactory().create(ONT) parser = GafParser() assocs = parser.parse(POMBASE, skipheader=True) assocs = [a for a in assocs if a['subject']['label'] in GENES] aset = afa.create_from_assocs(assocs, ontology=ont) ont = aset.subontology() aset.ontology = ont logging.info('Genes={} Terms={}'.format(len(aset.subjects), len(ont.nodes()))) print('STATS={}'.format(aset.as_dataframe().describe())) #genes = aset.subjects[0:5] sse = SemSearchEngine(assocmodel=aset) logging.info('Calculating all MICAs') sse.calculate_all_micas() #h5path = 'tests/resources/mica_ic.h5' #logging.info('Saving to {}'.format(h5path)) #sse.mica_ic_df.to_hdf(h5path, key='mica_ic', mode='w') #logging.info('Saved to {}'.format(h5path)) logging.info('Doing pairwise') for i in aset.subjects: for j in aset.subjects: sim = sse.pw_score_cosine(i, j) #print('{} x {} = {}'.format(i,j,sim)) if i == j: assert (sim > 0.9999) tups = sse.pw_score_resnik_bestmatches(i, j) print('{} x {} = {} // {}'.format(i, j, sim, tups))
def __init__(self, ont: str, taxon: str) -> None: self.associations = None self.ont = ont self.taxon = taxon self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory() self.load_associations()
def test_create_from_file_no_fmt(): """ Test loading from gaf while setting fmt to None """ ont = OntologyFactory().create('go') f = AssociationSetFactory() aset = f.create(ontology=ont, fmt=None, file=POMBASE) print("SUBJS: {}".format(aset.subjects)) assert len(aset.subjects) > 100
def load_associations(self, group): p = GafParser() afactory = AssociationSetFactory() url = "http://geneontology.org/gene-associations/gene_association.{}.gz".format( group) if group == 'human': url = "http://geneontology.org/gene-associations/goa_human.gaf.gz" assocs = p.parse(url) assocs = [x for x in assocs if 'header' not in x.keys()] self.associations = afactory.create_from_assocs(assocs, ontology=self.ontology)
def test_gaf(): """ Test loading from gaf """ ofactory = OntologyFactory() afactory = AssociationSetFactory() ont = ofactory.create('go') aset = afactory.create_from_gaf(open(POMBASE, "r"), ontology=ont) print(str(aset)) genes = aset.query([INTRACELLULAR]) for g in genes: print("G={} '{}'".format(g, aset.label(g))) assert G1 in genes
def test_learn(): afa = AssociationSetFactory() ont = OntologyFactory().create(ONT) aset = afa.create_from_file(file=GAF, ontology=ont) learner = ol.OntologyLearner(assocs=aset) print('L={}'.format(learner)) subont = ont.subontology(relations=['subClassOf']) learner.split_assocs(CC, ontology=subont) print('L.assocs={}'.format(learner.assocs)) print('L.tassocs={}'.format(learner.target_assocs)) with open('target/index.md', 'w') as file: learner.fit_all(reportfile=file) print('L.targets={}'.format(learner.targets))
def test_remote_disease(): """ factory test """ ofactory = OntologyFactory() afactory = AssociationSetFactory() ont = ofactory.create('doid') aset = afactory.create(ontology=ont, subject_category='disease', object_category='phenotype', taxon=HUMAN) rs = aset.query_associations([PD]) print("Gene Assocs to PD: {} {}".format(rs, len(rs)))
def test_learn(): ont = OntologyFactory().create(ONT) tont = OntologyFactory().create(TONT) afa = AssociationSetFactory() aset = afa.create_from_file(file=GAF, ontology=ont) taset = afa.create_from_file(file=TGAF, ontology=tont) learner = ol.OntologyLearner(assocs=aset, target_assocs=taset, score_threshold=0.6) print('L={}'.format(learner)) print('L.assocs={}'.format(learner.assocs)) print('L.tassocs={}'.format(learner.target_assocs)) with open('target/pheno_index.md', 'w') as file: learner.fit_all(reportfile=file) print('L.targets={}'.format(learner.targets))
def load_associations(self, ontology_name:str=None, subject_category:str=None, object_category:str=None, evidence=None, taxon:str=None, relation=None, file:Union[str, TextIO]=None, fmt:str=None, skim:bool=False) -> None: ofactory = OntologyFactory() afactory = AssociationSetFactory() ontology = ofactory.create(ontology_name, subject_category) self.associations = afactory.create( ontology=ontology, subject_category=subject_category, object_category=object_category, evidence=evidence, taxon=taxon, relation=relation, file=file, fmt=fmt, skim=skim )
def test_remote_go(): """ factory test """ ofactory = OntologyFactory() afactory = AssociationSetFactory() ont = ofactory.create('go').subontology(relations=['subClassOf', PART_OF]) aset = afactory.create(ontology=ont, subject_category='gene', object_category='function', taxon=MOUSE) rs = aset.query([TRANSCRIPTION_FACTOR], []) print("Mouse genes annotated to TF: {} {}".format(rs, len(rs))) for g in rs: print(" Gene: {} {}".format(g, aset.label(g))) set_tf = rs rs = aset.query([NUCLEUS], []) print("Mouse genes annotated to nucleus: {} {}".format(rs, len(rs))) set_nucleus = rs assert (len(rs) > 100) rs = aset.query([TRANSCRIPTION_FACTOR, NUCLEUS], []) print("Mouse TF genes annotated to nucleus: {} {}".format(rs, len(rs))) assert (len(rs) > 100) set_nucleus_tf = rs assert (len(rs) < len(set_nucleus)) rs = aset.query([NUCLEUS], [TRANSCRIPTION_FACTOR]) print("Mouse non-TF genes annotated to nucleus: {} {}".format(rs, len(rs))) assert (len(rs) > 100) set_nucleus_non_tf = rs assert (len(rs) < len(set_nucleus)) assert (len(set_nucleus_tf) + len(set_nucleus_non_tf) == len(set_nucleus)) enr = aset.enrichment_test(subjects=set_tf, labels=True) print("ENRICHMENT (tf): {}".format(enr)) [match] = [x for x in enr if x['c'] == NUCLEUS] print("ENRICHMENT (tf) for NUCLEUS: {}".format(match)) assert match['p'] < 0.00001
def test_learn_from_phenotype(): """ Learn GO from Phenotypes (note: some phenotypes in FYPO have graph paths to GO classes, so GO will be used to predict GO, which may seem circular, but in fact the phenotype is different information) """ ont = OntologyFactory().create(ONT) tont = OntologyFactory().create(TONT) afa = AssociationSetFactory() aset = afa.create_from_file(file=GAF, ontology=ont) taset = afa.create_from_file(file=TGAF, ontology=tont) learner = ol.OntologyLearner(assocs=aset, target_assocs=taset, score_threshold=0.9) print('L={}'.format(learner)) print('L.assocs={}'.format(learner.assocs)) print('L.tassocs={}'.format(learner.target_assocs)) dir = 'target/from_phenotype' with open(dir + '/index.md', 'w') as file: learner.fit_all(dir=dir, reportfile=file) print('L.targets={}'.format(learner.targets))
def __init__(self) -> None: self.associations = '' self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory()
def main(): """ Wrapper for OGR Assocs """ parser = argparse.ArgumentParser( description='Wrapper for obographs assocmodel library' """ By default, ontologies and assocs are cached locally and synced from a remote sparql endpoint """, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-r', '--resource', type=str, required=False, help='Name of ontology') parser.add_argument('-f', '--assocfile', type=str, required=False, help='Name of input file for associations') parser.add_argument( '--assocformat', type=str, default='gaf', required=False, help='Format of association file, if passed (default: gaf)') parser.add_argument('-o', '--outfile', type=str, required=False, help='Path to output file') parser.add_argument('-t', '--to', type=str, required=False, help='Output to (tree, dot, ...)') parser.add_argument('-d', '--direction', type=str, default='u', required=False, help='u = up, d = down, ud = up and down') parser.add_argument('-e', '--evidence', type=str, required=False, help='ECO class') parser.add_argument('-p', '--properties', nargs='*', type=str, required=False, help='Properties') parser.add_argument('-P', '--plot', type=bool, default=False, help='if set, plot output (requires plotly)') parser.add_argument('-y', '--yamlconfig', type=str, required=False, help='Path to setup/configuration yaml file') parser.add_argument('-S', '--slim', type=str, default='', required=False, help='Slim type. m=minimal') parser.add_argument('-c', '--container_properties', nargs='*', type=str, required=False, help='Properties to nest in graph') parser.add_argument('-C', '--category', nargs=2, type=str, required=False, help='category tuple (SUBJECT OBJECT)') parser.add_argument('-T', '--taxon', type=str, required=False, help='Taxon of associations') parser.add_argument('-v', '--verbosity', default=0, action='count', help='Increase output verbosity') subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help') # EXTRACT ONTOLOGY parser_n = subparsers.add_parser( 'subontology', help= 'Extract sub-ontology, include only annotated nodes or their descendants' ) parser_n.add_argument('-M', '--minimal', dest='minimal', action='store_true', default=False, help='If set, remove non-MRCA nodes') parser_n.set_defaults(function=extract_ontology) # ENRICHMENT parser_n = subparsers.add_parser( 'enrichment', help= 'Perform an enrichment test over a sample set of annotated entities') parser_n.add_argument( '-q', '--query', type=str, help='query all genes for this class an use as subject') parser_n.add_argument('-H', '--hypotheses', nargs='*', help='list of classes to test against') parser_n.add_argument( '-s', '--sample_file', type=str, help='file containing list of gene IDs in sample set') parser_n.add_argument( '-b', '--background_file', type=str, help='file containing list of gene IDs in background set') parser_n.add_argument('-t', '--threshold', type=float, help='p-value threshold') parser_n.add_argument('sample_ids', nargs='*', help='list of gene IDs in sample set') parser_n.set_defaults(function=run_enrichment_test) # PHENOLOG parser_n = subparsers.add_parser( 'phenolog', help= 'Perform multiple enrichment tests, using a second ontology and assoc set to build gene sets' ) parser_n.add_argument('-R', '--resource2', type=str, required=True, help='path to second GAF') parser_n.add_argument('-F', '--file2', type=str, required=True, help='handle for second ontology') parser_n.set_defaults(function=run_phenolog) # QUERY parser_n = subparsers.add_parser( 'query', help= 'Query for entities (e.g. genes) based on positive and negative terms') parser_n.add_argument('-q', '--query', nargs='*', help='positive classes') parser_n.add_argument('-N', '--negative', type=str, help='negative classes') parser_n.set_defaults(function=run_query) # QUERY ASSOCIATIONS parser_n = subparsers.add_parser( 'associations', help='Query for associations for a set of entities (e.g. genes)') parser_n.add_argument('subjects', nargs='*', help='subject ids') parser_n.add_argument('-D', '--dendrogram', type=bool, default=False) parser_n.set_defaults(function=run_query_associations) # INTERSECTIONS parser_n = subparsers.add_parser('intersections', help='Query intersections') parser_n.add_argument('-X', '--xterms', nargs='*', help='x classes') parser_n.add_argument('-Y', '--yterms', nargs='*', help='y classes') parser_n.add_argument('--useids', type=bool, default=False, help='if true, use IDs not labels on axes') parser_n.add_argument('terms', nargs='*', help='all terms (x and y)') parser_n.set_defaults(function=plot_intersections) # INTERSECTION DENDROGRAM (TODO: merge into previous?) parser_n = subparsers.add_parser('intersection-dendrogram', help='Plot dendrogram from intersections') parser_n.add_argument('-X', '--xterms', nargs='*', help='x classes') parser_n.add_argument('-Y', '--yterms', nargs='*', help='y classes') parser_n.add_argument('--useids', type=bool, default=False, help='if true, use IDs not labels on axes') parser_n.add_argument('terms', nargs='*', help='all terms (x and y)') parser_n.set_defaults(function=plot_term_intersection_dendrogram) # SIMILARITY MATRIX (may move to another module) parser_n = subparsers.add_parser( 'simmatrix', help='Plot dendrogram for similarities between subjects') parser_n.add_argument('-X', '--xsubjects', nargs='*', help='x subjects') parser_n.add_argument('-Y', '--ysubjects', nargs='*', help='y subjects') parser_n.add_argument('--useids', type=bool, default=False, help='if true, use IDs not labels on axes') parser_n.add_argument('subjects', nargs='*', help='all terms (x and y)') parser_n.set_defaults(function=plot_simmatrix) args = parser.parse_args() if args.verbosity >= 2: logging.basicConfig(level=logging.DEBUG) elif args.verbosity == 1: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARNING) if not args.assocfile: if not args.taxon or not args.category: raise ValueError( "Must specify EITHER assocfile OR both taxon and category") logging.info("Welcome!") if args.yamlconfig is not None: logging.info("Setting config from: {}".format(args.yamlconfig)) # note this sets a global: # we would not do this outside the context of a standalone script from ontobio.config import set_config set_config(args.yamlconfig) handle = args.resource # Ontology Factory ofactory = OntologyFactory() logging.info("Creating ont object from: {} {}".format(handle, ofactory)) ont = ofactory.create(handle) logging.info("ont: {}".format(ont)) evidence = args.evidence if evidence is not None and evidence.lower() == 'noiea': evidence = "-ECO:0000501" # Association Factory afactory = AssociationSetFactory() aset = None if args.assocfile is not None: aset = afactory.create_from_file(file=args.assocfile, fmt=args.assocformat, ontology=ont) else: [subject_category, object_category] = args.category # create using GO/Monarch services aset = afactory.create(ontology=ont, subject_category=subject_category, object_category=object_category, taxon=args.taxon) func = args.function func(ont, aset, args)
def test_remote_go_pombase(): ont = OntologyFactory().create('go') f = AssociationSetFactory() aset = f.create(ontology=ont, fmt='gaf', file=POMBASE) print("SUBJS: {}".format(aset.subjects)) assert len(aset.subjects) > 100
import pickle import pandas import numpy as np from ontobio.ontol_factory import OntologyFactory from ontobio.assoc_factory import AssociationSetFactory HUMAN = 'NCBITaxon:9606' #ontology paths ##''/Users/marcin/Documents/VIMSS/ontology/NCATS/HPO/hp.obo')#mondo#hp ofactory = OntologyFactory() afactory = AssociationSetFactory() print("creating...") ont = ofactory.create('hp') #ont = ofactory.create('mondo') aset = afactory.create(ontology=ont, subject_category='disease', object_category='phenotype', taxon=HUMAN) ###aset = afactory.create_from_gaf('my.gaf', ontology=ont) disease_ids = ["DECIPHER:1", "DECIPHER:16", "OMIM:614696", "OMIM:614699", "Orphanet:99978"] phenotype_ids = ["HP:0000007", "Orphanet:93299", "Orphanet:90794"] print("annotations\t"+phenotype_ids[1])
def test_factory(): ont = OntologyFactory().create(ONT) f = AssociationSetFactory() aset = f.create(ontology=ont, fmt='hpoa', file=ANNFILE) print("SUBJS: {}".format(aset.subjects)) assert len(aset.subjects) > 40
def main(): """ Phenologs """ parser = argparse.ArgumentParser( description='Phenologs' """ By default, ontologies are cached locally and synced from a remote sparql endpoint """, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-r', '--resource1', type=str, required=False, help='Name of ontology1') parser.add_argument('-R', '--resource2', type=str, required=False, help='Name of ontology2') parser.add_argument('-T', '--taxon', type=str, default='NCBITaxon:10090', required=False, help='NCBITaxon ID') parser.add_argument('-s', '--search', type=str, default='', required=False, help='Search type. p=partial, r=regex') parser.add_argument('-b', '--background', type=str, default=None, required=False, help='Class to use for background') parser.add_argument('-p', '--pthreshold', type=float, default=0.05, required=False, help='P-value threshold') parser.add_argument('-v', '--verbosity', default=0, action='count', help='Increase output verbosity') parser.add_argument('ids', nargs='*') args = parser.parse_args() if args.verbosity >= 2: logging.basicConfig(level=logging.DEBUG) if args.verbosity == 1: logging.basicConfig(level=logging.INFO) logging.info("Welcome!") ofactory = OntologyFactory() afactory = AssociationSetFactory() handle = args.resource1 ont1 = ofactory.create(args.resource1) ont2 = ofactory.create(args.resource2) logging.info("onts: {} {}".format(ont1, ont2)) searchp = args.search category = 'gene' aset1 = afactory.create(ontology=ont1, subject_category=category, object_category='phenotype', taxon=args.taxon) aset2 = afactory.create(ontology=ont2, subject_category=category, object_category='function', taxon=args.taxon) bg_cls = None if args.background is not None: bg_ids = resolve(ont1, [args.background], searchp) if len(bg_ids) == 0: logging.error("Cannnot resolve: '{}' using {} in {}".format( args.background, searchp, ont1)) sys.exit(1) elif len(bg_ids) > 1: logging.error("Multiple matches: '{}' using {} MATCHES={}".format( args.background, searchp, bg_ids)) sys.exit(1) else: logging.info("Background: {}".format(bg_cls)) [bg_cls] = bg_ids for id in resolve(ont1, args.ids, searchp): sample = aset1.query([id], []) print("Gene set class:{} Gene set: {}".format(id, sample)) bg = None if bg_cls is not None: bg = aset1.query([bg_cls], []) print("BACKGROUND SUBJECTS: {}".format(bg)) rs = aset2.enrichment_test(sample, bg, threshold=args.pthreshold, labels=True) print("RESULTS: {} < {}".format(len(rs), args.pthreshold)) for r in rs: print(str(r))