def npokb(): index_graph = OntGraph(path=auth.get_path('ontology-local-repo') / 'ttl/generated/neurons/npokb-index.ttl') if index_graph.path.exists(): index_graph.parse() # testing index_graph.bind('npokb', npokb) #[index_graph.add((npokb[str(i)], rdf.type, owl.Class)) for i in range(1, 11)] #[index_graph.add((npokb[str(i)], ilxtr.hasTemporaryId, TEMP[str(i)])) for i in range(1, 11)] ios = [] for eff in ('common-usage-types', 'huang-2017', 'markram-2015', 'allen-cell-types'): path = auth.get_path( 'ontology-local-repo') / f'ttl/generated/neurons/{eff}.ttl' input_graph = OntGraph(path=path) input_graph.parse() output_graph = input_graph.mapTempToIndex(index_graph, npokb, TEMP) ios.append((input_graph, output_graph)) input_graph, output_graph = ios[0] a, r, c = output_graph.subjectsChanged(input_graph) index_graph.write() # [o.write() for i, o, in ios] # when ready #from sparcur.paths import Path #Path(index_graph.path).xopen() breakpoint()
def main(): olr = auth.get_path('ontology-local-repo') resources = auth.get_path('resources') if not olr.exists(): raise FileNotFoundError(f'{olr} does not exist cannot continue') if not resources.exists(): raise FileNotFoundError(f'{resources} does not exist cannot continue') from docopt import docopt args = docopt(__doc__, version='parcellation 0.0.1') # import all ye submodules we have it sorted! LabelBase will find everything for us. :D if not args['--local']: from nifstd_tools.parcellation.aba import Artifacts as abaArts from nifstd_tools.parcellation.fsl import FSL # Artifacts is attached to the class from nifstd_tools.parcellation.whs import Artifacts as whsArts from nifstd_tools.parcellation.berman import Artifacts as bermArts from nifstd_tools.parcellation.paxinos import Artifacts as paxArts from nifstd_tools.parcellation.swanson import Artifacts as swArts from nifstd_tools.parcellation.freesurfer import Artifacts as fsArts onts = getOnts() _ = *(print(ont) for ont in onts), out = build(*onts, parcBridge, fail=args['--fail'], n_jobs=int(args['--jobs'])) if args['--stats']: breakpoint()
def _get_oauth_service(api='sheets', version='v4', readonly=True, SCOPES=None): """ Inner implementation for get oauth. If you see this function used directly anywhere other than in googapis it is almost certainly a mistake. """ if readonly: # FIXME the division isn't so clean for drive ... _auth_var = 'google-api-store-file-readonly' else: _auth_var = 'google-api-store-file' try: store_file = auth.get_path(_auth_var) except KeyError as e: _msg = (f'No value found for {_auth_var} in {auth._path}\n' 'See the previous error for more details about the cause.') raise ValueError(_msg) from e if store_file is None: _p = 'RUNTIME_CONFIG' if auth._path is None else auth._path # FIXME bad error message, need to check whether the key is even in # the user config, and yes we need our way to update the user config # and warn about unexpected formats for orthauth configs msg = (f'No file exists at the path specified by {_auth_var} in {_p}') log.debug(auth._runtime_config) log.debug(auth.user_config._runtime_config) raise ValueError(msg) # TODO log which file it is writing to ... if store_file.exists(): with open(store_file, 'rb') as f: try: creds = pickle.load(f) except pickle.UnpicklingError as e: # FIXME need better way to trace errors in a way # that won't leak secrets by default log.error(f'problem in file at path for {_auth_var}') raise e else: creds = None if SCOPES is None: raise TypeError('SCOPES has not been set, possibly because this is\n' 'being called by a function that expects the store file\n' 'to already exist. Please run `googapis auth` with the\n' 'appropriate scope.') if not creds or not creds.valid: # the first time you run this you will need to use the --noauth_local_webserver args if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: creds_file = auth.get_path('google-api-creds-file') flow = InstalledAppFlow.from_client_secrets_file((creds_file).as_posix(), SCOPES) creds = flow.run_console() with open(store_file, 'wb') as f: pickle.dump(creds, f) service = build(api, version, credentials=creds) return service
def catalog_extras(fetch=False): path = Path(auth.get_path('ontology-local-repo'), 'ttl') cat = (path / 'catalog-v001.xml').as_posix() with open((path / '../catalog-extras').as_posix(), 'rt') as ce, open(cat, 'rt') as c: clines = c.readlines() celines = ce.readlines() if clines[-2] != celines[-1]: with open(cat, 'wt') as f: f.writelines(clines[:-1] + celines + clines[-1:]) else: print(tc.blue('INFO:'), 'extras already added to catalog doing nothing') if fetch: print(tc.blue('INFO:'), 'fetching extras') def fetch_and_save(url, loc): resp = requests.get(url) saveloc = (path / loc).as_posix() if resp.ok: with open(saveloc, 'wb') as f: f.write(resp.content) print(tc.blue('INFO:'), f'{url:<60} written to {loc}') else: print(tc.red('WARNING:'), f'failed to fetch {url}') Async()(deferred(fetch_and_save)(url, loc) for line in celines for _, _, _, url, _, loc, _ in (line.split('"'), ))
def default(self): g = OntGraph().parse( auth.get_path('ontology-local-repo') / 'ttl/stimulation.ttl') preds = sorted(set(g.qname(p) for p in g.predicates())) header = [[ 'id', 'rdf:type', 'rdfs:domain', 'rdfs:range', 'rdfs:label', 'NIFRID:synonym', 'NIFRID:abbrev', 'definition:', 'editorNote:', 'rdfs:comment' ]] _rows = [] for type_ in ( owl.ObjectProperty, owl.Class, ): for s in sorted(g[:rdf.type:type_], key=natsort): if isinstance(s, rdflib.URIRef): row = [g.qname(s), g.qname(type_) ] + [fun(g, s) for fun in funs] _rows.append(row) rows = header + _rows defs = Defs(readonly=not self.options.update) if self.options.update: defs.upsert(*rows) # FIXME upsert broken on header reordering ? defs.commit() return rows
def test(): snchf = SnchFile.fromYaml('../test/sneech-file.yaml') snchf.writeTtl(aug.RepoPath('../test/rando-sneech-ttl.ttl').resolve()) rp = aug.RepoPath(auth.get_path('ontology-local-repo')) wrangler = SneechWrangler(aug.RepoPath('~/git/sneechenator').expanduser()) dir_snchn = wrangler.dir_process / 'test-sneechening' if not dir_snchn.exists(): # FIXME bad workflow dir_snchn.mkdir() path_index = wrangler.path_index(snchf.index) if not path_index.exists(): path_index = wrangler.new_index( snchf.index) # FIXME move inside Sneechenator? or no path_index.commit_from_working_tree(f'new index {snchf.index}') org_index = OntResGit(path_index) expanded = snchf.write(dir_snchn) # TODO commit expanded.commit_from_working_tree(f'expanded snch file') sncher = Sneechenator(org_index, snchf.namespaces, snchf.orgs) #sncher.preSneechUpon(dir_snchn) rg, maybe_sneeches = sncher.sneechReviewGraph() # commit here I think ? # consider using ilxtr.maybeHasIlxId ? # TODO modified maybe_sneeches file + maybe list -> update list breakpoint()
def main(): with open(auth.get_path('curies'), 'rt') as f: curie_map = yaml.safe_load(f) curie_map['nlx_only'] = curie_map[ ''] # map nlx_only to 'http://uri.neuinfo.org/nif/nifstd/' g = rdflib.Graph() g.parse('http://ontology.neuinfo.org/NIF/ttl/NIF-Cell.ttl', format='turtle') curiespaces = {k: rdflib.Namespace(v) for k, v in curie_map.items()} namespaces = { c_prefix: rdflib.Namespace(iri_prefix) for c_prefix, iri_prefix in g.namespaces() } subject = curiespaces['NIFCELL']['nifext_75'] predicate = None object_ = None matches = [t for t in g.triples((subject, predicate, object_))] print(matches) if matches: predicate = matches[0][1].toPython() print(predicate) if __name__ == '__main__': breakpoint()
class PaxSr_6(resSource): sourceFile = auth.get_path('resources') / 'paxinos09names.txt' artifact = Artifacts.PaxRat6 @classmethod def loadData(cls): with open(cls.source, 'rt') as f: lines = [ l.rsplit('#')[0].strip() for l in f.readlines() if not l.startswith('#') ] return [l.rsplit(' ', 1) for l in lines] @classmethod def processData(cls): structRecs = [] out = {} for structure, abrv in cls.raw: structRecs.append((abrv, structure)) if abrv in out: out[abrv][0].append(structure) else: out[abrv] = ([structure], ()) return structRecs, out @classmethod def validate(cls, structRecs, out): print(Counter(_[0] for _ in structRecs).most_common()[:5]) print(Counter(_[1] for _ in structRecs).most_common()[:5]) assert len(structRecs) == len([ s for sl, _ in out.values() for s in sl ]), 'There are non-unique abbreviations' errata = {} return out, errata
def main(): olr = auth.get_path('ontology-local-repo') ori = OntResIri('http://purl.obolibrary.org/obo/doid.owl') orp = OntResPath(olr / 'ttl/external/doid.owl') ort = ori g = ori.graph query = """ SELECT DISTINCT ?s ?o ?l WHERE { ?s a owl:Class . ?s rdfs:subClassOf* <http://purl.obolibrary.org/obo/DOID_4> . ?s rdfs:subClassOf ?o . ?s rdfs:label ?l . }""" res = list(g.query(query)) filt = [r for r in res if not isinstance(r[1], rdflib.BNode)] spath = 'ttl/generated/doidslim.ttl' go = OntGraph(path=olr / spath) # TODO prov record like the one we have for chebi go.bind('DOID', 'http://purl.obolibrary.org/obo/DOID_') s = rdflib.URIRef('http://ontology.neuinfo.org/NIF/' + spath) go.populate_from_triples( ((s, p, o) for p, o in ((rdf.type, owl.Ontology), (rdfs.label, rdflib.Literal("NIF DOID slim")),))) ds = rdflib.URIRef('http://purl.obolibrary.org/obo/DOID_4') go.add((ds, rdf.type, owl.Class)) go.add((ds, rdfs.label, rdflib.Literal('disease'))) go.populate_from_triples( (t for s, o, l in filt for t in ((s, rdf.type, owl.Class), (s, rdfs.subClassOf, o), (s, rdfs.label, l)))) go.write()
def main(): pi = PhenotypeIndicators() trips = list(pi.triples) #yield from PhenotypeIndicators().triples g = pi.asGraph() g.write(auth.get_path('ontology-local-repo') / f'ttl/{pi.name}.ttl')
def main(): #InterLexSneechenator() test() return # testing index_graph.bind('ILX', ILX) #[index_graph.add((npokb[str(i)], rdf.type, owl.Class)) for i in range(1, 11)] #[index_graph.add((npokb[str(i)], ilxtr.hasTemporaryId, TEMP[str(i)])) for i in range(1, 11)] ios = [] for eff in ('phenotype-core.ttl', 'phenotypes.ttl'): path = auth.get_path('ontology-local-repo') / eff input_graph = OntGraph(path=path) input_graph.parse() output_graph = input_graph.mapTempToIndex(index_graph, ILX, ilxtr) ios.append((input_graph, output_graph)) input_graph, output_graph = ios[0] a, r, c = output_graph.subjectsChanged(input_graph) index_graph.write() # [o.write() for i, o, in ios] # when ready #from sparcur.paths import Path #Path(index_graph.path).xopen() breakpoint()
def _todo(utr): # real output glb = auth.get_path('git-local-base') uberon_edit = aug.RepoPath( glb) / 'NOFORK/uberon/src/ontology/uberon-edit.obo' of = oio.OboFile(path=uberon_edit, strict=False) utr.submit_to_obofile(of, 'UBERON', uberon_id_range) of.write(overwrite=True, version=oio.OBO_VER_ROBOT)
def ncbigenemapping(may_need_ncbigene_added): #urlbase = 'https://www.ncbi.nlm.nih.gov/gene/?term=Mus+musculus+' urlbase = ('https://www.ncbi.nlm.nih.gov/gene?term=' '({gene_name}[Gene%20Name])%20AND%20{taxon_suffix}[Taxonomy%20ID]&' 'report=xml') urls = [urlbase.format(gene_name=n, taxon_suffix=10090) for n in may_need_ncbigene_added] done2 = {} for u in urls: if u not in done2: print(u) done2[u] = requests.get(u) base = auth.get_path('resources') / 'genesearch' if not base.exists(): base.mkdir() for resp in done2.values(): fn = OntId(resp.url).quoted with open(base / fn, 'wb') as f: f.write(resp.content) so_much_soup = [(resp.url, BeautifulSoup(resp.content, 'lxml')) for resp in done2.values()] trees = [] for i, (url, soup) in enumerate(so_much_soup): pre = soup.find_all('pre') if pre: for p in pre[0].text.split('\n\n'): if p: tree = etree.fromstring(p) trees.append((url, tree)) else: print('WAT', urls[i]) dimension = 'ilxtr:hasExpressionPhenotype' errors = [] to_add = [] mapping = {} for url, tree in trees: taxon = tree.xpath('//Org-ref//Object-id_id/text()')[0] geneid = tree.xpath('//Gene-track_geneid/text()')[0] genename = tree.xpath('//Gene-ref_locus/text()')[0] if genename in may_need_ncbigene_added and taxon == '10090': print(f'{genename} = Phenotype(\'NCBIGene:{geneid}\', {dimension!r}, label={genename!r}, override=True)') to_add.append(geneid) mapping[genename] = f'NCBIGene:{geneid}' else: errors.append((geneid, genename, taxon, url)) print(errors) _ = [print('NCBIGene:' + ta) for ta in to_add] #wat.find_all('div', **{'class':'rprt-header'}) #wat.find_all('div', **{'class':'ncbi-docsum'}) return mapping, to_add, errors
def fix_file(path): with open(path, 'rt') as f: sin = f.read() sout = sin.replace( '~/git/NIF-Ontology', auth.get_path('ontology-local-repo').resolve().as_posix()) with open(path, 'wt') as f: f.write(sout) return sin
def _mis_graph(self): """ for now easier to just get a fresh one, they are small """ glb = pauth.get_path('git-local-base') olr = Path(glb / 'duplicates' / 'sparc-NIF-Ontology') graph = (rdflib.ConjunctiveGraph() .parse((olr / 'ttl/sparc-methods.ttl').as_posix(), format='turtle') #.parse((olr / 'ttl/methods-core.ttl').as_posix(), format='turtle') #.parse((olr / 'ttl/methods-helper.ttl').as_posix(), format='turtle') #.parse((olr / 'ttl/methods.ttl').as_posix(), format='turtle') ) return graph
def npokb_mapping(): index_graph = OntGraph(path=auth.get_path('ontology-local-repo') / 'ttl/generated/neurons/npokb-index.ttl') if index_graph.path.exists(): index_graph.parse() # testing index_graph.bind('npokb', npokb) #[index_graph.add((npokb[str(i)], rdf.type, owl.Class)) for i in range(1, 11)] #[index_graph.add((npokb[str(i)], ilxtr.hasTemporaryId, TEMP[str(i)])) for i in range(1, 11)] ios = [] for eff in ( 'common-usage-types', 'huang-2017', 'markram-2015', 'allen-cell-types', ): # FIXME if the index id is already being used it is still added as a temp id incorrectly path = auth.get_path( 'ontology-local-repo') / f'ttl/generated/neurons/{eff}.ttl' org = OntResGit( path, ref='HEAD' ) # HEAD is default but just for clarity set it explicitly here prev_graph = org.graph input_graph = OntGraph(path=path) input_graph.parse() mapped_graph = input_graph.mapStableIdentifiers( prev_graph, ilxtr.origLabel) output_graph = mapped_graph.mapTempToIndex(index_graph, npokb, TEMP) ios.append((mapped_graph, output_graph)) mapped_graph, output_graph = ios[0] a, r, c = output_graph.subjectsChanged(mapped_graph) index_graph.write() [o.write() for i, o, in ios] # when ready #from sparcur.paths import Path #Path(index_graph.path).xopen() breakpoint()
def scigraph_stress(rate, timeout=5, verbose=False, debug=False, scigraph=auth.get('scigraph-api')): # TODO use the api classes with open((auth.get_path('resources') / 'chebi-subset-ids.txt').as_posix(), 'rt') as f: urls = [ os.path.join(scigraph, f'vocabulary/id/{curie.strip()}') for curie in f.readlines() ] print(urls) url_blaster(urls, rate, timeout, verbose, debug)
class ChebiIdsSrc(Source): source = auth.get_path('resources') / 'chebi-subset-ids.txt' source_original = True @classmethod def loadData(cls): ug = makeGraph('utilgraph', prefixes=uPREFIXES) with open(cls.source, 'rt') as f: ids_raw = set(_.strip() for _ in f.readlines()) ids = set(ug.expand(_.strip()).toPython() for _ in ids_raw) return ids_raw, ids @classmethod def validate(cls, a): return a
class HCPMMPSrc(resSource): sourceFile = auth.get_path( 'resources') / 'human_connectome_project_2016.csv' source_original = True artifact = Artifacts.HCPMMP @classmethod def loadData(cls): with open(cls.source, 'rt') as f: return [r for r in csv.reader(f)][1:] # skip header @classmethod def processData(cls): return cls.raw, @classmethod def validate(cls, d): return d
def ncbigene_make(): IDS_FILE = auth.get_path('resources') / 'gene-subset-ids.txt' with open(IDS_FILE.as_posix(), 'rt') as f: # this came from neuroNER ids = [l.split(':')[1].strip() for l in f.readlines()] #url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&retmax=5000&db=gene&id=' #for id_ in ids: #data = requests.get(url + id_).json()['result'][id_] url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' data = { 'db': 'gene', 'retmode': 'json', 'retmax': 5000, 'id': None, } chunks = [] for i, idset in enumerate(chunk_list(ids, 100)): print(i, len(idset)) data['id'] = ','.join(idset), resp = requests.post(url, data=data).json() chunks.append(resp) base = chunks[0]['result'] uids = base['uids'] for more in chunks[1:]: data = more['result'] uids.extend(data['uids']) base.update(data) #base['uids'] = uids # i mean... its just the keys base.pop('uids') ng = createOntology( 'ncbigeneslim', 'NIF NCBI Gene subset', makePrefixes('ilxtr', 'NIFRID', 'NCBIGene', 'NCBITaxon', 'skos', 'owl', 'SO'), 'ncbigeneslim', f'This subset is automatically generated from the NCBI Gene database on a subset of terms listed in {IDS_FILE}.', remote_base='http://ontology.neuinfo.org/NIF/') for k, v in base.items(): #if k != 'uids': ncbi(v, ng) ng.write()
class WHSSDSrc(resSource): sourceFile = lambda v: auth.get_path('resources' ) / f'WHS_SD_rat_atlas_v{v}.label' source_original = True artifact = lambda v: getattr(Artifacts, f'WHSSD{v}') @classmethod def loadData(cls): with open(cls.source, 'rt') as f: lines = [l.strip() for l in f.readlines() if not l.startswith('#')] return [(l[:3].strip(), l.split('"', 1)[1].strip('"')) for l in lines] @classmethod def processData(cls): return cls.raw, @classmethod def validate(cls, d): return d
class WHSSDilfSrc(resSource): sourceFile = lambda v: auth.get_path( 'resources') / f'WHS_SD_rat_atlas_v{v}_labels.ilf' source_original = True artifact = lambda v: getattr(Artifacts, f'WHSSD{v}') predicates = lambda v: { ilxtr.labelPartOf: ilxtr[f'labelPartOf-whssd-{v}'] } # FIXME @classmethod def loadData(cls): tree = etree.parse(cls.source.as_posix()) return tree @classmethod def processData(cls): tree = cls.raw def recurse(label_node, parent=None): name = label_node.get('name') abbrev = label_node.get('abbreviation') id = label_node.get('id') yield id, name, abbrev, parent for child in label_node.getchildren(): if child.tag == 'label': yield from recurse(child, parent=id) records = tuple() for structure in tree.xpath('//structure'): for lab in structure.getchildren(): if lab.tag == 'label': records += tuple(recurse(lab, None)) return records, @classmethod def validate(cls, d): return d
only = tuple() skip = tuple() ci_skip = tuple() network_tests = ( # reminder that these only skip mains 'closed_namespaces', 'hierarchies', 'make_catalog', 'scig', 'scigraph_codegen', ['ontload', 'graph'], ['ontutils', 'deadlinks'], ['ontutils', 'version-iri'], ) #requests.exceptions.SSLError if auth.get_path('scigraph-services') is None: skip += ( 'scigraph_deploy', ) # this will fail # FIXME this should really only skip main not both main and import? working_dir = get_working_dir(__file__) if working_dir is None: # python setup.py test will run from the module_parent folder # I'm pretty the split was only implemented because I was trying # to run all tests from the working_dir in one shot, but that has # a number of problems with references to local vs installed packages import inspect sf = inspect.getsourcefile(pyontutils) working_dir = Path(sf).parent.parent else:
def main(): import rdflib from pyontutils.core import makeGraph, makePrefixes, log from pyontutils.config import auth ub = auth.get_path('ontology-local-repo') / 'ttl/bridge/uberon-bridge.ttl' ncrb = auth.get_path( 'ontology-local-repo') / 'ttl/NIF-Neuron-Circuit-Role-Bridge.ttl' if not ub.exists() or not ncrb.exists(): # just skip this if we can't file the files log.warning(f'missing file {ub} or {ncrb}') return graph = rdflib.Graph() graph.parse(ub.as_posix(), format='turtle') graph.parse(ncrb.as_posix(), format='ttl') ecgraph = rdflib.Graph() oec = EquivalentClass() test = tuple(oec.parse(graph=graph)) ft = oc_.full_combinator(test[0][0], test[0][1]) ftng = makeGraph('thing3', prefixes=makePrefixes('owl', 'TEMP')) *ft.serialize(ftng.g), ftng.write() _roundtrip = list(test[0][1](test[0][0])) roundtrip = oc_(test[0][0], test[0][1]) # FIXME not quite there yet... for t in roundtrip: ecgraph.add(t) ecng = makeGraph('thing2', graph=ecgraph, prefixes=makePrefixes('owl', 'TEMP')) ecng.write() if __name__ == '__main__': breakpoint() return r = Restriction( rdfs.subClassOf) #, scope=owl.allValuesFrom)#NIFRID.has_proper_part) l = tuple(r.parse(graph=graph)) for t in r.triples: graph.remove(t) ng = makeGraph('thing', graph=graph) ng.write() #print(l) restriction = Restriction(None) #rdf.first) ll = List(lift_rules={owl.Restriction: restriction}) trips = tuple(ll.parse(graph=graph)) #subClassOf = PredicateCombinator(rdfs.subClassOf) # TODO should be able to do POCombinator(rdfs.subClassOf, 0bjectCombinator) subClassOf = POCombinator(rdfs.subClassOf, ObjectCombinator) superDuperClass = subClassOf( TEMP.superDuperClass) # has to exist prior to triples ec = oec( TEMP.ec1, TEMP.ec2, restriction(TEMP.predicate0, TEMP.target1), restriction(TEMP.predicate1, TEMP.target2), ) egraph = rdflib.Graph() acombinator = annotation((TEMP.testSubject, rdf.type, owl.Class), (TEMP.hoh, 'FUN')) ft = flattenTriples(( acombinator((TEMP.annotation, 'annotation value')), acombinator((TEMP.anotherAnnotation, 'annotation value again')), oc_(TEMP.c1, superDuperClass), oc_(TEMP.c2, superDuperClass), oc_(TEMP.c3, superDuperClass), oc_(TEMP.c4, superDuperClass), oc_(TEMP.c5, superDuperClass), oc_(TEMP.wat, subClassOf(TEMP.watParent)), oc_(TEMP.testSubject), ec(TEMP.testSubject), oc_(TEMP.more, oec(TEMP.ec3, restriction(TEMP.predicate10, TEMP.target10))), ), ) [egraph.add(t) for t in ft] eng = makeGraph('thing1', graph=egraph, prefixes=makePrefixes('owl', 'TEMP')) eng.write() if __name__ == '__main__': breakpoint()
class BermanSrc(resSource): run_ocr = False source_images = Path('~/files/cropped').expanduser() source = 'https://github.com/tgbugs/pyontutils.git' sourceFile = auth.get_path('resources') / 'berman' source_original = False artifact = Artifacts.BermanCat @classmethod def loadData(cls): """ Sigh, this was indeed a poorly conceived approach since it hard blocks when the files are not in the source so you can't easily bootstrap from another source and the cognitive overhead is way, way too high :/ Adding dry_run/bootstrap to __new__ sort of helps? """ """ Have to run this out here because resSource is handicapped """ data = [] if cls.source_images.exists(): for folder in cls.source_images.glob('*'): plate_num = int(folder.stem) text_file = cls.source / f'{plate_num}.txt' if not text_file.exists() or cls.run_ocr: legends = [] raw_text = '' for img in folder.glob('*.png'): print('num', plate_num, img.stem) p = subprocess.Popen( ('tesseract', img.as_posix(), 'stdout', '-l', 'eng', '--oem', '2', '--psm', '6'), stdout=subprocess.PIPE) bytes_text, err = p.communicate() raw_text += bytes_text.decode() + '\n' with open(text_file, 'wt') as f: f.write(raw_text) else: with open(text_file, 'rt') as f: raw_text = f.read() legends = get_legends(raw_text) data.append((plate_num, legends)) elif cls.source.exists(): for text_file in cls.source.glob('*.txt'): plate_num = int(text_file.stem) with open(text_file, 'rt') as f: raw_text = f.read() legends = get_legends(raw_text) data.append((plate_num, legends)) return data @classmethod def processData(cls): data = cls.raw # ocr fixes # in theory could use the most frequent if > .75 are the same ... cor_l = { 'abducens nerve': { 'GN': '6N' }, 'alaminar spinal trigeminal nucleus, magnocellular division (14)': { '5SM': 'SSM' }, 'alaminar spinal trigeminal nucleus, parvocellular division (6)': { '5SP': 'SSP' }, 'central nucleus of the inferior colliculus (21)': { '1CC': 'ICC' }, 'cerebral cortex': { '¢': 'C' }, 'commissure of the inferior colliculi': { '1CO': 'ICO', 'I1CO': 'ICO' }, 'corpus callosum': { '198': 'CC' }, 'inferior central nucleus (13)': { 'C': 'CI', 'Cl': 'CI' }, 'lateral tegmental field (3)': { 'FIL': 'FTL' }, 'mesencephalic trigeminal nucleus (19)': { 'SME': '5ME' }, 'motor trigeminal tract': { 'SMT': '5MT' }, 'nucleus of the trapezoid body (15)': { 'J': 'T' }, 'posterior interpeduncular nucleus, inner division': { 'al': 'IPI' }, # wow ... 'solitary tract': { '$': 'S' }, 'spinal trigeminal tract': { 'SST': '5ST' }, 'statoacoustic nerve': { 'BN': 'SN' }, 'superior central nucleus (22)': { 's': 'CS' }, 'trigeminal nerve': { 'SN': '5N' }, 'zona incerta': { 'Z1': 'ZI' }, } cor_a = { #'1': {'ependymal layer', 'superficial layer'}, #'2': {'intermediate layer', 'molecular layer'}, #'3': {'deep layer', 'oculomotor nucleus (27)', 'pyramidal layer'}, #'4': {'polymorph layer', 'trochlear nucleus (23)'}, 'KF': { 'KollikerFuse nucleus (17)': 'KéllikerFuse nucleus (17)' }, 'SCS': { 'superior colliculus, supertficial layer (25)': 'superior colliculus, superficial layer (25)' } } # close layer abbreviation issues # this of course means that abbrevs cannot be used as identifiers # but we already knew this abbrev_ok = { '1': { 'superficial layer': 1, 'ependymal layer': 1 }, '2': { 'intermediate layer': 1, 'molecular layer': 1 }, '3': { 'oculomotor nucleus (27)': 4, 'deep layer': 1, 'pyramidal layer': 1 }, '4': { 'polymorph layer': 1, 'trochlear nucleus (23)': 1 } } by_abbrev = collections.defaultdict(list) by_label = collections.defaultdict(list) abbrev_index = collections.defaultdict(list) label_index = collections.defaultdict(list) for n, legends in sorted(data): for abbrev, label in legends: if label in cor_l and abbrev in cor_l[label]: abbrev = cor_l[label][abbrev] if abbrev in cor_a and label in cor_a[abbrev]: label = cor_a[abbrev][label] by_abbrev[abbrev].append(label) by_label[label].append(abbrev) abbrev_index[abbrev].append(n) label_index[label].append(n) def dorder(thing, type=lambda v: v): return { k: type(v) for k, v in sorted(thing.items(), key=lambda kv: kv[0].lower()) } by_abbrev = dorder(by_abbrev, collections.Counter) by_label = dorder(by_label, collections.Counter) prob_a = {k: v for k, v in by_abbrev.items() if len(v) > 1} prob_l = {k: v for k, v in by_label.items() if len(v) > 1} pnorma = {k: dict(v) for k, v in prob_a.items()} assert pnorma == abbrev_ok, f'problem in abbrevs\n{pnorma}\n{abbrev_ok}' assert not prob_l, f'problem in labels {prob_l}' index_abbrev = dorder(abbrev_index, tuple) index_label = dorder(label_index, tuple) ia = sorted( set([(tuple(l), a, index_label[list(l)[0]], index_abbrev[a]) for a, l in by_abbrev.items() if a not in abbrev_ok and index_label[list(l)[0]] != index_abbrev[a]])) assert not ia, f'oops {ia}' il = sorted( set([(l, tuple(a), index_label[l], index_abbrev[list(a)[0]]) for l, a in by_label.items() if list(a)[0] not in abbrev_ok and index_label[l] != index_abbrev[list(a)[0]]])) assert not il, f'oops {il}' def paren_thing(label): if '(' in label: label_ws, pthing_cp = label.split('(', 1) return label_ws.strip(), int(pthing_cp.rstrip(')')) else: return label, None data_out = tuple( (*paren_thing(label), list(abbrev)[0], index_label[label]) for label, abbrev in by_label.items()) return data_out, @classmethod def validate(cls, d): return d
if not line: continue try: abbrev, label = line.split(' ', 1) except ValueError as e: print(repr(line)) print(repr(raw_text)) raise e continue abbrev = clean(abbrev) label = clean(label) legends.append((abbrev, label)) return legends resources = auth.get_path('resources') if resources is not None: # FIXME TODO this is a bad way to handle this ... with open(resources / 'brainmaps-cat-abbrevs.html', 'rt') as f: dat = f.read() asoup = BeautifulSoup(dat, 'lxml') class BermanSrc(resSource): run_ocr = False source_images = Path('~/files/cropped').expanduser() source = 'https://github.com/tgbugs/pyontutils.git' sourceFile = auth.get_path('resources') / 'berman' source_original = False artifact = Artifacts.BermanCat
def default(self): out_path = self.options.out_path BUILD = self.options.BUILD glb = Path(auth.get_path('git-local-base')) theme_repo = glb / 'org-html-themes' theme = theme_repo / 'setup/theme-readtheorg-local.setup' prepare_paths(BUILD, out_path, theme_repo, theme) doc_config = self._doc_config names = tuple(doc_config['repos']) + tuple( self.options.repo) # TODO fetch if missing ? repo_paths = [(glb / name).resolve() for name in names] repos = [p.repo for p in repo_paths] skip_folders = doc_config.get('skip-folders', tuple()) rskip = doc_config.get('skip', {}) # TODO move this into run_all docstring_kwargs = makeDocstrings(BUILD, repo_paths, skip_folders, rskip) wd_docs_kwargs = [docstring_kwargs] if self.options.docstring_only: [ kwargs.update({'theme': theme}) for _, _, kwargs in wd_docs_kwargs ] outname, rendered = render_docs(wd_docs_kwargs, out_path, titles=None, n_jobs=1, debug=self.options.debug)[0] if not outname.parent.exists(): outname.parent.mkdir(parents=True) with open(outname.as_posix(), 'wt') as f: f.write(rendered) return et = tuple() wd_docs_kwargs += [ (rp, rp / f, makeKwargs(rp, f)) for rp in repo_paths for f in rp.repo.git.ls_files().split('\n') if Path(f).suffix in suffixFuncs and only(rp, f) and noneMembers( f, *skip_folders) and f not in rskip.get(rp.name, et) ] [kwargs.update({'theme': theme}) for _, _, kwargs in wd_docs_kwargs] if self.options.spell: spell((f.as_posix() for _, f, _ in wd_docs_kwargs)) return titles = doc_config['titles'] outname_rendered = render_docs(wd_docs_kwargs, out_path, titles, self.options.jobs, debug=self.options.debug) index = [ f'<b class="{heading}">{heading}</b>' for heading in doc_config['index'] ] _NOTITLE = object() for outname, rendered in outname_rendered: apath = outname.relative_to(self.options.out_path) title = titles.get(apath.as_posix(), _NOTITLE) # TODO parse out/add titles if title is not None: value = (hfn.atag(apath) if title is _NOTITLE else hfn.atag( apath, title)) index.append(value) if not outname.parent.exists(): outname.parent.mkdir(parents=True) with open(outname.as_posix(), 'wt') as f: f.write(rendered) lt = list(titles) def title_key(a): title = a.split('"')[1] if title not in lt: msg = (f'{title} missing from {self.options.config}') raise ValueError(msg) return lt.index(title) index_body = '<br>\n'.join(['<h1>Documentation Index</h1>'] + sorted(index, key=title_key)) with open((out_path / 'index.html').as_posix(), 'wt') as f: f.write(hfn.htmldoc(index_body, title=doc_config['title']))
#!/usr/bin/env python3.7 import tempfile from pyontutils.config import auth from augpathlib import RepoPath as Path temp_path = Path(tempfile.tempdir) _ddconf = auth.get_path('resources') / 'doc-config.yaml' _ddpath = temp_path / 'build-ont-docs' / 'docs' __doc__ = f"""Compile all documentation from git repos. Usage: ont-docs [options] [--repo=<REPO>...] Options: -h --help show this -c --config=<PATH> path to doc-index.yaml [default: {_ddconf}] -o --out-path=<PATH> path inside which docs are built [default: {_ddpath}] -b --html-root=<REL> relative path to the html root [default: ..] -s --spell run hunspell on all docs -d --docstring-only build docstrings only -j --jobs=NJOBS number of jobs [default: 9] -r --repo=<REPO> additional repos to crawl for docs --debug redirect stderr to debug pipeline errors """ import os import re import ast import shutil import logging import subprocess from pathlib import PurePath
def swanson(): """ not really a parcellation scheme NOTE: the defining information up here is now deprecated it is kept around to keep the code further down happy """ source = auth.get_path('resources') / 'swanson_aligned.txt' ONT_PATH = 'http://ontology.neuinfo.org/NIF/ttl/generated/' filename = 'swanson_hierarchies' ontid = ONT_PATH + filename + '.ttl' PREFIXES = SwansonLabels.prefixes new_graph = makeGraph(filename, PREFIXES, writeloc='/tmp/') new_graph.add_ont(ontid, 'Swanson brain partomies', 'Swanson 2014 Partonomies', 'This file is automatically generated from ' + source.as_posix() + '.' + '**FIXME**', 'now') # FIXME citations should really go on the ... anatomy? scheme artifact definingCitation = 'Swanson, Larry W. Neuroanatomical Terminology: a lexicon of classical origins and historical foundations. Oxford University Press, USA, 2014.' definingCitationID = 'ISBN:9780195340624' new_graph.add_trip(ontid, 'NIFRID:definingCitation', definingCitation) new_graph.add_trip(ontid, 'NIFRID:definingCitationID', definingCitationID) with open(source, 'rt') as f: lines = [l.strip() for l in f.readlines()] # join header on page 794 lines[635] += ' ' + lines.pop(636) #fix for capitalization since this header is reused fixed = ' or '.join([' ('.join([n.capitalize() for n in _.split(' (')]) for _ in lines[635].lower().split(' or ')]).replace('human','HUMAN') lines[635] = fixed data = [] for l in lines: if not l.startswith('#'): level = l.count('.'*5) l = l.strip('.') if ' (' in l: if ') or' in l: n1, l = l.split(') or') area_name, citationP = n1.strip().split(' (') citation = citationP.rstrip(')') d = (level, area_name, citation, 'NEXT SYN') data.append(d) #print(tc.red(tc.bold(repr(d)))) area_name, citationP = l.strip().split(' (') citation = citationP.rstrip(')') else: area_name = l citation = None d = (level, area_name, citation, None) #print(d) data.append(d) results = Async()(deferred(sgv.findByTerm)(d[1]) for d in data) #results = [None] * len(data) curies = [[r['curie'] for r in _ if 'curie' in r and 'UBERON' in r['curie']] if _ else [] for _ in results] output = [_[0] if _ else None for _ in curies] header = ['Depth', 'Name', 'Citation', 'NextSyn', 'Uberon'] zoop = [header] + [r for r in zip(*zip(*data), output)] + \ [(0, 'Appendix END None', None, None, None)] # needed to add last appendix # TODO annotate the appendicies and the classes with these appendix_root_mapping = (1, 1, 1, 1, 30, 83, 69, 70, 74, 1) # should generate? class SP(rowParse): def __init__(self): self.nodes = defaultdict(dict) self._appendix = 0 self.appendicies = {} self._last_at_level = {} self.names = defaultdict(set) self.children = defaultdict(set) self.parents = defaultdict(set) self.next_syn = False super().__init__(zoop) def Depth(self, value): if self.next_syn: self.synonym = self.next_syn else: self.synonym = False self.depth = value def Name(self, value): self.name = value def Citation(self, value): self.citation = value def NextSyn(self, value): if value: self.next_syn = self._rowind else: self.next_syn = False def Uberon(self, value): self.uberon = value def _row_post(self): # check if we are in the next appendix # may want to xref ids between appendicies as well... if self.depth == 0: if self.name.startswith('Appendix'): if self._appendix: self.appendicies[self._appendix]['children'] = dict(self.children) self.appendicies[self._appendix]['parents'] = dict(self.parents) self._last_at_level = {} self.children = defaultdict(set) self.parents = defaultdict(set) _, num, apname = self.name.split(' ', 2) if num == 'END': return self._appendix = int(num) self.appendicies[self._appendix] = { 'name':apname.capitalize(), 'type':self.citation.capitalize() if self.citation else None} return else: if ' [' in self.name: name, taxonB = self.name.split(' [') self.name = name self.appendicies[self._appendix]['taxon'] = taxonB.rstrip(']').capitalize() else: # top level is animalia self.appendicies[self._appendix]['taxon'] = 'ANIMALIA'.capitalize() self.name = self.name.capitalize() self.citation = self.citation.capitalize() # nodes if self.synonym: self.nodes[self.synonym]['synonym'] = self.name self.nodes[self.synonym]['syn-cite'] = self.citation self.nodes[self.synonym]['syn-uberon'] = self.uberon return else: if self.citation: # Transverse Longitudinal etc all @ lvl4 self.names[self.name + ' ' + self.citation].add(self._rowind) else: self.name += str(self._appendix) + self.nodes[self._last_at_level[self.depth - 1]]['label'] #print(level, self.name) # can't return here because they are their own level # replace with actually doing something... self.nodes[self._rowind]['label'] = self.name self.nodes[self._rowind]['citation'] = self.citation self.nodes[self._rowind]['uberon'] = self.uberon # edges self._last_at_level[self.depth] = self._rowind # TODO will need something to deal with the Lateral/ if self.depth > 0: try: parent = self._last_at_level[self.depth - 1] except: breakpoint() self.children[parent].add(self._rowind) self.parents[self._rowind].add(parent) def _end(self): replace = {} for asdf in [sorted(n) for k,n in self.names.items() if len(n) > 1]: replace_with, to_replace = asdf[0], asdf[1:] for r in to_replace: replace[r] = replace_with for r, rw in replace.items(): #print(self.nodes[rw]) o = self.nodes.pop(r) #print(o) for vals in self.appendicies.values(): children = vals['children'] parents = vals['parents'] # need reversed so children are corrected before swap for r, rw in reversed(sorted(replace.items())): if r in parents: child = r new_child = rw parent = parents.pop(child) parents[new_child] = parent parent = list(parent)[0] children[parent].remove(child) children[parent].add(new_child) if r in children: parent = r new_parent = rw childs = children.pop(parent) children[new_parent] = childs for child in childs: parents[child] = {new_parent} self.nodes = dict(self.nodes) sp = SP() tp = [_ for _ in sorted(['{: <50}'.format(n['label']) + n['uberon'] if n['uberon'] else n['label'] for n in sp.nodes.values()])] #print('\n'.join(tp)) #print(sp.appendicies[1].keys()) #print(sp.nodes[1].keys()) nbase = PREFIXES['SWAN'] + '%s' json_ = {'nodes':[],'edges':[]} parent = ilxtr.swansonBrainRegionConcept og = OntGraph() for node, anns in sp.nodes.items(): nid = nbase % node new_graph.add_class(nid, parent, label=anns['label']) new_graph.add_trip(nid, 'NIFRID:definingCitation', anns['citation']) json_['nodes'].append({'lbl':anns['label'],'id':'SWA:' + str(node)}) #if anns['uberon']: #new_graph.add_trip(nid, owl.equivalentClass, anns['uberon']) # issues arrise here... [og.add(t) for t in map_term(rdflib.URIRef(nid), anns['label'], prefix='UBERON')] og.write(auth.get_path('ontology-local-repo') / 'ttl/generated/swanson-uberon-mapping.ttl') #hrm = [(anns['label'], gn(anns['label'])) for node, anns in sp.nodes.items()] #ok = [(h, test, term_source(h, test)) for h, test in hrm if test] #notok = [h for h, test in hrm if not test] for appendix, data in sp.appendicies.items(): aid = PREFIXES['SWAA'] + str(appendix) new_graph.add_class(aid, label=data['name'].capitalize()) new_graph.add_trip(aid, 'ilxtr:hasTaxonRank', data['taxon']) # FIXME appendix is the data artifact... children = data['children'] ahp = 'swanr:hasPart' + str(appendix) apo = 'swanr:partOf' + str(appendix) new_graph.add_op(ahp, transitive=True) new_graph.add_op(apo, inverse=ahp, transitive=True) for parent, childs in children.items(): # FIXME does this give complete coverage? pid = nbase % parent for child in childs: cid = nbase % child new_graph.add_restriction(pid, ahp, cid) # note hierarhcy inverts direction new_graph.add_restriction(cid, apo, pid) json_['edges'].append({'sub':'SWA:' + str(child),'pred':apo,'obj':'SWA:' + str(parent)}) return new_graph
class SwansonAppendix(resSource): sourceFile = auth.get_path('resources') / 'swanson_aligned.txt' artifact = Artifacts.SwansonAppendix