def children_pull(self, existing_caches=tuple(), only=tuple(), skip=tuple()): # FIXME this is really a recursive pull for organization level only ... sname = lambda gen: sorted(gen, key=lambda c: c.name) def refresh(c): updated = c.meta.updated newc = c.refresh() if newc is None: return nupdated = newc.meta.updated if nupdated != updated: return newc existing = sname(existing_caches) if not self._debug: skipexisting = { e.id: e for e in Async(rate=self._async_rate)( deferred(refresh)(e) for e in existing) if e is not None } else: # debug ... skipexisting = { e.id: e for e in (refresh(e) for e in existing) if e is not None } # FIXME # in theory the remote could change betwee these two loops # since we currently cannot do a single atomic pull for # a set of remotes and have them refresh existing files # in one shot if not self._debug: yield from ( rc for d in Async(rate=self._async_rate)( deferred(child.bootstrap)( recursive=True, only=only, skip=skip) for child in sname(self.children) #if child.id in skipexisting # TODO when dataset's have a 'anything in me updated' # field then we can use that to skip things that haven't # changed (hello git ...) ) for rc in d) else: # debug yield from ( rc for d in (child.bootstrap(recursive=True, only=only, skip=skip) for child in sname(self.children)) #if child.id in skipexisting # TODO when dataset's have a 'anything in me updated' # field then we can use that to skip things that haven't # changed (hello git ...) for rc in d)
def test_mbf_header(self): test_id = 'N:dataset:bec4d335-9377-4863-9017-ecd01170f354' test_dataset = [d.cache for d in self.test_datasets if d.cache.id == test_id][0] if not list(test_dataset.local.children): rchilds = list(test_dataset.rchildren) xmls = [c for c in rchilds if c.suffix == '.xml'] Async(rate=5)(deferred(x.fetch)() for x in xmls if not x.exists()) #[x.fetch() for x in xmls if not x.exists()] local_xmls = [x.local for x in xmls] else: local_xmls = list(test_dataset.local.rglob('*.xml')) if any(p for p in local_xmls if not p.exists()): raise BaseException('unfetched children') embfs = [exml.ExtractXml(x) for x in local_xmls] d = embfs[0].asDict() blob = [e.asDict() for e in embfs] errors = [b.pop('errors') for b in blob if 'errors' in b] error_types = set(e['validator'] for es in errors for e in es) if export: with open('mbf-test.json', 'wt') as f: json.dump(blob, f, indent=2, cls=JEncode) with open('mbf-errors.json', 'wt') as f: json.dump(errors, f, indent=2, cls=JEncode) assert error_types == {'not'} or not error_types, f'unexpected error type! {error_types}'
def zap_deleted(get_annos): annos = get_annos() new_annos = get_annos.get_annos_from_api(len(annos), 200) n_deleted = len([a for a in new_annos if a in annos]) print('there are', n_deleted, 'potentially deleted annotations') missing = [] h = get_annos.h() def thing(id): return id, h.head_annotation(id).ok # work backwards to cull deleted annotations size = 500 n_chunks = len(annos) // size for i, anno_chunk in enumerate(chunk_list(list(reversed(annos)), size)): if i < 10: continue print('chunk size', size, 'number', i + 1 , 'of', n_chunks, 'found', len(missing)) if len(missing) >= n_deleted: break responses = Async(25)(deferred(thing)(a.id) for a in anno_chunk) missing += [id for id, ok in responses if not ok] # TODO actually remove them embed()
def catalog_extras(fetch=False): path = Path(auth.get_path('ontology-local-repo'), 'ttl') cat = (path / 'catalog-v001.xml').as_posix() with open((path / '../catalog-extras').as_posix(), 'rt') as ce, open(cat, 'rt') as c: clines = c.readlines() celines = ce.readlines() if clines[-2] != celines[-1]: with open(cat, 'wt') as f: f.writelines(clines[:-1] + celines + clines[-1:]) else: print(tc.blue('INFO:'), 'extras already added to catalog doing nothing') if fetch: print(tc.blue('INFO:'), 'fetching extras') def fetch_and_save(url, loc): resp = requests.get(url) saveloc = (path / loc).as_posix() if resp.ok: with open(saveloc, 'wb') as f: f.write(resp.content) print(tc.blue('INFO:'), f'{url:<60} written to {loc}') else: print(tc.red('WARNING:'), f'failed to fetch {url}') Async()(deferred(fetch_and_save)(url, loc) for line in celines for _, _, _, url, _, loc, _ in (line.split('"'), ))
def boost(func: Callable, kwargs_list: List[dict], batch_size: int = 3, rate: int = None) -> iter: """ Async boost for Function/Method & list of kwarg params for Function/Method. :param func: Function/Method to be asynchronously called. :param kwargs_list: Function/Method perameters for each call. :param batch_size: Batch size. Default 3 :param rate: Inner batch size. Auto set to max possible. :returns: Generator of repsonses from func. >>>from ontquery.plugins.services.interlex_client import InterLexClient >>>ilx_cli = InterLexClient(base_url='https://test3.scicrunch.org/api/1/') >>>kwargs_list = [{'label': 'Label 1', 'type': 'term'}, {'label': 'Label 2', 'type': 'term'}] >>>self.boost(ilx_cli.add_entity, kwargs_list) """ # InterLex specific batch size range # if batch_size > 25: batch_size = 25 # trust me; this is MAX. Anymore freaks out the php workers. if batch_size < 3: batch_size = 3 # Any less than 3 and async isn't worth it. # Worker # gin = lambda kwargs: func(**kwargs) # Builds futures dynamically # results = [] for step in range(0, len(kwargs_list), batch_size): print('Step ->', step) # So you can restart from last step. results += Async(rate=rate)(deferred(gin)(kwargs) for kwargs in kwargs_list[step:step + batch_size]) return results
def triples(self): self.iri = rdflib.URIRef(f'https://apinatomy.org/uris/models/{self.id}') yield self.iri, rdf.type, readable.Graph yield self.iri, readable.name, rdflib.Literal(self.name) yield self.iri, readable.abbreviation, rdflib.Literal(self.abbreviation) externals = [] for id, blob in self.resources.items(): if 'class' not in blob: logd.warning(f'no class in\n{blob!r} for {id}') continue elif blob['class'] == 'Graph': continue obj = getattr(self, blob['class'])(blob, self.context, self.label_suffix) if blob['class'] == 'External': # defer lookup externals.append(obj) continue yield from obj.triples() Async()(deferred(lambda x: x._term)(e) for e in externals) for e in externals: yield from e.triples()
def export_identifier_metadata(self, dump_path, dataset_blobs): if (self.latest and self.latest_id_met_path.exists()): blob_id_met = self.latest_id_met else: def fetch(id): # FIXME error proof version ... try: metadata = id.metadata() metadata['id'] = id.identifier # FIXME normalization ... return metadata except requests.exceptions.HTTPError as e: logd.error(e) except (requests.exceptions.ConnectionError, requests.exceptions.SSLError) as e: log.error(e) # retrieve doi metadata and materialize it in the dataset _dois = set([ idlib.Auto(id) if not isinstance(id, idlib.Stream) else id for blob in dataset_blobs for id in chain( adops.get(blob, ['meta', 'protocol_url_or_doi'], on_failure=[]), adops.get(blob, ['meta', 'originating_article_doi'], on_failure=[]), # TODO data["links"]? [blob['meta']['doi']]) if id is not None ]) dois = [d for d in _dois if isinstance(d, idlib.Doi)] metadatas = Async(rate=10)(deferred(fetch)(d) for d in dois) bads = [ { 'id': d, 'reason': 'no metadata' } # TODO more granular reporting e.g. 404 for d, m in zip(dois, metadatas) if m is None ] metadatas = [m for m in metadatas if m is not None] blob_id_met = { 'id': 'identifier-metadata', # TODO is this ok ? 'identifier_metadata': metadatas, 'errors': bads, 'meta': { 'count': len(metadatas) }, 'prov': { 'timestamp_export_start': self.timestamp, 'export_system_identifier': Path.sysid, 'export_hostname': gethostname(), 'export_project_path': self.export_source_path.cache.anchor, }, } with open(dump_path / 'identifier-metadata.json', 'wt') as f: json.dump(blob_id_met, f, sort_keys=True, indent=2, cls=JEncode) return blob_id_met
def searchSquares(squares): def fetch(s): return s, list(query(label=s.label)) return { s: match for s, match in Async(rate=10)(deferred(fetch)(s) for s in squares) }
def main(): terms = Terms(readonly=False) sgd = Dynamic(cache=True) ol = sgd.prod_sparc_organList() ol['nodes'] ids = [n['id'] for n in ol['nodes']] res = Async()(deferred(by_organ)(i, sgd) for i in ids) nodes = [n for o, r in res for n in r['nodes']] rows = [(o, n['id'], n['lbl'], '\n'.join(syn(n)), defn(n)) for o, r in res for n in r['nodes']] terms.upsert(*rows) terms.commit()
def loadData(cls): """ corresponds to the list of FMA ids from organParts for all organs in the sparc organsList """ g = OntGraph() g.namespace_manager.populate_from(uPREFIXES) # cls._ghead except fma doesn't define FMA: ol = cls.sgd.prod_sparc_organList() top_ids = [n['id'] for n in ol['nodes']] res = Async()(deferred(by_organ)(i, cls.sgd) for i in top_ids) #res = [by_organ(i, cls.sgd) for i in top_ids] #res_stats(res) # why are there dupes? now we know! nodes = [n for o, r in res for n in r['nodes']] ids_raw = set(n['id'] for n in nodes if not n['id'].startswith('_:') and n['id'] != 'owl:Nothing') ids = set(g.namespace_manager.expand(id).toPython() for id in ids_raw) return ids_raw, ids
def get_itrips(self): results = self.get_scigraph_onts() iris = sorted(set(r['iri'] for r in results)) gin = lambda i: (i, self.sgg.getNeighbors(i, relationshipType='isDefinedBy', direction='OUTGOING')) nodes = Async()(deferred(gin)(i) for i in iris) imports = [(i, *[(e['obj'], 'owl:imports', e['sub']) for e in n['edges']]) for i, n in nodes if n] self.itrips = sorted( set( tuple(rdflib.URIRef(OntId(e).iri) for e in t) for i, *ts in imports if ts for t in ts)) return self.itrips
def make_rt(to_review_tuples, retired=retired): def inner(u, l, retired=retired): ne = sgg.getNeighbors(u, relationshipType="isDefinedBy", depth=1) if ne: curie = help_graph.qname(u) help_graph.g.add((URIRef(u), ilxtr.SciGraphLookup, URIRef(f'http://scigraph.olympiangods.org/scigraph/graph/{curie}'))) if ne and ne['edges']: src = ' '.join([f'<{e["obj"]}>' for e in ne["edges"]]) elif u in retired: src = retfile else: src = '<>' return f'{u:<70} {l:<50} {src}' out = Async(rate=3000)(deferred(inner)(u, l) for u, l in sorted(to_review_tuples, key=lambda a:a[-1])) return '\n'.join(out)
def counts(self): if not hasattr(self, '_counts'): size = 0 dirs = 0 files = 0 need_meta = [] if not self.is_dir(): gen = self, else: gen = self.rchildren for c in gen: if c.is_dir(): dirs += 1 else: files += 1 # testing for broken symlinks is hard try: maybe_size = c.cache.meta.size except AttributeError as e: log.error(f'no cache or no meta for {c}\n{e}') continue if maybe_size is None: need_meta.append(c) else: size += maybe_size if need_meta and self._refresh_on_missing: nl = '\n' log.info( f'refreshing {len(need_meta)} files with missing metadata in {self}' f'\n{nl.join(_.as_posix() for _ in need_meta)}') new_caches = Async(rate=self.rate)(deferred(c.cache.refresh)() for c in need_meta) for c in new_caches: # FIXME first time around meta doesn't get updated ?? if c is None: continue # file was deleted (logged previously) if c.meta is None: log.critical(f'missing metdata! {c}') continue size += c.meta.size self._counts = dict(size=FileSize(size), dirs=dirs, files=files) return self._counts
def getOnts(): # generate everything from these two so that they stay up to date # http://help.brain-map.org/display/api/Atlas+Drawings+and+Ontologies func = lambda url: requests.get(url).json()['msg'] query = 'http://api.brain-map.org/api/v2/data/query.json?criteria=model::{model}' models = 'Atlas', 'Ontology', 'ReferenceSpace' res = Async(rate=10)(deferred(func)(query.format(model=model)) for model in models) _Atlas, _Ontology, _ReferenceSpace = res # FIXME looks like this API changed onts = {o['id']: o for o in _Ontology} refs = {r['id']: r for r in _ReferenceSpace} refs[None] = None onts[None] = None want_onts = set() # ontology metadata for at in _Atlas: at['name'] at['description'] ref = refs[at['reference_space_id']] try: ont = onts[at['structure_graph_id']] except KeyError as e: ont = dict(id=at['structure_graph_id'], organism_id=2) print('hey guys, could you please fix this missing ont?', e) if ont: want_onts.add(ont['id']) if ont and ref: assert ont['organism_id'] == ref[ 'organism_id'], f"\n{ont['organism_id']}\n{ref['organism_id']}" have_atlases = set(o['id'] for o in onts.values() if o and o['has_atlas']) try: assert want_onts == have_atlases, f'\n{sorted(want_onts)}\n{sorted(have_atlases)}' except AssertionError as e: print('needs more attention', e) for oid in want_onts: try: ont = onts[oid] except KeyError: # FIXME continue ont['name'] ont['description'] ont['id']
def test_rate(self): out = Async(rate=10)(deferred(lambda a:a)('lol') for _ in range(10))
def test_fast(self): out = Async()(deferred(lambda a:a)('lol') for _ in range(1000))
def swanson(): """ not really a parcellation scheme NOTE: the defining information up here is now deprecated it is kept around to keep the code further down happy """ source = auth.get_path('resources') / 'swanson_aligned.txt' ONT_PATH = 'http://ontology.neuinfo.org/NIF/ttl/generated/' filename = 'swanson_hierarchies' ontid = ONT_PATH + filename + '.ttl' PREFIXES = SwansonLabels.prefixes new_graph = makeGraph(filename, PREFIXES, writeloc='/tmp/') new_graph.add_ont(ontid, 'Swanson brain partomies', 'Swanson 2014 Partonomies', 'This file is automatically generated from ' + source.as_posix() + '.' + '**FIXME**', 'now') # FIXME citations should really go on the ... anatomy? scheme artifact definingCitation = 'Swanson, Larry W. Neuroanatomical Terminology: a lexicon of classical origins and historical foundations. Oxford University Press, USA, 2014.' definingCitationID = 'ISBN:9780195340624' new_graph.add_trip(ontid, 'NIFRID:definingCitation', definingCitation) new_graph.add_trip(ontid, 'NIFRID:definingCitationID', definingCitationID) with open(source, 'rt') as f: lines = [l.strip() for l in f.readlines()] # join header on page 794 lines[635] += ' ' + lines.pop(636) #fix for capitalization since this header is reused fixed = ' or '.join([' ('.join([n.capitalize() for n in _.split(' (')]) for _ in lines[635].lower().split(' or ')]).replace('human','HUMAN') lines[635] = fixed data = [] for l in lines: if not l.startswith('#'): level = l.count('.'*5) l = l.strip('.') if ' (' in l: if ') or' in l: n1, l = l.split(') or') area_name, citationP = n1.strip().split(' (') citation = citationP.rstrip(')') d = (level, area_name, citation, 'NEXT SYN') data.append(d) #print(tc.red(tc.bold(repr(d)))) area_name, citationP = l.strip().split(' (') citation = citationP.rstrip(')') else: area_name = l citation = None d = (level, area_name, citation, None) #print(d) data.append(d) results = Async()(deferred(sgv.findByTerm)(d[1]) for d in data) #results = [None] * len(data) curies = [[r['curie'] for r in _ if 'curie' in r and 'UBERON' in r['curie']] if _ else [] for _ in results] output = [_[0] if _ else None for _ in curies] header = ['Depth', 'Name', 'Citation', 'NextSyn', 'Uberon'] zoop = [header] + [r for r in zip(*zip(*data), output)] + \ [(0, 'Appendix END None', None, None, None)] # needed to add last appendix # TODO annotate the appendicies and the classes with these appendix_root_mapping = (1, 1, 1, 1, 30, 83, 69, 70, 74, 1) # should generate? class SP(rowParse): def __init__(self): self.nodes = defaultdict(dict) self._appendix = 0 self.appendicies = {} self._last_at_level = {} self.names = defaultdict(set) self.children = defaultdict(set) self.parents = defaultdict(set) self.next_syn = False super().__init__(zoop) def Depth(self, value): if self.next_syn: self.synonym = self.next_syn else: self.synonym = False self.depth = value def Name(self, value): self.name = value def Citation(self, value): self.citation = value def NextSyn(self, value): if value: self.next_syn = self._rowind else: self.next_syn = False def Uberon(self, value): self.uberon = value def _row_post(self): # check if we are in the next appendix # may want to xref ids between appendicies as well... if self.depth == 0: if self.name.startswith('Appendix'): if self._appendix: self.appendicies[self._appendix]['children'] = dict(self.children) self.appendicies[self._appendix]['parents'] = dict(self.parents) self._last_at_level = {} self.children = defaultdict(set) self.parents = defaultdict(set) _, num, apname = self.name.split(' ', 2) if num == 'END': return self._appendix = int(num) self.appendicies[self._appendix] = { 'name':apname.capitalize(), 'type':self.citation.capitalize() if self.citation else None} return else: if ' [' in self.name: name, taxonB = self.name.split(' [') self.name = name self.appendicies[self._appendix]['taxon'] = taxonB.rstrip(']').capitalize() else: # top level is animalia self.appendicies[self._appendix]['taxon'] = 'ANIMALIA'.capitalize() self.name = self.name.capitalize() self.citation = self.citation.capitalize() # nodes if self.synonym: self.nodes[self.synonym]['synonym'] = self.name self.nodes[self.synonym]['syn-cite'] = self.citation self.nodes[self.synonym]['syn-uberon'] = self.uberon return else: if self.citation: # Transverse Longitudinal etc all @ lvl4 self.names[self.name + ' ' + self.citation].add(self._rowind) else: self.name += str(self._appendix) + self.nodes[self._last_at_level[self.depth - 1]]['label'] #print(level, self.name) # can't return here because they are their own level # replace with actually doing something... self.nodes[self._rowind]['label'] = self.name self.nodes[self._rowind]['citation'] = self.citation self.nodes[self._rowind]['uberon'] = self.uberon # edges self._last_at_level[self.depth] = self._rowind # TODO will need something to deal with the Lateral/ if self.depth > 0: try: parent = self._last_at_level[self.depth - 1] except: breakpoint() self.children[parent].add(self._rowind) self.parents[self._rowind].add(parent) def _end(self): replace = {} for asdf in [sorted(n) for k,n in self.names.items() if len(n) > 1]: replace_with, to_replace = asdf[0], asdf[1:] for r in to_replace: replace[r] = replace_with for r, rw in replace.items(): #print(self.nodes[rw]) o = self.nodes.pop(r) #print(o) for vals in self.appendicies.values(): children = vals['children'] parents = vals['parents'] # need reversed so children are corrected before swap for r, rw in reversed(sorted(replace.items())): if r in parents: child = r new_child = rw parent = parents.pop(child) parents[new_child] = parent parent = list(parent)[0] children[parent].remove(child) children[parent].add(new_child) if r in children: parent = r new_parent = rw childs = children.pop(parent) children[new_parent] = childs for child in childs: parents[child] = {new_parent} self.nodes = dict(self.nodes) sp = SP() tp = [_ for _ in sorted(['{: <50}'.format(n['label']) + n['uberon'] if n['uberon'] else n['label'] for n in sp.nodes.values()])] #print('\n'.join(tp)) #print(sp.appendicies[1].keys()) #print(sp.nodes[1].keys()) nbase = PREFIXES['SWAN'] + '%s' json_ = {'nodes':[],'edges':[]} parent = ilxtr.swansonBrainRegionConcept og = OntGraph() for node, anns in sp.nodes.items(): nid = nbase % node new_graph.add_class(nid, parent, label=anns['label']) new_graph.add_trip(nid, 'NIFRID:definingCitation', anns['citation']) json_['nodes'].append({'lbl':anns['label'],'id':'SWA:' + str(node)}) #if anns['uberon']: #new_graph.add_trip(nid, owl.equivalentClass, anns['uberon']) # issues arrise here... [og.add(t) for t in map_term(rdflib.URIRef(nid), anns['label'], prefix='UBERON')] og.write(auth.get_path('ontology-local-repo') / 'ttl/generated/swanson-uberon-mapping.ttl') #hrm = [(anns['label'], gn(anns['label'])) for node, anns in sp.nodes.items()] #ok = [(h, test, term_source(h, test)) for h, test in hrm if test] #notok = [h for h, test in hrm if not test] for appendix, data in sp.appendicies.items(): aid = PREFIXES['SWAA'] + str(appendix) new_graph.add_class(aid, label=data['name'].capitalize()) new_graph.add_trip(aid, 'ilxtr:hasTaxonRank', data['taxon']) # FIXME appendix is the data artifact... children = data['children'] ahp = 'swanr:hasPart' + str(appendix) apo = 'swanr:partOf' + str(appendix) new_graph.add_op(ahp, transitive=True) new_graph.add_op(apo, inverse=ahp, transitive=True) for parent, childs in children.items(): # FIXME does this give complete coverage? pid = nbase % parent for child in childs: cid = nbase % child new_graph.add_restriction(pid, ahp, cid) # note hierarhcy inverts direction new_graph.add_restriction(cid, apo, pid) json_['edges'].append({'sub':'SWA:' + str(child),'pred':apo,'obj':'SWA:' + str(parent)}) return new_graph
def export_identifier_metadata(self, dump_path, latest_path, dataset_blobs): latest_id_met_path = latest_path / self.id_metadata if (self.latest and latest_id_met_path.exists()): with open(latest_id_met_path, 'rt') as f: blob_id_met = json.load(f) else: import requests def fetch(id): # FIXME error proof version ... try: metadata = id.metadata() metadata['id'] = id return metadata except (requests.exceptions.HTTPError, idlib.exc.RemoteError) as e: logd.error(e) except (requests.exceptions.ConnectionError, requests.exceptions.SSLError, idlib.exc.ResolutionError) as e: log.error(e) def autoid_report_error(id, blob): try: return idlib.Auto(id) except idlib.exc.MalformedIdentifierError as e: msg = f'{blob["id"]} bad id: {id}' logd.error(msg) return None # retrieve doi metadata and materialize it in the dataset _dois = set([ id if isinstance(id, idlib.Stream) else (fromJson(id) if isinstance(id, dict) else autoid_report_error( id, blob)) for blob in dataset_blobs for id in chain( adops.get(blob, ['meta', 'protocol_url_or_doi'], on_failure=[]), adops.get(blob, ['meta', 'originating_article_doi'], on_failure=[]), # TODO data["links"]? [blob['meta']['doi']] if 'doi' in blob['meta'] else []) if id is not None ]) dois = [d for d in _dois if isinstance(d, idlib.Doi)] metadatas = Async(rate=10)(deferred(fetch)(d) for d in dois) bads = [ { 'id': d, 'reason': 'no metadata' } # TODO more granular reporting e.g. 404 for d, m in zip(dois, metadatas) if m is None ] metadatas = [m for m in metadatas if m is not None] blob_id_met = { 'id': 'identifier-metadata', # TODO is this ok ? 'identifier_metadata': metadatas, 'errors': bads, 'meta': { 'count': len(metadatas) }, 'prov': { 'timestamp_export_start': self.timestamp, 'export_system_identifier': Path.sysid, 'export_hostname': gethostname(), 'export_project_path': self.export_source_path.cache.anchor, }, } with open(dump_path / self.id_metadata, 'wt') as f: json.dump(blob_id_met, f, sort_keys=True, indent=2, cls=JEncode) return blob_id_met
def url_blaster(urls, rate, timeout=5, verbose=False, debug=False, method='head', fail=False, negative=False, ok_test=lambda r: r.ok): shuffle(urls) # try to distribute timeout events evenly across workers if verbose: [print(u) for u in sorted(urls)] class Timedout: ok = False def __init__(self, url): self.url = url r_method = getattr(requests, method) def method_timeout(url, _method=r_method): try: return _method(url, timeout=timeout) except (requests.ConnectTimeout, requests.ReadTimeout) as e: print('Timedout:', url, e) return Timedout(url) s = time() collector = [] if debug else None all_ = Async(rate=rate, debug=verbose, collector=collector)(deferred(method_timeout)(url) for url in urls) o = time() not_ok = [_.url for _ in all_ if not ok_test(_)] d = o - s print( f'Actual time: {d} Effective rate: {len(urls) / d}Hz diff: {(len(urls) / d) / rate if rate else 1}' ) print('Failed:') if not_ok: for nok in not_ok: print(nok) ln = len(not_ok) lt = len(urls) lo = lt - ln msg = f'{ln} urls out of {lt} ({ln / lt * 100:2.2f}%) are not ok. D:' print(msg) # always print to get around joblib issues if negative and fail: if len(not_ok) == len(all_): raise AssertionError('Everything failed!') elif fail: raise AssertionError(f'{msg}\n' + '\n'.join(sorted(not_ok))) else: print(f'OK. All {len(urls)} urls passed! :D') if debug: from matplotlib.pyplot import plot, savefig, figure, show, legend, title from collections import defaultdict def asyncVis(collector): by_thread = defaultdict(lambda: [[], [], [], [], [], [], [], []]) min_ = 0 for thread, job, start, target_stop, stop, time_per_job, p, i, d in sorted( collector): if not min_: min_ = stop by_thread[thread][0].append(job) #by_thread[thread][1].append(start - min_) by_thread[thread][2].append(target_stop - stop) by_thread[thread][3].append(stop - min_) by_thread[thread][4].append(time_per_job) by_thread[thread][5].append(p) by_thread[thread][6].append(i) by_thread[thread][7].append(d) for thread, (job, y1, y2, y3, y4, y5, y6, y7) in by_thread.items(): figure() title(str(thread)) plot(job, [0] * len(job), 'r-') #plot(job, y1, label=f'stop') plot(job, y2, label=f'early by') #plot(job, y3, label=f'stop') #plot(job, y4, label=f'time per job') # now constant... plot(job, y5, label='P') plot(job, y6, label='I') plot(job, y7, label='D') legend() show() asyncVis(collector) breakpoint()
def test_rate_empty(self): out = Async(rate=20)(deferred(lambda a:a)('lol') for _ in range(0))
def render(pred, root, direction=None, depth=10, local_filepath=None, branch='master', restriction=False, wgb='FIXME', local=False, verbose=False, flatten=False,): kwargs = {'local':local, 'verbose':verbose} prov = makeProv(pred, root, wgb) if local_filepath is not None: github_link = ('https://github.com/SciCrunch/NIF-Ontology/raw/' f'{branch}/{local_filepath}') prov.append('<link rel="http://www.w3.org/ns/prov#wasDerivedFrom" ' f'href="{github_link}">') graph = graphFromGithub(github_link, verbose) qname = graph.namespace_manager._qhrm # FIXME labels_index = {qname(s):str(o) for s, o in graph[:rdfs.label:]} if pred == 'subClassOf': pred = 'rdfs:subClassOf' # FIXME qname properly? elif pred == 'subPropertyOf': pred = 'rdfs:subPropertyOf' try: kwargs['json'] = graph.asOboGraph(pred, restriction=restriction) kwargs['prefixes'] = {k:str(v) for k, v in graph.namespace_manager} except KeyError as e: if verbose: log.error(str(e)) return abort(422, 'Unknown predicate.') else: kwargs['graph'] = sgg # FIXME this does not work for a generic scigraph load ... # and it should not be calculated every time anyway! # oh look, here we are needed a class again if False: versionIRI = [ e['obj'] for e in sgg.getNeighbors('http://ontology.neuinfo.org/' 'NIF/ttl/nif.ttl')['edges'] if e['pred'] == 'versionIRI'][0] #print(versionIRI) prov.append('<link rel="http://www.w3.org/ns/prov#wasDerivedFrom" ' f'href="{versionIRI}">') # FIXME wrong and wont resolve prov.append('<meta name="representation" content="SciGraph">') # FIXME :/ kwargs['html_head'] = prov try: if root.startswith('http'): # FIXME this codepath is completely busted? if 'prefixes' in kwargs: rec = None for k, v in kwargs.items(): if root.startswith(v): rec = k + 'r:' + root.strip(v) # FIXME what?! break if rec is None: raise KeyError('no prefix found for {root}') else: rec = sgv.findById(root) if 'curie' in rec: root_curie = rec['curie'] # FIXME https://github.com/SciGraph/SciGraph/issues/268 if not root_curie.endswith(':') and '/' not in root_curie: root = root_curie else: kwargs['curie'] = root_curie elif 'prefixes' not in kwargs and root.endswith(':'): kwargs['curie'] = root root = sgc._curies[root.rstrip(':')] # also 268 tree, extras = creatTree(*Query(root, pred, direction, depth), **kwargs) dematerialize(list(tree.keys())[0], tree) if flatten: if local_filepath is not None: def safe_find(n): return {'labels':[labels_index[n]], 'deprecated': False # FIXME inacurate } else: def safe_find(n): # FIXME scigraph bug if n.endswith(':'): n = sgc._curies[n.rstrip(':')] elif '/' in n: prefix, suffix = n.split(':') iriprefix = sgc._curies[prefix] n = iriprefix + suffix return sgv.findById(n) out = set(n for n in flatten_tree(extras.hierarchy)) try: lrecs = Async()(deferred(safe_find)(n) for n in out) except RuntimeError: asyncio.set_event_loop(current_app.config['loop']) lrecs = Async()(deferred(safe_find)(n) for n in out) rows = sorted(((r['labels'][0] if r['labels'] else '') + ',' + n for r, n in zip(lrecs, out) # FIXME still stuff wrong, but better for non cache case if not r['deprecated']), key=lambda lid: lid.lower()) return '\n'.join(rows), 200, {'Content-Type':'text/plain;charset=utf-8'} else: return hfn.htmldoc(extras.html, other=prov, styles=hfn.tree_styles) except (KeyError, TypeError) as e: if verbose: log.error(f'{type(e)} {e}') if sgg.getNode(root): # FIXME distinguish these cases... message = 'Unknown predicate or no results.' elif 'json' in kwargs: message = 'Unknown root.' r = graph.namespace_manager.expand(root) for s in graph.subjects(): if r == s: message = ('No results. ' 'You are querying a ttl file directly, ' 'did you remember to set ?restriction=true?') break else: message = 'Unknown root.' return abort(422, message)
def would_you_like_to_know_more_question_mark(): # resolving differences between classes more_ids = set(( 'http://uri.neuinfo.org/nif/nifstd/readable/ChEBIid', 'http://uri.neuinfo.org/nif/nifstd/readable/GOid', 'http://uri.neuinfo.org/nif/nifstd/readable/MeshUid', 'http://uri.neuinfo.org/nif/nifstd/readable/PMID', 'http://uri.neuinfo.org/nif/nifstd/readable/UmlsCui', 'http://uri.neuinfo.org/nif/nifstd/readable/bamsID', 'http://uri.neuinfo.org/nif/nifstd/readable/bonfireID', 'http://uri.neuinfo.org/nif/nifstd/readable/cell_ontology_ID', 'http://uri.neuinfo.org/nif/nifstd/readable/definingCitationID', 'http://uri.neuinfo.org/nif/nifstd/readable/definingCitationURI', 'http://uri.neuinfo.org/nif/nifstd/readable/emapMouseStageDataID', 'http://uri.neuinfo.org/nif/nifstd/readable/emapMouseStageDiagramID', 'http://uri.neuinfo.org/nif/nifstd/readable/externalSourceId', 'http://uri.neuinfo.org/nif/nifstd/readable/externalSourceURI', 'http://uri.neuinfo.org/nif/nifstd/readable/gbifID', 'http://uri.neuinfo.org/nif/nifstd/readable/gbifTaxonKeyID', 'http://uri.neuinfo.org/nif/nifstd/readable/gene_Ontology_ID', #'http://uri.neuinfo.org/nif/nifstd/readable/hasExternalSource', 'http://uri.neuinfo.org/nif/nifstd/readable/hasGenbankAccessionNumber', 'http://uri.neuinfo.org/nif/nifstd/readable/imsrStandardStrainName', 'http://uri.neuinfo.org/nif/nifstd/readable/isReplacedByClass', 'http://uri.neuinfo.org/nif/nifstd/readable/jaxMiceID', 'http://uri.neuinfo.org/nif/nifstd/readable/ncbiTaxID', 'http://uri.neuinfo.org/nif/nifstd/readable/neuronamesID', 'http://uri.neuinfo.org/nif/nifstd/readable/nifID', 'http://uri.neuinfo.org/nif/nifstd/readable/sao_ID', 'http://uri.neuinfo.org/nif/nifstd/readable/umls_ID', 'http://www.geneontology.org/formats/oboInOwl#id', )) outside = [] eee = {} resolver_not_ilx_only_but_not_in_scigraph = set() # resources.ttl _res = Graph().parse((gitf / 'NIF-Ontology/ttl/resources.ttl').as_posix(), format='turtle') reslookup = {uri:[l] for uri, l in _res.subject_objects(rdfs.label)} for uri in chain(h_uris, resolver_not_ilx_only): if 'uri.neuinfo.org' in uri: try: meta = sgg.getNode(uri.toPython())['nodes'][0]['meta'] asdf = {hng.qname(k):v for k, v in meta.items() if k in more_ids} except TypeError: resolver_not_ilx_only_but_not_in_scigraph.add(uri) # resources.ttl ;) if uri in reslookup: # no differentia asdf = False else: asdf = False print('WTF', uri) if asdf: #print(uri, asdf) eee[uri] = asdf for l in asdf.values(): for e in l: outside.append(e) outside_dupes = [v for v, c in Counter(outside).most_common() if c > 1] eee_dupes = {k:v for k, v in eee.items() if anyMembers(outside_dupes, *(e for l in v.values() for e in l))} #for uri, meta in sorted(eee_dupes.items(), key=lambda a:sorted(a[1].values())): #print(uri.toPython(), sorted((e.replace('PMID: ', 'PMID:'), k) for k, l in meta.items() for e in l)) # attempt to deal with label mappings iexisting = defaultdict(set) iiexisting = {} for i, existing in zip(datal('ilx'), datal('iri')): #if 'uri.neuinfo.org' in existing: if 'interlex.org' not in existing and 'neurolex.org' not in existing: iexisting[i].add(URIRef(existing)) iiexisting[URIRef(existing)] = i iexisting = {**iexisting} _ilabs = {k:l for k, l in zip(datal('ilx'), datal('label'))} def inner(iri): resp = sgv.findById(iri) if resp is not None: l = resp['labels'] else: l = [] #_ilabs[iiexisting[iri]] + '** already in ilx **'] #print('trouble?', iri) # ilx only return iri, l #labs = {k:v[0] if v else '<--NO-LABEL-->' for k, v in Async()(deferred(inner)(id_) for id_ in chain(h_uris, (e for s in iexisting.values() for e in s)))} labs = {k:v[0] if v else '<--NO-LABEL-->' for k, v in Async()(deferred(inner)(id_) for id_ in h_uris)} ilabs = {k:l.lower() for k, l in zip(datal('ilx'), datal('label'))} iilabs = {v:k for k, v in ilabs.items()} assert len(ilabs) == len(iilabs) missing_map = {k:iilabs[v.lower()] for k, v in labs.items() if v and v.lower() in iilabs} # XXX this is not valid missing_existing = {i:[m, *iexisting[i]] for m, i in missing_map.items() if i in iexisting} missing_equivs = {next(iter(iexisting[i])):i for m, i in missing_map.items() if i in iexisting} eid = NIFRID.externalSourceId.toPython() ded = owl.deprecated.toPython() # SP: -> swissprot vs uniprot mmr = [] proto_mmr_1_to_1 = {} arrr = defaultdict(set) uniprot_iuphar = set() for uri, ilx_frag in {**missing_equivs, **missing_map}.items(): uri = URIRef(uri) try: meta = sgg.getNode(uri.toPython())['nodes'][0]['meta'] except TypeError: # just ignore these, they are ilx only :/ meta = {} if eid in meta: src = meta[eid][0] if src.startswith('SP:'): src = tc.yellow(src.replace('SP:', 'http://www.uniprot.org/uniprot/')) #elif src.startswith('IUPHAR:'): #pass #else: #src = 'TODO' elif ded in meta and meta[ded]: src = tc.red('ded ') else: src = 'TODO' val = labs[uri] if uri in labs else _ilabs[ilx_frag] + ' **' if uri in eee: differentia = str(eee[uri]) for v in eee[uri].values(): for e in v: arrr[e].add(uri) if 'SP:' in e or 'IUPHAR:' in e: uniprot_iuphar.add(uri) else: differentia = '' if uri in _ilx and uri in all_uris: ruri = SGG[hng.qname(uri)] ruri = tc.blue(f'{ruri:<60}') else: ruri = uri ruri = f'{ruri:<60}' v = ' '.join((f'{val:<60}', src, ruri, ilxb[ilx_frag], differentia)) mmr.append(v) proto_mmr_1_to_1[uri] = v src = None arrr = {**arrr} arrr_not_1_to_1 = {k:v for k, v in arrr.items() if len(v) > 1} #arrr_n11_uris = set((u.toPython() for v in arrr_not_1_to_1.values() for u in v)) arrr_n11_uris = set.union(*arrr_not_1_to_1.values()) mmr_1_to_1 = {k:v for k, v in proto_mmr_1_to_1.items() if k not in arrr_n11_uris} no_uniprot = {k:v for k, v in proto_mmr_1_to_1.items() if k not in uniprot_iuphar} arrr_n11_text = '\n'.join(f'{k:<15} {sorted(_.toPython() for _ in v)}' for k, v in arrr_not_1_to_1.items()) mmr.sort() mmr_text = '\n'.join(mmr) mmr_1_to_1_text = '\n'.join(sorted(mmr_1_to_1.values())) no_uniprot_text = '\n'.join(sorted(no_uniprot.values()))